ikarus/src/unicode/extract-info.ss

302 lines
10 KiB
Scheme
Raw Normal View History

#!/usr/bin/env ikarus --r6rs-script
;;; this file is a mess.
(import
(ikarus)
(unicode-data))
(define (hex-string->number str)
(or (string->number (string-append "#x" str))
(error 'hex-string->number "invalid ~s" str)))
(define (find-char c s)
(let f ([i 0] [n (string-length s)])
(cond
[(= i n) #f]
[(char=? (string-ref s i) c) i]
[else (f (add1 i) n)])))
(define (extract-range str)
(cond
[(find-char #\. str)
=>
(lambda (i)
(cons (hex-string->number (substring str 0 i))
(hex-string->number (substring str (+ i 2) (string-length str)))))]
[else
(let ([n (hex-string->number str)])
(cons n n))]))
(define (codes-in-cats ls cats)
(let f ([ls ls] [ac '()])
(cond
[(null? ls) (reverse ac)]
[(memq (cdar ls) cats)
(f (cdr ls) (cons (caar ls) ac))]
[else (f (cdr ls) ac)])))
(define (make-xonxoff ls)
;;; makes a list where if your index is at an odd
;;; position, then you're ON. If your index is not in
;;; the list, then look for the index before you.
(cons 0
(let f ([i 1] [on? #f]
[ls (if (= (car ls) 0)
(error 'make-xonxoff "first is on")
ls)])
(cond
[(null? ls) (list i)]
[(= i (car ls))
(if on?
(f (+ i 1) #t (cdr ls))
(cons i (f (+ i 1) #t (cdr ls))))]
[else
(if on?
(cons i (f (+ i 1) #f ls))
(f (+ i 1) #f ls))]))))
(define (search-on? n v)
(let ([k (- (vector-length v) 1)])
(let f ([i 0] [k k])
(cond
[(fx= i k) (odd? i)]
[else
(let ([j (fxsra (+ i k 1) 1)])
(cond
[(<= (vector-ref v j) n) (f j k)]
[else (f i (- j 1))]))]))))
(define (verify vec ls)
(let f ([i 0] [ls ls])
(unless (> i #x10FFFF)
(let-values ([(on? ls)
(cond
[(null? ls) (values #f '())]
[(= i (car ls)) (values #t (cdr ls))]
[else (values #f ls)])])
(unless (equal? on? (search-on? i vec))
(error #f "did not pass on ~s" i))
(f (+ i 1) ls)))))
(define (cat fields)
(let ([num (car fields)]
[cat (caddr fields)])
(cons
(read (open-input-string (format "#x~a" num)))
(string->symbol cat))))
(define constituent-property #x010000)
(define uppercase-property #x020000)
(define lowercase-property #x040000)
(define titlecase-property #x080000)
(define alphabetic-property #x100000)
(define numeric-property #x200000)
(define whitespace-property #x400000)
;;; Uppercase = Lu + Other_Uppercase
;;; Lowercase = Ll + Other_Lowercase
;;; Titlecase = Lt
;;; Alphabetic = Lu + Ll + Lt + Lm + Lo + Nl + Other_Alphabetic
;;; Numeric = ???
;;; White_Space =
(define proplist-properties
`(["Other_Uppercase" ,uppercase-property]
["Other_Lowercase" ,lowercase-property]
["Other_Alphabetic" ,alphabetic-property]
["White_Space" ,whitespace-property]))
(define categories
;;; 30 categories
`([Lu ,(+ 00 constituent-property uppercase-property alphabetic-property) "Letter, Uppercase"]
[Ll ,(+ 01 constituent-property lowercase-property alphabetic-property) "Letter, Lowercase"]
[Lt ,(+ 02 constituent-property titlecase-property alphabetic-property) "Letter, Titlecase"]
[Lm ,(+ 03 constituent-property alphabetic-property) "Letter, Modifier"]
[Lo ,(+ 04 constituent-property alphabetic-property) "Letter, Other"]
[Mn ,(+ 05 constituent-property) "Mark, Nonspacing"]
[Mc ,(+ 06 constituent-property) "Mark, Spacing Combining"]
[Me ,(+ 07 constituent-property) "Mark, Enclosing"]
[Nd ,(+ 08 constituent-property numeric-property) "Number, Decimal Digit"]
[Nl ,(+ 09 constituent-property alphabetic-property numeric-property) "Number, Letter"]
[No ,(+ 10 constituent-property numeric-property) "Number, Other"]
[Pc ,(+ 11 constituent-property) "Punctuation, Connector"]
[Pd ,(+ 12 constituent-property) "Punctuation, Dash"]
[Ps ,(+ 13 ) "Punctuation, Open"]
[Pe ,(+ 14 ) "Punctuation, Close"]
[Pi ,(+ 15 ) "Punctuation, Initial quote"]
[Pf ,(+ 16 ) "Punctuation, Final quote"]
[Po ,(+ 17 constituent-property) "Punctuation, Other"]
[Sm ,(+ 18 constituent-property) "Symbol, Math"]
[Sc ,(+ 19 constituent-property) "Symbol, Currency"]
[Sk ,(+ 20 constituent-property) "Symbol, Modifier"]
[So ,(+ 21 constituent-property) "Symbol, Other"]
[Zs ,(+ 22 ) "Separator, Space"]
[Zl ,(+ 23 ) "Separator, Line"]
[Zp ,(+ 24 ) "Separator, Paragraph"]
[Cc ,(+ 25 ) "Other, Control"]
[Cf ,(+ 26 ) "Other, Format"]
[Cs ,(+ 27 ) "Other, Surrogate"]
[Co ,(+ 28 constituent-property) "Other, Private Use"]
[Cn ,(+ 29 ) "Other, Not Assigned"]
))
(define (category-index x)
(cond
[(assq x categories) => cadr]
[else (error 'category-index "invalid cat ~s" x)]))
(define (insert-missing ls)
(let ([Cn-index (category-index 'Cn)])
(let f ([ls ls] [i 0] [ac '()])
(cond
[(> i #x10FFFF) (reverse ac)]
[(null? ls)
(f ls (+ i 1) (cons (cons i Cn-index) ac))]
[(= i (caar ls))
(f (cdr ls) (+ i 1)
(cons (cons (caar ls) (category-index (cdar ls))) ac))]
[else
(f ls (+ i 1) (cons (cons i Cn-index) ac))]))))
(define (make-cats-table ls)
(let f ([i 1] [st (car ls)] [ls (cdr ls)] [ac '()])
(cond
[(null? ls) (reverse (cons (cons i st) ac))]
[(equal? (cdar ls) (cdr st)) (f (add1 i) st (cdr ls) ac)]
[else
(f 1 (car ls) (cdr ls) (cons (cons i st) ac))])))
(define (merge-sequences ls)
(define (split ls)
(cond
[(null? ls) (values '() '())]
[(= (caar ls) 1)
(let-values ([(chain no-chain) (split (cdr ls))])
(values (cons (cdar ls) chain) no-chain))]
[else
(values '() ls)]))
(define (mk-chain a chain)
(cond
[(null? chain) a]
[else
(cons (car a)
(list->vector
(cons (cdr a)
(map cdr chain))))]))
(cond
[(null? ls) '()]
[(= (caar ls) 1)
(let-values ([(chain no-chain) (split (cdr ls))])
(cons (mk-chain (cdar ls) chain)
(merge-sequences no-chain)))]
[else (cons (cdar ls) (merge-sequences (cdr ls)))]))
(define (iota i n)
(let f ([i i] [n n] [ac '()])
(cond
[(= i n) ac]
[else (f i (sub1 n) (cons (sub1 n) ac))])))
;;; first, make a big vector for all characters
;;; place all in category Cn, unless proven otherwise
(let ([v (make-vector (+ #x10FFFF 1) (category-index 'Cn))])
(let ([ls (get-unicode-data "UNIDATA/UnicodeData.txt")])
;;; interesting parts of each element in ls are:
;;; field0: the character index, numeric
;;; field2: the category, symbolic
(for-each
(lambda (x)
(let ([idx (hex-string->number (list-ref x 0))]
[cat (category-index (string->symbol (list-ref x 2)))])
(vector-set! v idx cat)))
ls))
;;; every element of v now maps to the category-index.
(let ([ls (get-unicode-data "UNIDATA/PropList.txt")])
;;; field0 is a range
;;; field1 is a property name
(for-each
(lambda (x)
(let ([range (extract-range (car x))]
[name (cadr x)])
(cond
[(assoc name proplist-properties) =>
(lambda (a)
(let ([n (cadr a)])
(let f ([i (car range)] [j (cdr range)])
(unless (> i j)
(vector-set! v i (fxlogor (vector-ref v i) n))
(f (add1 i) j)))))])))
ls))
(let ([table
(merge-sequences
(make-cats-table
(map cons
(iota 0 (vector-length v))
(vector->list v))))])
(with-output-to-file "unicode-charinfo.ss"
(lambda ()
(printf ";;; DO NOT EDIT\n")
(printf ";;; automatically generated\n")
(printf ";;; ~s elements in vectors\n\n" (length table))
(pretty-print
`(begin
(define constituent-property ,constituent-property )
(define uppercase-property ,uppercase-property )
(define lowercase-property ,lowercase-property )
(define titlecase-property ,titlecase-property )
(define alphabetic-property ,alphabetic-property )
(define numeric-property ,numeric-property )
(define whitespace-property ,whitespace-property)))
(pretty-print
`(define unicode-categories-lookup-vector
',(list->vector (map car table))))
(pretty-print
`(define unicode-categories-values-vector
',(list->vector (map cdr table))))
(pretty-print
`(define unicode-categories-name-vector
',(list->vector (map car categories)))))
'replace))
(exit 0)
(let ([ls (map cat (get-unicode-data "UNIDATA/UnicodeData.txt"))])
(let ([wanted
(codes-in-cats ls
'(Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pd Pc Po Sc Sm Sk So Co))]
[cats-table (merge-sequences
(make-cats-table
(insert-missing ls)))])
(let ([xonxoff (list->vector (make-xonxoff wanted))])
(verify xonxoff wanted)
(with-output-to-file "unicode-info.ss"
(lambda ()
(printf ";;; DO NOT EDIT\n")
(printf ";;; automatically generated\n")
(printf ";;; ~s elements in vector\n\n" (vector-length xonxoff))
(pretty-print
`(define unicode-constituents-vector ',xonxoff))
(printf ";;; ~s elements in cats\n" (length cats-table))
(pretty-print
`(define unicode-categories-lookup-vector
',(list->vector (map car cats-table))))
(pretty-print
`(define unicode-categories-values-vector
',(list->vector (map cdr cats-table))))
(pretty-print
`(define unicode-categories-name-vector
',(list->vector (map car categories)))))
'replace)))))
(printf "Happy Happy Joy Joy\n")