ikarus/src/unicode/extract-categories.ss

130 lines
3.5 KiB
Scheme
Executable File

#!/usr/bin/env ikarus --r6rs-script
(import (ikarus))
(define (read-line)
(let f ([ac '()])
(let ([x (read-char)])
(cond
[(eof-object? x)
(if (null? ac)
(eof-object)
(list->string (reverse ac)))]
[(char=? x #\newline)
(if (null? ac) (f) (list->string (reverse ac)))]
[else (f (cons x ac))]))))
(define (find-semi str i n)
(cond
[(or (fx= i n)
(char=? (string-ref str i) #\;)) i]
[else (find-semi str (+ i 1) n)]))
(define (split str)
(let f ([i 0] [n (string-length str)])
(cond
[(= i n) '()]
[else
(let ([j (find-semi str i n)])
(cond
[(= j n) (list (substring str i j))]
[else
(cons (substring str i j)
(f (+ j 1) n))]))])))
(define (extract-uni-data)
(let f ([ls '()])
(let ([line (read-line)])
(cond
[(eof-object? line)
(reverse ls)]
[else
(let ([fields (split line)])
(let ([num (car fields)]
[cat (caddr fields)])
(f (cons
(cons
(read
(open-input-string (format "#x~a" num)))
(string->symbol cat))
ls))))]))))
(define (codes-in-cats ls cats)
(let f ([ls ls] [ac '()])
(cond
[(null? ls) (reverse ac)]
[(memq (cdar ls) cats)
(f (cdr ls) (cons (caar ls) ac))]
[else (f (cdr ls) ac)])))
(define (make-xonxoff ls)
;;; makes a list where if your index is at an odd
;;; position, then you're ON. If your index is not in
;;; the list, then look for the index before you.
(cons 0
(let f ([i 1] [on? #f]
[ls (if (= (car ls) 0)
(error 'make-xonxoff "first is on")
ls)])
(cond
[(null? ls) (list i)]
[(= i (car ls))
(if on?
(f (+ i 1) #t (cdr ls))
(cons i (f (+ i 1) #t (cdr ls))))]
[else
(if on?
(cons i (f (+ i 1) #f ls))
(f (+ i 1) #f ls))]))))
(define (odd? n) (= (fxlogand n 1) 1))
(define (search-on? n v)
(let ([k (- (vector-length v) 1)])
(let f ([i 0] [k k])
(cond
[(fx= i k) (odd? i)]
[else
(let ([j (fxsra (+ i k 1) 1)])
(cond
[(<= (vector-ref v j) n) (f j k)]
[else (f i (- j 1))]))]))))
(define (verify vec ls)
(let f ([i 0] [ls ls])
(unless (> i #x10FFFF)
(let-values ([(on? ls)
(cond
[(null? ls) (values #f '())]
[(= i (car ls)) (values #t (cdr ls))]
[else (values #f ls)])])
(unless (equal? on? (search-on? i vec))
(error #f "did not pass on ~s" i))
(f (+ i 1) ls)))))
(let ([ls
(with-input-from-file
"UNIDATA/UnicodeData.txt"
extract-uni-data)])
(let ([wanted
(codes-in-cats ls
'(Lu Ll Lt Lm Lo Mn Mc Me Nd Nl No Pd Pc Po Sc Sm Sk So Co))])
(let ([xonxoff (list->vector (make-xonxoff wanted))])
(verify xonxoff wanted)
(with-output-to-file "unicode-constituents.ss"
(lambda ()
(printf ";;; DO NOT EDIT\n")
(printf ";;; automatically generated\n")
(printf ";;; ~s elements in vector\n\n" (vector-length xonxoff))
(pretty-print
`(define unicode-constituents-vector ',xonxoff)))
'replace))))
(printf "Happy Happy Joy Joy\n")