ikarus/scheme/unicode/extract-info.ss

254 lines
9.2 KiB
Scheme
Executable File

#!/usr/bin/env ikarus --r6rs-script
;;; Ikarus Scheme -- A compiler for R6RS Scheme.
;;; Copyright (C) 2006,2007,2008 Abdulaziz Ghuloum
;;;
;;; This program is free software: you can redistribute it and/or modify
;;; it under the terms of the GNU General Public License version 3 as
;;; published by the Free Software Foundation.
;;;
;;; This program is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;; General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with this program. If not, see <http://www.gnu.org/licenses/>.
;;; this file is a mess.
(import
(ikarus)
(unicode-data))
(define license
'";;; Ikarus Scheme -- A compiler for R6RS Scheme.
;;; Copyright (C) 2006,2007,2008 Abdulaziz Ghuloum
;;;
;;; This program is free software: you can redistribute it and/or modify
;;; it under the terms of the GNU General Public License version 3 as
;;; published by the Free Software Foundation.
;;;
;;; This program is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;; General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with this program. If not, see <http://www.gnu.org/licenses/>.
")
(define (hex-string->number str)
(or (string->number (string-append "#x" str))
(error 'hex-string->number "invalid ~s" str)))
(define (find-char c s)
(let f ([i 0] [n (string-length s)])
(cond
[(= i n) #f]
[(char=? (string-ref s i) c) i]
[else (f (add1 i) n)])))
(define (extract-range str)
(cond
[(find-char #\. str)
=>
(lambda (i)
(cons (hex-string->number (substring str 0 i))
(hex-string->number (substring str (+ i 2) (string-length str)))))]
[else
(let ([n (hex-string->number str)])
(cons n n))]))
(define constituent-property #x010000)
(define uppercase-property #x020000)
(define lowercase-property #x040000)
(define titlecase-property #x080000)
(define alphabetic-property #x100000)
(define numeric-property #x200000)
(define whitespace-property #x400000)
;;; Uppercase = Lu + Other_Uppercase
;;; Lowercase = Ll + Other_Lowercase
;;; Titlecase = Lt
;;; Alphabetic = Lu + Ll + Lt + Lm + Lo + Nl + Other_Alphabetic
;;; Numeric = ???
;;; White_Space =
(define proplist-properties
`(["Other_Uppercase" ,uppercase-property]
["Other_Lowercase" ,lowercase-property]
["Other_Alphabetic" ,alphabetic-property]
["White_Space" ,whitespace-property]))
(define categories
;;; 30 categories
`([Lu ,(+ 00 constituent-property uppercase-property alphabetic-property) "Letter, Uppercase"]
[Ll ,(+ 01 constituent-property lowercase-property alphabetic-property) "Letter, Lowercase"]
[Lt ,(+ 02 constituent-property titlecase-property alphabetic-property) "Letter, Titlecase"]
[Lm ,(+ 03 constituent-property alphabetic-property) "Letter, Modifier"]
[Lo ,(+ 04 constituent-property alphabetic-property) "Letter, Other"]
[Mn ,(+ 05 constituent-property) "Mark, Nonspacing"]
[Mc ,(+ 06 ) "Mark, Spacing Combining"]
[Me ,(+ 07 ) "Mark, Enclosing"]
[Nd ,(+ 08 ) "Number, Decimal Digit"]
[Nl ,(+ 09 constituent-property alphabetic-property) "Number, Letter"]
[No ,(+ 10 constituent-property) "Number, Other"]
[Pc ,(+ 11 constituent-property) "Punctuation, Connector"]
[Pd ,(+ 12 constituent-property) "Punctuation, Dash"]
[Ps ,(+ 13 ) "Punctuation, Open"]
[Pe ,(+ 14 ) "Punctuation, Close"]
[Pi ,(+ 15 ) "Punctuation, Initial quote"]
[Pf ,(+ 16 ) "Punctuation, Final quote"]
[Po ,(+ 17 constituent-property) "Punctuation, Other"]
[Sm ,(+ 18 constituent-property) "Symbol, Math"]
[Sc ,(+ 19 constituent-property) "Symbol, Currency"]
[Sk ,(+ 20 constituent-property) "Symbol, Modifier"]
[So ,(+ 21 constituent-property) "Symbol, Other"]
[Zs ,(+ 22 ) "Separator, Space"]
[Zl ,(+ 23 ) "Separator, Line"]
[Zp ,(+ 24 ) "Separator, Paragraph"]
[Cc ,(+ 25 ) "Other, Control"]
[Cf ,(+ 26 ) "Other, Format"]
[Cs ,(+ 27 ) "Other, Surrogate"]
[Co ,(+ 28 constituent-property) "Other, Private Use"]
[Cn ,(+ 29 ) "Other, Not Assigned"]
))
(define (category-index x)
(cond
[(assq x categories) => cadr]
[else (error 'category-index "invalid cat ~s" x)]))
(define (make-cats-table ls)
(let f ([i 1] [st (car ls)] [ls (cdr ls)] [ac '()])
(cond
[(null? ls) (reverse (cons (cons i st) ac))]
[(equal? (cdar ls) (cdr st)) (f (add1 i) st (cdr ls) ac)]
[else
(f 1 (car ls) (cdr ls) (cons (cons i st) ac))])))
(define (merge-sequences ls)
(define (split ls)
(cond
[(null? ls) (values '() '())]
[(= (caar ls) 1)
(let-values ([(chain no-chain) (split (cdr ls))])
(values (cons (cdar ls) chain) no-chain))]
[else
(values '() ls)]))
(define (mk-chain a chain)
(cond
[(null? chain) a]
[else
(cons (car a)
(list->vector
(cons (cdr a)
(map cdr chain))))]))
(cond
[(null? ls) '()]
[(= (caar ls) 1)
(let-values ([(chain no-chain) (split (cdr ls))])
(cons (mk-chain (cdar ls) chain)
(merge-sequences no-chain)))]
[else (cons (cdar ls) (merge-sequences (cdr ls)))]))
(define (iota i n)
(let f ([i i] [n n] [ac '()])
(cond
[(= i n) ac]
[else (f i (sub1 n) (cons (sub1 n) ac))])))
;;; first, make a big vector for all characters
;;; place all in category Cn, unless proven otherwise
(let ([v (make-vector (+ #x10FFFF 1) (category-index 'Cn))])
(let ([ls (get-unicode-data "UNIDATA/UnicodeData.txt")])
;;; interesting parts of each element in ls are:
;;; field0: the character index, numeric
;;; field2: the category, symbolic
;;; field8: if set, then the char has the numeric property
(define (setprop idx prop)
(vector-set! v idx prop))
(let ([ls (map
(lambda (x)
(let ([idx (hex-string->number (list-ref x 0))]
[cat (category-index (string->symbol (list-ref x 2)))]
[num? (list-ref x 8)])
(cons idx
(if (string=? num? "")
cat
(fxlogor cat numeric-property)))))
ls)])
(let f ([ls ls])
(cond
[(null? ls) (void)]
[(null? (cdr ls)) (setprop (caar ls) (cdar ls))]
[(or (= (+ 1 (caar ls)) (caadr ls))
(not (= (cdar ls) (cdadr ls))))
(setprop (caar ls) (cdar ls))
(f (cdr ls))]
[else
(let f ([i (caar ls)] [j (caadr ls)] [p (cdar ls)])
(unless (> i j)
(setprop i p)
(f (add1 i) j p)))
(f (cddr ls))]))))
;;; every element of v now maps to the category-index.
(let ([ls (get-unicode-data "UNIDATA/PropList.txt")])
;;; field0 is a range
;;; field1 is a property name
(for-each
(lambda (x)
(let ([range (extract-range (car x))]
[name (cadr x)])
(cond
[(assoc name proplist-properties) =>
(lambda (a)
(let ([n (cadr a)])
(let f ([i (car range)] [j (cdr range)])
(unless (> i j)
(vector-set! v i (fxlogor (vector-ref v i) n))
(f (add1 i) j)))))])))
ls))
(let ([table
(merge-sequences
(make-cats-table
(map cons
(iota 0 (vector-length v))
(vector->list v))))])
(with-output-to-file "unicode-charinfo.ss"
(lambda ()
(display license)
(printf ";;; DO NOT EDIT\n")
(printf ";;; automatically generated\n")
(printf ";;; ~s elements in vectors\n\n" (length table))
(pretty-print
`(begin
(define constituent-property ,constituent-property )
(define uppercase-property ,uppercase-property )
(define lowercase-property ,lowercase-property )
(define titlecase-property ,titlecase-property )
(define alphabetic-property ,alphabetic-property )
(define numeric-property ,numeric-property )
(define whitespace-property ,whitespace-property)))
(pretty-print
`(define unicode-categories-lookup-vector
',(list->vector (map car table))))
(pretty-print
`(define unicode-categories-values-vector
',(list->vector (map cdr table))))
(pretty-print
`(define unicode-categories-name-vector
',(list->vector (map car categories)))))
'replace)))
(printf "Happy Happy Joy Joy\n")