371 lines
18 KiB
Scheme
371 lines
18 KiB
Scheme
;;; Copyright (C) 2008 Abdulaziz Ghuloum, R. Kent Dybvig
|
|
;;; Copyright (C) 2006,2007 Abdulaziz Ghuloum
|
|
;;;
|
|
;;; Permission is hereby granted, free of charge, to any person obtaining a
|
|
;;; copy of this software and associated documentation files (the "Software"),
|
|
;;; to deal in the Software without restriction, including without limitation
|
|
;;; the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
;;; and/or sell copies of the Software, and to permit persons to whom the
|
|
;;; Software is furnished to do so, subject to the following conditions:
|
|
;;;
|
|
;;; The above copyright notice and this permission notice shall be included in
|
|
;;; all copies or substantial portions of the Software.
|
|
;;;
|
|
;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
;;; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
;;; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
;;; THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
;;; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
;;; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
;;; DEALINGS IN THE SOFTWARE.
|
|
|
|
(import
|
|
(ikarus)
|
|
(unicode-data))
|
|
|
|
(include "extract-common.ss")
|
|
|
|
(define ptr-bytes 4)
|
|
(define code-point-limit #x110000)
|
|
(define-table (make-table table-ref table-set! table-ref-code)
|
|
(make-vector vector-ref vector-set!)
|
|
code-point-limit #x40 #x40)
|
|
|
|
(define (extract-range str)
|
|
(define (find-char c s)
|
|
(let f ([i 0] [n (string-length s)])
|
|
(cond
|
|
[(= i n) #f]
|
|
[(char=? (string-ref s i) c) i]
|
|
[else (f (+ i 1) n)])))
|
|
(cond
|
|
[(find-char #\. str) =>
|
|
(lambda (i)
|
|
(cons
|
|
(hex->num (substring str 0 i))
|
|
(hex->num (substring str (+ i 2) (string-length str)))))]
|
|
[else (let ([n (hex->num str)]) (cons n n))]))
|
|
|
|
; fixnum field laid out as follows:
|
|
; bits 0-5: category number
|
|
; bits 6-9: wordbreak property
|
|
; bits 10-17: combining class
|
|
; bits 18-29: case/type property bits
|
|
|
|
(define-syntax define-bitfields
|
|
(lambda (x)
|
|
(define construct-name
|
|
(lambda (template-identifier . args)
|
|
(datum->syntax template-identifier
|
|
(string->symbol
|
|
(apply string-append
|
|
(map (lambda (x) (format "~a" (syntax->datum x)))
|
|
args))))))
|
|
(define extract
|
|
(lambda (fld* bit def*)
|
|
(define (enumerate ls)
|
|
(let f ([ls ls] [i 0])
|
|
(cond
|
|
[(null? ls) '()]
|
|
[else (cons i (f (cdr ls) (+ i 1)))])))
|
|
(assert (< bit (fixnum-width)))
|
|
(if (null? fld*)
|
|
def*
|
|
(syntax-case (car fld*) (flag enumeration integer)
|
|
[(flag name) (identifier? #'name)
|
|
(extract (cdr fld*) (+ bit 1)
|
|
#`((define name #,(fxsll 1 bit)) #,@def*))]
|
|
[(enumeration name id ...)
|
|
(and (identifier? #'name) (for-all identifier? #'(id ...)))
|
|
(let ([width (bitwise-length (length #'(id ...)))])
|
|
(with-syntax ([name-shift (construct-name #'name #'name "-shift")]
|
|
[name-mask (construct-name #'name #'name "-mask")])
|
|
(extract (cdr fld*) (+ bit width)
|
|
#`((define name-shift #,bit)
|
|
(define name-mask #,(fx- (fxsll 1 width) 1))
|
|
#,@(map (lambda (id val) #`(define #,id #,val))
|
|
#'(id ...)
|
|
(enumerate #'(id ...)))
|
|
#,@def*))))]
|
|
[(integer name width) (identifier? #'name)
|
|
(let ([width (syntax->datum #'width)])
|
|
(with-syntax ([name-shift (construct-name #'name #'name "-shift")]
|
|
[name-mask (construct-name #'name #'name "-mask")])
|
|
(extract (cdr fld*) (+ bit width)
|
|
#`((define name-shift #,bit)
|
|
(define name-mask #,(fx- (fxsll 1 width) 1))
|
|
#,@def*))))]))))
|
|
(syntax-case x ()
|
|
[(_ fld ...)
|
|
#`(begin #,@(extract #'(fld ...) 0 #'()))])))
|
|
|
|
(define-bitfields
|
|
(flag cased-property)
|
|
(flag case-ignorable-property)
|
|
(flag constituent-property)
|
|
(flag uppercase-property)
|
|
(flag lowercase-property)
|
|
(flag titlecase-property)
|
|
(flag alphabetic-property)
|
|
(flag numeric-property)
|
|
(flag whitespace-property)
|
|
(enumeration category Lu-cat Ll-cat Lt-cat Lm-cat Lo-cat
|
|
Mn-cat Mc-cat Me-cat Nd-cat Nl-cat No-cat Pc-cat Pd-cat
|
|
Ps-cat Pe-cat Pi-cat Pf-cat Po-cat Sm-cat Sc-cat Sk-cat
|
|
So-cat Zs-cat Zl-cat Zp-cat Cc-cat Cf-cat Cs-cat Co-cat
|
|
Cn-cat)
|
|
; default wb-other-property must be zero, so must be listed first
|
|
(enumeration wbproperty wb-other-property wb-aletter-property
|
|
wb-numeric-property wb-katakana-property
|
|
wb-extend-property wb-format-property wb-midnum-property
|
|
wb-midletter-property wb-midnumlet-property
|
|
wb-extendnumlet-property wb-cr-property wb-lf-property
|
|
wb-newline-property)
|
|
(integer combining-class 8))
|
|
|
|
;;; Uppercase = Lu + Other_Uppercase
|
|
;;; Lowercase = Ll + Other_Lowercase
|
|
;;; Titlecase = Lt
|
|
;;; Alphabetic = Lu + Ll + Lt + Lm + Lo + Nl + Other_Alphabetic
|
|
;;; Numeric = ???
|
|
;;; White_Space =
|
|
|
|
;;; cased property:
|
|
;;; D120: A character C is defined to be cased if and only if C has the
|
|
;;; Lowercase or Uppercase property or has a General_Category value of
|
|
;;; Titlecase_Letter
|
|
;;;
|
|
;;; case-ignorable property:
|
|
;;; D121 A character C is defined to be case-ignorable if C has the
|
|
;;; value MidLetter or the value MidNumLet for the Word_Break property
|
|
;;; or its General_Category is one of Nonspacing_Mark (Mn),
|
|
;;; Enclosing_Mark (Me), Format (Cf), Modifier_Letter (Lm), or
|
|
;;; Modifier_Symbol (Sk).
|
|
|
|
(define name->wbprop
|
|
(lambda (name)
|
|
(case (string->symbol name)
|
|
[(ALetter) (fxsll wb-aletter-property wbproperty-shift)]
|
|
[(Numeric) (fxsll wb-numeric-property wbproperty-shift)]
|
|
[(Katakana) (fxsll wb-katakana-property wbproperty-shift)]
|
|
[(MidLetter) (fxior (fxsll wb-midletter-property wbproperty-shift) case-ignorable-property)]
|
|
[(Extend) (fxsll wb-extend-property wbproperty-shift)]
|
|
[(Format) (fxsll wb-format-property wbproperty-shift)]
|
|
[(MidNum) (fxsll wb-midnum-property wbproperty-shift)]
|
|
[(MidNumLet) (fxior (fxsll wb-midnumlet-property wbproperty-shift) case-ignorable-property)]
|
|
[(ExtendNumLet) (fxsll wb-extendnumlet-property wbproperty-shift)]
|
|
[(CR) (fxsll wb-cr-property wbproperty-shift)]
|
|
[(LF) (fxsll wb-lf-property wbproperty-shift)]
|
|
[(Newline) (fxsll wb-newline-property wbproperty-shift)]
|
|
[else (error 'name->wbprop "unexpected property ~a" name)])))
|
|
|
|
(define proplist-properties
|
|
`(["Other_Uppercase" ,uppercase-property]
|
|
["Other_Lowercase" ,lowercase-property]
|
|
["Other_Alphabetic" ,alphabetic-property]
|
|
["White_Space" ,whitespace-property]))
|
|
|
|
(define categories
|
|
;;; 30 categories
|
|
`([Lu ,(+ (fxsll Lu-cat category-shift) constituent-property uppercase-property alphabetic-property) "Letter, Uppercase"]
|
|
[Ll ,(+ (fxsll Ll-cat category-shift) constituent-property lowercase-property alphabetic-property) "Letter, Lowercase"]
|
|
[Lt ,(+ (fxsll Lt-cat category-shift) constituent-property titlecase-property alphabetic-property cased-property) "Letter, Titlecase"]
|
|
[Lm ,(+ (fxsll Lm-cat category-shift) constituent-property alphabetic-property case-ignorable-property) "Letter, Modifier"]
|
|
[Lo ,(+ (fxsll Lo-cat category-shift) constituent-property alphabetic-property) "Letter, Other"]
|
|
[Mn ,(+ (fxsll Mn-cat category-shift) constituent-property case-ignorable-property) "Mark, Nonspacing"]
|
|
[Mc ,(+ (fxsll Mc-cat category-shift) ) "Mark, Spacing Combining"]
|
|
[Me ,(+ (fxsll Me-cat category-shift) case-ignorable-property) "Mark, Enclosing"]
|
|
[Nd ,(+ (fxsll Nd-cat category-shift) ) "Number, Decimal Digit"]
|
|
[Nl ,(+ (fxsll Nl-cat category-shift) constituent-property alphabetic-property) "Number, Letter"]
|
|
[No ,(+ (fxsll No-cat category-shift) constituent-property) "Number, Other"]
|
|
[Pc ,(+ (fxsll Pc-cat category-shift) constituent-property) "Punctuation, Connector"]
|
|
[Pd ,(+ (fxsll Pd-cat category-shift) constituent-property) "Punctuation, Dash"]
|
|
[Ps ,(+ (fxsll Ps-cat category-shift) ) "Punctuation, Open"]
|
|
[Pe ,(+ (fxsll Pe-cat category-shift) ) "Punctuation, Close"]
|
|
[Pi ,(+ (fxsll Pi-cat category-shift) ) "Punctuation, Initial quote"]
|
|
[Pf ,(+ (fxsll Pf-cat category-shift) ) "Punctuation, Final quote"]
|
|
[Po ,(+ (fxsll Po-cat category-shift) constituent-property) "Punctuation, Other"]
|
|
[Sm ,(+ (fxsll Sm-cat category-shift) constituent-property) "Symbol, Math"]
|
|
[Sc ,(+ (fxsll Sc-cat category-shift) constituent-property) "Symbol, Currency"]
|
|
[Sk ,(+ (fxsll Sk-cat category-shift) constituent-property case-ignorable-property) "Symbol, Modifier"]
|
|
[So ,(+ (fxsll So-cat category-shift) constituent-property) "Symbol, Other"]
|
|
[Zs ,(+ (fxsll Zs-cat category-shift) ) "Separator, Space"]
|
|
[Zl ,(+ (fxsll Zl-cat category-shift) ) "Separator, Line"]
|
|
[Zp ,(+ (fxsll Zp-cat category-shift) ) "Separator, Paragraph"]
|
|
[Cc ,(+ (fxsll Cc-cat category-shift) ) "Other, Control"]
|
|
[Cf ,(+ (fxsll Cf-cat category-shift) case-ignorable-property) "Other, Format"]
|
|
[Cs ,(+ (fxsll Cs-cat category-shift) ) "Other, Surrogate"]
|
|
[Co ,(+ (fxsll Co-cat category-shift) constituent-property) "Other, Private Use"]
|
|
[Cn ,(+ (fxsll Cn-cat category-shift) ) "Other, Not Assigned"]
|
|
))
|
|
|
|
(define (category/flags x)
|
|
(cond
|
|
[(assq x categories) => cadr]
|
|
[else (error 'category/flags "invalid cat ~s" x)]))
|
|
|
|
(define (make-cats-table ls)
|
|
(let f ([i 1] [st (car ls)] [ls (cdr ls)] [ac '()])
|
|
(cond
|
|
[(null? ls) (reverse (cons (cons i st) ac))]
|
|
[(equal? (cdar ls) (cdr st)) (f (+ i 1) st (cdr ls) ac)]
|
|
[else (f 1 (car ls) (cdr ls) (cons (cons i st) ac))])))
|
|
|
|
; create table, placing all in category Cn until proven otherwise
|
|
(let ([tbl (make-table (category/flags 'Cn))])
|
|
(define (setprop n prop) (table-set! tbl n prop))
|
|
(define (getprop n) (table-ref tbl n))
|
|
;;; interesting parts of each element in UnicodeData.txt are:
|
|
;;; field0: the character index, numeric
|
|
;;; field2: the category, symbolic
|
|
;;; field3: the combining class (0-255)
|
|
;;; field8: if set, then the char has the numeric property
|
|
;;; field12: if set, then the char has upper-case mapping and is thus cased
|
|
;;; field13: if set, then the char has lower-case mapping and is thus cased
|
|
(let f ([ls (map (lambda (x)
|
|
(let ([n (hex->num (list-ref x 0))]
|
|
[cclass (string->number (list-ref x 3))]
|
|
[cat/flags (category/flags (string->symbol (list-ref x 2)))]
|
|
[num (if (string=? (list-ref x 8) "") 0 numeric-property)]
|
|
[cased (if (and (string=? (list-ref x 12) "") (string=? (list-ref x 13) ""))
|
|
0 cased-property)])
|
|
(cons n (fxior num cased
|
|
(fxsll cclass combining-class-shift)
|
|
cat/flags))))
|
|
(get-unicode-data "UNIDATA/UnicodeData.txt"))])
|
|
(unless (null? ls)
|
|
(cond
|
|
[(null? (cdr ls)) (setprop (caar ls) (cdar ls))]
|
|
[(or (= (+ 1 (caar ls)) (caadr ls))
|
|
(not (= (cdar ls) (cdadr ls))))
|
|
(setprop (caar ls) (cdar ls))
|
|
(f (cdr ls))]
|
|
[else
|
|
(let f ([n (caar ls)] [j (caadr ls)] [p (cdar ls)])
|
|
(unless (> n j) (setprop n p) (f (+ n 1) j p)))
|
|
(f (cddr ls))])))
|
|
;;; interesting parts of each element in WordBreakProperty.txt are:
|
|
;;; field0: the character index, numeric
|
|
;;; field1: the word-break property
|
|
(for-each
|
|
(lambda (x)
|
|
(let ([range (extract-range (list-ref x 0))])
|
|
(let f ([i (car range)] [j (cdr range)])
|
|
(unless (> i j)
|
|
(let ([prop (getprop i)])
|
|
(unless (fx= (fxand (fxsra prop wbproperty-shift) wbproperty-mask) 0)
|
|
(error #f "multiple word break properties found for ~x" i))
|
|
(setprop i (fxior prop (name->wbprop (list-ref x 1))))
|
|
(f (+ i 1) j))))))
|
|
(get-unicode-data "UNIDATA/WordBreakProperty.txt"))
|
|
;;; interesting parts of each element in PropList.txt are:
|
|
;;; field0: range of character indices
|
|
;;; field1: property name
|
|
(for-each
|
|
(lambda (x)
|
|
(let ([range (extract-range (list-ref x 0))]
|
|
[name (list-ref x 1)])
|
|
(cond
|
|
[(assoc name proplist-properties) =>
|
|
(lambda (a)
|
|
(let ([n (cadr a)])
|
|
(let f ([i (car range)] [j (cdr range)])
|
|
(unless (> i j)
|
|
(setprop i (fxlogor (getprop i) n))
|
|
(f (+ i 1) j)))))])))
|
|
(get-unicode-data "UNIDATA/PropList.txt"))
|
|
(commonize* tbl)
|
|
(with-output-to-file* "unicode-charinfo.ss"
|
|
(lambda ()
|
|
(parameterize ([print-graph #t])
|
|
(pretty-print
|
|
`(module ($char-constituent? $char-upper-case? $char-lower-case? $char-title-case? $char-alphabetic?
|
|
$char-numeric? $char-whitespace? $char-cased? $char-case-ignorable? $char-category
|
|
$wb-aletter? $wb-numeric? $wb-katakana? $wb-extend? $wb-format? $wb-midnum? $wb-midletter?
|
|
$wb-midnumlet? $wb-extendnumlet? $char-combining-class $char-dump)
|
|
(define category-mask ,category-mask)
|
|
(define unicode-category-table ',tbl)
|
|
(define unicode-category-names
|
|
',(list->vector (map car categories)))
|
|
(define table-ref ,table-ref-code)
|
|
(define (getprop n) (table-ref unicode-category-table n))
|
|
(define $char-constituent?
|
|
(lambda (c)
|
|
(fxlogtest (getprop (char->integer c)) ,constituent-property)))
|
|
(define $char-upper-case?
|
|
(lambda (c)
|
|
(fxlogtest (getprop (char->integer c)) ,uppercase-property)))
|
|
(define $char-lower-case?
|
|
(lambda (c)
|
|
(fxlogtest (getprop (char->integer c)) ,lowercase-property)))
|
|
(define $char-title-case?
|
|
(lambda (c)
|
|
(fxlogtest (getprop (char->integer c)) ,titlecase-property)))
|
|
(define $char-alphabetic?
|
|
(lambda (c)
|
|
(fxlogtest (getprop (char->integer c)) ,alphabetic-property)))
|
|
(define $char-numeric?
|
|
(lambda (c)
|
|
(fxlogtest (getprop (char->integer c)) ,numeric-property)))
|
|
(define $char-whitespace?
|
|
(lambda (c)
|
|
(fxlogtest (getprop (char->integer c)) ,whitespace-property)))
|
|
(define $char-cased?
|
|
(lambda (c)
|
|
(fxlogtest (getprop (char->integer c)) ,cased-property)))
|
|
(define $char-case-ignorable?
|
|
(lambda (c)
|
|
(fxlogtest (getprop (char->integer c)) ,case-ignorable-property)))
|
|
(define (wb prop)
|
|
(lambda (c)
|
|
(fx= (fxand
|
|
(fxsra
|
|
(getprop (char->integer c))
|
|
,wbproperty-shift)
|
|
,wbproperty-mask)
|
|
prop)))
|
|
(define $wb-aletter? (wb ,wb-aletter-property))
|
|
(define $wb-numeric? (wb ,wb-numeric-property))
|
|
(define $wb-katakana? (wb ,wb-katakana-property))
|
|
(define $wb-extend? (wb ,wb-extend-property))
|
|
(define $wb-format? (wb ,wb-format-property))
|
|
(define $wb-midnum? (wb ,wb-midnum-property))
|
|
(define $wb-midletter? (wb ,wb-midletter-property))
|
|
(define $wb-midnumlet? (wb ,wb-midnumlet-property))
|
|
(define $wb-extendnumlet? (wb ,wb-extendnumlet-property))
|
|
(define $char-combining-class
|
|
(lambda (c)
|
|
(fxand (fxsra (getprop (char->integer c)) ,combining-class-shift)
|
|
,combining-class-mask)))
|
|
(define $char-category
|
|
(lambda (c)
|
|
(vector-ref unicode-category-names
|
|
(fxand (fxsra (getprop (char->integer c)) ,category-shift)
|
|
,category-mask))))
|
|
(define $char-dump
|
|
(lambda (c)
|
|
(define (list-true . args) (remq #f args))
|
|
(list-true
|
|
(and ($char-constituent? c) 'constituent)
|
|
(and ($char-upper-case? c) 'upper-case)
|
|
(and ($char-lower-case? c) 'lower-case)
|
|
(and ($char-title-case? c) 'title-case)
|
|
(and ($char-alphabetic? c) 'alphabetic)
|
|
(and ($char-numeric? c) 'whitespace)
|
|
(and ($char-whitespace? c) 'whitespace)
|
|
(and ($char-cased? c) 'cased)
|
|
(and ($char-case-ignorable? c) 'case-ignorable)
|
|
(and ($wb-aletter? c) 'aletter)
|
|
(and ($wb-numeric? c) 'numeric)
|
|
(and ($wb-katakana? c) 'katakana)
|
|
(and ($wb-extend? c) 'extend)
|
|
(and ($wb-format? c) 'format)
|
|
(and ($wb-midnum? c) 'midnum)
|
|
(and ($wb-midletter? c) 'midletter)
|
|
(and ($wb-midnumlet? c) 'midnumlet)
|
|
(and ($wb-extendnumlet? c) 'extendnumlet)
|
|
`(combining-class ,($char-combining-class c))
|
|
($char-category c))))))))))
|
|
|
|
(printf "Happy Happy Joy Joy ~s\n" (sizeof cache))
|