- speedup of unicode normalization routines.

- added unicode normalization tests.
This commit is contained in:
Abdulaziz Ghuloum 2008-10-29 03:15:17 -04:00
parent 15e8775c67
commit a4ba327173
5 changed files with 373 additions and 100 deletions

View File

@ -489,112 +489,110 @@
(define $decompose
; might should optimize for sequences of ascii characters
(lambda (s canonical?)
(define (canonical<? c1 c2)
(fx< ($char-combining-class c1) ($char-combining-class c2)))
(define (sort-and-flush comb*)
(for-each write-char
(list-sort canonical<? (reverse comb*))))
(define ($char-decomp c)
(if (and (char<=? hangul-sbase c) (char<=? c hangul-slimit))
($hangul-decomp c)
(if canonical?
($str-decomp-canon c)
($str-decomp-compat c))))
(with-output-to-string
(lambda ()
(let ([n (string-length s)])
(define (push-and-go c* c** i comb*)
(if (char? c*)
(go c* c** i comb*)
(go (car c*) (cons (cdr c*) c**) i comb*)))
(define (pop-and-go c** i comb*)
(if (null? c**)
(if (fx= i n)
(let ([n (string-length s)] [ac '()])
(define (canonical>? c1 c2)
(fx> ($char-combining-class c1) ($char-combining-class c2)))
(define (sort-and-flush comb*)
(unless (null? comb*)
(set! ac (append (list-sort canonical>? comb*) ac))))
(define ($char-decomp c)
(if (and (char<=? hangul-sbase c) (char<=? c hangul-slimit))
($hangul-decomp c)
(if canonical?
($str-decomp-canon c)
($str-decomp-compat c))))
(define (push-and-go c* c** i comb*)
(if (char? c*)
(go c* c** i comb*)
(go (car c*) (cons (cdr c*) c**) i comb*)))
(define (pop-and-go c** i comb*)
(if (null? c**)
(if (fx= i n)
(sort-and-flush comb*)
(go (string-ref s i) '() (fx+ i 1) comb*))
(push-and-go (car c**) (cdr c**) i comb*)))
(define (go c c** i comb*)
(let ([c* ($char-decomp c)])
(if (eq? c c*) ; should be eqv?
(if (fxzero? ($char-combining-class c))
(begin
(sort-and-flush comb*)
(go (string-ref s i) '() (fx+ i 1) comb*))
(push-and-go (car c**) (cdr c**) i comb*)))
(define (go c c** i comb*)
(let ([c* ($char-decomp c)])
(if (eq? c c*) ; should be eqv?
(if (fxzero? ($char-combining-class c))
(begin
(sort-and-flush comb*)
(write-char c)
(pop-and-go c** i '()))
(pop-and-go c** i (cons c comb*)))
(push-and-go c* c** i comb*))))
(pop-and-go '() 0 '()))))))
(set! ac (cons c ac))
(pop-and-go c** i '()))
(pop-and-go c** i (cons c comb*)))
(push-and-go c* c** i comb*))))
(pop-and-go '() 0 '())
(list->string (reverse ac)))))
(define $compose
(let ([comp-table #f])
(define (lookup-composite c1 c2)
; needs to handle HANGUL
(hashtable-ref comp-table (cons c1 c2) #f))
(define (init!)
(set! comp-table
(make-hashtable
(lambda (x)
(fxxor
(fxsll (char->integer (car x)) 7)
(char->integer (cdr x))))
(lambda (x y)
(and (char=? (car x) (car y))
(char=? (cdr x) (cdr y))))))
(vector-for-each
(lambda (c* c) (hashtable-set! comp-table c* c))
(car ($composition-pairs))
(cdr ($composition-pairs))))
(lambda (s)
(unless comp-table
(set! comp-table
(make-hashtable
(lambda (x)
(fxxor
(fxsll (char->integer (car x)) 7)
(char->integer (cdr x))))
(lambda (x y)
(and (char=? (car x) (car y))
(char=? (cdr x) (cdr y))))))
(vector-for-each
(lambda (c* c) (hashtable-set! comp-table c* c))
(car ($composition-pairs))
(cdr ($composition-pairs))))
(with-output-to-string
(lambda ()
(let ([n (string-length s)])
(define (dump c acc)
(write-char c)
(for-each write-char (reverse acc)))
(define (s0 i)
(unless (fx= i n)
(let ([c (string-ref s i)])
(if (fxzero? ($char-combining-class c))
(s1 (fx+ i 1) c)
(begin (write-char c) (s0 (fx+ i 1)))))))
(define (s1 i c)
(if (fx= i n)
(write-char c)
(let ([c1 (string-ref s i)])
(cond
[(and (and (char<=? hangul-lbase c)
(char<=? c hangul-llimit))
(and (char<=? hangul-vbase c1)
(char<=? c1 hangul-vlimit)))
(s1 (fx+ i 1)
(let ([lindex (char- c hangul-lbase)]
[vindex (char- c1 hangul-vbase)])
(integer->char
(fx+ (char->integer hangul-sbase)
(fx* (fx+ (fx* lindex hangul-vcount) vindex)
hangul-tcount)))))]
[(and (and (char<=? hangul-sbase c)
(char<=? c hangul-slimit))
(and (char<=? hangul-tbase c1)
(char<=? c1 hangul-tlimit))
(let ([sindex (char- c hangul-sbase)])
(fxzero? (fxmod sindex hangul-tcount))))
(let ([tindex (char- c1 hangul-tbase)])
(s1 (fx+ i 1) (integer->char (fx+ (char->integer c) tindex))))]
[else (s2 i c -1 '())]))))
(define (s2 i c class acc)
(if (fx= i n)
(dump c acc)
(let ([c1 (string-ref s i)])
(let ([class1 ($char-combining-class c1)])
(cond
[(and (fx< class class1) (lookup-composite c c1)) =>
(lambda (c) (s2 (fx+ i 1) c class acc))]
[(fx= class1 0)
(dump c acc)
(s1 (fx+ i 1) c1)]
[else (s2 (fx+ i 1) c class1 (cons c1 acc))])))))
(s0 0)))))))
(unless comp-table (init!))
(let ([ac '()] [n (string-length s)])
(define (dump c acc)
(set! ac (cons c ac))
(unless (null? acc) (set! ac (append acc ac))))
(define (s0 i)
(unless (fx= i n)
(let ([c (string-ref s i)])
(if (fxzero? ($char-combining-class c))
(s1 (fx+ i 1) c)
(begin (set! ac (cons c ac)) (s0 (fx+ i 1)))))))
(define (s1 i c)
(if (fx= i n)
(set! ac (cons c ac))
(let ([c1 (string-ref s i)])
(cond
[(and (and (char<=? hangul-lbase c)
(char<=? c hangul-llimit))
(and (char<=? hangul-vbase c1)
(char<=? c1 hangul-vlimit)))
(s1 (fx+ i 1)
(let ([lindex (char- c hangul-lbase)]
[vindex (char- c1 hangul-vbase)])
(integer->char
(fx+ (char->integer hangul-sbase)
(fx* (fx+ (fx* lindex hangul-vcount) vindex)
hangul-tcount)))))]
[(and (and (char<=? hangul-sbase c)
(char<=? c hangul-slimit))
(and (char<=? hangul-tbase c1)
(char<=? c1 hangul-tlimit))
(let ([sindex (char- c hangul-sbase)])
(fxzero? (fxmod sindex hangul-tcount))))
(let ([tindex (char- c1 hangul-tbase)])
(s1 (fx+ i 1) (integer->char (fx+ (char->integer c) tindex))))]
[else (s2 i c -1 '())]))))
(define (s2 i c class acc)
(if (fx= i n)
(dump c acc)
(let ([c1 (string-ref s i)])
(let ([class1 ($char-combining-class c1)])
(cond
[(and (fx< class class1) (lookup-composite c c1)) =>
(lambda (c) (s2 (fx+ i 1) c class acc))]
[(fx= class1 0)
(dump c acc)
(s1 (fx+ i 1) c1)]
[else (s2 (fx+ i 1) c class1 (cons c1 acc))])))))
(s0 0)
(list->string (reverse ac))))))
(define-string-op string-normalize-nfd
(lambda (s) ($decompose s #t)))

View File

@ -22,7 +22,7 @@
lists strings bytevectors hashtables fixnums bignums numerics
bitwise enums pointers sorting io fasl reader case-folding
parse-flonums string-to-number bignum-to-flonum div-and-mod
fldiv-and-mod))
fldiv-and-mod unicode normalization))
(define (run-test-from-library x)
(printf "[testing ~a] ..." x)

View File

@ -0,0 +1,123 @@
(library (tests normalization)
(export run-tests)
(import (ikarus) (unicode-data))
(define (reset) (error 'reset "yukk"))
(define (enumerate ls)
(let f ([ls ls] [i 0])
(cond
[(null? ls) '()]
[else (cons i (f (cdr ls) (+ i 1)))])))
(define (list-head ls n)
(if (zero? n)
'()
(cons (car ls) (list-head (cdr ls) (- n 1)))))
(define (split str)
(remove ""
(let f ([i 0] [n (string-length str)])
(cond
[(= i n) (list (substring str 0 n))]
[(char=? (string-ref str i) #\space)
(cons (substring str 0 i)
(split (substring str (+ i 1) n)))]
[else (f (add1 i) n)]))))
(define (conv x)
(list->string
(map (lambda (x) (integer->char (string->number x 16)))
(split x))))
(define (run-tests)
(let ([data (map (lambda (x) (map conv (list-head x 5)))
(filter (lambda (x) (>= (length x) 5))
(get-unicode-data
"unicode/UNIDATA/NormalizationTest.txt")))])
(define NFD string-normalize-nfd)
(define NFKD string-normalize-nfkd)
(define NFC string-normalize-nfc)
(define NFKC string-normalize-nfkc)
(define (test1)
(for-each
(lambda (x testno)
(apply
(lambda (c1 c2 c3 c4 c5)
(unless (and (string=? c2 (NFC c1) (NFC c2) (NFC c3))
(string=? c4 (NFC c4) (NFC c5)))
(parameterize ([print-unicode #f])
(printf "test 1[~s] failed for ~s\n" testno x)
(printf " c2 = ~s\n" c2)
(printf " NFC(c1) = ~s\n" (NFC c1))
(printf " NFC(c2) = ~s\n" (NFC c2))
(printf " NFC(c3) = ~s\n" (NFC c3))
(printf " c4 = ~s\n" c4)
(printf " NFC(c4) = ~s\n" (NFC c4))
(printf " NFC(c5) = ~s\n" (NFC c5))
(reset))))
x))
data (enumerate data)))
(define (test2)
(for-each
(lambda (x testno)
(apply
(lambda (c1 c2 c3 c4 c5)
(unless (and (string=? c3 (NFD c1) (NFD c2) (NFD c3))
(string=? c5 (NFD c4) (NFD c5)))
(parameterize ([print-unicode #f])
(printf "test 2[~s] failed for ~s\n" testno x)
(printf " c3 = ~s\n" c3)
(printf " NFD(c1) = ~s\n" (NFD c1))
(printf " NFD(c2) = ~s\n" (NFD c2))
(printf " NFD(c3) = ~s\n" (NFD c3))
(printf " c5 = ~s\n" c5)
(printf " NFD(c4) = ~s\n" (NFD c4))
(printf " NFD(c5) = ~s\n" (NFD c5))
(reset))))
x))
data (enumerate data)))
(define (test3)
(for-each
(lambda (x testno)
(apply
(lambda (c1 c2 c3 c4 c5)
(unless (string=? c4 (NFKC c1) (NFKC c2) (NFKC c3) (NFKC c4) (NFKC c5))
(parameterize ([print-unicode #f])
(printf "test 3[~s] failed for ~s\n" testno x)
(printf " c4 = ~s\n" c4)
(printf " NFKC(c1) = ~s\n" (NFKC c1))
(printf " NFKC(c2) = ~s\n" (NFKC c2))
(printf " NFKC(c3) = ~s\n" (NFKC c3))
(printf " NFKC(c4) = ~s\n" (NFKC c4))
(printf " NFKC(c5) = ~s\n" (NFKC c5))
(reset))))
x))
data (enumerate data)))
(define (test4)
(for-each
(lambda (x testno)
(apply
(lambda (c1 c2 c3 c4 c5)
(unless (string=? c5 (NFKD c1) (NFKD c2) (NFKD c3) (NFKD c4) (NFKD c5))
(parameterize ([print-unicode #f])
(printf "test 4[~s] failed for ~s\n" testno x)
(printf " c5 = ~s\n" c5)
(printf " NFKD(c1) = ~s\n" (NFKD c1))
(printf " NFKD(c2) = ~s\n" (NFKD c2))
(printf " NFKD(c3) = ~s\n" (NFKD c3))
(printf " NFKD(c4) = ~s\n" (NFKD c4))
(printf " NFKD(c5) = ~s\n" (NFKD c5))
(reset))))
x))
data (enumerate data)))
(printf " running ~s tests ..." (length data))
(test1)
(test2)
(test3)
(test4))))

151
scheme/tests/unicode.ss Normal file
View File

@ -0,0 +1,151 @@
(library (tests unicode)
(export run-tests)
(import (ikarus))
(define-syntax test
(lambda (q)
(syntax-case q ()
[(_ x y)
#`(unless (equal? x y)
(syntax-error #''#,q "test failed"))])))
(define (run-unicode-tests)
(test (char-upcase #\i) #\I)
(test (char-downcase #\i) #\i)
(test (char-titlecase #\i) #\I)
(test (char-foldcase #\i) #\i)
(test (char-upcase #\xDF) #\xDF)
(test (char-downcase #\xDF) #\xDF)
(test (char-titlecase #\xDF) #\xDF)
(test (char-foldcase #\xDF) #\xDF)
(test (char-upcase #\x3A3) #\x3A3)
(test (char-downcase #\x3A3) #\x3C3)
(test (char-titlecase #\x3A3) #\x3A3)
(test (char-foldcase #\x3A3) #\x3C3)
(test (char-upcase #\x3C2) #\x3A3)
(test (char-downcase #\x3C2) #\x3C2)
(test (char-titlecase #\x3C2) #\x3A3)
(test (char-foldcase #\x3C2) #\x3C3)
(test (char-ci<? #\z #\Z) #f)
(test (char-ci<? #\Z #\z) #f)
(test (char-ci<? #\a #\Z) #t)
(test (char-ci<? #\Z #\a) #f)
(test (char-ci<=? #\z #\Z) #t)
(test (char-ci<=? #\Z #\z) #t)
(test (char-ci<=? #\a #\Z) #t)
(test (char-ci<=? #\Z #\a) #f)
(test (char-ci=? #\z #\a) #f)
(test (char-ci=? #\z #\Z) #t)
(test (char-ci=? #\x3C2 #\x3C3) #t)
(test (char-ci>? #\z #\Z) #f)
(test (char-ci>? #\Z #\z) #f)
(test (char-ci>? #\a #\Z) #f)
(test (char-ci>? #\Z #\a) #t)
(test (char-ci>=? #\Z #\z) #t)
(test (char-ci>=? #\z #\Z) #t)
(test (char-ci>=? #\z #\Z) #t)
(test (char-ci>=? #\a #\z) #f)
(test (char-alphabetic? #\a) #t)
(test (char-alphabetic? #\1) #f)
(test (char-numeric? #\1) #t)
(test (char-numeric? #\a) #f)
(test (char-whitespace? #\space) #t)
(test (char-whitespace? #\x00A0) #t)
(test (char-whitespace? #\a) #f)
(test (char-upper-case? #\a) #f)
(test (char-upper-case? #\A) #t)
(test (char-upper-case? #\x3A3) #t)
(test (char-lower-case? #\a) #t)
(test (char-lower-case? #\A) #f)
(test (char-lower-case? #\x3C3) #t)
(test (char-lower-case? #\x00AA) #t)
(test (char-title-case? #\a) #f)
(test (char-title-case? #\A) #f)
(test (char-title-case? #\I) #f)
(test (char-title-case? #\x01C5) #t)
(test (char-general-category #\a) 'Ll)
(test (char-general-category #\space) 'Zs)
(test (char-general-category #\x10FFFF) 'Cn)
(test (string-upcase "Hi") "HI")
(test (string-upcase "HI") "HI")
(test (string-downcase "Hi") "hi")
(test (string-downcase "hi") "hi")
(test (string-foldcase "Hi") "hi")
(test (string-foldcase "HI") "hi")
(test (string-foldcase "hi") "hi")
(test (string-upcase "Stra\xDF;e") "STRASSE")
(test (string-downcase "Stra\xDF;e") "stra\xDF;e")
(test (string-foldcase "Stra\xDF;e") "strasse")
(test (string-downcase "STRASSE") "strasse")
(test (string-downcase "\x3A3;") "\x3C3;")
(test (string-upcase "\x39E;\x391;\x39F;\x3A3;") "\x39E;\x391;\x39F;\x3A3;")
(test (string-downcase "\x39E;\x391;\x39F;\x3A3;") "\x3BE;\x3B1;\x3BF;\x3C2;")
(test (string-downcase "\x39E;\x391;\x39F;\x3A3;\x3A3;") "\x3BE;\x3B1;\x3BF;\x3C3;\x3C2;")
(test (string-downcase "\x39E;\x391;\x39F;\x3A3; \x3A3;") "\x3BE;\x3B1;\x3BF;\x3C2; \x3C3;")
(test (string-foldcase "\x39E;\x391;\x39F;\x3A3;") "\x3BE;\x3B1;\x3BF;\x3C3;")
(test (string-upcase "\x3BE;\x3B1;\x3BF;\x3C3;") "\x39E;\x391;\x39F;\x3A3;")
(test (string-upcase "\x3BE;\x3B1;\x3BF;\x3C2;") "\x39E;\x391;\x39F;\x3A3;")
(test (string-titlecase "kNock KNoCK") "Knock Knock")
(test (string-titlecase "who's there?") "Who's There?")
(test (string-titlecase "r6rs") "R6rs") ; this example appears to be wrong in R6RS (Sept 2007 version)
(test (string-titlecase "R6RS") "R6rs") ; this one, too
(test (string-downcase "A\x3A3;'x") "a\x3C3;'x") ; ' is a MidLetter
(test (string-ci<? "a" "Z") #t)
(test (string-ci<? "A" "z") #t)
(test (string-ci<? "Z" "a") #f)
(test (string-ci<? "z" "A") #f)
(test (string-ci<? "z" "Z") #f)
(test (string-ci<? "Z" "z") #f)
(test (string-ci>? "a" "Z") #f)
(test (string-ci>? "A" "z") #f)
(test (string-ci>? "Z" "a") #t)
(test (string-ci>? "z" "A") #t)
(test (string-ci>? "z" "Z") #f)
(test (string-ci>? "Z" "z") #f)
(test (string-ci=? "z" "Z") #t)
(test (string-ci=? "z" "a") #f)
(test (string-ci=? "Stra\xDF;e" "Strasse") #t)
(test (string-ci=? "Stra\xDF;e" "STRASSE") #t)
(test (string-ci=? "\x39E;\x391;\x39F;\x3A3;" "\x3BE;\x3B1;\x3BF;\x3C2;") #t)
(test (string-ci=? "\x39E;\x391;\x39F;\x3A3;" "\x3BE;\x3B1;\x3BF;\x3C3;") #t)
(test (string-ci<=? "a" "Z") #t)
(test (string-ci<=? "A" "z") #t)
(test (string-ci<=? "Z" "a") #f)
(test (string-ci<=? "z" "A") #f)
(test (string-ci<=? "z" "Z") #t)
(test (string-ci<=? "Z" "z") #t)
(test (string-ci>=? "a" "Z") #f)
(test (string-ci>=? "A" "z") #f)
(test (string-ci>=? "Z" "a") #t)
(test (string-ci>=? "z" "A") #t)
(test (string-ci>=? "z" "Z") #t)
(test (string-ci>=? "Z" "z") #t)
(test (string-normalize-nfd "\xE9;") "\x65;\x301;")
(test (string-normalize-nfc "\xE9;") "\xE9;")
(test (string-normalize-nfd "\x65;\x301;") "\x65;\x301;")
(test (string-normalize-nfc "\x65;\x301;") "\xE9;")
(test (string-normalize-nfkd "\xE9;") "\x65;\x301;")
(test (string-normalize-nfkc "\xE9;") "\xE9;")
(test (string-normalize-nfkd "\x65;\x301;") "\x65;\x301;")
(test (string-normalize-nfkc "\x65;\x301;") "\xE9;"))
(define (run-tests)
(run-unicode-tests))
)

1
scheme/unicode-data.ss Symbolic link
View File

@ -0,0 +1 @@
unicode/unicode-data.ss