- speedup of unicode normalization routines.
- added unicode normalization tests.
This commit is contained in:
parent
15e8775c67
commit
a4ba327173
|
@ -489,20 +489,18 @@
|
|||
(define $decompose
|
||||
; might should optimize for sequences of ascii characters
|
||||
(lambda (s canonical?)
|
||||
(define (canonical<? c1 c2)
|
||||
(fx< ($char-combining-class c1) ($char-combining-class c2)))
|
||||
(let ([n (string-length s)] [ac '()])
|
||||
(define (canonical>? c1 c2)
|
||||
(fx> ($char-combining-class c1) ($char-combining-class c2)))
|
||||
(define (sort-and-flush comb*)
|
||||
(for-each write-char
|
||||
(list-sort canonical<? (reverse comb*))))
|
||||
(unless (null? comb*)
|
||||
(set! ac (append (list-sort canonical>? comb*) ac))))
|
||||
(define ($char-decomp c)
|
||||
(if (and (char<=? hangul-sbase c) (char<=? c hangul-slimit))
|
||||
($hangul-decomp c)
|
||||
(if canonical?
|
||||
($str-decomp-canon c)
|
||||
($str-decomp-compat c))))
|
||||
(with-output-to-string
|
||||
(lambda ()
|
||||
(let ([n (string-length s)])
|
||||
(define (push-and-go c* c** i comb*)
|
||||
(if (char? c*)
|
||||
(go c* c** i comb*)
|
||||
|
@ -519,19 +517,18 @@
|
|||
(if (fxzero? ($char-combining-class c))
|
||||
(begin
|
||||
(sort-and-flush comb*)
|
||||
(write-char c)
|
||||
(set! ac (cons c ac))
|
||||
(pop-and-go c** i '()))
|
||||
(pop-and-go c** i (cons c comb*)))
|
||||
(push-and-go c* c** i comb*))))
|
||||
(pop-and-go '() 0 '()))))))
|
||||
(pop-and-go '() 0 '())
|
||||
(list->string (reverse ac)))))
|
||||
|
||||
(define $compose
|
||||
(let ([comp-table #f])
|
||||
(define (lookup-composite c1 c2)
|
||||
; needs to handle HANGUL
|
||||
(hashtable-ref comp-table (cons c1 c2) #f))
|
||||
(lambda (s)
|
||||
(unless comp-table
|
||||
(define (init!)
|
||||
(set! comp-table
|
||||
(make-hashtable
|
||||
(lambda (x)
|
||||
|
@ -545,21 +542,21 @@
|
|||
(lambda (c* c) (hashtable-set! comp-table c* c))
|
||||
(car ($composition-pairs))
|
||||
(cdr ($composition-pairs))))
|
||||
(with-output-to-string
|
||||
(lambda ()
|
||||
(let ([n (string-length s)])
|
||||
(lambda (s)
|
||||
(unless comp-table (init!))
|
||||
(let ([ac '()] [n (string-length s)])
|
||||
(define (dump c acc)
|
||||
(write-char c)
|
||||
(for-each write-char (reverse acc)))
|
||||
(set! ac (cons c ac))
|
||||
(unless (null? acc) (set! ac (append acc ac))))
|
||||
(define (s0 i)
|
||||
(unless (fx= i n)
|
||||
(let ([c (string-ref s i)])
|
||||
(if (fxzero? ($char-combining-class c))
|
||||
(s1 (fx+ i 1) c)
|
||||
(begin (write-char c) (s0 (fx+ i 1)))))))
|
||||
(begin (set! ac (cons c ac)) (s0 (fx+ i 1)))))))
|
||||
(define (s1 i c)
|
||||
(if (fx= i n)
|
||||
(write-char c)
|
||||
(set! ac (cons c ac))
|
||||
(let ([c1 (string-ref s i)])
|
||||
(cond
|
||||
[(and (and (char<=? hangul-lbase c)
|
||||
|
@ -594,7 +591,8 @@
|
|||
(dump c acc)
|
||||
(s1 (fx+ i 1) c1)]
|
||||
[else (s2 (fx+ i 1) c class1 (cons c1 acc))])))))
|
||||
(s0 0)))))))
|
||||
(s0 0)
|
||||
(list->string (reverse ac))))))
|
||||
|
||||
(define-string-op string-normalize-nfd
|
||||
(lambda (s) ($decompose s #t)))
|
||||
|
|
|
@ -22,7 +22,7 @@
|
|||
lists strings bytevectors hashtables fixnums bignums numerics
|
||||
bitwise enums pointers sorting io fasl reader case-folding
|
||||
parse-flonums string-to-number bignum-to-flonum div-and-mod
|
||||
fldiv-and-mod))
|
||||
fldiv-and-mod unicode normalization))
|
||||
|
||||
(define (run-test-from-library x)
|
||||
(printf "[testing ~a] ..." x)
|
||||
|
|
|
@ -0,0 +1,123 @@
|
|||
|
||||
(library (tests normalization)
|
||||
(export run-tests)
|
||||
(import (ikarus) (unicode-data))
|
||||
|
||||
(define (reset) (error 'reset "yukk"))
|
||||
(define (enumerate ls)
|
||||
(let f ([ls ls] [i 0])
|
||||
(cond
|
||||
[(null? ls) '()]
|
||||
[else (cons i (f (cdr ls) (+ i 1)))])))
|
||||
(define (list-head ls n)
|
||||
(if (zero? n)
|
||||
'()
|
||||
(cons (car ls) (list-head (cdr ls) (- n 1)))))
|
||||
|
||||
(define (split str)
|
||||
(remove ""
|
||||
(let f ([i 0] [n (string-length str)])
|
||||
(cond
|
||||
[(= i n) (list (substring str 0 n))]
|
||||
[(char=? (string-ref str i) #\space)
|
||||
(cons (substring str 0 i)
|
||||
(split (substring str (+ i 1) n)))]
|
||||
[else (f (add1 i) n)]))))
|
||||
|
||||
(define (conv x)
|
||||
(list->string
|
||||
(map (lambda (x) (integer->char (string->number x 16)))
|
||||
(split x))))
|
||||
|
||||
(define (run-tests)
|
||||
(let ([data (map (lambda (x) (map conv (list-head x 5)))
|
||||
(filter (lambda (x) (>= (length x) 5))
|
||||
(get-unicode-data
|
||||
"unicode/UNIDATA/NormalizationTest.txt")))])
|
||||
(define NFD string-normalize-nfd)
|
||||
(define NFKD string-normalize-nfkd)
|
||||
(define NFC string-normalize-nfc)
|
||||
(define NFKC string-normalize-nfkc)
|
||||
|
||||
(define (test1)
|
||||
(for-each
|
||||
(lambda (x testno)
|
||||
(apply
|
||||
(lambda (c1 c2 c3 c4 c5)
|
||||
(unless (and (string=? c2 (NFC c1) (NFC c2) (NFC c3))
|
||||
(string=? c4 (NFC c4) (NFC c5)))
|
||||
(parameterize ([print-unicode #f])
|
||||
(printf "test 1[~s] failed for ~s\n" testno x)
|
||||
(printf " c2 = ~s\n" c2)
|
||||
(printf " NFC(c1) = ~s\n" (NFC c1))
|
||||
(printf " NFC(c2) = ~s\n" (NFC c2))
|
||||
(printf " NFC(c3) = ~s\n" (NFC c3))
|
||||
(printf " c4 = ~s\n" c4)
|
||||
(printf " NFC(c4) = ~s\n" (NFC c4))
|
||||
(printf " NFC(c5) = ~s\n" (NFC c5))
|
||||
(reset))))
|
||||
x))
|
||||
data (enumerate data)))
|
||||
|
||||
(define (test2)
|
||||
(for-each
|
||||
(lambda (x testno)
|
||||
(apply
|
||||
(lambda (c1 c2 c3 c4 c5)
|
||||
(unless (and (string=? c3 (NFD c1) (NFD c2) (NFD c3))
|
||||
(string=? c5 (NFD c4) (NFD c5)))
|
||||
(parameterize ([print-unicode #f])
|
||||
(printf "test 2[~s] failed for ~s\n" testno x)
|
||||
(printf " c3 = ~s\n" c3)
|
||||
(printf " NFD(c1) = ~s\n" (NFD c1))
|
||||
(printf " NFD(c2) = ~s\n" (NFD c2))
|
||||
(printf " NFD(c3) = ~s\n" (NFD c3))
|
||||
(printf " c5 = ~s\n" c5)
|
||||
(printf " NFD(c4) = ~s\n" (NFD c4))
|
||||
(printf " NFD(c5) = ~s\n" (NFD c5))
|
||||
(reset))))
|
||||
x))
|
||||
data (enumerate data)))
|
||||
|
||||
(define (test3)
|
||||
(for-each
|
||||
(lambda (x testno)
|
||||
(apply
|
||||
(lambda (c1 c2 c3 c4 c5)
|
||||
(unless (string=? c4 (NFKC c1) (NFKC c2) (NFKC c3) (NFKC c4) (NFKC c5))
|
||||
(parameterize ([print-unicode #f])
|
||||
(printf "test 3[~s] failed for ~s\n" testno x)
|
||||
(printf " c4 = ~s\n" c4)
|
||||
(printf " NFKC(c1) = ~s\n" (NFKC c1))
|
||||
(printf " NFKC(c2) = ~s\n" (NFKC c2))
|
||||
(printf " NFKC(c3) = ~s\n" (NFKC c3))
|
||||
(printf " NFKC(c4) = ~s\n" (NFKC c4))
|
||||
(printf " NFKC(c5) = ~s\n" (NFKC c5))
|
||||
(reset))))
|
||||
x))
|
||||
data (enumerate data)))
|
||||
|
||||
(define (test4)
|
||||
(for-each
|
||||
(lambda (x testno)
|
||||
(apply
|
||||
(lambda (c1 c2 c3 c4 c5)
|
||||
(unless (string=? c5 (NFKD c1) (NFKD c2) (NFKD c3) (NFKD c4) (NFKD c5))
|
||||
(parameterize ([print-unicode #f])
|
||||
(printf "test 4[~s] failed for ~s\n" testno x)
|
||||
(printf " c5 = ~s\n" c5)
|
||||
(printf " NFKD(c1) = ~s\n" (NFKD c1))
|
||||
(printf " NFKD(c2) = ~s\n" (NFKD c2))
|
||||
(printf " NFKD(c3) = ~s\n" (NFKD c3))
|
||||
(printf " NFKD(c4) = ~s\n" (NFKD c4))
|
||||
(printf " NFKD(c5) = ~s\n" (NFKD c5))
|
||||
(reset))))
|
||||
x))
|
||||
data (enumerate data)))
|
||||
|
||||
(printf " running ~s tests ..." (length data))
|
||||
(test1)
|
||||
(test2)
|
||||
(test3)
|
||||
(test4))))
|
||||
|
|
@ -0,0 +1,151 @@
|
|||
|
||||
(library (tests unicode)
|
||||
(export run-tests)
|
||||
(import (ikarus))
|
||||
|
||||
(define-syntax test
|
||||
(lambda (q)
|
||||
(syntax-case q ()
|
||||
[(_ x y)
|
||||
#`(unless (equal? x y)
|
||||
(syntax-error #''#,q "test failed"))])))
|
||||
|
||||
(define (run-unicode-tests)
|
||||
(test (char-upcase #\i) #\I)
|
||||
(test (char-downcase #\i) #\i)
|
||||
(test (char-titlecase #\i) #\I)
|
||||
(test (char-foldcase #\i) #\i)
|
||||
|
||||
(test (char-upcase #\xDF) #\xDF)
|
||||
(test (char-downcase #\xDF) #\xDF)
|
||||
(test (char-titlecase #\xDF) #\xDF)
|
||||
(test (char-foldcase #\xDF) #\xDF)
|
||||
|
||||
(test (char-upcase #\x3A3) #\x3A3)
|
||||
(test (char-downcase #\x3A3) #\x3C3)
|
||||
(test (char-titlecase #\x3A3) #\x3A3)
|
||||
(test (char-foldcase #\x3A3) #\x3C3)
|
||||
|
||||
(test (char-upcase #\x3C2) #\x3A3)
|
||||
(test (char-downcase #\x3C2) #\x3C2)
|
||||
(test (char-titlecase #\x3C2) #\x3A3)
|
||||
(test (char-foldcase #\x3C2) #\x3C3)
|
||||
|
||||
(test (char-ci<? #\z #\Z) #f)
|
||||
(test (char-ci<? #\Z #\z) #f)
|
||||
(test (char-ci<? #\a #\Z) #t)
|
||||
(test (char-ci<? #\Z #\a) #f)
|
||||
(test (char-ci<=? #\z #\Z) #t)
|
||||
(test (char-ci<=? #\Z #\z) #t)
|
||||
(test (char-ci<=? #\a #\Z) #t)
|
||||
(test (char-ci<=? #\Z #\a) #f)
|
||||
(test (char-ci=? #\z #\a) #f)
|
||||
(test (char-ci=? #\z #\Z) #t)
|
||||
(test (char-ci=? #\x3C2 #\x3C3) #t)
|
||||
(test (char-ci>? #\z #\Z) #f)
|
||||
(test (char-ci>? #\Z #\z) #f)
|
||||
(test (char-ci>? #\a #\Z) #f)
|
||||
(test (char-ci>? #\Z #\a) #t)
|
||||
(test (char-ci>=? #\Z #\z) #t)
|
||||
(test (char-ci>=? #\z #\Z) #t)
|
||||
(test (char-ci>=? #\z #\Z) #t)
|
||||
(test (char-ci>=? #\a #\z) #f)
|
||||
|
||||
(test (char-alphabetic? #\a) #t)
|
||||
(test (char-alphabetic? #\1) #f)
|
||||
(test (char-numeric? #\1) #t)
|
||||
(test (char-numeric? #\a) #f)
|
||||
(test (char-whitespace? #\space) #t)
|
||||
(test (char-whitespace? #\x00A0) #t)
|
||||
(test (char-whitespace? #\a) #f)
|
||||
(test (char-upper-case? #\a) #f)
|
||||
(test (char-upper-case? #\A) #t)
|
||||
(test (char-upper-case? #\x3A3) #t)
|
||||
(test (char-lower-case? #\a) #t)
|
||||
(test (char-lower-case? #\A) #f)
|
||||
(test (char-lower-case? #\x3C3) #t)
|
||||
(test (char-lower-case? #\x00AA) #t)
|
||||
(test (char-title-case? #\a) #f)
|
||||
(test (char-title-case? #\A) #f)
|
||||
(test (char-title-case? #\I) #f)
|
||||
(test (char-title-case? #\x01C5) #t)
|
||||
|
||||
(test (char-general-category #\a) 'Ll)
|
||||
(test (char-general-category #\space) 'Zs)
|
||||
(test (char-general-category #\x10FFFF) 'Cn)
|
||||
|
||||
(test (string-upcase "Hi") "HI")
|
||||
(test (string-upcase "HI") "HI")
|
||||
(test (string-downcase "Hi") "hi")
|
||||
(test (string-downcase "hi") "hi")
|
||||
(test (string-foldcase "Hi") "hi")
|
||||
(test (string-foldcase "HI") "hi")
|
||||
(test (string-foldcase "hi") "hi")
|
||||
|
||||
(test (string-upcase "Stra\xDF;e") "STRASSE")
|
||||
(test (string-downcase "Stra\xDF;e") "stra\xDF;e")
|
||||
(test (string-foldcase "Stra\xDF;e") "strasse")
|
||||
(test (string-downcase "STRASSE") "strasse")
|
||||
|
||||
(test (string-downcase "\x3A3;") "\x3C3;")
|
||||
|
||||
(test (string-upcase "\x39E;\x391;\x39F;\x3A3;") "\x39E;\x391;\x39F;\x3A3;")
|
||||
(test (string-downcase "\x39E;\x391;\x39F;\x3A3;") "\x3BE;\x3B1;\x3BF;\x3C2;")
|
||||
(test (string-downcase "\x39E;\x391;\x39F;\x3A3;\x3A3;") "\x3BE;\x3B1;\x3BF;\x3C3;\x3C2;")
|
||||
(test (string-downcase "\x39E;\x391;\x39F;\x3A3; \x3A3;") "\x3BE;\x3B1;\x3BF;\x3C2; \x3C3;")
|
||||
(test (string-foldcase "\x39E;\x391;\x39F;\x3A3;") "\x3BE;\x3B1;\x3BF;\x3C3;")
|
||||
(test (string-upcase "\x3BE;\x3B1;\x3BF;\x3C3;") "\x39E;\x391;\x39F;\x3A3;")
|
||||
(test (string-upcase "\x3BE;\x3B1;\x3BF;\x3C2;") "\x39E;\x391;\x39F;\x3A3;")
|
||||
|
||||
(test (string-titlecase "kNock KNoCK") "Knock Knock")
|
||||
(test (string-titlecase "who's there?") "Who's There?")
|
||||
(test (string-titlecase "r6rs") "R6rs") ; this example appears to be wrong in R6RS (Sept 2007 version)
|
||||
(test (string-titlecase "R6RS") "R6rs") ; this one, too
|
||||
|
||||
(test (string-downcase "A\x3A3;'x") "a\x3C3;'x") ; ' is a MidLetter
|
||||
|
||||
(test (string-ci<? "a" "Z") #t)
|
||||
(test (string-ci<? "A" "z") #t)
|
||||
(test (string-ci<? "Z" "a") #f)
|
||||
(test (string-ci<? "z" "A") #f)
|
||||
(test (string-ci<? "z" "Z") #f)
|
||||
(test (string-ci<? "Z" "z") #f)
|
||||
(test (string-ci>? "a" "Z") #f)
|
||||
(test (string-ci>? "A" "z") #f)
|
||||
(test (string-ci>? "Z" "a") #t)
|
||||
(test (string-ci>? "z" "A") #t)
|
||||
(test (string-ci>? "z" "Z") #f)
|
||||
(test (string-ci>? "Z" "z") #f)
|
||||
(test (string-ci=? "z" "Z") #t)
|
||||
(test (string-ci=? "z" "a") #f)
|
||||
(test (string-ci=? "Stra\xDF;e" "Strasse") #t)
|
||||
(test (string-ci=? "Stra\xDF;e" "STRASSE") #t)
|
||||
(test (string-ci=? "\x39E;\x391;\x39F;\x3A3;" "\x3BE;\x3B1;\x3BF;\x3C2;") #t)
|
||||
(test (string-ci=? "\x39E;\x391;\x39F;\x3A3;" "\x3BE;\x3B1;\x3BF;\x3C3;") #t)
|
||||
(test (string-ci<=? "a" "Z") #t)
|
||||
(test (string-ci<=? "A" "z") #t)
|
||||
(test (string-ci<=? "Z" "a") #f)
|
||||
(test (string-ci<=? "z" "A") #f)
|
||||
(test (string-ci<=? "z" "Z") #t)
|
||||
(test (string-ci<=? "Z" "z") #t)
|
||||
(test (string-ci>=? "a" "Z") #f)
|
||||
(test (string-ci>=? "A" "z") #f)
|
||||
(test (string-ci>=? "Z" "a") #t)
|
||||
(test (string-ci>=? "z" "A") #t)
|
||||
(test (string-ci>=? "z" "Z") #t)
|
||||
(test (string-ci>=? "Z" "z") #t)
|
||||
|
||||
(test (string-normalize-nfd "\xE9;") "\x65;\x301;")
|
||||
(test (string-normalize-nfc "\xE9;") "\xE9;")
|
||||
(test (string-normalize-nfd "\x65;\x301;") "\x65;\x301;")
|
||||
(test (string-normalize-nfc "\x65;\x301;") "\xE9;")
|
||||
|
||||
(test (string-normalize-nfkd "\xE9;") "\x65;\x301;")
|
||||
(test (string-normalize-nfkc "\xE9;") "\xE9;")
|
||||
(test (string-normalize-nfkd "\x65;\x301;") "\x65;\x301;")
|
||||
(test (string-normalize-nfkc "\x65;\x301;") "\xE9;"))
|
||||
|
||||
(define (run-tests)
|
||||
(run-unicode-tests))
|
||||
)
|
||||
|
|
@ -0,0 +1 @@
|
|||
unicode/unicode-data.ss
|
Loading…
Reference in New Issue