From a4ba327173c14840fbde74cc947713aabf669c78 Mon Sep 17 00:00:00 2001 From: Abdulaziz Ghuloum Date: Wed, 29 Oct 2008 03:15:17 -0400 Subject: [PATCH] - speedup of unicode normalization routines. - added unicode normalization tests. --- scheme/ikarus.unicode.ss | 196 +++++++++++++++++----------------- scheme/run-tests.ss | 2 +- scheme/tests/normalization.ss | 123 +++++++++++++++++++++ scheme/tests/unicode.ss | 151 ++++++++++++++++++++++++++ scheme/unicode-data.ss | 1 + 5 files changed, 373 insertions(+), 100 deletions(-) create mode 100644 scheme/tests/normalization.ss create mode 100644 scheme/tests/unicode.ss create mode 120000 scheme/unicode-data.ss diff --git a/scheme/ikarus.unicode.ss b/scheme/ikarus.unicode.ss index 0f1d2e1..e0f3e9f 100644 --- a/scheme/ikarus.unicode.ss +++ b/scheme/ikarus.unicode.ss @@ -489,112 +489,110 @@ (define $decompose ; might should optimize for sequences of ascii characters (lambda (s canonical?) - (define (canonical? c1 c2) + (fx> ($char-combining-class c1) ($char-combining-class c2))) + (define (sort-and-flush comb*) + (unless (null? comb*) + (set! ac (append (list-sort canonical>? comb*) ac)))) + (define ($char-decomp c) + (if (and (char<=? hangul-sbase c) (char<=? c hangul-slimit)) + ($hangul-decomp c) + (if canonical? + ($str-decomp-canon c) + ($str-decomp-compat c)))) + (define (push-and-go c* c** i comb*) + (if (char? c*) + (go c* c** i comb*) + (go (car c*) (cons (cdr c*) c**) i comb*))) + (define (pop-and-go c** i comb*) + (if (null? c**) + (if (fx= i n) + (sort-and-flush comb*) + (go (string-ref s i) '() (fx+ i 1) comb*)) + (push-and-go (car c**) (cdr c**) i comb*))) + (define (go c c** i comb*) + (let ([c* ($char-decomp c)]) + (if (eq? c c*) ; should be eqv? + (if (fxzero? ($char-combining-class c)) + (begin (sort-and-flush comb*) - (go (string-ref s i) '() (fx+ i 1) comb*)) - (push-and-go (car c**) (cdr c**) i comb*))) - (define (go c c** i comb*) - (let ([c* ($char-decomp c)]) - (if (eq? c c*) ; should be eqv? - (if (fxzero? ($char-combining-class c)) - (begin - (sort-and-flush comb*) - (write-char c) - (pop-and-go c** i '())) - (pop-and-go c** i (cons c comb*))) - (push-and-go c* c** i comb*)))) - (pop-and-go '() 0 '())))))) + (set! ac (cons c ac)) + (pop-and-go c** i '())) + (pop-and-go c** i (cons c comb*))) + (push-and-go c* c** i comb*)))) + (pop-and-go '() 0 '()) + (list->string (reverse ac))))) (define $compose (let ([comp-table #f]) (define (lookup-composite c1 c2) - ; needs to handle HANGUL (hashtable-ref comp-table (cons c1 c2) #f)) + (define (init!) + (set! comp-table + (make-hashtable + (lambda (x) + (fxxor + (fxsll (char->integer (car x)) 7) + (char->integer (cdr x)))) + (lambda (x y) + (and (char=? (car x) (car y)) + (char=? (cdr x) (cdr y)))))) + (vector-for-each + (lambda (c* c) (hashtable-set! comp-table c* c)) + (car ($composition-pairs)) + (cdr ($composition-pairs)))) (lambda (s) - (unless comp-table - (set! comp-table - (make-hashtable - (lambda (x) - (fxxor - (fxsll (char->integer (car x)) 7) - (char->integer (cdr x)))) - (lambda (x y) - (and (char=? (car x) (car y)) - (char=? (cdr x) (cdr y)))))) - (vector-for-each - (lambda (c* c) (hashtable-set! comp-table c* c)) - (car ($composition-pairs)) - (cdr ($composition-pairs)))) - (with-output-to-string - (lambda () - (let ([n (string-length s)]) - (define (dump c acc) - (write-char c) - (for-each write-char (reverse acc))) - (define (s0 i) - (unless (fx= i n) - (let ([c (string-ref s i)]) - (if (fxzero? ($char-combining-class c)) - (s1 (fx+ i 1) c) - (begin (write-char c) (s0 (fx+ i 1))))))) - (define (s1 i c) - (if (fx= i n) - (write-char c) - (let ([c1 (string-ref s i)]) - (cond - [(and (and (char<=? hangul-lbase c) - (char<=? c hangul-llimit)) - (and (char<=? hangul-vbase c1) - (char<=? c1 hangul-vlimit))) - (s1 (fx+ i 1) - (let ([lindex (char- c hangul-lbase)] - [vindex (char- c1 hangul-vbase)]) - (integer->char - (fx+ (char->integer hangul-sbase) - (fx* (fx+ (fx* lindex hangul-vcount) vindex) - hangul-tcount)))))] - [(and (and (char<=? hangul-sbase c) - (char<=? c hangul-slimit)) - (and (char<=? hangul-tbase c1) - (char<=? c1 hangul-tlimit)) - (let ([sindex (char- c hangul-sbase)]) - (fxzero? (fxmod sindex hangul-tcount)))) - (let ([tindex (char- c1 hangul-tbase)]) - (s1 (fx+ i 1) (integer->char (fx+ (char->integer c) tindex))))] - [else (s2 i c -1 '())])))) - (define (s2 i c class acc) - (if (fx= i n) - (dump c acc) - (let ([c1 (string-ref s i)]) - (let ([class1 ($char-combining-class c1)]) - (cond - [(and (fx< class class1) (lookup-composite c c1)) => - (lambda (c) (s2 (fx+ i 1) c class acc))] - [(fx= class1 0) - (dump c acc) - (s1 (fx+ i 1) c1)] - [else (s2 (fx+ i 1) c class1 (cons c1 acc))]))))) - (s0 0))))))) + (unless comp-table (init!)) + (let ([ac '()] [n (string-length s)]) + (define (dump c acc) + (set! ac (cons c ac)) + (unless (null? acc) (set! ac (append acc ac)))) + (define (s0 i) + (unless (fx= i n) + (let ([c (string-ref s i)]) + (if (fxzero? ($char-combining-class c)) + (s1 (fx+ i 1) c) + (begin (set! ac (cons c ac)) (s0 (fx+ i 1))))))) + (define (s1 i c) + (if (fx= i n) + (set! ac (cons c ac)) + (let ([c1 (string-ref s i)]) + (cond + [(and (and (char<=? hangul-lbase c) + (char<=? c hangul-llimit)) + (and (char<=? hangul-vbase c1) + (char<=? c1 hangul-vlimit))) + (s1 (fx+ i 1) + (let ([lindex (char- c hangul-lbase)] + [vindex (char- c1 hangul-vbase)]) + (integer->char + (fx+ (char->integer hangul-sbase) + (fx* (fx+ (fx* lindex hangul-vcount) vindex) + hangul-tcount)))))] + [(and (and (char<=? hangul-sbase c) + (char<=? c hangul-slimit)) + (and (char<=? hangul-tbase c1) + (char<=? c1 hangul-tlimit)) + (let ([sindex (char- c hangul-sbase)]) + (fxzero? (fxmod sindex hangul-tcount)))) + (let ([tindex (char- c1 hangul-tbase)]) + (s1 (fx+ i 1) (integer->char (fx+ (char->integer c) tindex))))] + [else (s2 i c -1 '())])))) + (define (s2 i c class acc) + (if (fx= i n) + (dump c acc) + (let ([c1 (string-ref s i)]) + (let ([class1 ($char-combining-class c1)]) + (cond + [(and (fx< class class1) (lookup-composite c c1)) => + (lambda (c) (s2 (fx+ i 1) c class acc))] + [(fx= class1 0) + (dump c acc) + (s1 (fx+ i 1) c1)] + [else (s2 (fx+ i 1) c class1 (cons c1 acc))]))))) + (s0 0) + (list->string (reverse ac)))))) (define-string-op string-normalize-nfd (lambda (s) ($decompose s #t))) diff --git a/scheme/run-tests.ss b/scheme/run-tests.ss index ac92f6c..1944b04 100755 --- a/scheme/run-tests.ss +++ b/scheme/run-tests.ss @@ -22,7 +22,7 @@ lists strings bytevectors hashtables fixnums bignums numerics bitwise enums pointers sorting io fasl reader case-folding parse-flonums string-to-number bignum-to-flonum div-and-mod - fldiv-and-mod)) + fldiv-and-mod unicode normalization)) (define (run-test-from-library x) (printf "[testing ~a] ..." x) diff --git a/scheme/tests/normalization.ss b/scheme/tests/normalization.ss new file mode 100644 index 0000000..86a5a5f --- /dev/null +++ b/scheme/tests/normalization.ss @@ -0,0 +1,123 @@ + +(library (tests normalization) + (export run-tests) + (import (ikarus) (unicode-data)) + +(define (reset) (error 'reset "yukk")) +(define (enumerate ls) + (let f ([ls ls] [i 0]) + (cond + [(null? ls) '()] + [else (cons i (f (cdr ls) (+ i 1)))]))) +(define (list-head ls n) + (if (zero? n) + '() + (cons (car ls) (list-head (cdr ls) (- n 1))))) + +(define (split str) + (remove "" + (let f ([i 0] [n (string-length str)]) + (cond + [(= i n) (list (substring str 0 n))] + [(char=? (string-ref str i) #\space) + (cons (substring str 0 i) + (split (substring str (+ i 1) n)))] + [else (f (add1 i) n)])))) + +(define (conv x) + (list->string + (map (lambda (x) (integer->char (string->number x 16))) + (split x)))) + +(define (run-tests) + (let ([data (map (lambda (x) (map conv (list-head x 5))) + (filter (lambda (x) (>= (length x) 5)) + (get-unicode-data + "unicode/UNIDATA/NormalizationTest.txt")))]) + (define NFD string-normalize-nfd) + (define NFKD string-normalize-nfkd) + (define NFC string-normalize-nfc) + (define NFKC string-normalize-nfkc) + + (define (test1) + (for-each + (lambda (x testno) + (apply + (lambda (c1 c2 c3 c4 c5) + (unless (and (string=? c2 (NFC c1) (NFC c2) (NFC c3)) + (string=? c4 (NFC c4) (NFC c5))) + (parameterize ([print-unicode #f]) + (printf "test 1[~s] failed for ~s\n" testno x) + (printf " c2 = ~s\n" c2) + (printf " NFC(c1) = ~s\n" (NFC c1)) + (printf " NFC(c2) = ~s\n" (NFC c2)) + (printf " NFC(c3) = ~s\n" (NFC c3)) + (printf " c4 = ~s\n" c4) + (printf " NFC(c4) = ~s\n" (NFC c4)) + (printf " NFC(c5) = ~s\n" (NFC c5)) + (reset)))) + x)) + data (enumerate data))) + + (define (test2) + (for-each + (lambda (x testno) + (apply + (lambda (c1 c2 c3 c4 c5) + (unless (and (string=? c3 (NFD c1) (NFD c2) (NFD c3)) + (string=? c5 (NFD c4) (NFD c5))) + (parameterize ([print-unicode #f]) + (printf "test 2[~s] failed for ~s\n" testno x) + (printf " c3 = ~s\n" c3) + (printf " NFD(c1) = ~s\n" (NFD c1)) + (printf " NFD(c2) = ~s\n" (NFD c2)) + (printf " NFD(c3) = ~s\n" (NFD c3)) + (printf " c5 = ~s\n" c5) + (printf " NFD(c4) = ~s\n" (NFD c4)) + (printf " NFD(c5) = ~s\n" (NFD c5)) + (reset)))) + x)) + data (enumerate data))) + + (define (test3) + (for-each + (lambda (x testno) + (apply + (lambda (c1 c2 c3 c4 c5) + (unless (string=? c4 (NFKC c1) (NFKC c2) (NFKC c3) (NFKC c4) (NFKC c5)) + (parameterize ([print-unicode #f]) + (printf "test 3[~s] failed for ~s\n" testno x) + (printf " c4 = ~s\n" c4) + (printf " NFKC(c1) = ~s\n" (NFKC c1)) + (printf " NFKC(c2) = ~s\n" (NFKC c2)) + (printf " NFKC(c3) = ~s\n" (NFKC c3)) + (printf " NFKC(c4) = ~s\n" (NFKC c4)) + (printf " NFKC(c5) = ~s\n" (NFKC c5)) + (reset)))) + x)) + data (enumerate data))) + + (define (test4) + (for-each + (lambda (x testno) + (apply + (lambda (c1 c2 c3 c4 c5) + (unless (string=? c5 (NFKD c1) (NFKD c2) (NFKD c3) (NFKD c4) (NFKD c5)) + (parameterize ([print-unicode #f]) + (printf "test 4[~s] failed for ~s\n" testno x) + (printf " c5 = ~s\n" c5) + (printf " NFKD(c1) = ~s\n" (NFKD c1)) + (printf " NFKD(c2) = ~s\n" (NFKD c2)) + (printf " NFKD(c3) = ~s\n" (NFKD c3)) + (printf " NFKD(c4) = ~s\n" (NFKD c4)) + (printf " NFKD(c5) = ~s\n" (NFKD c5)) + (reset)))) + x)) + data (enumerate data))) + + (printf " running ~s tests ..." (length data)) + (test1) + (test2) + (test3) + (test4)))) + diff --git a/scheme/tests/unicode.ss b/scheme/tests/unicode.ss new file mode 100644 index 0000000..e2f1752 --- /dev/null +++ b/scheme/tests/unicode.ss @@ -0,0 +1,151 @@ + +(library (tests unicode) + (export run-tests) + (import (ikarus)) + + (define-syntax test + (lambda (q) + (syntax-case q () + [(_ x y) + #`(unless (equal? x y) + (syntax-error #''#,q "test failed"))]))) + + (define (run-unicode-tests) + (test (char-upcase #\i) #\I) + (test (char-downcase #\i) #\i) + (test (char-titlecase #\i) #\I) + (test (char-foldcase #\i) #\i) + + (test (char-upcase #\xDF) #\xDF) + (test (char-downcase #\xDF) #\xDF) + (test (char-titlecase #\xDF) #\xDF) + (test (char-foldcase #\xDF) #\xDF) + + (test (char-upcase #\x3A3) #\x3A3) + (test (char-downcase #\x3A3) #\x3C3) + (test (char-titlecase #\x3A3) #\x3A3) + (test (char-foldcase #\x3A3) #\x3C3) + + (test (char-upcase #\x3C2) #\x3A3) + (test (char-downcase #\x3C2) #\x3C2) + (test (char-titlecase #\x3C2) #\x3A3) + (test (char-foldcase #\x3C2) #\x3C3) + + (test (char-ci? #\z #\Z) #f) + (test (char-ci>? #\Z #\z) #f) + (test (char-ci>? #\a #\Z) #f) + (test (char-ci>? #\Z #\a) #t) + (test (char-ci>=? #\Z #\z) #t) + (test (char-ci>=? #\z #\Z) #t) + (test (char-ci>=? #\z #\Z) #t) + (test (char-ci>=? #\a #\z) #f) + + (test (char-alphabetic? #\a) #t) + (test (char-alphabetic? #\1) #f) + (test (char-numeric? #\1) #t) + (test (char-numeric? #\a) #f) + (test (char-whitespace? #\space) #t) + (test (char-whitespace? #\x00A0) #t) + (test (char-whitespace? #\a) #f) + (test (char-upper-case? #\a) #f) + (test (char-upper-case? #\A) #t) + (test (char-upper-case? #\x3A3) #t) + (test (char-lower-case? #\a) #t) + (test (char-lower-case? #\A) #f) + (test (char-lower-case? #\x3C3) #t) + (test (char-lower-case? #\x00AA) #t) + (test (char-title-case? #\a) #f) + (test (char-title-case? #\A) #f) + (test (char-title-case? #\I) #f) + (test (char-title-case? #\x01C5) #t) + + (test (char-general-category #\a) 'Ll) + (test (char-general-category #\space) 'Zs) + (test (char-general-category #\x10FFFF) 'Cn) + + (test (string-upcase "Hi") "HI") + (test (string-upcase "HI") "HI") + (test (string-downcase "Hi") "hi") + (test (string-downcase "hi") "hi") + (test (string-foldcase "Hi") "hi") + (test (string-foldcase "HI") "hi") + (test (string-foldcase "hi") "hi") + + (test (string-upcase "Stra\xDF;e") "STRASSE") + (test (string-downcase "Stra\xDF;e") "stra\xDF;e") + (test (string-foldcase "Stra\xDF;e") "strasse") + (test (string-downcase "STRASSE") "strasse") + + (test (string-downcase "\x3A3;") "\x3C3;") + + (test (string-upcase "\x39E;\x391;\x39F;\x3A3;") "\x39E;\x391;\x39F;\x3A3;") + (test (string-downcase "\x39E;\x391;\x39F;\x3A3;") "\x3BE;\x3B1;\x3BF;\x3C2;") + (test (string-downcase "\x39E;\x391;\x39F;\x3A3;\x3A3;") "\x3BE;\x3B1;\x3BF;\x3C3;\x3C2;") + (test (string-downcase "\x39E;\x391;\x39F;\x3A3; \x3A3;") "\x3BE;\x3B1;\x3BF;\x3C2; \x3C3;") + (test (string-foldcase "\x39E;\x391;\x39F;\x3A3;") "\x3BE;\x3B1;\x3BF;\x3C3;") + (test (string-upcase "\x3BE;\x3B1;\x3BF;\x3C3;") "\x39E;\x391;\x39F;\x3A3;") + (test (string-upcase "\x3BE;\x3B1;\x3BF;\x3C2;") "\x39E;\x391;\x39F;\x3A3;") + + (test (string-titlecase "kNock KNoCK") "Knock Knock") + (test (string-titlecase "who's there?") "Who's There?") + (test (string-titlecase "r6rs") "R6rs") ; this example appears to be wrong in R6RS (Sept 2007 version) + (test (string-titlecase "R6RS") "R6rs") ; this one, too + + (test (string-downcase "A\x3A3;'x") "a\x3C3;'x") ; ' is a MidLetter + + (test (string-ci? "a" "Z") #f) + (test (string-ci>? "A" "z") #f) + (test (string-ci>? "Z" "a") #t) + (test (string-ci>? "z" "A") #t) + (test (string-ci>? "z" "Z") #f) + (test (string-ci>? "Z" "z") #f) + (test (string-ci=? "z" "Z") #t) + (test (string-ci=? "z" "a") #f) + (test (string-ci=? "Stra\xDF;e" "Strasse") #t) + (test (string-ci=? "Stra\xDF;e" "STRASSE") #t) + (test (string-ci=? "\x39E;\x391;\x39F;\x3A3;" "\x3BE;\x3B1;\x3BF;\x3C2;") #t) + (test (string-ci=? "\x39E;\x391;\x39F;\x3A3;" "\x3BE;\x3B1;\x3BF;\x3C3;") #t) + (test (string-ci<=? "a" "Z") #t) + (test (string-ci<=? "A" "z") #t) + (test (string-ci<=? "Z" "a") #f) + (test (string-ci<=? "z" "A") #f) + (test (string-ci<=? "z" "Z") #t) + (test (string-ci<=? "Z" "z") #t) + (test (string-ci>=? "a" "Z") #f) + (test (string-ci>=? "A" "z") #f) + (test (string-ci>=? "Z" "a") #t) + (test (string-ci>=? "z" "A") #t) + (test (string-ci>=? "z" "Z") #t) + (test (string-ci>=? "Z" "z") #t) + + (test (string-normalize-nfd "\xE9;") "\x65;\x301;") + (test (string-normalize-nfc "\xE9;") "\xE9;") + (test (string-normalize-nfd "\x65;\x301;") "\x65;\x301;") + (test (string-normalize-nfc "\x65;\x301;") "\xE9;") + + (test (string-normalize-nfkd "\xE9;") "\x65;\x301;") + (test (string-normalize-nfkc "\xE9;") "\xE9;") + (test (string-normalize-nfkd "\x65;\x301;") "\x65;\x301;") + (test (string-normalize-nfkc "\x65;\x301;") "\xE9;")) + + (define (run-tests) + (run-unicode-tests)) + ) + diff --git a/scheme/unicode-data.ss b/scheme/unicode-data.ss new file mode 120000 index 0000000..808594d --- /dev/null +++ b/scheme/unicode-data.ss @@ -0,0 +1 @@ +unicode/unicode-data.ss \ No newline at end of file