- reduced latency of construction of unicode composition tables.

This commit is contained in:
Abdulaziz Ghuloum 2008-10-29 02:11:53 -04:00
parent 9a3666d3ea
commit 15e8775c67
6 changed files with 1716 additions and 1273 deletions

Binary file not shown.

View File

@ -3037,14 +3037,6 @@ procedures.
not recognize the \texttt{|p} notation.
\item The following procedures are missing from \texttt{(rnrs unicode)}:
\begin{Verbatim}
string-titlecase
string-normalize-nfc string-normalize-nfd
string-normalize-nfkc string-normalize-nfkd
\end{Verbatim}
\item The following procedures are missing from \texttt{(rnrs arithmetic
bitwise)}:
\begin{Verbatim}
@ -3060,8 +3052,7 @@ fxreverse-bit-field fxrotate-bit-field
\item The following procedures are missing from \texttt{(rnrs hashtables)}:
\begin{Verbatim}
make-eqv-hashtable make-hashtable equal-hash
hashtable-hash-function hashtable-equivalence-function
make-eqv-hashtable equal-hash
\end{Verbatim}
\item The following procedures are missing from \texttt{(rnrs io ports)}:

View File

@ -98,13 +98,6 @@
(not (fxzero? (fxand x y))))
(define (char- x y)
(fx- (char->integer x) (char->integer y)))
(define (iota n)
(let f ([n n] [ac '()])
(cond
[(= n 0) ac]
[else
(let ([n (- n 1)])
(f n (cons n ac)))])))
(include "unicode/unicode-char-cases.ss")
(include "unicode/unicode-charinfo.ss")
@ -180,7 +173,7 @@
(chars (cdr ac)
(let f ([p (cdar ac)] [n n])
(cond
[(pair? p) (f (cdr p) (+ n 1))]
[(pair? p) (f (cdr p) (fx+ n 1))]
[else n])))]))
(define (extend src ac src-len dst-len)
(let f ([str str] [dst (make-string dst-len)] [i 0] [j 0] [ac (reverse ac)] [sigma* '()])
@ -212,7 +205,7 @@
(and (not (fx= i n))
(or ($char-cased? (string-ref str i))
(and ($char-case-ignorable? (string-ref str i))
(scan (+ i incr) incr n)))))
(scan (fx+ i incr) incr n)))))
(and (scan (fx- i 1) -1 -1) (not (scan (fx+ i 1) +1 (string-length str)))))
; scanning requires we have some character in place...guess nonfinal sigma
(for-each (lambda (i) (string-set! str i nonfinal-sigma)) sigma*)
@ -548,13 +541,10 @@
(lambda (x y)
(and (char=? (car x) (car y))
(char=? (cdr x) (cdr y))))))
(for-each
(lambda (i)
(unless (and (fx<= #xD800 i) (fx<= i #xDFFF))
(unless (memv i ($composition-exclusions))
(let* ([c (integer->char i)] [c* ($str-decomp-canon c)])
(when (pair? c*) (hashtable-set! comp-table c* c))))))
(iota #x110000)))
(vector-for-each
(lambda (c* c) (hashtable-set! comp-table c* c))
(car ($composition-pairs))
(cdr ($composition-pairs))))
(with-output-to-string
(lambda ()
(let ([n (string-length s)])
@ -607,23 +597,15 @@
(s0 0)))))))
(define-string-op string-normalize-nfd
(lambda (s)
; need string? check
($decompose s #t)))
(lambda (s) ($decompose s #t)))
(define-string-op string-normalize-nfkd
(lambda (s)
; need string? check
($decompose s #f)))
(lambda (s) ($decompose s #f)))
(define-string-op string-normalize-nfc
(lambda (s)
; need string? check
($compose ($decompose s #t))))
(lambda (s) ($compose ($decompose s #t))))
(define-string-op string-normalize-nfkc
(lambda (s)
; need string? check
($compose ($decompose s #f))))
(lambda (s) ($compose ($decompose s #f))))
))

View File

@ -1 +1 @@
1648
1650

View File

@ -25,7 +25,13 @@
; dropping support for s16 inner vectors for now
(include "extract-common.ss")
(define (iota n)
(let f ([n n] [ac '()])
(cond
[(= n 0) ac]
[else
(let ([n (- n 1)])
(f n (cons n ac)))])))
(define ptr-bytes 4)
(define code-point-limit #x110000) ; as of Unicode 5.1
@ -161,12 +167,39 @@
(verify-identity! n cdrec)
(table-set! table n (acc cdrec)))))
ls)
(time (commonize* table))
(commonize* table)
table)))
(define (get-composition-exclusions)
(map hex->num
(map car (get-unicode-data "UNIDATA/CompositionExclusions.txt"))))
(define (get-composition-pairs decomp-canon-table)
(define ($str-decomp-canon c)
(define (strop tbl c)
(let ([n (char->integer c)])
(if (and (fx< table-limit code-point-limit)
(fx>= n table-limit))
c
(let ([x (table-ref tbl n)])
(if (fixnum? x)
(integer->char (fx+ x n))
x)))))
(strop decomp-canon-table c))
(let ([exclusions
(map hex->num
(map car (get-unicode-data
"UNIDATA/CompositionExclusions.txt")))]
[pairs '()])
(for-each
(lambda (i)
(unless (and (fx<= #xD800 i) (fx<= i #xDFFF))
(unless (memv i exclusions)
(let* ([c (integer->char i)] [c* ($str-decomp-canon c)])
(when (pair? c*)
(set! pairs (cons (cons c* c) pairs)))))))
(iota #x110000))
(cons (list->vector (map car pairs))
(list->vector (map cdr pairs)))))
(let ([ls (map data-case (get-unicode-data "UNIDATA/UnicodeData.txt"))])
(insert-foldcase-data! ls (get-unicode-data "UNIDATA/CaseFolding.txt"))
@ -179,7 +212,8 @@
(pretty-print
`(module ($char-upcase $char-downcase $char-titlecase $char-foldcase
$str-upcase $str-downcase $str-titlecase $str-foldcase
$str-decomp-canon $str-decomp-compat $composition-exclusions)
$str-decomp-canon $str-decomp-compat
$composition-pairs)
(define char-upcase-table ',(build-table chardata-ucchar ls))
(define char-downcase-table ',(build-table chardata-lcchar ls))
(define char-titlecase-table ',(build-table chardata-tcchar ls))
@ -218,7 +252,8 @@
(define ($str-foldcase c) (strop string-foldcase-table c))
(define ($str-decomp-canon c) (strop decomp-canon-table c))
(define ($str-decomp-compat c) (strop decomp-compat-table c))
(define ($composition-exclusions)
',(get-composition-exclusions))))))))
(define ($composition-pairs)
',(get-composition-pairs
(build-table chardata-decomp-canon ls)))))))))
(printf "Happy Happy Joy Joy ~a\n" (sizeof cache))

File diff suppressed because it is too large Load Diff