;;; Copyright (C) 2008 Abdulaziz Ghuloum, R. Kent Dybvig ;;; Copyright (C) 2006,2007 Abdulaziz Ghuloum ;;; ;;; Permission is hereby granted, free of charge, to any person obtaining a ;;; copy of this software and associated documentation files (the "Software"), ;;; to deal in the Software without restriction, including without limitation ;;; the rights to use, copy, modify, merge, publish, distribute, sublicense, ;;; and/or sell copies of the Software, and to permit persons to whom the ;;; Software is furnished to do so, subject to the following conditions: ;;; ;;; The above copyright notice and this permission notice shall be included in ;;; all copies or substantial portions of the Software. ;;; ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ;;; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ;;; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ;;; THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ;;; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ;;; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ;;; DEALINGS IN THE SOFTWARE. (import (ikarus) (unicode-data)) ; dropping support for s16 inner vectors for now (include "extract-common.ss") (define (iota n) (let f ([n n] [ac '()]) (cond [(= n 0) ac] [else (let ([n (- n 1)]) (f n (cons n ac)))]))) (define ptr-bytes 4) (define code-point-limit #x110000) ; as of Unicode 5.1 #;(define table-limit #x30000) (define table-limit code-point-limit) (define-table (make-table table-ref table-set! table-ref-code) (make-vector vector-ref vector-set!) table-limit #x40 #x40) (define-record-type chardata (fields (immutable ucchar) (immutable lcchar) (immutable tcchar) (mutable fcchar) (mutable ucstr) (mutable lcstr) (mutable tcstr) (mutable fcstr) (immutable decomp-canon) (immutable decomp-compat)) (protocol (lambda (new) (lambda (ucchar lcchar tcchar decomp-canon decomp-compat) (new ucchar lcchar tcchar 0 ucchar lcchar tcchar 0 decomp-canon decomp-compat))))) (define (find-cdrec idx ls) (cond [(assq idx ls) => cdr] [else (error 'find-cdrec "~s is missing" idx)])) (define data-case (lambda (fields) (let ([n (hex->num (car fields))] [uc (list-ref fields 12)] [lc (list-ref fields 13)] [tc (list-ref fields 14)]) (define (f x) (if (string=? x "") 0 (- (hex->num x) n))) (cons n (make-chardata (f uc) (f lc) (f tc) (parse-decomp n (list-ref fields 5) #f) (parse-decomp n (list-ref fields 5) #t)))))) (define (split str) (remove "" (let f ([i 0] [n (string-length str)]) (cond [(= i n) (list (substring str 0 n))] [(char=? (string-ref str i) #\space) (cons (substring str 0 i) (split (substring str (+ i 1) n)))] [else (f (add1 i) n)])))) (define (improperize ls) (cond [(null? (cdr ls)) (car ls)] [else (cons (car ls) (improperize (cdr ls)))])) (define (c*->off c* n) (if (= (length c*) 1) (- (car c*) n) (improperize (map integer->char c*)))) (define (codes->off str n) (c*->off (map hex->num (split str)) n)) ;;; decomposition field looks like: ;;; hex-value* ;;; hex-value* ;;; latter appear to be for compatibility decomposition only (define (parse-decomp n str compat?) (let f ([ls (split str)]) (cond [(null? ls) 0] [(char=? (string-ref (car ls) 0) #\<) (if compat? (c*->off (map hex->num (cdr ls)) n) 0)] [else (c*->off (map hex->num ls) n)]))) (define (insert-foldcase-data! ls data) (for-each (lambda (fields) (let ([n (hex->num (car fields))]) (let ([cdrec (find-cdrec n ls)] [offset (codes->off (caddr fields) n)]) (chardata-fcchar-set! cdrec offset) (chardata-fcstr-set! cdrec offset)))) (filter (lambda (fields) (equal? (cadr fields) "C")) data)) (for-each (lambda (fields) (let ([n (hex->num (car fields))]) (chardata-fcstr-set! (find-cdrec n ls) (codes->off (caddr fields) n)))) (filter (lambda (fields) (equal? (cadr fields) "F")) data))) (define (insert-specialcase-data! ls data) (for-each (lambda (fields) (let ([n (hex->num (car fields))]) (let ([cdrec (find-cdrec n ls)]) (chardata-lcstr-set! cdrec (codes->off (list-ref fields 1) n)) (chardata-tcstr-set! cdrec (codes->off (list-ref fields 2) n)) (chardata-ucstr-set! cdrec (codes->off (list-ref fields 3) n))))) (filter (lambda (fields) (= 0 (string-length (list-ref fields 4)))) data))) (define verify-identity! (lambda (n cdrec) (define (zeros? . args) (andmap (lambda (x) (eqv? x 0)) args)) (unless (zeros? (chardata-ucchar cdrec) (chardata-lcchar cdrec) (chardata-tcchar cdrec) (chardata-fcchar cdrec) (chardata-ucstr cdrec) (chardata-lcstr cdrec) (chardata-tcstr cdrec) (chardata-fcstr cdrec) (chardata-decomp-canon cdrec) (chardata-decomp-compat cdrec)) (error 'verify-identity "failed for ~x, ~s" n cdrec)))) (define build-table (lambda (acc ls) (let ([table (make-table 0)]) (for-each (lambda (x) (let ([n (car x)] [cdrec (cdr x)]) (unless (< n code-point-limit) (error 'build-table "code point value ~s is at or above declared limit ~s" n code-point-limit)) (if (>= n table-limit) (verify-identity! n cdrec) (table-set! table n (acc cdrec))))) ls) (commonize* table) table))) (define (get-composition-pairs decomp-canon-table) (define ($str-decomp-canon c) (define (strop tbl c) (let ([n (char->integer c)]) (if (and (fx< table-limit code-point-limit) (fx>= n table-limit)) c (let ([x (table-ref tbl n)]) (if (fixnum? x) (integer->char (fx+ x n)) x))))) (strop decomp-canon-table c)) (let ([exclusions (map hex->num (map car (get-unicode-data "UNIDATA/CompositionExclusions.txt")))] [pairs '()]) (for-each (lambda (i) (unless (and (fx<= #xD800 i) (fx<= i #xDFFF)) (unless (memv i exclusions) (let* ([c (integer->char i)] [c* ($str-decomp-canon c)]) (when (pair? c*) (set! pairs (cons (cons c* c) pairs))))))) (iota #x110000)) (cons (list->vector (map car pairs)) (list->vector (map cdr pairs))))) (let ([ls (map data-case (get-unicode-data "UNIDATA/UnicodeData.txt"))]) (insert-foldcase-data! ls (get-unicode-data "UNIDATA/CaseFolding.txt")) (insert-specialcase-data! ls (get-unicode-data "UNIDATA/SpecialCasing.txt")) ; insert final sigma flag for char-downcase conversion (chardata-lcstr-set! (find-cdrec #x3a3 ls) 'sigma) (with-output-to-file* "unicode-char-cases.ss" (lambda () (parameterize ([print-graph #t] [print-unicode #f]) (pretty-print `(module ($char-upcase $char-downcase $char-titlecase $char-foldcase $str-upcase $str-downcase $str-titlecase $str-foldcase $str-decomp-canon $str-decomp-compat $composition-pairs) (define char-upcase-table ',(build-table chardata-ucchar ls)) (define char-downcase-table ',(build-table chardata-lcchar ls)) (define char-titlecase-table ',(build-table chardata-tcchar ls)) (define char-foldcase-table ',(build-table chardata-fcchar ls)) (define string-upcase-table ',(build-table chardata-ucstr ls)) (define string-downcase-table ',(build-table chardata-lcstr ls)) (define string-titlecase-table ',(build-table chardata-tcstr ls)) (define string-foldcase-table ',(build-table chardata-fcstr ls)) (define decomp-canon-table ',(build-table chardata-decomp-canon ls)) (define decomp-compat-table ',(build-table chardata-decomp-compat ls)) (define table-limit ,table-limit) (define code-point-limit ,code-point-limit) (define table-ref ,table-ref-code) (define (charop tbl c) (let ([n (char->integer c)]) (if (and (fx< table-limit code-point-limit) (fx>= n table-limit)) c (integer->char (fx+ (table-ref tbl n) n))))) (define (strop tbl c) (let ([n (char->integer c)]) (if (and (fx< table-limit code-point-limit) (fx>= n table-limit)) c (let ([x (table-ref tbl n)]) (if (fixnum? x) (integer->char (fx+ x n)) x))))) (define ($char-upcase c) (charop char-upcase-table c)) (define ($char-downcase c) (charop char-downcase-table c)) (define ($char-titlecase c) (charop char-titlecase-table c)) (define ($char-foldcase c) (charop char-foldcase-table c)) (define ($str-upcase c) (strop string-upcase-table c)) (define ($str-downcase c) (strop string-downcase-table c)) (define ($str-titlecase c) (strop string-titlecase-table c)) (define ($str-foldcase c) (strop string-foldcase-table c)) (define ($str-decomp-canon c) (strop decomp-canon-table c)) (define ($str-decomp-compat c) (strop decomp-compat-table c)) (define ($composition-pairs) ',(get-composition-pairs (build-table chardata-decomp-canon ls))))))))) (printf "Happy Happy Joy Joy ~a\n" (sizeof cache))