ikarus/scheme/unicode/extract-char-cases.ss

222 lines
7.2 KiB
Scheme
Executable File

#!/usr/bin/env ikarus --r6rs-script
;;; Ikarus Scheme -- A compiler for R6RS Scheme.
;;; Copyright (C) 2006,2007,2008 Abdulaziz Ghuloum
;;;
;;; This program is free software: you can redistribute it and/or modify
;;; it under the terms of the GNU General Public License version 3 as
;;; published by the Free Software Foundation.
;;;
;;; This program is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;; General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with this program. If not, see <http://www.gnu.org/licenses/>.
(import
(ikarus)
(unicode-data))
(define license
'";;; Ikarus Scheme -- A compiler for R6RS Scheme.
;;; Copyright (C) 2006,2007,2008 Abdulaziz Ghuloum
;;;
;;; This program is free software: you can redistribute it and/or modify
;;; it under the terms of the GNU General Public License version 3 as
;;; published by the Free Software Foundation.
;;;
;;; This program is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;; General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with this program. If not, see <http://www.gnu.org/licenses/>.
")
(define (hex->num x)
(read (open-string-input-port (format "#x~a" x))))
(define data-case
(lambda (fields)
(let ([num (car fields)]
[uc (list-ref fields uc-index)]
[lc (list-ref fields lc-index)]
[tc (list-ref fields tc-index)])
(let ([n (hex->num num)])
(define (f x)
(if (string=? x "") 0 (- (hex->num x) n)))
;#(UC LC TC FC string-FC)
(cons n (vector (f uc) (f lc) (f tc) #f 0 (f uc) (f lc) (f tc)))))))
(define (remove-dups ls)
(let f ([ls ls] [last #f])
(cond
[(null? ls) '()]
[(equal? (cdar ls) last) (f (cdr ls) last)]
[else
(cons (car ls) (f (cdr ls) (cdar ls)))])))
(define (compute-foldcase ls)
(define (find-vec idx)
(cond
[(assq idx ls) => cdr]
[else (error 'find-vec "~s is missing" idx)]))
(define (upper i)
(+ i (vector-ref (find-vec i) 0)))
(define (lower i)
(+ i (vector-ref (find-vec i) 1)))
(define (set-folder! i j)
(vector-set! (find-vec i) 3 (- j i)))
(for-each
(lambda (x)
(let ([idx (car x)] [vec (cdr x)])
(vector-set! vec 3
(- (lower (upper idx)) idx))))
ls)
(for-each
(lambda (idx)
(let ([vec (find-vec idx)])
(vector-set! vec 3 0)))
;; turkic chars
'(#x130 #x131))
ls)
(define uc-index 12)
(define lc-index 13)
(define tc-index 14)
(define (remove-spaces str)
(cond
[(= (string-length str) 0) str]
[(char=? (string-ref str 0) #\space)
(remove-spaces (substring str 1 (string-length str)))]
[else str]))
(define (split str)
(let f ([i 0] [n (string-length str)])
(cond
[(= i n) (list (substring str 0 n))]
[(char=? (string-ref str i) #\space)
(cons (substring str 0 i)
(split (substring str (+ i 1) n)))]
[else (f (add1 i) n)])))
(define (improperize ls)
(cond
[(null? (cdr ls)) (car ls)]
[else (cons (car ls) (improperize (cdr ls)))]))
(define (convert-full-fold-fields ls)
(cond
[(null? ls) '()]
[else
(let ([fields (car ls)])
(let ([cat (remove-spaces (cadr fields))])
(cond
[(member cat '("C" "F"))
(let ([n (hex->num (remove-spaces (car fields)))])
(let ([c* (map hex->num
(map remove-spaces
(split
(remove-spaces (caddr fields)))))])
(cons
(cons n
(if (= (length c*) 1)
(- (car c*) n)
(improperize (map integer->char c*))))
(convert-full-fold-fields (cdr ls)))))]
[else (convert-full-fold-fields (cdr ls))])))]))
(define-struct spcase (lc tc uc))
(define (convert-special-casing ls)
(cond
[(null? ls) '()]
[else
(let ([fields (car ls)])
(cond
[(or (<= (length fields) 4)
(= 0 (string-length (remove-spaces (list-ref fields 4)))))
(let ([n (hex->num (remove-spaces (car fields)))])
(define (field-data str)
(let ([c* (map hex->num
(map remove-spaces
(split (remove-spaces str))))])
(if (= (length c*) 1)
(- (car c*) n)
(improperize (map integer->char c*)))))
(cons
(cons n
(make-spcase
(field-data (list-ref fields 1))
(field-data (list-ref fields 2))
(field-data (list-ref fields 3))))
(convert-special-casing (cdr ls))))]
[else (convert-special-casing (cdr ls))]))]))
(define (with-output-to-file* file thunk)
(when (file-exists? file) (delete-file file))
(with-output-to-file file thunk))
(let ([ls
;;; get initial table
(compute-foldcase
(map data-case
(get-unicode-data "UNIDATA/UnicodeData.txt")))])
;;; compute the string-foldcase data
(for-each
(lambda (x)
(let ([n (car x)] [chars (cdr x)])
(cond
[(assq n ls) =>
(lambda (p)
(vector-set! (cdr p) 4 chars))]
[else (error #f "not there" n)])))
(convert-full-fold-fields
(get-unicode-data "UNIDATA/CaseFolding.txt")))
(for-each
(lambda (x)
(let ([n (car x)] [cases (cdr x)])
(cond
[(assq n ls) =>
(lambda (p)
(let ([v (cdr p)])
(vector-set! (cdr p) 5 (spcase-uc cases))
(vector-set! (cdr p) 6 (spcase-lc cases))
(vector-set! (cdr p) 7 (spcase-tc cases))))]
[else (error #f "not here" n)])))
(convert-special-casing
(get-unicode-data "UNIDATA/SpecialCasing.txt")))
;;; done
(let ([ls (remove-dups ls)])
(define (p name idx)
(pretty-print
`(define ,name
',(list->vector (map (lambda (x) (vector-ref (cdr x) idx)) ls)))))
(parameterize ([print-unicode #f] [pretty-width 80])
(let ([v0 (list->vector (map car ls))])
(with-output-to-file* "unicode-char-cases.ss"
(lambda ()
(display license)
(printf ";;; DO NOT EDIT\n;;; automatically generated\n")
(printf ";;; ~s entries in table\n" (vector-length v0))
(pretty-print `(define charcase-search-vector ',v0))
(p 'char-upcase-adjustment-vector 0)
(p 'char-downcase-adjustment-vector 1)
(p 'char-titlecase-adjustment-vector 2)
(p 'char-foldcase-adjustment-vector 3)
(p 'string-foldcase-adjustment-vector 4)
(p 'string-upcase-adjustment-vector 5)
(p 'string-downcase-adjustment-vector 6)
(p 'string-titlecase-adjustment-vector 7)
))))))
(printf "Happy Happy Joy Joy\n")