#!/usr/bin/env ikarus --r6rs-script
;;; Ikarus Scheme -- A compiler for R6RS Scheme.
;;; Copyright (C) 2006,2007  Abdulaziz Ghuloum
;;; 
;;; This program is free software: you can redistribute it and/or modify
;;; it under the terms of the GNU General Public License version 3 as
;;; published by the Free Software Foundation.
;;; 
;;; This program is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;;; General Public License for more details.
;;; 
;;; You should have received a copy of the GNU General Public License
;;; along with this program.  If not, see <http://www.gnu.org/licenses/>.


;;; this file is a mess.

(import 
  (ikarus)
  (unicode-data))

(define license 
'";;; Ikarus Scheme -- A compiler for R6RS Scheme.
;;; Copyright (C) 2006,2007  Abdulaziz Ghuloum
;;; 
;;; This program is free software: you can redistribute it and/or modify
;;; it under the terms of the GNU General Public License version 3 as
;;; published by the Free Software Foundation.
;;; 
;;; This program is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;;; General Public License for more details.
;;; 
;;; You should have received a copy of the GNU General Public License
;;; along with this program.  If not, see <http://www.gnu.org/licenses/>.

")

(define (hex-string->number str)
  (or (string->number (string-append "#x" str)) 
      (error 'hex-string->number "invalid ~s" str)))

(define (find-char c s)
  (let f ([i 0] [n (string-length s)])
    (cond
      [(= i n) #f]
      [(char=? (string-ref s i) c) i]
      [else (f (add1 i) n)])))

(define (extract-range str)
 (cond
   [(find-char #\. str)
    =>
    (lambda (i) 
      (cons (hex-string->number (substring str 0 i))
            (hex-string->number (substring str (+ i 2) (string-length str)))))]
   [else 
    (let ([n (hex-string->number str)])
      (cons n n))]))




(define constituent-property  #x010000)
(define uppercase-property    #x020000)
(define lowercase-property    #x040000)
(define titlecase-property    #x080000)
(define alphabetic-property   #x100000)
(define numeric-property      #x200000)
(define whitespace-property   #x400000)

;;; Uppercase = Lu + Other_Uppercase
;;; Lowercase = Ll + Other_Lowercase
;;; Titlecase = Lt
;;; Alphabetic = Lu + Ll + Lt + Lm + Lo + Nl + Other_Alphabetic 
;;; Numeric = ???
;;; White_Space = 

(define proplist-properties
  `(["Other_Uppercase"  ,uppercase-property]
    ["Other_Lowercase"  ,lowercase-property]
    ["Other_Alphabetic" ,alphabetic-property]
    ["White_Space"      ,whitespace-property]))

(define categories
  ;;; 30 categories
  `([Lu ,(+ 00 constituent-property uppercase-property alphabetic-property) "Letter, Uppercase"]
    [Ll ,(+ 01 constituent-property lowercase-property alphabetic-property) "Letter, Lowercase"]
    [Lt ,(+ 02 constituent-property titlecase-property alphabetic-property) "Letter, Titlecase"]
    [Lm ,(+ 03 constituent-property alphabetic-property)  "Letter, Modifier"]
    [Lo ,(+ 04 constituent-property alphabetic-property)  "Letter, Other"]
    [Mn ,(+ 05 constituent-property)  "Mark, Nonspacing"]
    [Mc ,(+ 06 constituent-property)  "Mark, Spacing Combining"]
    [Me ,(+ 07 constituent-property)  "Mark, Enclosing"]
    [Nd ,(+ 08 constituent-property)  "Number, Decimal Digit"]
    [Nl ,(+ 09 constituent-property alphabetic-property)  "Number, Letter"]
    [No ,(+ 10 constituent-property)  "Number, Other"]
    [Pc ,(+ 11 constituent-property)  "Punctuation, Connector"]
    [Pd ,(+ 12 constituent-property)  "Punctuation, Dash"]
    [Ps ,(+ 13 )                      "Punctuation, Open"]
    [Pe ,(+ 14 )                      "Punctuation, Close"]
    [Pi ,(+ 15 )                      "Punctuation, Initial quote"]
    [Pf ,(+ 16 )                      "Punctuation, Final quote"]
    [Po ,(+ 17 constituent-property)  "Punctuation, Other"]
    [Sm ,(+ 18 constituent-property)  "Symbol, Math"]
    [Sc ,(+ 19 constituent-property)  "Symbol, Currency"]
    [Sk ,(+ 20 constituent-property)  "Symbol, Modifier"]
    [So ,(+ 21 constituent-property)  "Symbol, Other"]
    [Zs ,(+ 22 )                      "Separator, Space"]
    [Zl ,(+ 23 )                      "Separator, Line"]
    [Zp ,(+ 24 )                      "Separator, Paragraph"]
    [Cc ,(+ 25 )                      "Other, Control"]
    [Cf ,(+ 26 )                      "Other, Format"]
    [Cs ,(+ 27 )                      "Other, Surrogate"]
    [Co ,(+ 28 constituent-property)  "Other, Private Use"]
    [Cn ,(+ 29 )                      "Other, Not Assigned"]
  ))


(define (category-index x)
  (cond
    [(assq x categories) => cadr]
    [else (error 'category-index "invalid cat ~s" x)]))


(define (make-cats-table ls) 
  (let f ([i 1] [st (car ls)] [ls (cdr ls)] [ac '()])
    (cond
      [(null? ls) (reverse (cons (cons i st) ac))]
      [(equal? (cdar ls) (cdr st)) (f (add1 i) st (cdr ls) ac)]
      [else
       (f 1 (car ls) (cdr ls) (cons (cons i st) ac))])))


(define (merge-sequences ls) 
  (define (split ls) 
    (cond
      [(null? ls) (values '() '())]
      [(= (caar ls) 1) 
       (let-values ([(chain no-chain) (split (cdr ls))])
         (values (cons (cdar ls) chain) no-chain))]
      [else 
       (values '() ls)]))
  (define (mk-chain a chain) 
    (cond
      [(null? chain) a]
      [else 
       (cons (car a)
         (list->vector 
           (cons (cdr a) 
             (map cdr chain))))]))
  (cond
    [(null? ls) '()]
    [(= (caar ls) 1)
     (let-values ([(chain no-chain) (split (cdr ls))])
       (cons (mk-chain (cdar ls) chain)
         (merge-sequences no-chain)))]
    [else (cons (cdar ls) (merge-sequences (cdr ls)))]))

(define (iota i n)
  (let f ([i i] [n n] [ac '()])
    (cond
      [(= i n) ac]
      [else (f i (sub1 n) (cons (sub1 n) ac))])))

;;; first, make a big vector for all characters
;;; place all in category Cn, unless proven otherwise
(let ([v (make-vector (+ #x10FFFF 1) (category-index 'Cn))])
  (let ([ls (get-unicode-data "UNIDATA/UnicodeData.txt")])
    ;;; interesting parts of each element in ls are:
    ;;; field0: the character index, numeric
    ;;; field2: the category, symbolic
    ;;; field8: if set, then the char has the numeric property
    (define (setprop idx prop) 
      (vector-set! v idx prop))
    (let ([ls (map 
                (lambda (x) 
                  (let ([idx (hex-string->number (list-ref x 0))]
                        [cat (category-index (string->symbol (list-ref x 2)))]
                        [num? (list-ref x 8)])
                    (cons idx
                      (if (string=? num? "")
                          cat
                          (fxlogor cat numeric-property)))))
                ls)])
      (let f ([ls ls])
        (cond
          [(null? ls) (void)]
          [(null? (cdr ls)) (setprop (caar ls) (cdar ls))]
          [(or (= (+ 1 (caar ls)) (caadr ls))
               (not (= (cdar ls) (cdadr ls))))
           (setprop (caar ls) (cdar ls)) 
           (f (cdr ls))]
          [else
           (let f ([i (caar ls)] [j (caadr ls)] [p (cdar ls)])
             (unless (> i j) 
               (setprop i p)
               (f (add1 i) j p)))
           (f (cddr ls))]))))
  ;;; every element of v now maps to the category-index.
  (let ([ls (get-unicode-data "UNIDATA/PropList.txt")])
    ;;; field0 is a range
    ;;; field1 is a property name
    (for-each
      (lambda (x)
        (let ([range (extract-range (car x))]
              [name (cadr x)])
          (cond
            [(assoc name proplist-properties) =>
             (lambda (a) 
               (let ([n (cadr a)])
                 (let f ([i (car range)] [j (cdr range)])
                   (unless (> i j) 
                     (vector-set! v i (fxlogor (vector-ref v i) n))
                     (f (add1 i) j)))))])))
      ls))
  (let ([table 
         (merge-sequences 
           (make-cats-table
             (map cons 
                  (iota 0 (vector-length v))
                  (vector->list v))))])
    (with-output-to-file "unicode-charinfo.ss"
      (lambda () 
         (display license)
         (printf ";;; DO NOT EDIT\n")
         (printf ";;; automatically generated\n")
         (printf ";;; ~s elements in vectors\n\n" (length table))
         (pretty-print
           `(begin
              (define constituent-property  ,constituent-property ) 
              (define uppercase-property    ,uppercase-property   ) 
              (define lowercase-property    ,lowercase-property   ) 
              (define titlecase-property    ,titlecase-property   ) 
              (define alphabetic-property   ,alphabetic-property  ) 
              (define numeric-property      ,numeric-property     ) 
              (define whitespace-property   ,whitespace-property)))
         (pretty-print 
           `(define unicode-categories-lookup-vector 
              ',(list->vector (map car table))))
         (pretty-print 
           `(define unicode-categories-values-vector 
              ',(list->vector (map cdr table))))
         (pretty-print 
           `(define unicode-categories-name-vector 
              ',(list->vector (map car categories)))))
      'replace)))


(printf "Happy Happy Joy Joy\n")