2007-10-25 16:27:34 -04:00
|
|
|
;;; Ikarus Scheme -- A compiler for R6RS Scheme.
|
|
|
|
;;; Copyright (C) 2006,2007 Abdulaziz Ghuloum
|
|
|
|
;;;
|
|
|
|
;;; This program is free software: you can redistribute it and/or modify
|
|
|
|
;;; it under the terms of the GNU General Public License version 3 as
|
|
|
|
;;; published by the Free Software Foundation.
|
|
|
|
;;;
|
|
|
|
;;; This program is distributed in the hope that it will be useful, but
|
|
|
|
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
;;; General Public License for more details.
|
|
|
|
;;;
|
|
|
|
;;; You should have received a copy of the GNU General Public License
|
|
|
|
;;; along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
2007-05-18 18:02:49 -04:00
|
|
|
|
|
|
|
(library (ikarus transcoders)
|
2007-12-03 04:28:41 -05:00
|
|
|
(export string->utf8 utf8->string string->utf16 string->utf32
|
|
|
|
utf16->string utf32->string)
|
2007-12-03 03:00:01 -05:00
|
|
|
(import (except (ikarus) string->utf8 utf8->string string->utf16
|
|
|
|
utf16->string string->utf32 utf32->string)
|
2007-05-18 18:02:49 -04:00
|
|
|
(ikarus system $strings)
|
|
|
|
(ikarus system $bytevectors)
|
|
|
|
(ikarus system $fx)
|
2007-05-18 20:18:55 -04:00
|
|
|
(ikarus system $chars))
|
2007-12-03 01:35:29 -05:00
|
|
|
|
2007-05-18 18:02:49 -04:00
|
|
|
;;; From http://en.wikipedia.org/wiki/UTF-8
|
|
|
|
;;; hexadecimal binary scalar value UTF8
|
|
|
|
;;; 000000-00007F 00000000_00000000_0zzzzzzz 0zzzzzzz
|
|
|
|
;;; 000080-0007FF 00000000_00000yyy_yyzzzzzz 110yyyyy 10zzzzzz
|
|
|
|
;;; 000800-00FFFF 00000000_xxxxyyyy_yyzzzzzz 1110xxxx 10yyyyyy 10zzzzzz
|
|
|
|
;;; 010000-10FFFF 000wwwxx_xxxxyyyy_yyzzzzzz 11110www 10xxxxxx 10yyyyyy 10zzzzzz
|
|
|
|
|
|
|
|
;;; valid ranges: [000000 - 00D7FF] \union [00E000 - 10FFFF]
|
|
|
|
;;; invalid hole: [00D800 - 00DFFF]
|
|
|
|
|
|
|
|
;;; handling-modes: ignore, replace, raise
|
|
|
|
;;; ignore: skips over the offending bytes
|
|
|
|
;;; replace: places a U+FFFD in place of the malformed bytes
|
|
|
|
;;; raise: raises an error
|
|
|
|
|
2007-12-03 11:06:29 -05:00
|
|
|
;;; It appears that utf-8 data can start with a #xEF #xBB #xBF BOM!
|
|
|
|
|
2007-12-03 04:28:41 -05:00
|
|
|
(define integer->char/invalid
|
|
|
|
(lambda (n)
|
|
|
|
(cond
|
|
|
|
[(not (fixnum? n)) #\xFFFD]
|
|
|
|
[($fx<= n #xD7FF) ($fixnum->char n)]
|
|
|
|
[($fx< n #xE000) #\xFFFD]
|
|
|
|
[($fx<= n #x10FFFF) ($fixnum->char n)]
|
|
|
|
[else #\xFFFD])))
|
|
|
|
|
2007-10-12 00:33:19 -04:00
|
|
|
(define string->utf8
|
2007-05-18 18:02:49 -04:00
|
|
|
(lambda (str)
|
2007-05-18 20:18:55 -04:00
|
|
|
(define (utf8-string-size str)
|
|
|
|
(let f ([str str] [i 0] [j ($string-length str)] [n 0])
|
|
|
|
(cond
|
|
|
|
[($fx= i j) n]
|
|
|
|
[else
|
|
|
|
(let ([c ($string-ref str i)])
|
|
|
|
(let ([b ($char->fixnum c)])
|
|
|
|
(f str ($fxadd1 i) j
|
|
|
|
($fx+ n
|
|
|
|
(cond
|
|
|
|
[($fx<= b #x7F) 1]
|
|
|
|
[($fx<= b #x7FF) 2]
|
|
|
|
[($fx<= b #xFFFF) 3]
|
|
|
|
[else 4])))))])))
|
|
|
|
(define (fill-utf8-bytevector bv str)
|
|
|
|
(let f ([bv bv] [str str] [i 0] [j 0] [n ($string-length str)])
|
|
|
|
(cond
|
|
|
|
[($fx= i n) bv]
|
|
|
|
[else
|
|
|
|
(let ([c ($string-ref str i)])
|
|
|
|
(let ([b ($char->fixnum c)])
|
|
|
|
(cond
|
|
|
|
[($fx<= b #x7F)
|
|
|
|
($bytevector-set! bv j b)
|
|
|
|
(f bv str ($fxadd1 i) ($fxadd1 j) n)]
|
|
|
|
[($fx<= b #x7FF)
|
|
|
|
($bytevector-set! bv j
|
|
|
|
($fxlogor #b11000000 ($fxsra b 6)))
|
|
|
|
($bytevector-set! bv ($fx+ j 1)
|
|
|
|
($fxlogor #b10000000 ($fxlogand b #b111111)))
|
|
|
|
(f bv str ($fxadd1 i) ($fx+ j 2) n)]
|
|
|
|
[($fx<= b #xFFFF)
|
|
|
|
($bytevector-set! bv j
|
|
|
|
($fxlogor #b11100000 ($fxsra b 12)))
|
|
|
|
($bytevector-set! bv ($fx+ j 1)
|
|
|
|
($fxlogor #b10000000 ($fxlogand ($fxsra b 6) #b111111)))
|
|
|
|
($bytevector-set! bv ($fx+ j 2)
|
|
|
|
($fxlogor #b10000000 ($fxlogand b #b111111)))
|
|
|
|
(f bv str ($fxadd1 i) ($fx+ j 3) n)]
|
|
|
|
[else
|
|
|
|
($bytevector-set! bv j
|
|
|
|
($fxlogor #b11110000 ($fxsra b 18)))
|
|
|
|
($bytevector-set! bv ($fx+ j 1)
|
|
|
|
($fxlogor #b10000000 ($fxlogand ($fxsra b 12) #b111111)))
|
|
|
|
($bytevector-set! bv ($fx+ j 2)
|
|
|
|
($fxlogor #b10000000 ($fxlogand ($fxsra b 6) #b111111)))
|
|
|
|
($bytevector-set! bv ($fx+ j 3)
|
|
|
|
($fxlogor #b10000000 ($fxlogand b #b111111)))
|
|
|
|
(f bv str ($fxadd1 i) ($fx+ j 4) n)])))])))
|
2007-05-18 18:02:49 -04:00
|
|
|
(unless (string? str)
|
2007-10-25 14:32:26 -04:00
|
|
|
(error 'string->utf8 "not a string" str))
|
2007-05-18 18:02:49 -04:00
|
|
|
(fill-utf8-bytevector
|
|
|
|
($make-bytevector (utf8-string-size str))
|
|
|
|
str)))
|
|
|
|
|
2007-12-02 23:43:22 -05:00
|
|
|
(define (utf8->string x)
|
|
|
|
(unless (bytevector? x)
|
|
|
|
(error 'utf8->string "not a bytevector" x))
|
|
|
|
(decode-utf8-bytevector x 'replace))
|
2007-10-12 00:33:19 -04:00
|
|
|
|
|
|
|
(define decode-utf8-bytevector
|
2007-05-18 20:18:55 -04:00
|
|
|
(let ()
|
2007-10-12 00:33:19 -04:00
|
|
|
(define who 'decode-utf8-bytevector)
|
2007-12-03 11:06:29 -05:00
|
|
|
(define (count bv i mode)
|
|
|
|
(let f ([x bv] [i i] [j ($bytevector-length bv)] [n 0] [mode mode])
|
2007-05-18 20:18:55 -04:00
|
|
|
(cond
|
|
|
|
[($fx= i j) n]
|
|
|
|
[else
|
|
|
|
(let ([b0 ($bytevector-u8-ref x i)])
|
|
|
|
(cond
|
|
|
|
[($fx<= b0 #x7F)
|
|
|
|
(f x ($fxadd1 i) j ($fxadd1 n) mode)]
|
|
|
|
[($fx= ($fxsra b0 5) #b110)
|
|
|
|
(let ([i ($fxadd1 i)])
|
|
|
|
(cond
|
|
|
|
[($fx< i j)
|
|
|
|
(let ([b1 ($bytevector-u8-ref x i)])
|
|
|
|
(cond
|
2007-12-03 05:43:53 -05:00
|
|
|
[(and ($fx= ($fxsra b1 6) #b10)
|
|
|
|
;;; 000080-0007FF
|
|
|
|
(let ([n (fxlogor (fxsll (fxlogand b0 #x1F) 6)
|
|
|
|
(fxlogand b1 #x3F))])
|
|
|
|
(and (fx>= n #x80)
|
|
|
|
(fx<= n #x7FF))))
|
2007-05-18 20:18:55 -04:00
|
|
|
(f x ($fxadd1 i) j ($fxadd1 n) mode)]
|
|
|
|
[(eq? mode 'ignore)
|
|
|
|
(f x i j n mode)]
|
|
|
|
[(eq? mode 'replace)
|
|
|
|
(f x i j ($fxadd1 n) mode)]
|
|
|
|
[else
|
2007-10-25 14:32:26 -04:00
|
|
|
(error who "invalid byte sequence at idx of bytevector"
|
|
|
|
b0 b1 i bv)]))]
|
2007-05-18 20:18:55 -04:00
|
|
|
[(eq? mode 'ignore) n]
|
|
|
|
[(eq? mode 'replace) ($fxadd1 n)]
|
|
|
|
[else
|
2007-10-25 14:32:26 -04:00
|
|
|
(error who "invalid byte near end of bytevector" b0)]))]
|
2007-05-18 20:18:55 -04:00
|
|
|
[($fx= ($fxsra b0 4) #b1110)
|
|
|
|
(cond
|
|
|
|
[($fx< ($fx+ i 2) j)
|
|
|
|
(let ([b1 ($bytevector-u8-ref x ($fx+ i 1))]
|
|
|
|
[b2 ($bytevector-u8-ref x ($fx+ i 2))])
|
|
|
|
(cond
|
2007-12-03 05:43:53 -05:00
|
|
|
[(and ($fx= ($fxsra ($fxlogor b1 b2) 6) #b10)
|
|
|
|
(let ([n (fx+ (fxsll (fxlogand b0 #xF) 12)
|
|
|
|
(fx+ (fxsll (fxlogand b1 #x3F) 6)
|
|
|
|
(fxlogand b2 #x3F)))])
|
|
|
|
;;; REVIEW LATER ; 000800-00FFFF
|
|
|
|
(and (fx>= n #x0000800) (fx<= n #x00FFFF))))
|
2007-05-18 20:18:55 -04:00
|
|
|
(f x ($fx+ i 3) j ($fxadd1 n) mode)]
|
|
|
|
[(eq? mode 'ignore)
|
|
|
|
(f x ($fxadd1 i) j n mode)]
|
|
|
|
[(eq? mode 'replace)
|
|
|
|
(f x ($fxadd1 i) j ($fxadd1 n) mode)]
|
2007-10-25 14:32:26 -04:00
|
|
|
[else (error who "invalid sequence" b0 b1 b2)]))]
|
2007-05-18 20:18:55 -04:00
|
|
|
[(eq? mode 'ignore) (f x ($fxadd1 i) j n mode)]
|
|
|
|
[(eq? mode 'replace) (f x ($fxadd1 i) j ($fxadd1 n) mode)]
|
|
|
|
[else (error who "incomplete char sequence")])]
|
|
|
|
[($fx= ($fxsra b0 3) #b11110)
|
|
|
|
(cond
|
|
|
|
[($fx< ($fx+ i 3) j)
|
|
|
|
(let ([b1 ($bytevector-u8-ref x ($fx+ i 1))]
|
|
|
|
[b2 ($bytevector-u8-ref x ($fx+ i 2))]
|
|
|
|
[b3 ($bytevector-u8-ref x ($fx+ i 3))])
|
|
|
|
(cond
|
2007-12-03 05:43:53 -05:00
|
|
|
[(and ($fx= ($fxsra ($fxlogor b1 ($fxlogor b2 b3)) 6) #b10)
|
|
|
|
(let ([n
|
|
|
|
($fx+ ($fxlogand b3 #b111111)
|
|
|
|
($fx+ ($fxsll ($fxlogand b2 #b111111) 6)
|
|
|
|
($fx+ ($fxsll ($fxlogand b1 #b111111) 12)
|
|
|
|
($fxsll ($fxlogand b0 #b111) 18))))])
|
|
|
|
;;; 010000-10FFFF
|
|
|
|
(and (fx>= n #x10000) (fx<= n #x10FFFF))))
|
2007-05-18 20:18:55 -04:00
|
|
|
(f x ($fx+ i 4) j ($fxadd1 n) mode)]
|
|
|
|
[(eq? mode 'ignore)
|
|
|
|
(f x ($fxadd1 i) j n mode)]
|
|
|
|
[(eq? mode 'replace)
|
|
|
|
(f x ($fxadd1 i) j ($fxadd1 n) mode)]
|
2007-10-25 14:32:26 -04:00
|
|
|
[else (error who "invalid sequence" b0 b1 b2 b3)]))]
|
2007-05-18 20:18:55 -04:00
|
|
|
[(eq? mode 'ignore) (f x ($fxadd1 i) j n mode)]
|
|
|
|
[(eq? mode 'replace) (f x ($fxadd1 i) j ($fxadd1 n) mode)]
|
|
|
|
[else (error who "incomplete char sequence")])]
|
|
|
|
[(eq? mode 'ignore) (f x ($fxadd1 i) j n mode)]
|
|
|
|
[(eq? mode 'replace) (f x ($fxadd1 i) j ($fxadd1 n) mode)]
|
2007-10-25 14:32:26 -04:00
|
|
|
[else (error who "invalid byte at index of bytevector" b0 i x)]))])))
|
2007-12-03 11:06:29 -05:00
|
|
|
(define (fill str bv i mode)
|
|
|
|
(let f ([str str] [x bv] [i i] [j ($bytevector-length bv)] [n 0] [mode mode])
|
2007-05-18 20:18:55 -04:00
|
|
|
(cond
|
|
|
|
[($fx= i j) str]
|
|
|
|
[else
|
|
|
|
(let ([b0 ($bytevector-u8-ref x i)])
|
|
|
|
(cond
|
|
|
|
[($fx<= b0 #x7F)
|
|
|
|
($string-set! str n ($fixnum->char b0))
|
|
|
|
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
|
|
|
|
[($fx= ($fxsra b0 5) #b110)
|
|
|
|
(let ([i ($fxadd1 i)])
|
|
|
|
(cond
|
|
|
|
[($fx< i j)
|
|
|
|
(let ([b1 ($bytevector-u8-ref x i)])
|
|
|
|
(cond
|
2007-12-03 05:43:53 -05:00
|
|
|
[(and ($fx= ($fxsra b1 6) #b10)
|
|
|
|
;;; 000080-0007FF
|
|
|
|
(let ([n (fxlogor (fxsll (fxlogand b0 #x1F) 6)
|
|
|
|
(fxlogand b1 #x3F))])
|
|
|
|
(and (fx>= n #x80)
|
|
|
|
(fx<= n #x7FF)
|
|
|
|
($fixnum->char n)))) =>
|
|
|
|
(lambda (c)
|
|
|
|
($string-set! str n c)
|
|
|
|
(f str x ($fxadd1 i) j ($fxadd1 n) mode))]
|
2007-05-18 20:18:55 -04:00
|
|
|
[(eq? mode 'ignore)
|
|
|
|
(f str x i j n mode)]
|
|
|
|
[(eq? mode 'replace)
|
|
|
|
($string-set! str n ($fixnum->char #xFFFD))
|
|
|
|
(f str x i j ($fxadd1 n) mode)]
|
|
|
|
[else (error who "BUG")]))]
|
|
|
|
[(eq? mode 'ignore) str]
|
|
|
|
[(eq? mode 'replace)
|
|
|
|
($string-set! str n ($fixnum->char #xFFFD))
|
|
|
|
str]
|
|
|
|
[else (error who "BUG")]))]
|
|
|
|
[($fx= ($fxsra b0 4) #b1110)
|
|
|
|
(cond
|
|
|
|
[($fx< ($fx+ i 2) j)
|
|
|
|
(let ([b1 ($bytevector-u8-ref x ($fx+ i 1))]
|
|
|
|
[b2 ($bytevector-u8-ref x ($fx+ i 2))])
|
|
|
|
(cond
|
2007-12-03 05:43:53 -05:00
|
|
|
[(and ($fx= ($fxsra ($fxlogor b1 b2) 6) #b10)
|
|
|
|
(let ([n (fx+ (fxsll (fxlogand b0 #xF) 12)
|
|
|
|
(fx+ (fxsll (fxlogand b1 #x3F) 6)
|
|
|
|
(fxlogand b2 #x3F)))])
|
|
|
|
;;; REVIEW LATER ; 000800-00FFFF
|
|
|
|
(and (and (fx>= n #x000800) (fx<= n #x00FFFF))
|
|
|
|
($fixnum->char n)))) =>
|
|
|
|
(lambda (c)
|
|
|
|
($string-set! str n c)
|
|
|
|
(f str x ($fx+ i 3) j ($fxadd1 n) mode))]
|
2007-05-18 20:18:55 -04:00
|
|
|
[(eq? mode 'ignore)
|
|
|
|
(f str x ($fxadd1 i) j n mode)]
|
|
|
|
[(eq? mode 'replace)
|
|
|
|
($string-set! str n ($fixnum->char #xFFFD))
|
|
|
|
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
|
|
|
|
[else (error who "BUG")]))]
|
|
|
|
[(eq? mode 'ignore) (f str x ($fxadd1 i) j n mode)]
|
|
|
|
[(eq? mode 'replace)
|
|
|
|
($string-set! str n ($fixnum->char #xFFFD))
|
|
|
|
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
|
2007-05-18 20:59:10 -04:00
|
|
|
[else (error who "BUG")])]
|
2007-05-18 20:18:55 -04:00
|
|
|
[($fx= ($fxsra b0 3) #b11110)
|
|
|
|
(cond
|
|
|
|
[($fx< ($fx+ i 3) j)
|
|
|
|
(let ([b1 ($bytevector-u8-ref x ($fx+ i 1))]
|
|
|
|
[b2 ($bytevector-u8-ref x ($fx+ i 2))]
|
|
|
|
[b3 ($bytevector-u8-ref x ($fx+ i 3))])
|
|
|
|
(cond
|
2007-12-03 05:43:53 -05:00
|
|
|
[(and ($fx= ($fxsra ($fxlogor b1 ($fxlogor b2 b3)) 6) #b10)
|
|
|
|
(let ([n
|
|
|
|
($fx+ ($fxlogand b3 #b111111)
|
|
|
|
($fx+ ($fxsll ($fxlogand b2 #b111111) 6)
|
|
|
|
($fx+ ($fxsll ($fxlogand b1 #b111111) 12)
|
|
|
|
($fxsll ($fxlogand b0 #b111) 18))))])
|
|
|
|
;;; 010000-10FFFF
|
|
|
|
(and (fx>= n #x10000)
|
|
|
|
(fx<= n #x10FFFF)
|
|
|
|
($fixnum->char n)))) =>
|
|
|
|
(lambda (c)
|
|
|
|
($string-set! str n c)
|
|
|
|
(f str x ($fx+ i 4) j ($fxadd1 n) mode))]
|
2007-05-18 20:18:55 -04:00
|
|
|
[(eq? mode 'ignore)
|
|
|
|
(f str x ($fxadd1 i) j n mode)]
|
|
|
|
[(eq? mode 'replace)
|
|
|
|
($string-set! str n ($fixnum->char #xFFFD))
|
|
|
|
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
|
|
|
|
[else (error who "BUG")]))]
|
|
|
|
[(eq? mode 'ignore) (f str x ($fxadd1 i) j n mode)]
|
|
|
|
[(eq? mode 'replace)
|
|
|
|
($string-set! str n ($fixnum->char #xFFFD))
|
|
|
|
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
|
|
|
|
[else (error who "BUG")])]
|
|
|
|
[(eq? mode 'ignore) (f str x ($fxadd1 i) j n mode)]
|
|
|
|
[(eq? mode 'replace)
|
|
|
|
($string-set! str n ($fixnum->char #xFFFD))
|
|
|
|
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
|
|
|
|
[else (error who "BUG")]))])))
|
2007-12-03 11:06:29 -05:00
|
|
|
(define (has-bom? bv)
|
|
|
|
(and (fx> (bytevector-length bv) 3)
|
|
|
|
(fx= (bytevector-u8-ref bv 0) #xEF)
|
|
|
|
(fx= (bytevector-u8-ref bv 1) #xBB)
|
|
|
|
(fx= (bytevector-u8-ref bv 2) #xBF)))
|
2007-05-18 20:18:55 -04:00
|
|
|
(define (convert bv mode)
|
2007-12-03 11:06:29 -05:00
|
|
|
(cond
|
|
|
|
[(has-bom? bv)
|
|
|
|
(fill ($make-string (count bv 3 mode)) bv 3 mode)]
|
|
|
|
[else
|
|
|
|
(fill ($make-string (count bv 0 mode)) bv 0 mode)]))
|
2007-05-18 20:18:55 -04:00
|
|
|
(case-lambda
|
2007-05-18 20:59:10 -04:00
|
|
|
[(bv) (convert bv 'raise)]
|
2007-05-18 20:18:55 -04:00
|
|
|
[(bv handling-mode)
|
|
|
|
(unless (memq handling-mode '(ignore replace raise))
|
2007-10-12 00:33:19 -04:00
|
|
|
(error 'decode-utf8-bytevector
|
2007-10-25 14:32:26 -04:00
|
|
|
"not a valid handling mode"
|
2007-05-18 20:18:55 -04:00
|
|
|
handling-mode))
|
|
|
|
(convert bv handling-mode)])))
|
2007-05-18 18:02:49 -04:00
|
|
|
|
2007-12-03 01:35:29 -05:00
|
|
|
|
|
|
|
;;; From: http://tools.ietf.org/html/rfc2781
|
|
|
|
;;;
|
|
|
|
;;; 2.1 Encoding UTF-16
|
|
|
|
;;;
|
|
|
|
;;; Encoding of a single character from an ISO 10646 character value
|
|
|
|
;;; to UTF-16 proceeds as follows. Let U be the character number, no
|
|
|
|
;;; greater than 0x10FFFF.
|
|
|
|
;;;
|
|
|
|
;;; 1) If U < 0x10000, encode U as a 16-bit unsigned integer and terminate.
|
|
|
|
;;;
|
|
|
|
;;; 2) Let U' = U - 0x10000. Because U is less than or equal to 0x10FFFF,
|
|
|
|
;;; U' must be less than or equal to 0xFFFFF. That is, U' can be
|
|
|
|
;;; represented in 20 bits.
|
|
|
|
;;;
|
|
|
|
;;; 3) Initialize two 16-bit unsigned integers, W1 and W2, to 0xD800 and
|
|
|
|
;;; 0xDC00, respectively. These integers each have 10 bits free to
|
|
|
|
;;; encode the character value, for a total of 20 bits.
|
|
|
|
;;;
|
|
|
|
;;; 4) Assign the 10 high-order bits of the 20-bit U' to the 10 low-order
|
|
|
|
;;; bits of W1 and the 10 low-order bits of U' to the 10 low-order
|
|
|
|
;;; bits of W2. Terminate.
|
|
|
|
;;;
|
|
|
|
;;; Graphically, steps 2 through 4 look like:
|
|
|
|
;;; U' = yyyyyyyyyyxxxxxxxxxx
|
|
|
|
;;; W1 = 110110yyyyyyyyyy
|
|
|
|
;;; W2 = 110111xxxxxxxxxx
|
2007-12-03 03:00:01 -05:00
|
|
|
;;;
|
|
|
|
;;; Decoding of a single character from UTF-16 to an ISO 10646 character
|
|
|
|
;;; value proceeds as follows. Let W1 be the next 16-bit integer in the
|
|
|
|
;;; sequence of integers representing the text. Let W2 be the (eventual)
|
|
|
|
;;; next integer following W1.
|
|
|
|
;;;
|
|
|
|
;;; 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
|
|
|
|
;;; of W1. Terminate.
|
|
|
|
;;;
|
|
|
|
;;; 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
|
|
|
|
;;; is in error and no valid character can be obtained using W1.
|
|
|
|
;;; Terminate.
|
|
|
|
;;;
|
|
|
|
;;; 3) If there is no W2 (that is, the sequence ends with W1), or if W2
|
|
|
|
;;; is not between 0xDC00 and 0xDFFF, the sequence is in error.
|
|
|
|
;;; Terminate.
|
|
|
|
;;;
|
|
|
|
;;; 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
|
|
|
|
;;; bits of W1 as its 10 high-order bits and the 10 low-order bits of
|
|
|
|
;;; W2 as its 10 low-order bits.
|
2007-12-03 04:28:41 -05:00
|
|
|
;;; 5) Add 0x10000 to U' to obtain the character value U.
|
|
|
|
;;; Terminate.
|
|
|
|
|
2007-12-03 03:00:01 -05:00
|
|
|
|
2007-12-03 01:35:29 -05:00
|
|
|
|
|
|
|
(module (string->utf16)
|
|
|
|
(define ($string->utf16 str endianness)
|
|
|
|
(define (count-surr* str len i n)
|
|
|
|
(cond
|
|
|
|
[(fx= i len) n]
|
|
|
|
[else
|
|
|
|
(let ([c (string-ref str i)])
|
|
|
|
(cond
|
|
|
|
[(char<? c #\x10000)
|
|
|
|
(count-surr* str len (fx+ i 1) n)]
|
|
|
|
[else
|
|
|
|
(count-surr* str len (fx+ i 1) (fx+ n 1))]))]))
|
|
|
|
(define (bvfill str bv i j len endianness)
|
|
|
|
(cond
|
|
|
|
[(fx= i len) bv]
|
|
|
|
[else
|
|
|
|
(let ([n (char->integer (string-ref str i))])
|
|
|
|
(cond
|
|
|
|
[(fx< n #x10000)
|
|
|
|
(bytevector-u16-set! bv j n endianness)
|
|
|
|
(bvfill str bv (fx+ i 1) (fx+ j 2) len endianness)]
|
|
|
|
[else
|
|
|
|
(let ([u^ (fx- n #x10000)])
|
|
|
|
(bytevector-u16-set! bv j
|
|
|
|
(fxlogor (fxsll #b110110 10) (fxsra u^ 10))
|
|
|
|
endianness)
|
|
|
|
(bytevector-u16-set! bv (fx+ j 2)
|
2007-12-03 04:28:41 -05:00
|
|
|
(fxlogor (fxsll #b110111 10) (fxlogand u^ #x3FF))
|
2007-12-03 01:35:29 -05:00
|
|
|
endianness))
|
|
|
|
(bvfill str bv (fx+ i 1) (fx+ j 4) len endianness)]))]))
|
|
|
|
(let ([len ($string-length str)])
|
|
|
|
(let ([n (count-surr* str len 0 0)])
|
|
|
|
;;; FIXME: maybe special case for n=0 later
|
|
|
|
(let ([bv (make-bytevector (fxsll (fx+ len n) 1))])
|
|
|
|
(bvfill str bv 0 0 len endianness)))))
|
|
|
|
(define string->utf16
|
|
|
|
(case-lambda
|
|
|
|
[(str)
|
|
|
|
(unless (string? str)
|
|
|
|
(error 'string->utf16 "not a string" str))
|
|
|
|
($string->utf16 str 'big)]
|
|
|
|
[(str endianness)
|
|
|
|
(unless (string? str)
|
|
|
|
(error 'string->utf16 "not a string" str))
|
|
|
|
(unless (memv endianness '(big little))
|
|
|
|
(error 'string->utf16 "invalid endianness" endianness))
|
|
|
|
($string->utf16 str endianness)])))
|
|
|
|
|
2007-12-03 03:00:01 -05:00
|
|
|
(module (utf16->string)
|
|
|
|
(define who 'utf16->string)
|
|
|
|
(define (count-size bv endianness i len n)
|
|
|
|
(cond
|
|
|
|
[(fx= i len)
|
|
|
|
(if (fx= len (bytevector-length bv))
|
|
|
|
n
|
|
|
|
(+ n 1))]
|
|
|
|
[else
|
|
|
|
(let ([w1 (bytevector-u16-ref bv i endianness)])
|
|
|
|
(cond
|
|
|
|
[(or (fx< w1 #xD800) (fx> w1 #xDFFF))
|
|
|
|
(count-size bv endianness (+ i 2) len (+ n 1))]
|
2007-12-03 04:28:41 -05:00
|
|
|
[(not (fx<= #xD800 w1 #xDBFF)) ;;; error sequence
|
2007-12-03 03:00:01 -05:00
|
|
|
(count-size bv endianness (+ i 2) len (+ n 1))]
|
|
|
|
[(<= (+ i 4) (bytevector-length bv))
|
|
|
|
(let ([w2 (bytevector-u16-ref bv (+ i 2) endianness)])
|
|
|
|
(cond
|
|
|
|
[(not (<= #xDC00 w2 #xDFFF))
|
|
|
|
;;; do we skip w2 also?
|
|
|
|
;;; I won't. Just w1 is an error
|
|
|
|
(count-size bv endianness (+ i 2) len (+ n 1))]
|
|
|
|
[else
|
|
|
|
;;; 4-byte sequence is ok
|
|
|
|
(count-size bv endianness (+ i 4) len (+ n 1))]))]
|
|
|
|
[else
|
|
|
|
;;; error again
|
|
|
|
(count-size bv endianness (+ i 2) len (+ n 1))]))]))
|
|
|
|
(define (fill bv endianness str i len n)
|
|
|
|
(cond
|
|
|
|
[(fx= i len)
|
|
|
|
(unless (fx= len (bytevector-length bv))
|
|
|
|
(string-set! str n #\xFFFD))
|
|
|
|
str]
|
|
|
|
[else
|
|
|
|
(let ([w1 (bytevector-u16-ref bv i endianness)])
|
|
|
|
(cond
|
|
|
|
[(or (fx< w1 #xD800) (fx> w1 #xDFFF))
|
2007-12-03 04:28:41 -05:00
|
|
|
(string-set! str n (integer->char/invalid w1))
|
2007-12-03 03:00:01 -05:00
|
|
|
(fill bv endianness str (+ i 2) len (+ n 1))]
|
2007-12-03 04:28:41 -05:00
|
|
|
[(not (fx<= #xD800 w1 #xDBFF)) ;;; error sequence
|
2007-12-03 03:00:01 -05:00
|
|
|
(string-set! str n #\xFFFD)
|
|
|
|
(fill bv endianness str (+ i 2) len (+ n 1))]
|
|
|
|
[(<= (+ i 4) (bytevector-length bv))
|
|
|
|
(let ([w2 (bytevector-u16-ref bv (+ i 2) endianness)])
|
|
|
|
(cond
|
|
|
|
[(not (<= #xDC00 w2 #xDFFF))
|
|
|
|
;;; do we skip w2 also?
|
|
|
|
;;; I won't. Just w1 is an error
|
|
|
|
(string-set! str n #\xFFFD)
|
|
|
|
(fill bv endianness str (+ i 2) len (+ n 1))]
|
|
|
|
[else
|
|
|
|
(string-set! str n
|
2007-12-03 04:28:41 -05:00
|
|
|
(integer->char/invalid
|
|
|
|
(+ #x10000
|
|
|
|
(fxlogor (fxsll (fxlogand w1 #x3FF) 10)
|
|
|
|
(fxlogand w2 #x3FF)))))
|
|
|
|
(fill bv endianness str (+ i 4) len (+ n 1))]))]
|
2007-12-03 03:00:01 -05:00
|
|
|
[else
|
|
|
|
;;; error again
|
|
|
|
(string-set! str n #\xFFFD)
|
2007-12-03 04:28:41 -05:00
|
|
|
(fill bv endianness str (+ i 2) len (+ n 1))]))]))
|
2007-12-03 03:00:01 -05:00
|
|
|
(define (decode bv endianness start)
|
|
|
|
(let ([len (fxand (bytevector-length bv) -2)])
|
|
|
|
(let ([n (count-size bv endianness start len 0)])
|
|
|
|
(let ([str (make-string n)])
|
|
|
|
(fill bv endianness str start len 0)))))
|
|
|
|
(define ($utf16->string bv endianness em?)
|
|
|
|
(define (bom-present bv)
|
|
|
|
(and (fx>= (bytevector-length bv) 2)
|
|
|
|
(let ([n (bytevector-u16-ref bv 0 'big)])
|
|
|
|
(cond
|
|
|
|
[(fx= n #xFEFF) 'big]
|
|
|
|
[(fx= n #xFFFE) 'little]
|
|
|
|
[else #f]))))
|
|
|
|
(unless (bytevector? bv)
|
|
|
|
(error who "not a bytevector" bv))
|
|
|
|
(unless (memv endianness '(big little))
|
|
|
|
(error who "invalid endianness" endianness))
|
|
|
|
(cond
|
|
|
|
[em? (decode bv endianness 0)]
|
|
|
|
[(bom-present bv) =>
|
|
|
|
(lambda (endianness)
|
|
|
|
(decode bv endianness 2))]
|
|
|
|
[else
|
|
|
|
(decode bv endianness 0)]))
|
|
|
|
(define utf16->string
|
|
|
|
(case-lambda
|
|
|
|
[(bv endianness)
|
|
|
|
($utf16->string bv endianness #f)]
|
|
|
|
[(bv endianness em?)
|
|
|
|
($utf16->string bv endianness em?)])))
|
|
|
|
|
|
|
|
|
|
|
|
|
2007-12-03 01:35:29 -05:00
|
|
|
(module (string->utf32)
|
|
|
|
(define who 'string->utf32)
|
|
|
|
(define (vfill str bv i len endianness)
|
|
|
|
(cond
|
|
|
|
[(fx= i len) bv]
|
|
|
|
[else
|
|
|
|
(bytevector-u32-set! bv (fxsll i 2)
|
|
|
|
(char->integer (string-ref str i))
|
|
|
|
endianness)
|
|
|
|
(vfill str bv (fx+ i 1) len endianness)]))
|
|
|
|
(define ($string->utf32 str endianness)
|
|
|
|
(let ([len (string-length str)])
|
|
|
|
(vfill str (make-bytevector (fxsll len 2)) 0 len endianness)))
|
|
|
|
(define string->utf32
|
|
|
|
(case-lambda
|
|
|
|
[(str)
|
|
|
|
(unless (string? str)
|
|
|
|
(error who "not a string" str))
|
|
|
|
($string->utf32 str 'big)]
|
|
|
|
[(str endianness)
|
|
|
|
(unless (string? str)
|
|
|
|
(error who "not a string" str))
|
|
|
|
(unless (memq endianness '(little big))
|
|
|
|
(error who "invalid endianness" endianness))
|
|
|
|
($string->utf32 str endianness)])))
|
|
|
|
|
|
|
|
|
|
|
|
|
2007-12-03 03:00:01 -05:00
|
|
|
(module (utf32->string)
|
|
|
|
(define who 'utf32->string)
|
|
|
|
(define (fill bv endianness str i j n)
|
|
|
|
(cond
|
|
|
|
[(fx= i j)
|
|
|
|
(unless (fx= n (string-length str))
|
|
|
|
(string-set! str n #\xFFFD))
|
|
|
|
str]
|
|
|
|
[else
|
|
|
|
(string-set! str n
|
2007-12-03 04:28:41 -05:00
|
|
|
(integer->char/invalid
|
|
|
|
(bytevector-u32-ref bv i endianness)))
|
2007-12-03 03:00:01 -05:00
|
|
|
(fill bv endianness str (fx+ i 4) j (fx+ n 1))]))
|
|
|
|
(define (decode bv endianness start)
|
|
|
|
(let ([bvlen (bytevector-length bv)])
|
|
|
|
(let ([strlen (fxsra (fx+ (fx- bvlen start) 3) 2)])
|
2007-12-03 11:06:29 -05:00
|
|
|
(fill bv endianness (make-string strlen)
|
|
|
|
start (fxand bvlen -4)
|
|
|
|
0))))
|
2007-12-03 03:00:01 -05:00
|
|
|
(define ($utf32->string bv endianness em?)
|
|
|
|
(define (bom-present bv)
|
|
|
|
(and (fx>= (bytevector-length bv) 4)
|
2007-12-03 11:06:29 -05:00
|
|
|
(let ([n (bytevector-u32-ref bv 0 'big)])
|
2007-12-03 03:00:01 -05:00
|
|
|
(cond
|
2007-12-03 04:28:41 -05:00
|
|
|
[(= n #x0000FEFF) 'big]
|
|
|
|
[(= n #xFFFE0000) 'little]
|
2007-12-03 03:00:01 -05:00
|
|
|
[else #f]))))
|
|
|
|
(unless (bytevector? bv)
|
|
|
|
(error who "not a bytevector" bv))
|
|
|
|
(unless (memv endianness '(big little))
|
|
|
|
(error who "invalid endianness" endianness))
|
|
|
|
(cond
|
2007-12-03 11:06:29 -05:00
|
|
|
[em? (decode bv endianness 0)]
|
2007-12-03 03:00:01 -05:00
|
|
|
[(bom-present bv) =>
|
|
|
|
(lambda (endianness)
|
|
|
|
(decode bv endianness 4))]
|
|
|
|
[else
|
|
|
|
(decode bv endianness 0)]))
|
|
|
|
(define utf32->string
|
|
|
|
(case-lambda
|
|
|
|
[(bv endianness)
|
|
|
|
($utf32->string bv endianness #f)]
|
|
|
|
[(bv endianness em?)
|
|
|
|
($utf32->string bv endianness em?)])))
|
2007-12-03 01:35:29 -05:00
|
|
|
|
2007-05-18 18:02:49 -04:00
|
|
|
)
|