string<->utf in progress
This commit is contained in:
parent
bdb73c70a8
commit
4efdebb3ac
|
@ -16,7 +16,8 @@
|
||||||
|
|
||||||
(library (ikarus transcoders)
|
(library (ikarus transcoders)
|
||||||
(export string->utf8 utf8->string string->utf16 string->utf32)
|
(export string->utf8 utf8->string string->utf16 string->utf32)
|
||||||
(import (except (ikarus) string->utf8 utf8->string)
|
(import (except (ikarus) string->utf8 utf8->string string->utf16
|
||||||
|
utf16->string string->utf32 utf32->string)
|
||||||
(ikarus system $strings)
|
(ikarus system $strings)
|
||||||
(ikarus system $bytevectors)
|
(ikarus system $bytevectors)
|
||||||
(ikarus system $fx)
|
(ikarus system $fx)
|
||||||
|
@ -292,6 +293,27 @@
|
||||||
;;; U' = yyyyyyyyyyxxxxxxxxxx
|
;;; U' = yyyyyyyyyyxxxxxxxxxx
|
||||||
;;; W1 = 110110yyyyyyyyyy
|
;;; W1 = 110110yyyyyyyyyy
|
||||||
;;; W2 = 110111xxxxxxxxxx
|
;;; W2 = 110111xxxxxxxxxx
|
||||||
|
;;;
|
||||||
|
;;; Decoding of a single character from UTF-16 to an ISO 10646 character
|
||||||
|
;;; value proceeds as follows. Let W1 be the next 16-bit integer in the
|
||||||
|
;;; sequence of integers representing the text. Let W2 be the (eventual)
|
||||||
|
;;; next integer following W1.
|
||||||
|
;;;
|
||||||
|
;;; 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
|
||||||
|
;;; of W1. Terminate.
|
||||||
|
;;;
|
||||||
|
;;; 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
|
||||||
|
;;; is in error and no valid character can be obtained using W1.
|
||||||
|
;;; Terminate.
|
||||||
|
;;;
|
||||||
|
;;; 3) If there is no W2 (that is, the sequence ends with W1), or if W2
|
||||||
|
;;; is not between 0xDC00 and 0xDFFF, the sequence is in error.
|
||||||
|
;;; Terminate.
|
||||||
|
;;;
|
||||||
|
;;; 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
|
||||||
|
;;; bits of W1 as its 10 high-order bits and the 10 low-order bits of
|
||||||
|
;;; W2 as its 10 low-order bits.
|
||||||
|
|
||||||
|
|
||||||
(module (string->utf16)
|
(module (string->utf16)
|
||||||
(define ($string->utf16 str endianness)
|
(define ($string->utf16 str endianness)
|
||||||
|
@ -341,6 +363,100 @@
|
||||||
(error 'string->utf16 "invalid endianness" endianness))
|
(error 'string->utf16 "invalid endianness" endianness))
|
||||||
($string->utf16 str endianness)])))
|
($string->utf16 str endianness)])))
|
||||||
|
|
||||||
|
(module (utf16->string)
|
||||||
|
(define who 'utf16->string)
|
||||||
|
(define (count-size bv endianness i len n)
|
||||||
|
(cond
|
||||||
|
[(fx= i len)
|
||||||
|
(if (fx= len (bytevector-length bv))
|
||||||
|
n
|
||||||
|
(+ n 1))]
|
||||||
|
[else
|
||||||
|
(let ([w1 (bytevector-u16-ref bv i endianness)])
|
||||||
|
(cond
|
||||||
|
[(or (fx< w1 #xD800) (fx> w1 #xDFFF))
|
||||||
|
(count-size bv endianness (+ i 2) len (+ n 1))]
|
||||||
|
[(not (fx<= #xD800 w1 #xDFFF)) ;;; error sequence
|
||||||
|
(count-size bv endianness (+ i 2) len (+ n 1))]
|
||||||
|
[(<= (+ i 4) (bytevector-length bv))
|
||||||
|
(let ([w2 (bytevector-u16-ref bv (+ i 2) endianness)])
|
||||||
|
(cond
|
||||||
|
[(not (<= #xDC00 w2 #xDFFF))
|
||||||
|
;;; do we skip w2 also?
|
||||||
|
;;; I won't. Just w1 is an error
|
||||||
|
(count-size bv endianness (+ i 2) len (+ n 1))]
|
||||||
|
[else
|
||||||
|
;;; 4-byte sequence is ok
|
||||||
|
(count-size bv endianness (+ i 4) len (+ n 1))]))]
|
||||||
|
[else
|
||||||
|
;;; error again
|
||||||
|
(count-size bv endianness (+ i 2) len (+ n 1))]))]))
|
||||||
|
(define (fill bv endianness str i len n)
|
||||||
|
(cond
|
||||||
|
[(fx= i len)
|
||||||
|
(unless (fx= len (bytevector-length bv))
|
||||||
|
(string-set! str n #\xFFFD))
|
||||||
|
str]
|
||||||
|
[else
|
||||||
|
(let ([w1 (bytevector-u16-ref bv i endianness)])
|
||||||
|
(cond
|
||||||
|
[(or (fx< w1 #xD800) (fx> w1 #xDFFF))
|
||||||
|
(string-set! str n (integer->char w1))
|
||||||
|
(fill bv endianness str (+ i 2) len (+ n 1))]
|
||||||
|
[(not (fx<= #xD800 w1 #xDFFF)) ;;; error sequence
|
||||||
|
(string-set! str n #\xFFFD)
|
||||||
|
(fill bv endianness str (+ i 2) len (+ n 1))]
|
||||||
|
[(<= (+ i 4) (bytevector-length bv))
|
||||||
|
(let ([w2 (bytevector-u16-ref bv (+ i 2) endianness)])
|
||||||
|
(cond
|
||||||
|
[(not (<= #xDC00 w2 #xDFFF))
|
||||||
|
;;; do we skip w2 also?
|
||||||
|
;;; I won't. Just w1 is an error
|
||||||
|
(string-set! str n #\xFFFD)
|
||||||
|
(fill bv endianness str (+ i 2) len (+ n 1))]
|
||||||
|
[else
|
||||||
|
(string-set! str n
|
||||||
|
(integer->char
|
||||||
|
(fxlogor (fxsll (fxlogand w1 #x3FF) 10)
|
||||||
|
(fxlogand w2 #x3FF))))
|
||||||
|
(count-size bv endianness (+ i 4) len (+ n 1))]))]
|
||||||
|
[else
|
||||||
|
;;; error again
|
||||||
|
(string-set! str n #\xFFFD)
|
||||||
|
(count-size bv endianness str (+ i 2) len (+ n 1))]))]))
|
||||||
|
(define (decode bv endianness start)
|
||||||
|
(let ([len (fxand (bytevector-length bv) -2)])
|
||||||
|
(let ([n (count-size bv endianness start len 0)])
|
||||||
|
(let ([str (make-string n)])
|
||||||
|
(fill bv endianness str start len 0)))))
|
||||||
|
(define ($utf16->string bv endianness em?)
|
||||||
|
(define (bom-present bv)
|
||||||
|
(and (fx>= (bytevector-length bv) 2)
|
||||||
|
(let ([n (bytevector-u16-ref bv 0 'big)])
|
||||||
|
(cond
|
||||||
|
[(fx= n #xFEFF) 'big]
|
||||||
|
[(fx= n #xFFFE) 'little]
|
||||||
|
[else #f]))))
|
||||||
|
(unless (bytevector? bv)
|
||||||
|
(error who "not a bytevector" bv))
|
||||||
|
(unless (memv endianness '(big little))
|
||||||
|
(error who "invalid endianness" endianness))
|
||||||
|
(cond
|
||||||
|
[em? (decode bv endianness 0)]
|
||||||
|
[(bom-present bv) =>
|
||||||
|
(lambda (endianness)
|
||||||
|
(decode bv endianness 2))]
|
||||||
|
[else
|
||||||
|
(decode bv endianness 0)]))
|
||||||
|
(define utf16->string
|
||||||
|
(case-lambda
|
||||||
|
[(bv endianness)
|
||||||
|
($utf16->string bv endianness #f)]
|
||||||
|
[(bv endianness em?)
|
||||||
|
($utf16->string bv endianness em?)])))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
(module (string->utf32)
|
(module (string->utf32)
|
||||||
(define who 'string->utf32)
|
(define who 'string->utf32)
|
||||||
(define (vfill str bv i len endianness)
|
(define (vfill str bv i len endianness)
|
||||||
|
@ -369,5 +485,48 @@
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
(module (utf32->string)
|
||||||
|
(define who 'utf32->string)
|
||||||
|
(define (fill bv endianness str i j n)
|
||||||
|
(cond
|
||||||
|
[(fx= i j)
|
||||||
|
(unless (fx= n (string-length str))
|
||||||
|
(string-set! str n #\xFFFD))
|
||||||
|
str]
|
||||||
|
[else
|
||||||
|
(string-set! str n
|
||||||
|
(let ([w (bytevector-u32-ref bv i endianness)])
|
||||||
|
(integer->char w)))
|
||||||
|
(fill bv endianness str (fx+ i 4) j (fx+ n 1))]))
|
||||||
|
(define (decode bv endianness start)
|
||||||
|
(let ([bvlen (bytevector-length bv)])
|
||||||
|
(let ([strlen (fxsra (fx+ (fx- bvlen start) 3) 2)])
|
||||||
|
(fill bv endianness (make-string strlen) start
|
||||||
|
(fxand bvlen -2) 0))))
|
||||||
|
(define ($utf32->string bv endianness em?)
|
||||||
|
(define (bom-present bv)
|
||||||
|
(and (fx>= (bytevector-length bv) 4)
|
||||||
|
(let ([n (bytevector-u16-ref bv 0 'big)])
|
||||||
|
(cond
|
||||||
|
[(fx= n #x0000FEFF) 'big]
|
||||||
|
[(fx= n #xFFFE0000) 'little]
|
||||||
|
[else #f]))))
|
||||||
|
(unless (bytevector? bv)
|
||||||
|
(error who "not a bytevector" bv))
|
||||||
|
(unless (memv endianness '(big little))
|
||||||
|
(error who "invalid endianness" endianness))
|
||||||
|
(cond
|
||||||
|
[em? (decode bv endianness 0)]
|
||||||
|
[(bom-present bv) =>
|
||||||
|
(lambda (endianness)
|
||||||
|
(decode bv endianness 4))]
|
||||||
|
[else
|
||||||
|
(decode bv endianness 0)]))
|
||||||
|
(define utf32->string
|
||||||
|
(case-lambda
|
||||||
|
[(bv endianness)
|
||||||
|
($utf32->string bv endianness #f)]
|
||||||
|
[(bv endianness em?)
|
||||||
|
($utf32->string bv endianness em?)])))
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
1176
|
1177
|
||||||
|
|
Loading…
Reference in New Issue