From 4efdebb3ac75eac851e3cbe746143b229dc30aa7 Mon Sep 17 00:00:00 2001 From: Abdulaziz Ghuloum Date: Mon, 3 Dec 2007 03:00:01 -0500 Subject: [PATCH] string<->utf in progress --- scheme/ikarus.unicode-conversion.ss | 161 +++++++++++++++++++++++++++- scheme/last-revision | 2 +- 2 files changed, 161 insertions(+), 2 deletions(-) diff --git a/scheme/ikarus.unicode-conversion.ss b/scheme/ikarus.unicode-conversion.ss index 49cb2b4..92c0944 100644 --- a/scheme/ikarus.unicode-conversion.ss +++ b/scheme/ikarus.unicode-conversion.ss @@ -16,7 +16,8 @@ (library (ikarus transcoders) (export string->utf8 utf8->string string->utf16 string->utf32) - (import (except (ikarus) string->utf8 utf8->string) + (import (except (ikarus) string->utf8 utf8->string string->utf16 + utf16->string string->utf32 utf32->string) (ikarus system $strings) (ikarus system $bytevectors) (ikarus system $fx) @@ -292,6 +293,27 @@ ;;; U' = yyyyyyyyyyxxxxxxxxxx ;;; W1 = 110110yyyyyyyyyy ;;; W2 = 110111xxxxxxxxxx +;;; +;;; Decoding of a single character from UTF-16 to an ISO 10646 character +;;; value proceeds as follows. Let W1 be the next 16-bit integer in the +;;; sequence of integers representing the text. Let W2 be the (eventual) +;;; next integer following W1. +;;; +;;; 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value +;;; of W1. Terminate. +;;; +;;; 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence +;;; is in error and no valid character can be obtained using W1. +;;; Terminate. +;;; +;;; 3) If there is no W2 (that is, the sequence ends with W1), or if W2 +;;; is not between 0xDC00 and 0xDFFF, the sequence is in error. +;;; Terminate. +;;; +;;; 4) Construct a 20-bit unsigned integer U', taking the 10 low-order +;;; bits of W1 as its 10 high-order bits and the 10 low-order bits of +;;; W2 as its 10 low-order bits. + (module (string->utf16) (define ($string->utf16 str endianness) @@ -341,6 +363,100 @@ (error 'string->utf16 "invalid endianness" endianness)) ($string->utf16 str endianness)]))) + (module (utf16->string) + (define who 'utf16->string) + (define (count-size bv endianness i len n) + (cond + [(fx= i len) + (if (fx= len (bytevector-length bv)) + n + (+ n 1))] + [else + (let ([w1 (bytevector-u16-ref bv i endianness)]) + (cond + [(or (fx< w1 #xD800) (fx> w1 #xDFFF)) + (count-size bv endianness (+ i 2) len (+ n 1))] + [(not (fx<= #xD800 w1 #xDFFF)) ;;; error sequence + (count-size bv endianness (+ i 2) len (+ n 1))] + [(<= (+ i 4) (bytevector-length bv)) + (let ([w2 (bytevector-u16-ref bv (+ i 2) endianness)]) + (cond + [(not (<= #xDC00 w2 #xDFFF)) + ;;; do we skip w2 also? + ;;; I won't. Just w1 is an error + (count-size bv endianness (+ i 2) len (+ n 1))] + [else + ;;; 4-byte sequence is ok + (count-size bv endianness (+ i 4) len (+ n 1))]))] + [else + ;;; error again + (count-size bv endianness (+ i 2) len (+ n 1))]))])) + (define (fill bv endianness str i len n) + (cond + [(fx= i len) + (unless (fx= len (bytevector-length bv)) + (string-set! str n #\xFFFD)) + str] + [else + (let ([w1 (bytevector-u16-ref bv i endianness)]) + (cond + [(or (fx< w1 #xD800) (fx> w1 #xDFFF)) + (string-set! str n (integer->char w1)) + (fill bv endianness str (+ i 2) len (+ n 1))] + [(not (fx<= #xD800 w1 #xDFFF)) ;;; error sequence + (string-set! str n #\xFFFD) + (fill bv endianness str (+ i 2) len (+ n 1))] + [(<= (+ i 4) (bytevector-length bv)) + (let ([w2 (bytevector-u16-ref bv (+ i 2) endianness)]) + (cond + [(not (<= #xDC00 w2 #xDFFF)) + ;;; do we skip w2 also? + ;;; I won't. Just w1 is an error + (string-set! str n #\xFFFD) + (fill bv endianness str (+ i 2) len (+ n 1))] + [else + (string-set! str n + (integer->char + (fxlogor (fxsll (fxlogand w1 #x3FF) 10) + (fxlogand w2 #x3FF)))) + (count-size bv endianness (+ i 4) len (+ n 1))]))] + [else + ;;; error again + (string-set! str n #\xFFFD) + (count-size bv endianness str (+ i 2) len (+ n 1))]))])) + (define (decode bv endianness start) + (let ([len (fxand (bytevector-length bv) -2)]) + (let ([n (count-size bv endianness start len 0)]) + (let ([str (make-string n)]) + (fill bv endianness str start len 0))))) + (define ($utf16->string bv endianness em?) + (define (bom-present bv) + (and (fx>= (bytevector-length bv) 2) + (let ([n (bytevector-u16-ref bv 0 'big)]) + (cond + [(fx= n #xFEFF) 'big] + [(fx= n #xFFFE) 'little] + [else #f])))) + (unless (bytevector? bv) + (error who "not a bytevector" bv)) + (unless (memv endianness '(big little)) + (error who "invalid endianness" endianness)) + (cond + [em? (decode bv endianness 0)] + [(bom-present bv) => + (lambda (endianness) + (decode bv endianness 2))] + [else + (decode bv endianness 0)])) + (define utf16->string + (case-lambda + [(bv endianness) + ($utf16->string bv endianness #f)] + [(bv endianness em?) + ($utf16->string bv endianness em?)]))) + + + (module (string->utf32) (define who 'string->utf32) (define (vfill str bv i len endianness) @@ -369,5 +485,48 @@ + (module (utf32->string) + (define who 'utf32->string) + (define (fill bv endianness str i j n) + (cond + [(fx= i j) + (unless (fx= n (string-length str)) + (string-set! str n #\xFFFD)) + str] + [else + (string-set! str n + (let ([w (bytevector-u32-ref bv i endianness)]) + (integer->char w))) + (fill bv endianness str (fx+ i 4) j (fx+ n 1))])) + (define (decode bv endianness start) + (let ([bvlen (bytevector-length bv)]) + (let ([strlen (fxsra (fx+ (fx- bvlen start) 3) 2)]) + (fill bv endianness (make-string strlen) start + (fxand bvlen -2) 0)))) + (define ($utf32->string bv endianness em?) + (define (bom-present bv) + (and (fx>= (bytevector-length bv) 4) + (let ([n (bytevector-u16-ref bv 0 'big)]) + (cond + [(fx= n #x0000FEFF) 'big] + [(fx= n #xFFFE0000) 'little] + [else #f])))) + (unless (bytevector? bv) + (error who "not a bytevector" bv)) + (unless (memv endianness '(big little)) + (error who "invalid endianness" endianness)) + (cond + [em? (decode bv endianness 0)] + [(bom-present bv) => + (lambda (endianness) + (decode bv endianness 4))] + [else + (decode bv endianness 0)])) + (define utf32->string + (case-lambda + [(bv endianness) + ($utf32->string bv endianness #f)] + [(bv endianness em?) + ($utf32->string bv endianness em?)]))) ) diff --git a/scheme/last-revision b/scheme/last-revision index 5ddd971..bacbcf4 100644 --- a/scheme/last-revision +++ b/scheme/last-revision @@ -1 +1 @@ -1176 +1177