From 0da61d51cb79a787790168d3d0509488843185ab Mon Sep 17 00:00:00 2001 From: Abdulaziz Ghuloum Date: Sun, 19 Oct 2008 18:43:42 -0400 Subject: [PATCH] - input ports can now handle utf-16 codecs. --- scheme/ikarus.io.ss | 189 ++++++++++++++++++++-------- scheme/ikarus.unicode-conversion.ss | 12 +- scheme/last-revision | 2 +- scheme/tests/io.ss | 148 ++++++++++++++++++++-- 4 files changed, 287 insertions(+), 64 deletions(-) diff --git a/scheme/ikarus.io.ss b/scheme/ikarus.io.ss index de42964..378a9e2 100644 --- a/scheme/ikarus.io.ss +++ b/scheme/ikarus.io.ss @@ -206,6 +206,8 @@ (define fast-char-text-tag #b00000000010000) (define fast-u7-text-tag #b00000000100000) (define fast-u8-text-tag #b00000001100000) + (define fast-u16be-text-tag #b00000010000000) + (define fast-u16le-text-tag #b00000100000000) (define r6rs-mode-tag #b01000000000000) (define closed-port-tag #b10000000000000) @@ -219,13 +221,15 @@ (define fast-get-char-tag #b00000000010101) (define fast-get-utf8-tag #b00000000100101) (define fast-get-latin-tag #b00000001100101) + (define fast-get-utf16be-tag #b00000010000101) + (define fast-get-utf16le-tag #b00000100000101) (define fast-put-byte-tag #b00000000001010) (define fast-put-char-tag #b00000000010110) (define fast-put-utf8-tag #b00000000100110) (define fast-put-latin-tag #b00000001100110) - (define fast-attrs-mask #xFFF) + (define fast-attrs-mask #b111111111111) (define-syntax $port-fast-attrs (identifier-syntax (lambda (x) @@ -750,11 +754,6 @@ (die 'close-output-port "not an output port" p)) ($close-port p)) - ;(define-rrr port-has-port-position?) - ;(define-rrr port-position) - ;(define-rrr port-has-set-port-position!?) - ;(define-rrr set-port-position!) - (define (refill-bv-buffer p who) (when ($port-closed? p) (die who "port is closed" p)) (let ([bv ($port-buffer p)] [i ($port-index p)] [j ($port-size p)]) @@ -1019,14 +1018,27 @@ (die who "not a textual port" p)) (case (transcoder-codec tr) [(utf-8-codec) - ;;; ($set-port-attrs! p (fxior textual-input-port-bits fast-u7-text-tag)) (eof-object? (advance-bom p who '(#xEF #xBB #xBF)))] + [(utf-16-codec) + (let ([be? (advance-bom p who '(#xFE #xFF))]) + (case be? + [(#t) + ($set-port-attrs! p + (fxior textual-input-port-bits fast-u16be-text-tag)) + #f] + [(#f) + (let ([le? (advance-bom p who '(#xFF #xFE))]) + (case le? + [(#t #f) ;;; little by default + ($set-port-attrs! p + (fxior textual-input-port-bits fast-u16le-text-tag)) + #f] + [else #t]))] + [else #t]))] [else - (die 'slow-get-char - "BUG: codec not handled" - (transcoder-codec tr))]))) + (die who "BUG: codec not handled" (transcoder-codec tr))]))) ;;; (define (lookahead-char-char-mode p who) (let ([str ($port-buffer p)] @@ -1073,6 +1085,8 @@ (bytevector-u8-ref ($port-buffer p) i))] [else (get-char-latin-mode p who 0)]))] + [(eq? m fast-get-utf16le-tag) (peek-utf16 p who 'little)] + [(eq? m fast-get-utf16be-tag) (peek-utf16 p who 'big)] [else (if (speedup-input-port p who) (eof-object) @@ -1094,47 +1108,120 @@ [else ($set-port-index! p 1) (string-ref str 0)])))) + (define (peek-utf16 p who endianness) + (define integer->char/invalid + (lambda (n) + (cond + [(fx<= n #xD7FF) (integer->char n)] + [(fx< n #xE000) #\xFFFD] + [(fx<= n #x10FFFF) (integer->char n)] + [else #\xFFFD]))) + (let ([i ($port-index p)]) + (cond + [(fx<= (fx+ i 2) ($port-size p)) + (let ([w1 (bytevector-u16-ref ($port-buffer p) i endianness)]) + (cond + [(or (fx< w1 #xD800) (fx> w1 #xDFFF)) + (integer->char/invalid w1)] + [(not (and (fx<= #xD800 w1) (fx<= w1 #xDBFF))) + #\xFFFD] + [(fx<= (+ i 4) ($port-size p)) + (let ([w2 (bytevector-u16-ref + ($port-buffer p) (+ i 2) endianness)]) + (cond + [(not (and (fx<= #xDC00 w2) (fx<= w2 #xDFFF))) + #\xFFFD] + [else + (integer->char/invalid + (fx+ #x10000 + (fxlogor + (fxsll (fxand w1 #x3FF) 10) + (fxand w2 #x3FF))))]))] + [else + (let ([bytes (refill-bv-buffer p who)]) + (cond + [(fx= bytes 0) + #\xFFFD] + [else + (peek-utf16 p who endianness)]))]))] + [(fx< i ($port-size p)) + (let ([bytes (refill-bv-buffer p who)]) + (cond + [(fx= bytes 0) + #\xFFFD] + [else (peek-utf16 p who endianness)]))] + [else + (let ([bytes (refill-bv-buffer p who)]) + (if (fx= bytes 0) + (eof-object) + (peek-utf16 p who endianness)))]))) + (define (get-utf16 p who endianness) + (define (invalid p who endianness n) + (case (transcoder-error-handling-mode (port-transcoder p)) + [(ignore) (do-get-char p who endianness)] + [(replace) #\xFFFD] + [(raise) + (raise (make-i/o-decoding-error p n))] + [else (die who "BUG: invalid error handling mode" p)])) + (define (integer->char/invalid p who endianness n) + (cond + [(fx<= n #xD7FF) (integer->char n)] + [(fx< n #xE000) (invalid p who endianness n)] + [(fx<= n #x10FFFF) (integer->char n)] + [else (invalid p who endianness n)])) + (let ([i ($port-index p)]) + (cond + [(fx<= (fx+ i 2) ($port-size p)) + (let ([w1 (bytevector-u16-ref ($port-buffer p) i endianness)]) + (cond + [(or (fx< w1 #xD800) (fx> w1 #xDFFF)) + ($set-port-index! p (fx+ i 2)) + (integer->char/invalid p who endianness w1)] + [(not (and (fx<= #xD800 w1) (fx<= w1 #xDBFF))) + ($set-port-index! p (fx+ i 2)) + (invalid p who endianness w1)] + [(fx<= (+ i 4) ($port-size p)) + (let ([w2 (bytevector-u16-ref + ($port-buffer p) (+ i 2) endianness)]) + (cond + [(not (and (fx<= #xDC00 w2) (fx<= w2 #xDFFF))) + ($set-port-index! p (fx+ i 2)) + (invalid p who endianness w1)] + [else + ($set-port-index! p (fx+ i 4)) + (integer->char/invalid p who endianness + (fx+ #x10000 + (fxlogor + (fxsll (fxand w1 #x3FF) 10) + (fxand w2 #x3FF))))]))] + [else + (let ([bytes (refill-bv-buffer p who)]) + (cond + [(fx= bytes 0) + ($set-port-index! p ($port-size p)) + (invalid p who endianness w1)] + [else + (get-utf16 p who endianness)]))]))] + [(fx< i ($port-size p)) + (let ([bytes (refill-bv-buffer p who)]) + (cond + [(fx= bytes 0) + ($set-port-index! p ($port-size p)) + (invalid p who endianness + (bytevector-u8-ref ($port-buffer p) ($port-index p)))] + [else (get-utf16 p who endianness)]))] + [else + (let ([bytes (refill-bv-buffer p who)]) + (if (fx= bytes 0) + (eof-object) + (get-utf16 p who endianness)))]))) + (define (get-char p) + (do-get-char p 'get-char)) (define read-char (case-lambda - [(p) - (define who 'read-char) - (let ([m ($port-fast-attrs p)]) - (cond - [(eq? m fast-get-utf8-tag) - (let ([i ($port-index p)]) - (cond - [(fx< i ($port-size p)) - (let ([b (bytevector-u8-ref ($port-buffer p) i)]) - (cond - [(fx< b 128) - ($set-port-index! p (fx+ i 1)) - (integer->char b)] - [else (get-char-utf8-mode p who)]))] - [else - (get-char-utf8-mode p who)]))] - [(eq? m fast-get-char-tag) - (let ([i ($port-index p)]) - (cond - [(fx< i ($port-size p)) - ($set-port-index! p (fx+ i 1)) - (string-ref ($port-buffer p) i)] - [else (get-char-char-mode p who)]))] - [(eq? m fast-get-latin-tag) - (let ([i ($port-index p)]) - (cond - [(fx< i ($port-size p)) - ($set-port-index! p (fx+ i 1)) - (integer->char - (bytevector-u8-ref ($port-buffer p) i))] - [else - (get-char-latin-mode p who 1)]))] - [else - (if (speedup-input-port p who) - (eof-object) - (get-char p))]))] - [() (read-char (current-input-port))])) - (define (get-char p) - (define who 'get-char) + [(p) (do-get-char p 'read-char)] + [() (do-get-char (current-input-port) 'read-char)])) + (define (do-get-char p who) (let ([m ($port-fast-attrs p)]) (cond [(eq? m fast-get-utf8-tag) @@ -1165,10 +1252,12 @@ (bytevector-u8-ref ($port-buffer p) i))] [else (get-char-latin-mode p who 1)]))] + [(eq? m fast-get-utf16le-tag) (get-utf16 p who 'little)] + [(eq? m fast-get-utf16be-tag) (get-utf16 p who 'big)] [else (if (speedup-input-port p who) (eof-object) - (get-char p))])))) + (do-get-char p who))])))) ;;; ---------------------------------------------------------- (define (assert-binary-input-port p who) diff --git a/scheme/ikarus.unicode-conversion.ss b/scheme/ikarus.unicode-conversion.ss index 162481f..ec04674 100644 --- a/scheme/ikarus.unicode-conversion.ss +++ b/scheme/ikarus.unicode-conversion.ss @@ -355,7 +355,7 @@ ;;; of W1. Terminate. ;;; ;;; 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence -;;; is in die and no valid character can be obtained using W1. +;;; is in error and no valid character can be obtained using W1. ;;; Terminate. ;;; ;;; 3) If there is no W2 (that is, the sequence ends with W1), or if W2 @@ -410,7 +410,7 @@ [(str) (unless (string? str) (die 'string->utf16 "not a string" str)) - ($string->utf16 str 'big)] + ($string->utf16 str (native-endianness))] [(str endianness) (unless (string? str) (die 'string->utf16 "not a string" str)) @@ -458,7 +458,7 @@ [(or (fx< w1 #xD800) (fx> w1 #xDFFF)) (string-set! str n (integer->char/invalid w1)) (fill bv endianness str (+ i 2) len (+ n 1))] - [(not (fx<= #xD800 w1 #xDBFF)) ;;; die sequence + [(not (fx<= #xD800 w1 #xDBFF)) ;;; error sequence (string-set! str n #\xFFFD) (fill bv endianness str (+ i 2) len (+ n 1))] [(<= (+ i 4) (bytevector-length bv)) @@ -466,7 +466,7 @@ (cond [(not (<= #xDC00 w2 #xDFFF)) ;;; do we skip w2 also? - ;;; I won't. Just w1 is an die + ;;; I won't. Just w1 is an error (string-set! str n #\xFFFD) (fill bv endianness str (+ i 2) len (+ n 1))] [else @@ -477,7 +477,7 @@ (fxlogand w2 #x3FF))))) (fill bv endianness str (+ i 4) len (+ n 1))]))] [else - ;;; die again + ;;; error again (string-set! str n #\xFFFD) (fill bv endianness str (+ i 2) len (+ n 1))]))])) (define (decode bv endianness start) @@ -531,7 +531,7 @@ [(str) (unless (string? str) (die who "not a string" str)) - ($string->utf32 str 'big)] + ($string->utf32 str (native-endianness))] [(str endianness) (unless (string? str) (die who "not a string" str)) diff --git a/scheme/last-revision b/scheme/last-revision index d0e5260..755d892 100644 --- a/scheme/last-revision +++ b/scheme/last-revision @@ -1 +1 @@ -1636 +1637 diff --git a/scheme/tests/io.ss b/scheme/tests/io.ss index 129b374..3ac8ab0 100755 --- a/scheme/tests/io.ss +++ b/scheme/tests/io.ss @@ -202,6 +202,16 @@ (make-transcoder (utf-8-codec) 'none 'raise)) 128))) +(define (make-utf8-bytevector-range1) + (u8-list->bytevector + (let f ([i 0] [j #x7F]) + (cond + [(> i j) '()] + [else + (cons* i (f (+ i 1) j))])))) + + + (define (make-utf8-bytevector-range2) (u8-list->bytevector (let f ([i #x80] [j #x7FF]) @@ -236,6 +246,15 @@ (fxior #b10000000 (fxand i #b111111)) (f (+ i 1) j))])))) +(define (make-utf8-string-range1) + (list->string + (let f ([i 0] [j #x7F]) + (cond + [(> i j) '()] + [else + (cons (integer->char i) + (f (+ i 1) j))])))) + (define (make-utf8-string-range2) (list->string (let f ([i #x80] [j #x7FF]) @@ -276,7 +295,16 @@ [(char=? x (string-ref str i)) (f (+ i 1))] [else - (error #f "mismatch" x (string-ref str i) i)])))) + (error #f + (format + "mismatch at index ~a, got char ~a (code #x~x), \ + expected char ~a (code #x~x)" + i + x + (char->integer x) + (string-ref str i) + (char->integer (string-ref str i))))])))) + (define (test-port-string-peeking-output p str) (let f ([i 0]) @@ -294,7 +322,66 @@ [else (error #f "mismatch" x (string-ref str i) i)])))) + +(define (invalid-code? n) (not (valid-code? n))) +(define (valid-code? n) + (cond + [(< n 0) #f] + [(<= n #xD7FF) #t] + [(< n #xE000) #f] + [(<= n #x10FFFF) #t] + [else (error 'valid-code? "out of range" n)])) + + + +(define (make-u16le-bv min max) + (u8-list->bytevector + (let f ([i min]) + (cond + [(> i max) '()] + [(invalid-code? i) (f (+ i 1))] + [(< i #x10000) + (cons* + (fxand i #xFF) + (fxsra i 8) + (f (+ i 1)))] + [else + (let ([ii (fx- i #x10000)]) + (let ([w1 (fxior #xD800 (fxand #x3FF (fxsra ii 10)))] + [w2 (fxior #xDC00 (fxand #x3FF ii))]) + (cons* + (fxand w1 #xFF) + (fxsra w1 8) + (fxand w2 #xFF) + (fxsra w2 8) + (f (+ i 1)))))])))) + +(define (make-string-slice min max) + (list->string + (let f ([i min]) + (cond + [(> i max) '()] + [(invalid-code? i) (f (+ i 1))] + [else (cons (integer->char i) (f (+ i 1)))])))) + + +(define (make-u16le-range1) + (make-u16le-bv 0 #x7FFF)) +(define (make-u16le-range2) + (make-u16le-bv #x8000 #x10FFFF)) +(define (make-utf16-string-range1) + (make-string-slice 0 #x7FFF)) +(define (make-utf16-string-range2) + (make-string-slice #x8000 #x10FFFF)) + (define (run-exhaustive-tests) + + (test "utf8 range 1" + (test-port-string-output + (open-bytevector-input-port (make-utf8-bytevector-range1) + (make-transcoder (utf-8-codec) 'none 'raise)) + (make-utf8-string-range1))) + (test "utf8 range 2" (test-port-string-output (open-bytevector-input-port (make-utf8-bytevector-range2) @@ -312,7 +399,28 @@ (open-bytevector-input-port (make-utf8-bytevector-range4) (make-transcoder (utf-8-codec) 'none 'raise)) (make-utf8-string-range4))) - + + + (test "utf16 range 1" + (test-port-string-output + (open-bytevector-input-port (make-u16le-range1) + (make-transcoder (utf-16-codec) 'none 'raise)) + (make-utf16-string-range1))) + + + (test "utf16 range 2" + (test-port-string-output + (open-bytevector-input-port (make-u16le-range2) + (make-transcoder (utf-16-codec) 'none 'raise)) + (make-utf16-string-range2))) + + + (test "utf8 peek range 1" + (test-port-string-peeking-output + (open-bytevector-input-port (make-utf8-bytevector-range1) + (make-transcoder (utf-8-codec) 'none 'raise)) + (make-utf8-string-range1))) + (test "utf8 peek range 2" (test-port-string-peeking-output (open-bytevector-input-port (make-utf8-bytevector-range2) @@ -330,7 +438,24 @@ (open-bytevector-input-port (make-utf8-bytevector-range4) (make-transcoder (utf-8-codec) 'none 'raise)) (make-utf8-string-range4))) + + (test "utf16 peek range 1" + (test-port-string-peeking-output + (open-bytevector-input-port (make-u16le-range1) + (make-transcoder (utf-16-codec) 'none 'raise)) + (make-utf16-string-range1))) + + (test "utf16 peek range 2" + (test-port-string-peeking-output + (open-bytevector-input-port (make-u16le-range2) + (make-transcoder (utf-16-codec) 'none 'raise)) + (make-utf16-string-range2))) + (test "utf8 range 1 string" + (test-port-string-output + (open-string-input-port (make-utf8-string-range1)) + (make-utf8-string-range1))) + (test "utf8 range 2 string" (test-port-string-output (open-string-input-port (make-utf8-string-range2)) @@ -509,18 +634,27 @@ 1])) #f #f #f) transcoder))) - (define (test name codec conv) + (define (test name codec s->bv bv->s) (printf "testing partial reads for ~s codec ... " name) (let ([s (make-test-string)]) + (assert (string=? s (bv->s (s->bv s)))) (let ([r (call-with-port - (make-slow-input-port (conv s) + (make-slow-input-port (s->bv s) (make-transcoder codec (eol-style none) (error-handling-mode raise))) get-string-all)]) - (assert (string=? r s)))) + (unless (string=? r s) + (if (= (string-length r) (string-length s)) + (error #f "test failed") + (error #f "length mismatch" + (string-length s) (string-length r)))))) (printf "ok\n")) - ;(test 'utf16 (utf-16-codec) string->utf16) - (test 'utf8 (utf-8-codec) string->utf8)) + (test 'utf8 (utf-8-codec) + string->utf8 + utf8->string) + (test 'utf16 (utf-16-codec) + (lambda (x) (string->utf16 x 'little)) + (lambda (x) (utf16->string x 'little)))) (define-tests test-input-ports [eof-object?