- input ports can now handle utf-16 codecs.

This commit is contained in:
Abdulaziz Ghuloum 2008-10-19 18:43:42 -04:00
parent 8cd9d6ef16
commit 0da61d51cb
4 changed files with 287 additions and 64 deletions

View File

@ -206,6 +206,8 @@
(define fast-char-text-tag #b00000000010000) (define fast-char-text-tag #b00000000010000)
(define fast-u7-text-tag #b00000000100000) (define fast-u7-text-tag #b00000000100000)
(define fast-u8-text-tag #b00000001100000) (define fast-u8-text-tag #b00000001100000)
(define fast-u16be-text-tag #b00000010000000)
(define fast-u16le-text-tag #b00000100000000)
(define r6rs-mode-tag #b01000000000000) (define r6rs-mode-tag #b01000000000000)
(define closed-port-tag #b10000000000000) (define closed-port-tag #b10000000000000)
@ -219,13 +221,15 @@
(define fast-get-char-tag #b00000000010101) (define fast-get-char-tag #b00000000010101)
(define fast-get-utf8-tag #b00000000100101) (define fast-get-utf8-tag #b00000000100101)
(define fast-get-latin-tag #b00000001100101) (define fast-get-latin-tag #b00000001100101)
(define fast-get-utf16be-tag #b00000010000101)
(define fast-get-utf16le-tag #b00000100000101)
(define fast-put-byte-tag #b00000000001010) (define fast-put-byte-tag #b00000000001010)
(define fast-put-char-tag #b00000000010110) (define fast-put-char-tag #b00000000010110)
(define fast-put-utf8-tag #b00000000100110) (define fast-put-utf8-tag #b00000000100110)
(define fast-put-latin-tag #b00000001100110) (define fast-put-latin-tag #b00000001100110)
(define fast-attrs-mask #xFFF) (define fast-attrs-mask #b111111111111)
(define-syntax $port-fast-attrs (define-syntax $port-fast-attrs
(identifier-syntax (identifier-syntax
(lambda (x) (lambda (x)
@ -750,11 +754,6 @@
(die 'close-output-port "not an output port" p)) (die 'close-output-port "not an output port" p))
($close-port p)) ($close-port p))
;(define-rrr port-has-port-position?)
;(define-rrr port-position)
;(define-rrr port-has-set-port-position!?)
;(define-rrr set-port-position!)
(define (refill-bv-buffer p who) (define (refill-bv-buffer p who)
(when ($port-closed? p) (die who "port is closed" p)) (when ($port-closed? p) (die who "port is closed" p))
(let ([bv ($port-buffer p)] [i ($port-index p)] [j ($port-size p)]) (let ([bv ($port-buffer p)] [i ($port-index p)] [j ($port-size p)])
@ -1019,14 +1018,27 @@
(die who "not a textual port" p)) (die who "not a textual port" p))
(case (transcoder-codec tr) (case (transcoder-codec tr)
[(utf-8-codec) [(utf-8-codec)
;;;
($set-port-attrs! p ($set-port-attrs! p
(fxior textual-input-port-bits fast-u7-text-tag)) (fxior textual-input-port-bits fast-u7-text-tag))
(eof-object? (advance-bom p who '(#xEF #xBB #xBF)))] (eof-object? (advance-bom p who '(#xEF #xBB #xBF)))]
[(utf-16-codec)
(let ([be? (advance-bom p who '(#xFE #xFF))])
(case be?
[(#t)
($set-port-attrs! p
(fxior textual-input-port-bits fast-u16be-text-tag))
#f]
[(#f)
(let ([le? (advance-bom p who '(#xFF #xFE))])
(case le?
[(#t #f) ;;; little by default
($set-port-attrs! p
(fxior textual-input-port-bits fast-u16le-text-tag))
#f]
[else #t]))]
[else #t]))]
[else [else
(die 'slow-get-char (die who "BUG: codec not handled" (transcoder-codec tr))])))
"BUG: codec not handled"
(transcoder-codec tr))])))
;;; ;;;
(define (lookahead-char-char-mode p who) (define (lookahead-char-char-mode p who)
(let ([str ($port-buffer p)] (let ([str ($port-buffer p)]
@ -1073,6 +1085,8 @@
(bytevector-u8-ref ($port-buffer p) i))] (bytevector-u8-ref ($port-buffer p) i))]
[else [else
(get-char-latin-mode p who 0)]))] (get-char-latin-mode p who 0)]))]
[(eq? m fast-get-utf16le-tag) (peek-utf16 p who 'little)]
[(eq? m fast-get-utf16be-tag) (peek-utf16 p who 'big)]
[else [else
(if (speedup-input-port p who) (if (speedup-input-port p who)
(eof-object) (eof-object)
@ -1094,47 +1108,120 @@
[else [else
($set-port-index! p 1) ($set-port-index! p 1)
(string-ref str 0)])))) (string-ref str 0)]))))
(define (peek-utf16 p who endianness)
(define integer->char/invalid
(lambda (n)
(cond
[(fx<= n #xD7FF) (integer->char n)]
[(fx< n #xE000) #\xFFFD]
[(fx<= n #x10FFFF) (integer->char n)]
[else #\xFFFD])))
(let ([i ($port-index p)])
(cond
[(fx<= (fx+ i 2) ($port-size p))
(let ([w1 (bytevector-u16-ref ($port-buffer p) i endianness)])
(cond
[(or (fx< w1 #xD800) (fx> w1 #xDFFF))
(integer->char/invalid w1)]
[(not (and (fx<= #xD800 w1) (fx<= w1 #xDBFF)))
#\xFFFD]
[(fx<= (+ i 4) ($port-size p))
(let ([w2 (bytevector-u16-ref
($port-buffer p) (+ i 2) endianness)])
(cond
[(not (and (fx<= #xDC00 w2) (fx<= w2 #xDFFF)))
#\xFFFD]
[else
(integer->char/invalid
(fx+ #x10000
(fxlogor
(fxsll (fxand w1 #x3FF) 10)
(fxand w2 #x3FF))))]))]
[else
(let ([bytes (refill-bv-buffer p who)])
(cond
[(fx= bytes 0)
#\xFFFD]
[else
(peek-utf16 p who endianness)]))]))]
[(fx< i ($port-size p))
(let ([bytes (refill-bv-buffer p who)])
(cond
[(fx= bytes 0)
#\xFFFD]
[else (peek-utf16 p who endianness)]))]
[else
(let ([bytes (refill-bv-buffer p who)])
(if (fx= bytes 0)
(eof-object)
(peek-utf16 p who endianness)))])))
(define (get-utf16 p who endianness)
(define (invalid p who endianness n)
(case (transcoder-error-handling-mode (port-transcoder p))
[(ignore) (do-get-char p who endianness)]
[(replace) #\xFFFD]
[(raise)
(raise (make-i/o-decoding-error p n))]
[else (die who "BUG: invalid error handling mode" p)]))
(define (integer->char/invalid p who endianness n)
(cond
[(fx<= n #xD7FF) (integer->char n)]
[(fx< n #xE000) (invalid p who endianness n)]
[(fx<= n #x10FFFF) (integer->char n)]
[else (invalid p who endianness n)]))
(let ([i ($port-index p)])
(cond
[(fx<= (fx+ i 2) ($port-size p))
(let ([w1 (bytevector-u16-ref ($port-buffer p) i endianness)])
(cond
[(or (fx< w1 #xD800) (fx> w1 #xDFFF))
($set-port-index! p (fx+ i 2))
(integer->char/invalid p who endianness w1)]
[(not (and (fx<= #xD800 w1) (fx<= w1 #xDBFF)))
($set-port-index! p (fx+ i 2))
(invalid p who endianness w1)]
[(fx<= (+ i 4) ($port-size p))
(let ([w2 (bytevector-u16-ref
($port-buffer p) (+ i 2) endianness)])
(cond
[(not (and (fx<= #xDC00 w2) (fx<= w2 #xDFFF)))
($set-port-index! p (fx+ i 2))
(invalid p who endianness w1)]
[else
($set-port-index! p (fx+ i 4))
(integer->char/invalid p who endianness
(fx+ #x10000
(fxlogor
(fxsll (fxand w1 #x3FF) 10)
(fxand w2 #x3FF))))]))]
[else
(let ([bytes (refill-bv-buffer p who)])
(cond
[(fx= bytes 0)
($set-port-index! p ($port-size p))
(invalid p who endianness w1)]
[else
(get-utf16 p who endianness)]))]))]
[(fx< i ($port-size p))
(let ([bytes (refill-bv-buffer p who)])
(cond
[(fx= bytes 0)
($set-port-index! p ($port-size p))
(invalid p who endianness
(bytevector-u8-ref ($port-buffer p) ($port-index p)))]
[else (get-utf16 p who endianness)]))]
[else
(let ([bytes (refill-bv-buffer p who)])
(if (fx= bytes 0)
(eof-object)
(get-utf16 p who endianness)))])))
(define (get-char p)
(do-get-char p 'get-char))
(define read-char (define read-char
(case-lambda (case-lambda
[(p) [(p) (do-get-char p 'read-char)]
(define who 'read-char) [() (do-get-char (current-input-port) 'read-char)]))
(let ([m ($port-fast-attrs p)]) (define (do-get-char p who)
(cond
[(eq? m fast-get-utf8-tag)
(let ([i ($port-index p)])
(cond
[(fx< i ($port-size p))
(let ([b (bytevector-u8-ref ($port-buffer p) i)])
(cond
[(fx< b 128)
($set-port-index! p (fx+ i 1))
(integer->char b)]
[else (get-char-utf8-mode p who)]))]
[else
(get-char-utf8-mode p who)]))]
[(eq? m fast-get-char-tag)
(let ([i ($port-index p)])
(cond
[(fx< i ($port-size p))
($set-port-index! p (fx+ i 1))
(string-ref ($port-buffer p) i)]
[else (get-char-char-mode p who)]))]
[(eq? m fast-get-latin-tag)
(let ([i ($port-index p)])
(cond
[(fx< i ($port-size p))
($set-port-index! p (fx+ i 1))
(integer->char
(bytevector-u8-ref ($port-buffer p) i))]
[else
(get-char-latin-mode p who 1)]))]
[else
(if (speedup-input-port p who)
(eof-object)
(get-char p))]))]
[() (read-char (current-input-port))]))
(define (get-char p)
(define who 'get-char)
(let ([m ($port-fast-attrs p)]) (let ([m ($port-fast-attrs p)])
(cond (cond
[(eq? m fast-get-utf8-tag) [(eq? m fast-get-utf8-tag)
@ -1165,10 +1252,12 @@
(bytevector-u8-ref ($port-buffer p) i))] (bytevector-u8-ref ($port-buffer p) i))]
[else [else
(get-char-latin-mode p who 1)]))] (get-char-latin-mode p who 1)]))]
[(eq? m fast-get-utf16le-tag) (get-utf16 p who 'little)]
[(eq? m fast-get-utf16be-tag) (get-utf16 p who 'big)]
[else [else
(if (speedup-input-port p who) (if (speedup-input-port p who)
(eof-object) (eof-object)
(get-char p))])))) (do-get-char p who))]))))
;;; ---------------------------------------------------------- ;;; ----------------------------------------------------------
(define (assert-binary-input-port p who) (define (assert-binary-input-port p who)

View File

@ -355,7 +355,7 @@
;;; of W1. Terminate. ;;; of W1. Terminate.
;;; ;;;
;;; 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence ;;; 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
;;; is in die and no valid character can be obtained using W1. ;;; is in error and no valid character can be obtained using W1.
;;; Terminate. ;;; Terminate.
;;; ;;;
;;; 3) If there is no W2 (that is, the sequence ends with W1), or if W2 ;;; 3) If there is no W2 (that is, the sequence ends with W1), or if W2
@ -410,7 +410,7 @@
[(str) [(str)
(unless (string? str) (unless (string? str)
(die 'string->utf16 "not a string" str)) (die 'string->utf16 "not a string" str))
($string->utf16 str 'big)] ($string->utf16 str (native-endianness))]
[(str endianness) [(str endianness)
(unless (string? str) (unless (string? str)
(die 'string->utf16 "not a string" str)) (die 'string->utf16 "not a string" str))
@ -458,7 +458,7 @@
[(or (fx< w1 #xD800) (fx> w1 #xDFFF)) [(or (fx< w1 #xD800) (fx> w1 #xDFFF))
(string-set! str n (integer->char/invalid w1)) (string-set! str n (integer->char/invalid w1))
(fill bv endianness str (+ i 2) len (+ n 1))] (fill bv endianness str (+ i 2) len (+ n 1))]
[(not (fx<= #xD800 w1 #xDBFF)) ;;; die sequence [(not (fx<= #xD800 w1 #xDBFF)) ;;; error sequence
(string-set! str n #\xFFFD) (string-set! str n #\xFFFD)
(fill bv endianness str (+ i 2) len (+ n 1))] (fill bv endianness str (+ i 2) len (+ n 1))]
[(<= (+ i 4) (bytevector-length bv)) [(<= (+ i 4) (bytevector-length bv))
@ -466,7 +466,7 @@
(cond (cond
[(not (<= #xDC00 w2 #xDFFF)) [(not (<= #xDC00 w2 #xDFFF))
;;; do we skip w2 also? ;;; do we skip w2 also?
;;; I won't. Just w1 is an die ;;; I won't. Just w1 is an error
(string-set! str n #\xFFFD) (string-set! str n #\xFFFD)
(fill bv endianness str (+ i 2) len (+ n 1))] (fill bv endianness str (+ i 2) len (+ n 1))]
[else [else
@ -477,7 +477,7 @@
(fxlogand w2 #x3FF))))) (fxlogand w2 #x3FF)))))
(fill bv endianness str (+ i 4) len (+ n 1))]))] (fill bv endianness str (+ i 4) len (+ n 1))]))]
[else [else
;;; die again ;;; error again
(string-set! str n #\xFFFD) (string-set! str n #\xFFFD)
(fill bv endianness str (+ i 2) len (+ n 1))]))])) (fill bv endianness str (+ i 2) len (+ n 1))]))]))
(define (decode bv endianness start) (define (decode bv endianness start)
@ -531,7 +531,7 @@
[(str) [(str)
(unless (string? str) (unless (string? str)
(die who "not a string" str)) (die who "not a string" str))
($string->utf32 str 'big)] ($string->utf32 str (native-endianness))]
[(str endianness) [(str endianness)
(unless (string? str) (unless (string? str)
(die who "not a string" str)) (die who "not a string" str))

View File

@ -1 +1 @@
1636 1637

View File

@ -202,6 +202,16 @@
(make-transcoder (utf-8-codec) 'none 'raise)) (make-transcoder (utf-8-codec) 'none 'raise))
128))) 128)))
(define (make-utf8-bytevector-range1)
(u8-list->bytevector
(let f ([i 0] [j #x7F])
(cond
[(> i j) '()]
[else
(cons* i (f (+ i 1) j))]))))
(define (make-utf8-bytevector-range2) (define (make-utf8-bytevector-range2)
(u8-list->bytevector (u8-list->bytevector
(let f ([i #x80] [j #x7FF]) (let f ([i #x80] [j #x7FF])
@ -236,6 +246,15 @@
(fxior #b10000000 (fxand i #b111111)) (fxior #b10000000 (fxand i #b111111))
(f (+ i 1) j))])))) (f (+ i 1) j))]))))
(define (make-utf8-string-range1)
(list->string
(let f ([i 0] [j #x7F])
(cond
[(> i j) '()]
[else
(cons (integer->char i)
(f (+ i 1) j))]))))
(define (make-utf8-string-range2) (define (make-utf8-string-range2)
(list->string (list->string
(let f ([i #x80] [j #x7FF]) (let f ([i #x80] [j #x7FF])
@ -276,7 +295,16 @@
[(char=? x (string-ref str i)) [(char=? x (string-ref str i))
(f (+ i 1))] (f (+ i 1))]
[else [else
(error #f "mismatch" x (string-ref str i) i)])))) (error #f
(format
"mismatch at index ~a, got char ~a (code #x~x), \
expected char ~a (code #x~x)"
i
x
(char->integer x)
(string-ref str i)
(char->integer (string-ref str i))))]))))
(define (test-port-string-peeking-output p str) (define (test-port-string-peeking-output p str)
(let f ([i 0]) (let f ([i 0])
@ -294,7 +322,66 @@
[else [else
(error #f "mismatch" x (string-ref str i) i)])))) (error #f "mismatch" x (string-ref str i) i)]))))
(define (invalid-code? n) (not (valid-code? n)))
(define (valid-code? n)
(cond
[(< n 0) #f]
[(<= n #xD7FF) #t]
[(< n #xE000) #f]
[(<= n #x10FFFF) #t]
[else (error 'valid-code? "out of range" n)]))
(define (make-u16le-bv min max)
(u8-list->bytevector
(let f ([i min])
(cond
[(> i max) '()]
[(invalid-code? i) (f (+ i 1))]
[(< i #x10000)
(cons*
(fxand i #xFF)
(fxsra i 8)
(f (+ i 1)))]
[else
(let ([ii (fx- i #x10000)])
(let ([w1 (fxior #xD800 (fxand #x3FF (fxsra ii 10)))]
[w2 (fxior #xDC00 (fxand #x3FF ii))])
(cons*
(fxand w1 #xFF)
(fxsra w1 8)
(fxand w2 #xFF)
(fxsra w2 8)
(f (+ i 1)))))]))))
(define (make-string-slice min max)
(list->string
(let f ([i min])
(cond
[(> i max) '()]
[(invalid-code? i) (f (+ i 1))]
[else (cons (integer->char i) (f (+ i 1)))]))))
(define (make-u16le-range1)
(make-u16le-bv 0 #x7FFF))
(define (make-u16le-range2)
(make-u16le-bv #x8000 #x10FFFF))
(define (make-utf16-string-range1)
(make-string-slice 0 #x7FFF))
(define (make-utf16-string-range2)
(make-string-slice #x8000 #x10FFFF))
(define (run-exhaustive-tests) (define (run-exhaustive-tests)
(test "utf8 range 1"
(test-port-string-output
(open-bytevector-input-port (make-utf8-bytevector-range1)
(make-transcoder (utf-8-codec) 'none 'raise))
(make-utf8-string-range1)))
(test "utf8 range 2" (test "utf8 range 2"
(test-port-string-output (test-port-string-output
(open-bytevector-input-port (make-utf8-bytevector-range2) (open-bytevector-input-port (make-utf8-bytevector-range2)
@ -312,7 +399,28 @@
(open-bytevector-input-port (make-utf8-bytevector-range4) (open-bytevector-input-port (make-utf8-bytevector-range4)
(make-transcoder (utf-8-codec) 'none 'raise)) (make-transcoder (utf-8-codec) 'none 'raise))
(make-utf8-string-range4))) (make-utf8-string-range4)))
(test "utf16 range 1"
(test-port-string-output
(open-bytevector-input-port (make-u16le-range1)
(make-transcoder (utf-16-codec) 'none 'raise))
(make-utf16-string-range1)))
(test "utf16 range 2"
(test-port-string-output
(open-bytevector-input-port (make-u16le-range2)
(make-transcoder (utf-16-codec) 'none 'raise))
(make-utf16-string-range2)))
(test "utf8 peek range 1"
(test-port-string-peeking-output
(open-bytevector-input-port (make-utf8-bytevector-range1)
(make-transcoder (utf-8-codec) 'none 'raise))
(make-utf8-string-range1)))
(test "utf8 peek range 2" (test "utf8 peek range 2"
(test-port-string-peeking-output (test-port-string-peeking-output
(open-bytevector-input-port (make-utf8-bytevector-range2) (open-bytevector-input-port (make-utf8-bytevector-range2)
@ -330,7 +438,24 @@
(open-bytevector-input-port (make-utf8-bytevector-range4) (open-bytevector-input-port (make-utf8-bytevector-range4)
(make-transcoder (utf-8-codec) 'none 'raise)) (make-transcoder (utf-8-codec) 'none 'raise))
(make-utf8-string-range4))) (make-utf8-string-range4)))
(test "utf16 peek range 1"
(test-port-string-peeking-output
(open-bytevector-input-port (make-u16le-range1)
(make-transcoder (utf-16-codec) 'none 'raise))
(make-utf16-string-range1)))
(test "utf16 peek range 2"
(test-port-string-peeking-output
(open-bytevector-input-port (make-u16le-range2)
(make-transcoder (utf-16-codec) 'none 'raise))
(make-utf16-string-range2)))
(test "utf8 range 1 string"
(test-port-string-output
(open-string-input-port (make-utf8-string-range1))
(make-utf8-string-range1)))
(test "utf8 range 2 string" (test "utf8 range 2 string"
(test-port-string-output (test-port-string-output
(open-string-input-port (make-utf8-string-range2)) (open-string-input-port (make-utf8-string-range2))
@ -509,18 +634,27 @@
1])) 1]))
#f #f #f) #f #f #f)
transcoder))) transcoder)))
(define (test name codec conv) (define (test name codec s->bv bv->s)
(printf "testing partial reads for ~s codec ... " name) (printf "testing partial reads for ~s codec ... " name)
(let ([s (make-test-string)]) (let ([s (make-test-string)])
(assert (string=? s (bv->s (s->bv s))))
(let ([r (call-with-port (let ([r (call-with-port
(make-slow-input-port (conv s) (make-slow-input-port (s->bv s)
(make-transcoder codec (make-transcoder codec
(eol-style none) (error-handling-mode raise))) (eol-style none) (error-handling-mode raise)))
get-string-all)]) get-string-all)])
(assert (string=? r s)))) (unless (string=? r s)
(if (= (string-length r) (string-length s))
(error #f "test failed")
(error #f "length mismatch"
(string-length s) (string-length r))))))
(printf "ok\n")) (printf "ok\n"))
;(test 'utf16 (utf-16-codec) string->utf16) (test 'utf8 (utf-8-codec)
(test 'utf8 (utf-8-codec) string->utf8)) string->utf8
utf8->string)
(test 'utf16 (utf-16-codec)
(lambda (x) (string->utf16 x 'little))
(lambda (x) (utf16->string x 'little))))
(define-tests test-input-ports (define-tests test-input-ports
[eof-object? [eof-object?