- input ports can now handle utf-16 codecs.
This commit is contained in:
parent
8cd9d6ef16
commit
0da61d51cb
|
@ -206,6 +206,8 @@
|
|||
(define fast-char-text-tag #b00000000010000)
|
||||
(define fast-u7-text-tag #b00000000100000)
|
||||
(define fast-u8-text-tag #b00000001100000)
|
||||
(define fast-u16be-text-tag #b00000010000000)
|
||||
(define fast-u16le-text-tag #b00000100000000)
|
||||
(define r6rs-mode-tag #b01000000000000)
|
||||
(define closed-port-tag #b10000000000000)
|
||||
|
||||
|
@ -219,13 +221,15 @@
|
|||
(define fast-get-char-tag #b00000000010101)
|
||||
(define fast-get-utf8-tag #b00000000100101)
|
||||
(define fast-get-latin-tag #b00000001100101)
|
||||
(define fast-get-utf16be-tag #b00000010000101)
|
||||
(define fast-get-utf16le-tag #b00000100000101)
|
||||
|
||||
(define fast-put-byte-tag #b00000000001010)
|
||||
(define fast-put-char-tag #b00000000010110)
|
||||
(define fast-put-utf8-tag #b00000000100110)
|
||||
(define fast-put-latin-tag #b00000001100110)
|
||||
|
||||
(define fast-attrs-mask #xFFF)
|
||||
(define fast-attrs-mask #b111111111111)
|
||||
(define-syntax $port-fast-attrs
|
||||
(identifier-syntax
|
||||
(lambda (x)
|
||||
|
@ -750,11 +754,6 @@
|
|||
(die 'close-output-port "not an output port" p))
|
||||
($close-port p))
|
||||
|
||||
;(define-rrr port-has-port-position?)
|
||||
;(define-rrr port-position)
|
||||
;(define-rrr port-has-set-port-position!?)
|
||||
;(define-rrr set-port-position!)
|
||||
|
||||
(define (refill-bv-buffer p who)
|
||||
(when ($port-closed? p) (die who "port is closed" p))
|
||||
(let ([bv ($port-buffer p)] [i ($port-index p)] [j ($port-size p)])
|
||||
|
@ -1019,14 +1018,27 @@
|
|||
(die who "not a textual port" p))
|
||||
(case (transcoder-codec tr)
|
||||
[(utf-8-codec)
|
||||
;;;
|
||||
($set-port-attrs! p
|
||||
(fxior textual-input-port-bits fast-u7-text-tag))
|
||||
(eof-object? (advance-bom p who '(#xEF #xBB #xBF)))]
|
||||
[(utf-16-codec)
|
||||
(let ([be? (advance-bom p who '(#xFE #xFF))])
|
||||
(case be?
|
||||
[(#t)
|
||||
($set-port-attrs! p
|
||||
(fxior textual-input-port-bits fast-u16be-text-tag))
|
||||
#f]
|
||||
[(#f)
|
||||
(let ([le? (advance-bom p who '(#xFF #xFE))])
|
||||
(case le?
|
||||
[(#t #f) ;;; little by default
|
||||
($set-port-attrs! p
|
||||
(fxior textual-input-port-bits fast-u16le-text-tag))
|
||||
#f]
|
||||
[else #t]))]
|
||||
[else #t]))]
|
||||
[else
|
||||
(die 'slow-get-char
|
||||
"BUG: codec not handled"
|
||||
(transcoder-codec tr))])))
|
||||
(die who "BUG: codec not handled" (transcoder-codec tr))])))
|
||||
;;;
|
||||
(define (lookahead-char-char-mode p who)
|
||||
(let ([str ($port-buffer p)]
|
||||
|
@ -1073,6 +1085,8 @@
|
|||
(bytevector-u8-ref ($port-buffer p) i))]
|
||||
[else
|
||||
(get-char-latin-mode p who 0)]))]
|
||||
[(eq? m fast-get-utf16le-tag) (peek-utf16 p who 'little)]
|
||||
[(eq? m fast-get-utf16be-tag) (peek-utf16 p who 'big)]
|
||||
[else
|
||||
(if (speedup-input-port p who)
|
||||
(eof-object)
|
||||
|
@ -1094,10 +1108,120 @@
|
|||
[else
|
||||
($set-port-index! p 1)
|
||||
(string-ref str 0)]))))
|
||||
(define (peek-utf16 p who endianness)
|
||||
(define integer->char/invalid
|
||||
(lambda (n)
|
||||
(cond
|
||||
[(fx<= n #xD7FF) (integer->char n)]
|
||||
[(fx< n #xE000) #\xFFFD]
|
||||
[(fx<= n #x10FFFF) (integer->char n)]
|
||||
[else #\xFFFD])))
|
||||
(let ([i ($port-index p)])
|
||||
(cond
|
||||
[(fx<= (fx+ i 2) ($port-size p))
|
||||
(let ([w1 (bytevector-u16-ref ($port-buffer p) i endianness)])
|
||||
(cond
|
||||
[(or (fx< w1 #xD800) (fx> w1 #xDFFF))
|
||||
(integer->char/invalid w1)]
|
||||
[(not (and (fx<= #xD800 w1) (fx<= w1 #xDBFF)))
|
||||
#\xFFFD]
|
||||
[(fx<= (+ i 4) ($port-size p))
|
||||
(let ([w2 (bytevector-u16-ref
|
||||
($port-buffer p) (+ i 2) endianness)])
|
||||
(cond
|
||||
[(not (and (fx<= #xDC00 w2) (fx<= w2 #xDFFF)))
|
||||
#\xFFFD]
|
||||
[else
|
||||
(integer->char/invalid
|
||||
(fx+ #x10000
|
||||
(fxlogor
|
||||
(fxsll (fxand w1 #x3FF) 10)
|
||||
(fxand w2 #x3FF))))]))]
|
||||
[else
|
||||
(let ([bytes (refill-bv-buffer p who)])
|
||||
(cond
|
||||
[(fx= bytes 0)
|
||||
#\xFFFD]
|
||||
[else
|
||||
(peek-utf16 p who endianness)]))]))]
|
||||
[(fx< i ($port-size p))
|
||||
(let ([bytes (refill-bv-buffer p who)])
|
||||
(cond
|
||||
[(fx= bytes 0)
|
||||
#\xFFFD]
|
||||
[else (peek-utf16 p who endianness)]))]
|
||||
[else
|
||||
(let ([bytes (refill-bv-buffer p who)])
|
||||
(if (fx= bytes 0)
|
||||
(eof-object)
|
||||
(peek-utf16 p who endianness)))])))
|
||||
(define (get-utf16 p who endianness)
|
||||
(define (invalid p who endianness n)
|
||||
(case (transcoder-error-handling-mode (port-transcoder p))
|
||||
[(ignore) (do-get-char p who endianness)]
|
||||
[(replace) #\xFFFD]
|
||||
[(raise)
|
||||
(raise (make-i/o-decoding-error p n))]
|
||||
[else (die who "BUG: invalid error handling mode" p)]))
|
||||
(define (integer->char/invalid p who endianness n)
|
||||
(cond
|
||||
[(fx<= n #xD7FF) (integer->char n)]
|
||||
[(fx< n #xE000) (invalid p who endianness n)]
|
||||
[(fx<= n #x10FFFF) (integer->char n)]
|
||||
[else (invalid p who endianness n)]))
|
||||
(let ([i ($port-index p)])
|
||||
(cond
|
||||
[(fx<= (fx+ i 2) ($port-size p))
|
||||
(let ([w1 (bytevector-u16-ref ($port-buffer p) i endianness)])
|
||||
(cond
|
||||
[(or (fx< w1 #xD800) (fx> w1 #xDFFF))
|
||||
($set-port-index! p (fx+ i 2))
|
||||
(integer->char/invalid p who endianness w1)]
|
||||
[(not (and (fx<= #xD800 w1) (fx<= w1 #xDBFF)))
|
||||
($set-port-index! p (fx+ i 2))
|
||||
(invalid p who endianness w1)]
|
||||
[(fx<= (+ i 4) ($port-size p))
|
||||
(let ([w2 (bytevector-u16-ref
|
||||
($port-buffer p) (+ i 2) endianness)])
|
||||
(cond
|
||||
[(not (and (fx<= #xDC00 w2) (fx<= w2 #xDFFF)))
|
||||
($set-port-index! p (fx+ i 2))
|
||||
(invalid p who endianness w1)]
|
||||
[else
|
||||
($set-port-index! p (fx+ i 4))
|
||||
(integer->char/invalid p who endianness
|
||||
(fx+ #x10000
|
||||
(fxlogor
|
||||
(fxsll (fxand w1 #x3FF) 10)
|
||||
(fxand w2 #x3FF))))]))]
|
||||
[else
|
||||
(let ([bytes (refill-bv-buffer p who)])
|
||||
(cond
|
||||
[(fx= bytes 0)
|
||||
($set-port-index! p ($port-size p))
|
||||
(invalid p who endianness w1)]
|
||||
[else
|
||||
(get-utf16 p who endianness)]))]))]
|
||||
[(fx< i ($port-size p))
|
||||
(let ([bytes (refill-bv-buffer p who)])
|
||||
(cond
|
||||
[(fx= bytes 0)
|
||||
($set-port-index! p ($port-size p))
|
||||
(invalid p who endianness
|
||||
(bytevector-u8-ref ($port-buffer p) ($port-index p)))]
|
||||
[else (get-utf16 p who endianness)]))]
|
||||
[else
|
||||
(let ([bytes (refill-bv-buffer p who)])
|
||||
(if (fx= bytes 0)
|
||||
(eof-object)
|
||||
(get-utf16 p who endianness)))])))
|
||||
(define (get-char p)
|
||||
(do-get-char p 'get-char))
|
||||
(define read-char
|
||||
(case-lambda
|
||||
[(p)
|
||||
(define who 'read-char)
|
||||
[(p) (do-get-char p 'read-char)]
|
||||
[() (do-get-char (current-input-port) 'read-char)]))
|
||||
(define (do-get-char p who)
|
||||
(let ([m ($port-fast-attrs p)])
|
||||
(cond
|
||||
[(eq? m fast-get-utf8-tag)
|
||||
|
@ -1128,47 +1252,12 @@
|
|||
(bytevector-u8-ref ($port-buffer p) i))]
|
||||
[else
|
||||
(get-char-latin-mode p who 1)]))]
|
||||
[(eq? m fast-get-utf16le-tag) (get-utf16 p who 'little)]
|
||||
[(eq? m fast-get-utf16be-tag) (get-utf16 p who 'big)]
|
||||
[else
|
||||
(if (speedup-input-port p who)
|
||||
(eof-object)
|
||||
(get-char p))]))]
|
||||
[() (read-char (current-input-port))]))
|
||||
(define (get-char p)
|
||||
(define who 'get-char)
|
||||
(let ([m ($port-fast-attrs p)])
|
||||
(cond
|
||||
[(eq? m fast-get-utf8-tag)
|
||||
(let ([i ($port-index p)])
|
||||
(cond
|
||||
[(fx< i ($port-size p))
|
||||
(let ([b (bytevector-u8-ref ($port-buffer p) i)])
|
||||
(cond
|
||||
[(fx< b 128)
|
||||
($set-port-index! p (fx+ i 1))
|
||||
(integer->char b)]
|
||||
[else (get-char-utf8-mode p who)]))]
|
||||
[else
|
||||
(get-char-utf8-mode p who)]))]
|
||||
[(eq? m fast-get-char-tag)
|
||||
(let ([i ($port-index p)])
|
||||
(cond
|
||||
[(fx< i ($port-size p))
|
||||
($set-port-index! p (fx+ i 1))
|
||||
(string-ref ($port-buffer p) i)]
|
||||
[else (get-char-char-mode p who)]))]
|
||||
[(eq? m fast-get-latin-tag)
|
||||
(let ([i ($port-index p)])
|
||||
(cond
|
||||
[(fx< i ($port-size p))
|
||||
($set-port-index! p (fx+ i 1))
|
||||
(integer->char
|
||||
(bytevector-u8-ref ($port-buffer p) i))]
|
||||
[else
|
||||
(get-char-latin-mode p who 1)]))]
|
||||
[else
|
||||
(if (speedup-input-port p who)
|
||||
(eof-object)
|
||||
(get-char p))]))))
|
||||
(do-get-char p who))]))))
|
||||
|
||||
;;; ----------------------------------------------------------
|
||||
(define (assert-binary-input-port p who)
|
||||
|
|
|
@ -355,7 +355,7 @@
|
|||
;;; of W1. Terminate.
|
||||
;;;
|
||||
;;; 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
|
||||
;;; is in die and no valid character can be obtained using W1.
|
||||
;;; is in error and no valid character can be obtained using W1.
|
||||
;;; Terminate.
|
||||
;;;
|
||||
;;; 3) If there is no W2 (that is, the sequence ends with W1), or if W2
|
||||
|
@ -410,7 +410,7 @@
|
|||
[(str)
|
||||
(unless (string? str)
|
||||
(die 'string->utf16 "not a string" str))
|
||||
($string->utf16 str 'big)]
|
||||
($string->utf16 str (native-endianness))]
|
||||
[(str endianness)
|
||||
(unless (string? str)
|
||||
(die 'string->utf16 "not a string" str))
|
||||
|
@ -458,7 +458,7 @@
|
|||
[(or (fx< w1 #xD800) (fx> w1 #xDFFF))
|
||||
(string-set! str n (integer->char/invalid w1))
|
||||
(fill bv endianness str (+ i 2) len (+ n 1))]
|
||||
[(not (fx<= #xD800 w1 #xDBFF)) ;;; die sequence
|
||||
[(not (fx<= #xD800 w1 #xDBFF)) ;;; error sequence
|
||||
(string-set! str n #\xFFFD)
|
||||
(fill bv endianness str (+ i 2) len (+ n 1))]
|
||||
[(<= (+ i 4) (bytevector-length bv))
|
||||
|
@ -466,7 +466,7 @@
|
|||
(cond
|
||||
[(not (<= #xDC00 w2 #xDFFF))
|
||||
;;; do we skip w2 also?
|
||||
;;; I won't. Just w1 is an die
|
||||
;;; I won't. Just w1 is an error
|
||||
(string-set! str n #\xFFFD)
|
||||
(fill bv endianness str (+ i 2) len (+ n 1))]
|
||||
[else
|
||||
|
@ -477,7 +477,7 @@
|
|||
(fxlogand w2 #x3FF)))))
|
||||
(fill bv endianness str (+ i 4) len (+ n 1))]))]
|
||||
[else
|
||||
;;; die again
|
||||
;;; error again
|
||||
(string-set! str n #\xFFFD)
|
||||
(fill bv endianness str (+ i 2) len (+ n 1))]))]))
|
||||
(define (decode bv endianness start)
|
||||
|
@ -531,7 +531,7 @@
|
|||
[(str)
|
||||
(unless (string? str)
|
||||
(die who "not a string" str))
|
||||
($string->utf32 str 'big)]
|
||||
($string->utf32 str (native-endianness))]
|
||||
[(str endianness)
|
||||
(unless (string? str)
|
||||
(die who "not a string" str))
|
||||
|
|
|
@ -1 +1 @@
|
|||
1636
|
||||
1637
|
||||
|
|
|
@ -202,6 +202,16 @@
|
|||
(make-transcoder (utf-8-codec) 'none 'raise))
|
||||
128)))
|
||||
|
||||
(define (make-utf8-bytevector-range1)
|
||||
(u8-list->bytevector
|
||||
(let f ([i 0] [j #x7F])
|
||||
(cond
|
||||
[(> i j) '()]
|
||||
[else
|
||||
(cons* i (f (+ i 1) j))]))))
|
||||
|
||||
|
||||
|
||||
(define (make-utf8-bytevector-range2)
|
||||
(u8-list->bytevector
|
||||
(let f ([i #x80] [j #x7FF])
|
||||
|
@ -236,6 +246,15 @@
|
|||
(fxior #b10000000 (fxand i #b111111))
|
||||
(f (+ i 1) j))]))))
|
||||
|
||||
(define (make-utf8-string-range1)
|
||||
(list->string
|
||||
(let f ([i 0] [j #x7F])
|
||||
(cond
|
||||
[(> i j) '()]
|
||||
[else
|
||||
(cons (integer->char i)
|
||||
(f (+ i 1) j))]))))
|
||||
|
||||
(define (make-utf8-string-range2)
|
||||
(list->string
|
||||
(let f ([i #x80] [j #x7FF])
|
||||
|
@ -276,7 +295,16 @@
|
|||
[(char=? x (string-ref str i))
|
||||
(f (+ i 1))]
|
||||
[else
|
||||
(error #f "mismatch" x (string-ref str i) i)]))))
|
||||
(error #f
|
||||
(format
|
||||
"mismatch at index ~a, got char ~a (code #x~x), \
|
||||
expected char ~a (code #x~x)"
|
||||
i
|
||||
x
|
||||
(char->integer x)
|
||||
(string-ref str i)
|
||||
(char->integer (string-ref str i))))]))))
|
||||
|
||||
|
||||
(define (test-port-string-peeking-output p str)
|
||||
(let f ([i 0])
|
||||
|
@ -294,7 +322,66 @@
|
|||
[else
|
||||
(error #f "mismatch" x (string-ref str i) i)]))))
|
||||
|
||||
|
||||
(define (invalid-code? n) (not (valid-code? n)))
|
||||
(define (valid-code? n)
|
||||
(cond
|
||||
[(< n 0) #f]
|
||||
[(<= n #xD7FF) #t]
|
||||
[(< n #xE000) #f]
|
||||
[(<= n #x10FFFF) #t]
|
||||
[else (error 'valid-code? "out of range" n)]))
|
||||
|
||||
|
||||
|
||||
(define (make-u16le-bv min max)
|
||||
(u8-list->bytevector
|
||||
(let f ([i min])
|
||||
(cond
|
||||
[(> i max) '()]
|
||||
[(invalid-code? i) (f (+ i 1))]
|
||||
[(< i #x10000)
|
||||
(cons*
|
||||
(fxand i #xFF)
|
||||
(fxsra i 8)
|
||||
(f (+ i 1)))]
|
||||
[else
|
||||
(let ([ii (fx- i #x10000)])
|
||||
(let ([w1 (fxior #xD800 (fxand #x3FF (fxsra ii 10)))]
|
||||
[w2 (fxior #xDC00 (fxand #x3FF ii))])
|
||||
(cons*
|
||||
(fxand w1 #xFF)
|
||||
(fxsra w1 8)
|
||||
(fxand w2 #xFF)
|
||||
(fxsra w2 8)
|
||||
(f (+ i 1)))))]))))
|
||||
|
||||
(define (make-string-slice min max)
|
||||
(list->string
|
||||
(let f ([i min])
|
||||
(cond
|
||||
[(> i max) '()]
|
||||
[(invalid-code? i) (f (+ i 1))]
|
||||
[else (cons (integer->char i) (f (+ i 1)))]))))
|
||||
|
||||
|
||||
(define (make-u16le-range1)
|
||||
(make-u16le-bv 0 #x7FFF))
|
||||
(define (make-u16le-range2)
|
||||
(make-u16le-bv #x8000 #x10FFFF))
|
||||
(define (make-utf16-string-range1)
|
||||
(make-string-slice 0 #x7FFF))
|
||||
(define (make-utf16-string-range2)
|
||||
(make-string-slice #x8000 #x10FFFF))
|
||||
|
||||
(define (run-exhaustive-tests)
|
||||
|
||||
(test "utf8 range 1"
|
||||
(test-port-string-output
|
||||
(open-bytevector-input-port (make-utf8-bytevector-range1)
|
||||
(make-transcoder (utf-8-codec) 'none 'raise))
|
||||
(make-utf8-string-range1)))
|
||||
|
||||
(test "utf8 range 2"
|
||||
(test-port-string-output
|
||||
(open-bytevector-input-port (make-utf8-bytevector-range2)
|
||||
|
@ -313,6 +400,27 @@
|
|||
(make-transcoder (utf-8-codec) 'none 'raise))
|
||||
(make-utf8-string-range4)))
|
||||
|
||||
|
||||
(test "utf16 range 1"
|
||||
(test-port-string-output
|
||||
(open-bytevector-input-port (make-u16le-range1)
|
||||
(make-transcoder (utf-16-codec) 'none 'raise))
|
||||
(make-utf16-string-range1)))
|
||||
|
||||
|
||||
(test "utf16 range 2"
|
||||
(test-port-string-output
|
||||
(open-bytevector-input-port (make-u16le-range2)
|
||||
(make-transcoder (utf-16-codec) 'none 'raise))
|
||||
(make-utf16-string-range2)))
|
||||
|
||||
|
||||
(test "utf8 peek range 1"
|
||||
(test-port-string-peeking-output
|
||||
(open-bytevector-input-port (make-utf8-bytevector-range1)
|
||||
(make-transcoder (utf-8-codec) 'none 'raise))
|
||||
(make-utf8-string-range1)))
|
||||
|
||||
(test "utf8 peek range 2"
|
||||
(test-port-string-peeking-output
|
||||
(open-bytevector-input-port (make-utf8-bytevector-range2)
|
||||
|
@ -331,6 +439,23 @@
|
|||
(make-transcoder (utf-8-codec) 'none 'raise))
|
||||
(make-utf8-string-range4)))
|
||||
|
||||
(test "utf16 peek range 1"
|
||||
(test-port-string-peeking-output
|
||||
(open-bytevector-input-port (make-u16le-range1)
|
||||
(make-transcoder (utf-16-codec) 'none 'raise))
|
||||
(make-utf16-string-range1)))
|
||||
|
||||
(test "utf16 peek range 2"
|
||||
(test-port-string-peeking-output
|
||||
(open-bytevector-input-port (make-u16le-range2)
|
||||
(make-transcoder (utf-16-codec) 'none 'raise))
|
||||
(make-utf16-string-range2)))
|
||||
|
||||
(test "utf8 range 1 string"
|
||||
(test-port-string-output
|
||||
(open-string-input-port (make-utf8-string-range1))
|
||||
(make-utf8-string-range1)))
|
||||
|
||||
(test "utf8 range 2 string"
|
||||
(test-port-string-output
|
||||
(open-string-input-port (make-utf8-string-range2))
|
||||
|
@ -509,18 +634,27 @@
|
|||
1]))
|
||||
#f #f #f)
|
||||
transcoder)))
|
||||
(define (test name codec conv)
|
||||
(define (test name codec s->bv bv->s)
|
||||
(printf "testing partial reads for ~s codec ... " name)
|
||||
(let ([s (make-test-string)])
|
||||
(assert (string=? s (bv->s (s->bv s))))
|
||||
(let ([r (call-with-port
|
||||
(make-slow-input-port (conv s)
|
||||
(make-slow-input-port (s->bv s)
|
||||
(make-transcoder codec
|
||||
(eol-style none) (error-handling-mode raise)))
|
||||
get-string-all)])
|
||||
(assert (string=? r s))))
|
||||
(unless (string=? r s)
|
||||
(if (= (string-length r) (string-length s))
|
||||
(error #f "test failed")
|
||||
(error #f "length mismatch"
|
||||
(string-length s) (string-length r))))))
|
||||
(printf "ok\n"))
|
||||
;(test 'utf16 (utf-16-codec) string->utf16)
|
||||
(test 'utf8 (utf-8-codec) string->utf8))
|
||||
(test 'utf8 (utf-8-codec)
|
||||
string->utf8
|
||||
utf8->string)
|
||||
(test 'utf16 (utf-16-codec)
|
||||
(lambda (x) (string->utf16 x 'little))
|
||||
(lambda (x) (utf16->string x 'little))))
|
||||
|
||||
(define-tests test-input-ports
|
||||
[eof-object?
|
||||
|
|
Loading…
Reference in New Issue