decoding of utf8-transcoded ports is complete.
This commit is contained in:
parent
2575419665
commit
86d9c640b1
150
lab/io-spec.ss
150
lab/io-spec.ss
|
@ -3,6 +3,7 @@
|
|||
(export
|
||||
input-port? textual-port? port-eof?
|
||||
open-bytevector-input-port
|
||||
open-string-input-port
|
||||
make-custom-binary-input-port
|
||||
get-char lookahead-char get-u8 lookahead-u8 close-port
|
||||
transcoded-port)
|
||||
|
@ -10,6 +11,7 @@
|
|||
(except (ikarus)
|
||||
input-port? textual-port? port-eof?
|
||||
open-bytevector-input-port
|
||||
open-string-input-port
|
||||
make-custom-binary-input-port
|
||||
get-char lookahead-char get-u8 lookahead-u8 close-port
|
||||
transcoded-port))
|
||||
|
@ -219,21 +221,18 @@
|
|||
[buf ($port-buffer p)])
|
||||
(cond
|
||||
[(fx= i j) ;;; exhausted
|
||||
(let ([n (($port-read! p) buf 0 (bytevector-length buf))])
|
||||
(if (fx= n 0)
|
||||
(eof-object)
|
||||
(begin
|
||||
($set-port-index! p 0)
|
||||
($set-port-size! p n)
|
||||
(get-char p))))]
|
||||
(let ([bytes (refill-bv-buffer p who)])
|
||||
(cond
|
||||
[(fx= bytes 0) (eof-object)]
|
||||
[else (get-char p)]))]
|
||||
[else
|
||||
(let ([b0 (bytevector-u8-ref p i)])
|
||||
(let ([b0 (bytevector-u8-ref buf i)])
|
||||
(cond
|
||||
[(fx= (fxsra b0 5) #b110) ;;; two-byte-encoding
|
||||
(let ([i (fx+ i 1)])
|
||||
(cond
|
||||
[(fx< i j)
|
||||
(let ([b1 (bytevector-u8-ref p i)])
|
||||
(let ([b1 (bytevector-u8-ref buf i)])
|
||||
(cond
|
||||
[(fx= (fxsra b1 6) #b10)
|
||||
($set-port-index! p (fx+ i 1))
|
||||
|
@ -241,7 +240,7 @@
|
|||
(fxior (fxand b1 #b111111)
|
||||
(fxsll (fxand b0 #b11111) 6)))]
|
||||
[else
|
||||
($set-port-index! p (fx+ i 1))
|
||||
($set-port-index! p i)
|
||||
(do-error p who)]))]
|
||||
[else
|
||||
(let ([bytes (refill-bv-buffer p who)])
|
||||
|
@ -250,8 +249,67 @@
|
|||
($set-port-index! p (fx+ ($port-index p) 1))
|
||||
(do-error p who)]
|
||||
[else (get-char-utf8-mode p who)]))]))]
|
||||
[else (error who
|
||||
"BUG: 3-byte encoding not implemented")]))])))
|
||||
[(fx= (fxsra b0 4) #b1110) ;;; three-byte-encoding
|
||||
(cond
|
||||
[(fx< (fx+ i 2) j)
|
||||
(let ([b1 (bytevector-u8-ref buf (fx+ i 1))]
|
||||
[b2 (bytevector-u8-ref buf (fx+ i 2))])
|
||||
(cond
|
||||
[(fx= (fxsra (fxlogor b1 b2) 6) #b10)
|
||||
(let ([n (fxlogor
|
||||
(fxsll (fxand b0 #b1111) 12)
|
||||
(fxsll (fxand b1 #b111111) 6)
|
||||
(fxand b2 #b111111))])
|
||||
(cond
|
||||
[(fx<= #xD800 n #xDFFF)
|
||||
($set-port-index! p (fx+ i 1))
|
||||
(do-error p who)]
|
||||
[else
|
||||
($set-port-index! p (fx+ i 3))
|
||||
(integer->char n)]))]
|
||||
[else
|
||||
($set-port-index! p (fx+ i 1))
|
||||
(do-error p who)]))]
|
||||
[else
|
||||
(let ([bytes (refill-bv-buffer p who)])
|
||||
(cond
|
||||
[(fx= bytes 0)
|
||||
($set-port-index! p (fx+ ($port-index p) 1))
|
||||
(do-error p who)]
|
||||
[else (get-char-utf8-mode p who)]))])]
|
||||
[(fx= (fxsra b0 3) #b11110) ;;; four-byte-encoding
|
||||
(cond
|
||||
[(fx< (fx+ i 3) j)
|
||||
(let ([b1 (bytevector-u8-ref buf (fx+ i 1))]
|
||||
[b2 (bytevector-u8-ref buf (fx+ i 2))]
|
||||
[b3 (bytevector-u8-ref buf (fx+ i 3))])
|
||||
(cond
|
||||
[(fx= (fxsra (fxlogor b1 b2 b3) 6) #b10)
|
||||
(let ([n (fxlogor
|
||||
(fxsll (fxand b0 #b111) 18)
|
||||
(fxsll (fxand b1 #b111111) 12)
|
||||
(fxsll (fxand b2 #b111111) 6)
|
||||
(fxand b3 #b111111))])
|
||||
(cond
|
||||
[(fx<= #x10000 n #x10FFFF)
|
||||
($set-port-index! p (fx+ i 4))
|
||||
(integer->char n)]
|
||||
[else
|
||||
($set-port-index! p (fx+ i 1))
|
||||
(do-error p who)]))]
|
||||
[else
|
||||
($set-port-index! p (fx+ i 1))
|
||||
(do-error p who)]))]
|
||||
[else
|
||||
(let ([bytes (refill-bv-buffer p who)])
|
||||
(cond
|
||||
[(fx= bytes 0)
|
||||
($set-port-index! p (fx+ ($port-index p) 1))
|
||||
(do-error p who)]
|
||||
[else (get-char-utf8-mode p who)]))])]
|
||||
[else
|
||||
($set-port-index! p (fx+ i 1))
|
||||
(do-error p who)]))])))
|
||||
|
||||
(define (lookahead-char-utf8-mode p who)
|
||||
(define (do-error p who)
|
||||
|
@ -266,21 +324,18 @@
|
|||
[buf ($port-buffer p)])
|
||||
(cond
|
||||
[(fx= i j) ;;; exhausted
|
||||
(let ([n (($port-read! p) buf 0 (bytevector-length buf))])
|
||||
(if (fx= n 0)
|
||||
(eof-object)
|
||||
(begin
|
||||
($set-port-index! p 0)
|
||||
($set-port-size! p n)
|
||||
(lookahead-char p))))]
|
||||
(let ([bytes (refill-bv-buffer p who)])
|
||||
(cond
|
||||
[(fx= bytes 0) (eof-object)]
|
||||
[else (lookahead-char p)]))]
|
||||
[else
|
||||
(let ([b0 (bytevector-u8-ref p i)])
|
||||
(let ([b0 (bytevector-u8-ref buf i)])
|
||||
(cond
|
||||
[(fx= (fxsra b0 5) #b110) ;;; two-byte-encoding
|
||||
(let ([i (fx+ i 1)])
|
||||
(cond
|
||||
[(fx< i j)
|
||||
(let ([b1 (bytevector-u8-ref p i)])
|
||||
(let ([b1 (bytevector-u8-ref buf i)])
|
||||
(cond
|
||||
[(fx= (fxsra b1 6) #b10)
|
||||
(integer->char
|
||||
|
@ -293,9 +348,54 @@
|
|||
(cond
|
||||
[(fx= bytes 0) (do-error p who)]
|
||||
[else (lookahead-char-utf8-mode p who)]))]))]
|
||||
[else
|
||||
(error who
|
||||
"BUG: 3-byte encoding not implemented")]))])))
|
||||
[(fx= (fxsra b0 4) #b1110) ;;; three-byte-encoding
|
||||
(cond
|
||||
[(fx< (fx+ i 2) j)
|
||||
(let ([b1 (bytevector-u8-ref buf (fx+ i 1))]
|
||||
[b2 (bytevector-u8-ref buf (fx+ i 2))])
|
||||
(cond
|
||||
[(fx= (fxsra (fxlogor b1 b2) 6) #b10)
|
||||
(let ([n (fxlogor
|
||||
(fxsll (fxand b0 #b1111) 12)
|
||||
(fxsll (fxand b1 #b111111) 6)
|
||||
(fxand b2 #b111111))])
|
||||
(cond
|
||||
[(fx<= #xD800 n #xDFFF) (do-error p who)]
|
||||
[else (integer->char n)]))]
|
||||
[else (do-error p who)]))]
|
||||
[else
|
||||
(let ([bytes (refill-bv-buffer p who)])
|
||||
(cond
|
||||
[(fx= bytes 0) (do-error p who)]
|
||||
[else (lookahead-char-utf8-mode p who)]))])]
|
||||
[(fx= (fxsra b0 3) #b11110) ;;; four-byte-encoding
|
||||
(cond
|
||||
[(fx< (fx+ i 3) j)
|
||||
(let ([b1 (bytevector-u8-ref buf (fx+ i 1))]
|
||||
[b2 (bytevector-u8-ref buf (fx+ i 2))]
|
||||
[b3 (bytevector-u8-ref buf (fx+ i 3))])
|
||||
(cond
|
||||
[(fx= (fxsra (fxlogor b1 b2 b3) 6) #b10)
|
||||
(let ([n (fxlogor
|
||||
(fxsll (fxand b0 #b111) 18)
|
||||
(fxsll (fxand b1 #b111111) 12)
|
||||
(fxsll (fxand b2 #b111111) 6)
|
||||
(fxand b3 #b111111))])
|
||||
(cond
|
||||
[(fx<= #x10000 n #x10FFFF)
|
||||
(integer->char n)]
|
||||
[else
|
||||
(do-error p who)]))]
|
||||
[else
|
||||
(do-error p who)]))]
|
||||
[else
|
||||
(let ([bytes (refill-bv-buffer p who)])
|
||||
(cond
|
||||
[(fx= bytes 0)
|
||||
(do-error p who)]
|
||||
[else (get-char-utf8-mode p who)]))])]
|
||||
[else (do-error p who)]))])))
|
||||
|
||||
(define-rrr get-char-char-mode)
|
||||
|
||||
(define (advance-utf8-bom p who)
|
||||
|
@ -347,7 +447,7 @@
|
|||
(let ([b (bytevector-u8-ref ($port-buffer p) i)])
|
||||
(cond
|
||||
[(fx< b 128) (integer->char b)]
|
||||
[else (lookahead-char-utf8-mode p)]))]
|
||||
[else (lookahead-char-utf8-mode p who)]))]
|
||||
[else
|
||||
(lookahead-char-utf8-mode p who)]))]
|
||||
[(eq? m fast-get-char-tag)
|
||||
|
|
130
lab/io-test.ss
130
lab/io-test.ss
|
@ -201,4 +201,134 @@
|
|||
(make-transcoder (utf-8-codec) 'none 'raise))
|
||||
128))
|
||||
|
||||
(define (make-utf8-bytevector-range2)
|
||||
(u8-list->bytevector
|
||||
(let f ([i #x80] [j #x7FF])
|
||||
(cond
|
||||
[(> i j) '()]
|
||||
[else
|
||||
(cons* (fxior #b11000000 (fxsra i 6))
|
||||
(fxior #b10000000 (fxand i #b111111))
|
||||
(f (+ i 1) j))]))))
|
||||
|
||||
(define (make-utf8-bytevector-range3)
|
||||
(u8-list->bytevector
|
||||
(let f ([i #x800] [j #xFFFF])
|
||||
(cond
|
||||
[(> i j) '()]
|
||||
[(fx= i #xD800) (f #xE000 j)]
|
||||
[else
|
||||
(cons* (fxior #b11100000 (fxsra i 12))
|
||||
(fxior #b10000000 (fxand (fxsra i 6) #b111111))
|
||||
(fxior #b10000000 (fxand i #b111111))
|
||||
(f (+ i 1) j))]))))
|
||||
|
||||
(define (make-utf8-bytevector-range4)
|
||||
(u8-list->bytevector
|
||||
(let f ([i #x10000] [j #x10FFFF])
|
||||
(cond
|
||||
[(> i j) '()]
|
||||
[else
|
||||
(cons* (fxior #b11110000 (fxsra i 18))
|
||||
(fxior #b10000000 (fxand (fxsra i 12) #b111111))
|
||||
(fxior #b10000000 (fxand (fxsra i 6) #b111111))
|
||||
(fxior #b10000000 (fxand i #b111111))
|
||||
(f (+ i 1) j))]))))
|
||||
|
||||
(define (make-utf8-string-range2)
|
||||
(list->string
|
||||
(let f ([i #x80] [j #x7FF])
|
||||
(cond
|
||||
[(> i j) '()]
|
||||
[else
|
||||
(cons (integer->char i)
|
||||
(f (+ i 1) j))]))))
|
||||
|
||||
(define (make-utf8-string-range3)
|
||||
(list->string
|
||||
(let f ([i #x800] [j #xFFFF])
|
||||
(cond
|
||||
[(> i j) '()]
|
||||
[(fx= i #xD800) (f #xE000 j)]
|
||||
[else
|
||||
(cons (integer->char i)
|
||||
(f (+ i 1) j))]))))
|
||||
|
||||
(define (make-utf8-string-range4)
|
||||
(list->string
|
||||
(let f ([i #x10000] [j #x10FFFF])
|
||||
(cond
|
||||
[(> i j) '()]
|
||||
[else
|
||||
(cons (integer->char i)
|
||||
(f (+ i 1) j))]))))
|
||||
|
||||
(define (test-port-string-output p str)
|
||||
(let f ([i 0])
|
||||
(let ([x (get-char p)])
|
||||
(cond
|
||||
[(eof-object? x)
|
||||
(unless (= i (string-length str))
|
||||
(error #f "premature eof"))]
|
||||
[(= i (string-length str))
|
||||
(error #f "too many chars")]
|
||||
[(char=? x (string-ref str i))
|
||||
(f (+ i 1))]
|
||||
[else
|
||||
(error #f "mismatch" x (string-ref str i) i)]))))
|
||||
|
||||
(define (test-port-string-peeking-output p str)
|
||||
(let f ([i 0])
|
||||
(let ([x (lookahead-char p)])
|
||||
(cond
|
||||
[(eof-object? x)
|
||||
(unless (= i (string-length str))
|
||||
(error #f "premature eof"))]
|
||||
[(= i (string-length str))
|
||||
(error #f "too many chars")]
|
||||
[(not (char=? x (get-char p)))
|
||||
(error #f "peek not same as get")]
|
||||
[(char=? x (string-ref str i))
|
||||
(f (+ i 1))]
|
||||
[else
|
||||
(error #f "mismatch" x (string-ref str i) i)]))))
|
||||
|
||||
(test "utf8 range 2"
|
||||
(test-port-string-output
|
||||
(open-bytevector-input-port (make-utf8-bytevector-range2)
|
||||
(make-transcoder (utf-8-codec) 'none 'raise))
|
||||
(make-utf8-string-range2)))
|
||||
|
||||
(test "utf8 range 3"
|
||||
(test-port-string-output
|
||||
(open-bytevector-input-port (make-utf8-bytevector-range3)
|
||||
(make-transcoder (utf-8-codec) 'none 'raise))
|
||||
(make-utf8-string-range3)))
|
||||
|
||||
(test "utf8 range 4"
|
||||
(test-port-string-output
|
||||
(open-bytevector-input-port (make-utf8-bytevector-range4)
|
||||
(make-transcoder (utf-8-codec) 'none 'raise))
|
||||
(make-utf8-string-range4)))
|
||||
|
||||
(test "utf8 peek range 2"
|
||||
(test-port-string-peeking-output
|
||||
(open-bytevector-input-port (make-utf8-bytevector-range2)
|
||||
(make-transcoder (utf-8-codec) 'none 'raise))
|
||||
(make-utf8-string-range2)))
|
||||
|
||||
(test "utf8 peek range 3"
|
||||
(test-port-string-peeking-output
|
||||
(open-bytevector-input-port (make-utf8-bytevector-range3)
|
||||
(make-transcoder (utf-8-codec) 'none 'raise))
|
||||
(make-utf8-string-range3)))
|
||||
|
||||
(test "utf8 peek range 4"
|
||||
(test-port-string-peeking-output
|
||||
(open-bytevector-input-port (make-utf8-bytevector-range4)
|
||||
(make-transcoder (utf-8-codec) 'none 'raise))
|
||||
(make-utf8-string-range4)))
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue