decoding of utf8-transcoded ports is complete.

This commit is contained in:
Abdulaziz Ghuloum 2007-12-07 05:34:46 -05:00
parent 2575419665
commit 86d9c640b1
2 changed files with 255 additions and 25 deletions

View File

@ -3,6 +3,7 @@
(export
input-port? textual-port? port-eof?
open-bytevector-input-port
open-string-input-port
make-custom-binary-input-port
get-char lookahead-char get-u8 lookahead-u8 close-port
transcoded-port)
@ -10,6 +11,7 @@
(except (ikarus)
input-port? textual-port? port-eof?
open-bytevector-input-port
open-string-input-port
make-custom-binary-input-port
get-char lookahead-char get-u8 lookahead-u8 close-port
transcoded-port))
@ -219,21 +221,18 @@
[buf ($port-buffer p)])
(cond
[(fx= i j) ;;; exhausted
(let ([n (($port-read! p) buf 0 (bytevector-length buf))])
(if (fx= n 0)
(eof-object)
(begin
($set-port-index! p 0)
($set-port-size! p n)
(get-char p))))]
(let ([bytes (refill-bv-buffer p who)])
(cond
[(fx= bytes 0) (eof-object)]
[else (get-char p)]))]
[else
(let ([b0 (bytevector-u8-ref p i)])
(let ([b0 (bytevector-u8-ref buf i)])
(cond
[(fx= (fxsra b0 5) #b110) ;;; two-byte-encoding
(let ([i (fx+ i 1)])
(cond
[(fx< i j)
(let ([b1 (bytevector-u8-ref p i)])
(let ([b1 (bytevector-u8-ref buf i)])
(cond
[(fx= (fxsra b1 6) #b10)
($set-port-index! p (fx+ i 1))
@ -241,7 +240,7 @@
(fxior (fxand b1 #b111111)
(fxsll (fxand b0 #b11111) 6)))]
[else
($set-port-index! p (fx+ i 1))
($set-port-index! p i)
(do-error p who)]))]
[else
(let ([bytes (refill-bv-buffer p who)])
@ -250,8 +249,67 @@
($set-port-index! p (fx+ ($port-index p) 1))
(do-error p who)]
[else (get-char-utf8-mode p who)]))]))]
[else (error who
"BUG: 3-byte encoding not implemented")]))])))
[(fx= (fxsra b0 4) #b1110) ;;; three-byte-encoding
(cond
[(fx< (fx+ i 2) j)
(let ([b1 (bytevector-u8-ref buf (fx+ i 1))]
[b2 (bytevector-u8-ref buf (fx+ i 2))])
(cond
[(fx= (fxsra (fxlogor b1 b2) 6) #b10)
(let ([n (fxlogor
(fxsll (fxand b0 #b1111) 12)
(fxsll (fxand b1 #b111111) 6)
(fxand b2 #b111111))])
(cond
[(fx<= #xD800 n #xDFFF)
($set-port-index! p (fx+ i 1))
(do-error p who)]
[else
($set-port-index! p (fx+ i 3))
(integer->char n)]))]
[else
($set-port-index! p (fx+ i 1))
(do-error p who)]))]
[else
(let ([bytes (refill-bv-buffer p who)])
(cond
[(fx= bytes 0)
($set-port-index! p (fx+ ($port-index p) 1))
(do-error p who)]
[else (get-char-utf8-mode p who)]))])]
[(fx= (fxsra b0 3) #b11110) ;;; four-byte-encoding
(cond
[(fx< (fx+ i 3) j)
(let ([b1 (bytevector-u8-ref buf (fx+ i 1))]
[b2 (bytevector-u8-ref buf (fx+ i 2))]
[b3 (bytevector-u8-ref buf (fx+ i 3))])
(cond
[(fx= (fxsra (fxlogor b1 b2 b3) 6) #b10)
(let ([n (fxlogor
(fxsll (fxand b0 #b111) 18)
(fxsll (fxand b1 #b111111) 12)
(fxsll (fxand b2 #b111111) 6)
(fxand b3 #b111111))])
(cond
[(fx<= #x10000 n #x10FFFF)
($set-port-index! p (fx+ i 4))
(integer->char n)]
[else
($set-port-index! p (fx+ i 1))
(do-error p who)]))]
[else
($set-port-index! p (fx+ i 1))
(do-error p who)]))]
[else
(let ([bytes (refill-bv-buffer p who)])
(cond
[(fx= bytes 0)
($set-port-index! p (fx+ ($port-index p) 1))
(do-error p who)]
[else (get-char-utf8-mode p who)]))])]
[else
($set-port-index! p (fx+ i 1))
(do-error p who)]))])))
(define (lookahead-char-utf8-mode p who)
(define (do-error p who)
@ -266,21 +324,18 @@
[buf ($port-buffer p)])
(cond
[(fx= i j) ;;; exhausted
(let ([n (($port-read! p) buf 0 (bytevector-length buf))])
(if (fx= n 0)
(eof-object)
(begin
($set-port-index! p 0)
($set-port-size! p n)
(lookahead-char p))))]
(let ([bytes (refill-bv-buffer p who)])
(cond
[(fx= bytes 0) (eof-object)]
[else (lookahead-char p)]))]
[else
(let ([b0 (bytevector-u8-ref p i)])
(let ([b0 (bytevector-u8-ref buf i)])
(cond
[(fx= (fxsra b0 5) #b110) ;;; two-byte-encoding
(let ([i (fx+ i 1)])
(cond
[(fx< i j)
(let ([b1 (bytevector-u8-ref p i)])
(let ([b1 (bytevector-u8-ref buf i)])
(cond
[(fx= (fxsra b1 6) #b10)
(integer->char
@ -293,9 +348,54 @@
(cond
[(fx= bytes 0) (do-error p who)]
[else (lookahead-char-utf8-mode p who)]))]))]
[else
(error who
"BUG: 3-byte encoding not implemented")]))])))
[(fx= (fxsra b0 4) #b1110) ;;; three-byte-encoding
(cond
[(fx< (fx+ i 2) j)
(let ([b1 (bytevector-u8-ref buf (fx+ i 1))]
[b2 (bytevector-u8-ref buf (fx+ i 2))])
(cond
[(fx= (fxsra (fxlogor b1 b2) 6) #b10)
(let ([n (fxlogor
(fxsll (fxand b0 #b1111) 12)
(fxsll (fxand b1 #b111111) 6)
(fxand b2 #b111111))])
(cond
[(fx<= #xD800 n #xDFFF) (do-error p who)]
[else (integer->char n)]))]
[else (do-error p who)]))]
[else
(let ([bytes (refill-bv-buffer p who)])
(cond
[(fx= bytes 0) (do-error p who)]
[else (lookahead-char-utf8-mode p who)]))])]
[(fx= (fxsra b0 3) #b11110) ;;; four-byte-encoding
(cond
[(fx< (fx+ i 3) j)
(let ([b1 (bytevector-u8-ref buf (fx+ i 1))]
[b2 (bytevector-u8-ref buf (fx+ i 2))]
[b3 (bytevector-u8-ref buf (fx+ i 3))])
(cond
[(fx= (fxsra (fxlogor b1 b2 b3) 6) #b10)
(let ([n (fxlogor
(fxsll (fxand b0 #b111) 18)
(fxsll (fxand b1 #b111111) 12)
(fxsll (fxand b2 #b111111) 6)
(fxand b3 #b111111))])
(cond
[(fx<= #x10000 n #x10FFFF)
(integer->char n)]
[else
(do-error p who)]))]
[else
(do-error p who)]))]
[else
(let ([bytes (refill-bv-buffer p who)])
(cond
[(fx= bytes 0)
(do-error p who)]
[else (get-char-utf8-mode p who)]))])]
[else (do-error p who)]))])))
(define-rrr get-char-char-mode)
(define (advance-utf8-bom p who)
@ -347,7 +447,7 @@
(let ([b (bytevector-u8-ref ($port-buffer p) i)])
(cond
[(fx< b 128) (integer->char b)]
[else (lookahead-char-utf8-mode p)]))]
[else (lookahead-char-utf8-mode p who)]))]
[else
(lookahead-char-utf8-mode p who)]))]
[(eq? m fast-get-char-tag)

View File

@ -201,4 +201,134 @@
(make-transcoder (utf-8-codec) 'none 'raise))
128))
(define (make-utf8-bytevector-range2)
(u8-list->bytevector
(let f ([i #x80] [j #x7FF])
(cond
[(> i j) '()]
[else
(cons* (fxior #b11000000 (fxsra i 6))
(fxior #b10000000 (fxand i #b111111))
(f (+ i 1) j))]))))
(define (make-utf8-bytevector-range3)
(u8-list->bytevector
(let f ([i #x800] [j #xFFFF])
(cond
[(> i j) '()]
[(fx= i #xD800) (f #xE000 j)]
[else
(cons* (fxior #b11100000 (fxsra i 12))
(fxior #b10000000 (fxand (fxsra i 6) #b111111))
(fxior #b10000000 (fxand i #b111111))
(f (+ i 1) j))]))))
(define (make-utf8-bytevector-range4)
(u8-list->bytevector
(let f ([i #x10000] [j #x10FFFF])
(cond
[(> i j) '()]
[else
(cons* (fxior #b11110000 (fxsra i 18))
(fxior #b10000000 (fxand (fxsra i 12) #b111111))
(fxior #b10000000 (fxand (fxsra i 6) #b111111))
(fxior #b10000000 (fxand i #b111111))
(f (+ i 1) j))]))))
(define (make-utf8-string-range2)
(list->string
(let f ([i #x80] [j #x7FF])
(cond
[(> i j) '()]
[else
(cons (integer->char i)
(f (+ i 1) j))]))))
(define (make-utf8-string-range3)
(list->string
(let f ([i #x800] [j #xFFFF])
(cond
[(> i j) '()]
[(fx= i #xD800) (f #xE000 j)]
[else
(cons (integer->char i)
(f (+ i 1) j))]))))
(define (make-utf8-string-range4)
(list->string
(let f ([i #x10000] [j #x10FFFF])
(cond
[(> i j) '()]
[else
(cons (integer->char i)
(f (+ i 1) j))]))))
(define (test-port-string-output p str)
(let f ([i 0])
(let ([x (get-char p)])
(cond
[(eof-object? x)
(unless (= i (string-length str))
(error #f "premature eof"))]
[(= i (string-length str))
(error #f "too many chars")]
[(char=? x (string-ref str i))
(f (+ i 1))]
[else
(error #f "mismatch" x (string-ref str i) i)]))))
(define (test-port-string-peeking-output p str)
(let f ([i 0])
(let ([x (lookahead-char p)])
(cond
[(eof-object? x)
(unless (= i (string-length str))
(error #f "premature eof"))]
[(= i (string-length str))
(error #f "too many chars")]
[(not (char=? x (get-char p)))
(error #f "peek not same as get")]
[(char=? x (string-ref str i))
(f (+ i 1))]
[else
(error #f "mismatch" x (string-ref str i) i)]))))
(test "utf8 range 2"
(test-port-string-output
(open-bytevector-input-port (make-utf8-bytevector-range2)
(make-transcoder (utf-8-codec) 'none 'raise))
(make-utf8-string-range2)))
(test "utf8 range 3"
(test-port-string-output
(open-bytevector-input-port (make-utf8-bytevector-range3)
(make-transcoder (utf-8-codec) 'none 'raise))
(make-utf8-string-range3)))
(test "utf8 range 4"
(test-port-string-output
(open-bytevector-input-port (make-utf8-bytevector-range4)
(make-transcoder (utf-8-codec) 'none 'raise))
(make-utf8-string-range4)))
(test "utf8 peek range 2"
(test-port-string-peeking-output
(open-bytevector-input-port (make-utf8-bytevector-range2)
(make-transcoder (utf-8-codec) 'none 'raise))
(make-utf8-string-range2)))
(test "utf8 peek range 3"
(test-port-string-peeking-output
(open-bytevector-input-port (make-utf8-bytevector-range3)
(make-transcoder (utf-8-codec) 'none 'raise))
(make-utf8-string-range3)))
(test "utf8 peek range 4"
(test-port-string-peeking-output
(open-bytevector-input-port (make-utf8-bytevector-range4)
(make-transcoder (utf-8-codec) 'none 'raise))
(make-utf8-string-range4)))