Minor bugs in utf32->string and utf8->string.

This commit is contained in:
Abdulaziz Ghuloum 2007-12-03 11:06:29 -05:00
parent b5ab25c1cf
commit 00ffe4fdfd
2 changed files with 22 additions and 10 deletions

View File

@ -39,6 +39,8 @@
;;; replace: places a U+FFFD in place of the malformed bytes ;;; replace: places a U+FFFD in place of the malformed bytes
;;; raise: raises an error ;;; raise: raises an error
;;; It appears that utf-8 data can start with a #xEF #xBB #xBF BOM!
(define integer->char/invalid (define integer->char/invalid
(lambda (n) (lambda (n)
(cond (cond
@ -113,8 +115,8 @@
(define decode-utf8-bytevector (define decode-utf8-bytevector
(let () (let ()
(define who 'decode-utf8-bytevector) (define who 'decode-utf8-bytevector)
(define (count bv mode) (define (count bv i mode)
(let f ([x bv] [i 0] [j ($bytevector-length bv)] [n 0] [mode mode]) (let f ([x bv] [i i] [j ($bytevector-length bv)] [n 0] [mode mode])
(cond (cond
[($fx= i j) n] [($fx= i j) n]
[else [else
@ -194,8 +196,8 @@
[(eq? mode 'ignore) (f x ($fxadd1 i) j n mode)] [(eq? mode 'ignore) (f x ($fxadd1 i) j n mode)]
[(eq? mode 'replace) (f x ($fxadd1 i) j ($fxadd1 n) mode)] [(eq? mode 'replace) (f x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "invalid byte at index of bytevector" b0 i x)]))]))) [else (error who "invalid byte at index of bytevector" b0 i x)]))])))
(define (fill str bv mode) (define (fill str bv i mode)
(let f ([str str] [x bv] [i 0] [j ($bytevector-length bv)] [n 0] [mode mode]) (let f ([str str] [x bv] [i i] [j ($bytevector-length bv)] [n 0] [mode mode])
(cond (cond
[($fx= i j) str] [($fx= i j) str]
[else [else
@ -294,8 +296,17 @@
($string-set! str n ($fixnum->char #xFFFD)) ($string-set! str n ($fixnum->char #xFFFD))
(f str x ($fxadd1 i) j ($fxadd1 n) mode)] (f str x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "BUG")]))]))) [else (error who "BUG")]))])))
(define (has-bom? bv)
(and (fx> (bytevector-length bv) 3)
(fx= (bytevector-u8-ref bv 0) #xEF)
(fx= (bytevector-u8-ref bv 1) #xBB)
(fx= (bytevector-u8-ref bv 2) #xBF)))
(define (convert bv mode) (define (convert bv mode)
(fill ($make-string (count bv mode)) bv mode)) (cond
[(has-bom? bv)
(fill ($make-string (count bv 3 mode)) bv 3 mode)]
[else
(fill ($make-string (count bv 0 mode)) bv 0 mode)]))
(case-lambda (case-lambda
[(bv) (convert bv 'raise)] [(bv) (convert bv 'raise)]
[(bv handling-mode) [(bv handling-mode)
@ -544,12 +555,13 @@
(define (decode bv endianness start) (define (decode bv endianness start)
(let ([bvlen (bytevector-length bv)]) (let ([bvlen (bytevector-length bv)])
(let ([strlen (fxsra (fx+ (fx- bvlen start) 3) 2)]) (let ([strlen (fxsra (fx+ (fx- bvlen start) 3) 2)])
(fill bv endianness (make-string strlen) start (fill bv endianness (make-string strlen)
(fxand bvlen -2) 0)))) start (fxand bvlen -4)
0))))
(define ($utf32->string bv endianness em?) (define ($utf32->string bv endianness em?)
(define (bom-present bv) (define (bom-present bv)
(and (fx>= (bytevector-length bv) 4) (and (fx>= (bytevector-length bv) 4)
(let ([n (bytevector-u16-ref bv 0 'big)]) (let ([n (bytevector-u32-ref bv 0 'big)])
(cond (cond
[(= n #x0000FEFF) 'big] [(= n #x0000FEFF) 'big]
[(= n #xFFFE0000) 'little] [(= n #xFFFE0000) 'little]
@ -559,7 +571,7 @@
(unless (memv endianness '(big little)) (unless (memv endianness '(big little))
(error who "invalid endianness" endianness)) (error who "invalid endianness" endianness))
(cond (cond
[em? (decode bv endianness 0)] [em? (decode bv endianness 0)]
[(bom-present bv) => [(bom-present bv) =>
(lambda (endianness) (lambda (endianness)
(decode bv endianness 4))] (decode bv endianness 4))]

View File

@ -1 +1 @@
1179 1180