utf16->string, utf32->string, string->utf16, string->utf32 are all

working now.
This commit is contained in:
Abdulaziz Ghuloum 2007-12-03 04:28:41 -05:00
parent 4efdebb3ac
commit d878bd3934
3 changed files with 32 additions and 17 deletions

View File

@ -15,7 +15,8 @@
(library (ikarus transcoders) (library (ikarus transcoders)
(export string->utf8 utf8->string string->utf16 string->utf32) (export string->utf8 utf8->string string->utf16 string->utf32
utf16->string utf32->string)
(import (except (ikarus) string->utf8 utf8->string string->utf16 (import (except (ikarus) string->utf8 utf8->string string->utf16
utf16->string string->utf32 utf32->string) utf16->string string->utf32 utf32->string)
(ikarus system $strings) (ikarus system $strings)
@ -38,6 +39,15 @@
;;; replace: places a U+FFFD in place of the malformed bytes ;;; replace: places a U+FFFD in place of the malformed bytes
;;; raise: raises an error ;;; raise: raises an error
(define integer->char/invalid
(lambda (n)
(cond
[(not (fixnum? n)) #\xFFFD]
[($fx<= n #xD7FF) ($fixnum->char n)]
[($fx< n #xE000) #\xFFFD]
[($fx<= n #x10FFFF) ($fixnum->char n)]
[else #\xFFFD])))
(define string->utf8 (define string->utf8
(lambda (str) (lambda (str)
(define (utf8-string-size str) (define (utf8-string-size str)
@ -313,6 +323,9 @@
;;; 4) Construct a 20-bit unsigned integer U', taking the 10 low-order ;;; 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
;;; bits of W1 as its 10 high-order bits and the 10 low-order bits of ;;; bits of W1 as its 10 high-order bits and the 10 low-order bits of
;;; W2 as its 10 low-order bits. ;;; W2 as its 10 low-order bits.
;;; 5) Add 0x10000 to U' to obtain the character value U.
;;; Terminate.
(module (string->utf16) (module (string->utf16)
@ -342,7 +355,7 @@
(fxlogor (fxsll #b110110 10) (fxsra u^ 10)) (fxlogor (fxsll #b110110 10) (fxsra u^ 10))
endianness) endianness)
(bytevector-u16-set! bv (fx+ j 2) (bytevector-u16-set! bv (fx+ j 2)
(fxlogor (fxsll #b110111 10) (fxlogand u^ #xFFFF)) (fxlogor (fxsll #b110111 10) (fxlogand u^ #x3FF))
endianness)) endianness))
(bvfill str bv (fx+ i 1) (fx+ j 4) len endianness)]))])) (bvfill str bv (fx+ i 1) (fx+ j 4) len endianness)]))]))
(let ([len ($string-length str)]) (let ([len ($string-length str)])
@ -376,7 +389,7 @@
(cond (cond
[(or (fx< w1 #xD800) (fx> w1 #xDFFF)) [(or (fx< w1 #xD800) (fx> w1 #xDFFF))
(count-size bv endianness (+ i 2) len (+ n 1))] (count-size bv endianness (+ i 2) len (+ n 1))]
[(not (fx<= #xD800 w1 #xDFFF)) ;;; error sequence [(not (fx<= #xD800 w1 #xDBFF)) ;;; error sequence
(count-size bv endianness (+ i 2) len (+ n 1))] (count-size bv endianness (+ i 2) len (+ n 1))]
[(<= (+ i 4) (bytevector-length bv)) [(<= (+ i 4) (bytevector-length bv))
(let ([w2 (bytevector-u16-ref bv (+ i 2) endianness)]) (let ([w2 (bytevector-u16-ref bv (+ i 2) endianness)])
@ -401,9 +414,9 @@
(let ([w1 (bytevector-u16-ref bv i endianness)]) (let ([w1 (bytevector-u16-ref bv i endianness)])
(cond (cond
[(or (fx< w1 #xD800) (fx> w1 #xDFFF)) [(or (fx< w1 #xD800) (fx> w1 #xDFFF))
(string-set! str n (integer->char w1)) (string-set! str n (integer->char/invalid w1))
(fill bv endianness str (+ i 2) len (+ n 1))] (fill bv endianness str (+ i 2) len (+ n 1))]
[(not (fx<= #xD800 w1 #xDFFF)) ;;; error sequence [(not (fx<= #xD800 w1 #xDBFF)) ;;; error sequence
(string-set! str n #\xFFFD) (string-set! str n #\xFFFD)
(fill bv endianness str (+ i 2) len (+ n 1))] (fill bv endianness str (+ i 2) len (+ n 1))]
[(<= (+ i 4) (bytevector-length bv)) [(<= (+ i 4) (bytevector-length bv))
@ -416,14 +429,15 @@
(fill bv endianness str (+ i 2) len (+ n 1))] (fill bv endianness str (+ i 2) len (+ n 1))]
[else [else
(string-set! str n (string-set! str n
(integer->char (integer->char/invalid
(fxlogor (fxsll (fxlogand w1 #x3FF) 10) (+ #x10000
(fxlogand w2 #x3FF)))) (fxlogor (fxsll (fxlogand w1 #x3FF) 10)
(count-size bv endianness (+ i 4) len (+ n 1))]))] (fxlogand w2 #x3FF)))))
(fill bv endianness str (+ i 4) len (+ n 1))]))]
[else [else
;;; error again ;;; error again
(string-set! str n #\xFFFD) (string-set! str n #\xFFFD)
(count-size bv endianness str (+ i 2) len (+ n 1))]))])) (fill bv endianness str (+ i 2) len (+ n 1))]))]))
(define (decode bv endianness start) (define (decode bv endianness start)
(let ([len (fxand (bytevector-length bv) -2)]) (let ([len (fxand (bytevector-length bv) -2)])
(let ([n (count-size bv endianness start len 0)]) (let ([n (count-size bv endianness start len 0)])
@ -495,8 +509,8 @@
str] str]
[else [else
(string-set! str n (string-set! str n
(let ([w (bytevector-u32-ref bv i endianness)]) (integer->char/invalid
(integer->char w))) (bytevector-u32-ref bv i endianness)))
(fill bv endianness str (fx+ i 4) j (fx+ n 1))])) (fill bv endianness str (fx+ i 4) j (fx+ n 1))]))
(define (decode bv endianness start) (define (decode bv endianness start)
(let ([bvlen (bytevector-length bv)]) (let ([bvlen (bytevector-length bv)])
@ -508,8 +522,8 @@
(and (fx>= (bytevector-length bv) 4) (and (fx>= (bytevector-length bv) 4)
(let ([n (bytevector-u16-ref bv 0 'big)]) (let ([n (bytevector-u16-ref bv 0 'big)])
(cond (cond
[(fx= n #x0000FEFF) 'big] [(= n #x0000FEFF) 'big]
[(fx= n #xFFFE0000) 'little] [(= n #xFFFE0000) 'little]
[else #f])))) [else #f]))))
(unless (bytevector? bv) (unless (bytevector? bv)
(error who "not a bytevector" bv)) (error who "not a bytevector" bv))

View File

@ -1 +1 @@
1177 1178

View File

@ -14,6 +14,7 @@
;;; You should have received a copy of the GNU General Public License ;;; You should have received a copy of the GNU General Public License
;;; along with this program. If not, see <http://www.gnu.org/licenses/>. ;;; along with this program. If not, see <http://www.gnu.org/licenses/>.
;;; vim:syntax=scheme
(import (only (ikarus) import)) (import (only (ikarus) import))
(import (except (ikarus) assembler-output)) (import (except (ikarus) assembler-output))
(import (ikarus compiler)) (import (ikarus compiler))
@ -935,8 +936,8 @@
[u8-list->bytevector i r bv] [u8-list->bytevector i r bv]
[uint-list->bytevector i r bv] [uint-list->bytevector i r bv]
[utf8->string i r bv] [utf8->string i r bv]
[utf16->string r bv] [utf16->string i r bv]
[utf32->string r bv] [utf32->string i r bv]
[print-condition i] [print-condition i]
[condition? i r co] [condition? i r co]
[&assertion i r co] [&assertion i r co]