95 lines
3.5 KiB
Scheme
95 lines
3.5 KiB
Scheme
|
|
||
|
(library (ikarus transcoders)
|
||
|
(export string->utf8-bytevector
|
||
|
#;utf8-bytevector->string)
|
||
|
(import (except (ikarus) string->utf8-bytevector)
|
||
|
(ikarus system $strings)
|
||
|
(ikarus system $bytevectors)
|
||
|
(ikarus system $fx)
|
||
|
(ikarus system $chars)
|
||
|
)
|
||
|
|
||
|
;;; From http://en.wikipedia.org/wiki/UTF-8
|
||
|
;;; hexadecimal binary scalar value UTF8
|
||
|
;;; 000000-00007F 00000000_00000000_0zzzzzzz 0zzzzzzz
|
||
|
;;; 000080-0007FF 00000000_00000yyy_yyzzzzzz 110yyyyy 10zzzzzz
|
||
|
;;; 000800-00FFFF 00000000_xxxxyyyy_yyzzzzzz 1110xxxx 10yyyyyy 10zzzzzz
|
||
|
;;; 010000-10FFFF 000wwwxx_xxxxyyyy_yyzzzzzz 11110www 10xxxxxx 10yyyyyy 10zzzzzz
|
||
|
|
||
|
;;; valid ranges: [000000 - 00D7FF] \union [00E000 - 10FFFF]
|
||
|
;;; invalid hole: [00D800 - 00DFFF]
|
||
|
|
||
|
;;; handling-modes: ignore, replace, raise
|
||
|
;;; ignore: skips over the offending bytes
|
||
|
;;; replace: places a U+FFFD in place of the malformed bytes
|
||
|
;;; raise: raises an error
|
||
|
|
||
|
(define (utf8-string-size str)
|
||
|
(let f ([str str] [i 0] [j ($string-length str)] [n 0])
|
||
|
(cond
|
||
|
[($fx= i j) n]
|
||
|
[else
|
||
|
(let ([c ($string-ref str i)])
|
||
|
(let ([b ($char->fixnum c)])
|
||
|
(f str ($fxadd1 i) j
|
||
|
($fx+ n
|
||
|
(cond
|
||
|
[($fx<= b #x7F) 1]
|
||
|
[($fx<= b #x7FF) 2]
|
||
|
[($fx<= b #xFFFF) 3]
|
||
|
[else 4])))))])))
|
||
|
|
||
|
(define (fill-utf8-bytevector bv str)
|
||
|
(let f ([bv bv] [str str] [i 0] [j 0] [n ($string-length str)])
|
||
|
(cond
|
||
|
[($fx= i n) bv]
|
||
|
[else
|
||
|
(let ([c ($string-ref str i)])
|
||
|
(let ([b ($char->fixnum c)])
|
||
|
(cond
|
||
|
[($fx<= b #x7F)
|
||
|
($bytevector-set! bv j b)
|
||
|
(f bv str ($fxadd1 i) ($fxadd1 j) n)]
|
||
|
[($fx<= b #x7FF)
|
||
|
($bytevector-set! bv j
|
||
|
($fxlogor #b11000000 ($fxsra b 6)))
|
||
|
($bytevector-set! bv ($fx+ j 1)
|
||
|
($fxlogor #b10000000 ($fxlogand b #b111111)))
|
||
|
(f bv str ($fxadd1 i) ($fx+ j 2) n)]
|
||
|
[($fx<= b #xFFFF)
|
||
|
($bytevector-set! bv j
|
||
|
($fxlogor #b11100000 ($fxsra b 12)))
|
||
|
($bytevector-set! bv ($fx+ j 1)
|
||
|
($fxlogor #b10000000 ($fxlogand ($fxsra b 6) #b111111)))
|
||
|
($bytevector-set! bv ($fx+ j 2)
|
||
|
($fxlogor #b10000000 ($fxlogand b #b111111)))
|
||
|
(f bv str ($fxadd1 i) ($fx+ j 3) n)]
|
||
|
[else
|
||
|
($bytevector-set! bv j
|
||
|
($fxlogor #b11110000 ($fxsra b 18)))
|
||
|
($bytevector-set! bv ($fx+ j 1)
|
||
|
($fxlogor #b10000000 ($fxlogand ($fxsra b 12) #b111111)))
|
||
|
($bytevector-set! bv ($fx+ j 2)
|
||
|
($fxlogor #b10000000 ($fxlogand ($fxsra b 6) #b111111)))
|
||
|
($bytevector-set! bv ($fx+ j 3)
|
||
|
($fxlogor #b10000000 ($fxlogand b #b111111)))
|
||
|
(f bv str ($fxadd1 i) ($fx+ j 4) n)])))])))
|
||
|
|
||
|
|
||
|
(define string->utf8-bytevector
|
||
|
(lambda (str)
|
||
|
(unless (string? str)
|
||
|
(error 'string->utf8-bytevector "~s is not a string" str))
|
||
|
(fill-utf8-bytevector
|
||
|
($make-bytevector (utf8-string-size str))
|
||
|
str)))
|
||
|
|
||
|
|
||
|
)
|
||
|
|
||
|
|
||
|
#!eof
|
||
|
(unless (memq handling-mode '(ignore replace raise))
|
||
|
(error 'string->utf8-bytevector "~s is not a valid hanlding-mode, shound be one of ignore, replace, or raise" handling-mode))
|
||
|
|