ikarus/scheme/ikarus.unicode-conversion.ss

265 lines
12 KiB
Scheme

;;; Ikarus Scheme -- A compiler for R6RS Scheme.
;;; Copyright (C) 2006,2007 Abdulaziz Ghuloum
;;;
;;; This program is free software: you can redistribute it and/or modify
;;; it under the terms of the GNU General Public License version 3 as
;;; published by the Free Software Foundation.
;;;
;;; This program is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;; General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with this program. If not, see <http://www.gnu.org/licenses/>.
(library (ikarus transcoders)
(export string->utf8 utf8->string)
(import (except (ikarus) string->utf8 utf8->string)
(ikarus system $strings)
(ikarus system $bytevectors)
(ikarus system $fx)
(ikarus system $chars))
;;; From http://en.wikipedia.org/wiki/UTF-8
;;; hexadecimal binary scalar value UTF8
;;; 000000-00007F 00000000_00000000_0zzzzzzz 0zzzzzzz
;;; 000080-0007FF 00000000_00000yyy_yyzzzzzz 110yyyyy 10zzzzzz
;;; 000800-00FFFF 00000000_xxxxyyyy_yyzzzzzz 1110xxxx 10yyyyyy 10zzzzzz
;;; 010000-10FFFF 000wwwxx_xxxxyyyy_yyzzzzzz 11110www 10xxxxxx 10yyyyyy 10zzzzzz
;;; valid ranges: [000000 - 00D7FF] \union [00E000 - 10FFFF]
;;; invalid hole: [00D800 - 00DFFF]
;;; handling-modes: ignore, replace, raise
;;; ignore: skips over the offending bytes
;;; replace: places a U+FFFD in place of the malformed bytes
;;; raise: raises an error
(define string->utf8
(lambda (str)
(define (utf8-string-size str)
(let f ([str str] [i 0] [j ($string-length str)] [n 0])
(cond
[($fx= i j) n]
[else
(let ([c ($string-ref str i)])
(let ([b ($char->fixnum c)])
(f str ($fxadd1 i) j
($fx+ n
(cond
[($fx<= b #x7F) 1]
[($fx<= b #x7FF) 2]
[($fx<= b #xFFFF) 3]
[else 4])))))])))
(define (fill-utf8-bytevector bv str)
(let f ([bv bv] [str str] [i 0] [j 0] [n ($string-length str)])
(cond
[($fx= i n) bv]
[else
(let ([c ($string-ref str i)])
(let ([b ($char->fixnum c)])
(cond
[($fx<= b #x7F)
($bytevector-set! bv j b)
(f bv str ($fxadd1 i) ($fxadd1 j) n)]
[($fx<= b #x7FF)
($bytevector-set! bv j
($fxlogor #b11000000 ($fxsra b 6)))
($bytevector-set! bv ($fx+ j 1)
($fxlogor #b10000000 ($fxlogand b #b111111)))
(f bv str ($fxadd1 i) ($fx+ j 2) n)]
[($fx<= b #xFFFF)
($bytevector-set! bv j
($fxlogor #b11100000 ($fxsra b 12)))
($bytevector-set! bv ($fx+ j 1)
($fxlogor #b10000000 ($fxlogand ($fxsra b 6) #b111111)))
($bytevector-set! bv ($fx+ j 2)
($fxlogor #b10000000 ($fxlogand b #b111111)))
(f bv str ($fxadd1 i) ($fx+ j 3) n)]
[else
($bytevector-set! bv j
($fxlogor #b11110000 ($fxsra b 18)))
($bytevector-set! bv ($fx+ j 1)
($fxlogor #b10000000 ($fxlogand ($fxsra b 12) #b111111)))
($bytevector-set! bv ($fx+ j 2)
($fxlogor #b10000000 ($fxlogand ($fxsra b 6) #b111111)))
($bytevector-set! bv ($fx+ j 3)
($fxlogor #b10000000 ($fxlogand b #b111111)))
(f bv str ($fxadd1 i) ($fx+ j 4) n)])))])))
(unless (string? str)
(error 'string->utf8 "not a string" str))
(fill-utf8-bytevector
($make-bytevector (utf8-string-size str))
str)))
(define (utf8->string x) (decode-utf8-bytevector x 'replace))
(define decode-utf8-bytevector
(let ()
(define who 'decode-utf8-bytevector)
(define (count bv mode)
(let f ([x bv] [i 0] [j ($bytevector-length bv)] [n 0] [mode mode])
(cond
[($fx= i j) n]
[else
(let ([b0 ($bytevector-u8-ref x i)])
(cond
[($fx<= b0 #x7F)
(f x ($fxadd1 i) j ($fxadd1 n) mode)]
[($fx= ($fxsra b0 5) #b110)
(let ([i ($fxadd1 i)])
(cond
[($fx< i j)
(let ([b1 ($bytevector-u8-ref x i)])
(cond
[($fx= ($fxsra b1 6) #b10)
(f x ($fxadd1 i) j ($fxadd1 n) mode)]
[(eq? mode 'ignore)
(f x i j n mode)]
[(eq? mode 'replace)
(f x i j ($fxadd1 n) mode)]
[else
(error who "invalid byte sequence at idx of bytevector"
b0 b1 i bv)]))]
[(eq? mode 'ignore) n]
[(eq? mode 'replace) ($fxadd1 n)]
[else
(error who "invalid byte near end of bytevector" b0)]))]
[($fx= ($fxsra b0 4) #b1110)
(cond
[($fx< ($fx+ i 2) j)
(let ([b1 ($bytevector-u8-ref x ($fx+ i 1))]
[b2 ($bytevector-u8-ref x ($fx+ i 2))])
(cond
[($fx= ($fxsra ($fxlogor b1 b2) 6) #b10)
(f x ($fx+ i 3) j ($fxadd1 n) mode)]
[(eq? mode 'ignore)
(f x ($fxadd1 i) j n mode)]
[(eq? mode 'replace)
(f x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "invalid sequence" b0 b1 b2)]))]
[(eq? mode 'ignore) (f x ($fxadd1 i) j n mode)]
[(eq? mode 'replace) (f x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "incomplete char sequence")])]
[($fx= ($fxsra b0 3) #b11110)
(cond
[($fx< ($fx+ i 3) j)
(let ([b1 ($bytevector-u8-ref x ($fx+ i 1))]
[b2 ($bytevector-u8-ref x ($fx+ i 2))]
[b3 ($bytevector-u8-ref x ($fx+ i 3))])
(cond
[($fx= ($fxsra ($fxlogor b1 ($fxlogor b2 b3)) 6) #b10)
(f x ($fx+ i 4) j ($fxadd1 n) mode)]
[(eq? mode 'ignore)
(f x ($fxadd1 i) j n mode)]
[(eq? mode 'replace)
(f x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "invalid sequence" b0 b1 b2 b3)]))]
[(eq? mode 'ignore) (f x ($fxadd1 i) j n mode)]
[(eq? mode 'replace) (f x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "incomplete char sequence")])]
[(eq? mode 'ignore) (f x ($fxadd1 i) j n mode)]
[(eq? mode 'replace) (f x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "invalid byte at index of bytevector" b0 i x)]))])))
(define (fill str bv mode)
(let f ([str str] [x bv] [i 0] [j ($bytevector-length bv)] [n 0] [mode mode])
(cond
[($fx= i j) str]
[else
(let ([b0 ($bytevector-u8-ref x i)])
(cond
[($fx<= b0 #x7F)
($string-set! str n ($fixnum->char b0))
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
[($fx= ($fxsra b0 5) #b110)
(let ([i ($fxadd1 i)])
(cond
[($fx< i j)
(let ([b1 ($bytevector-u8-ref x i)])
(cond
[($fx= ($fxsra b1 6) #b10)
($string-set! str n
($fixnum->char
($fx+ ($fxlogand b1 #b111111)
($fxsll ($fxlogand b0 #b11111) 6))))
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
[(eq? mode 'ignore)
(f str x i j n mode)]
[(eq? mode 'replace)
($string-set! str n ($fixnum->char #xFFFD))
(f str x i j ($fxadd1 n) mode)]
[else (error who "BUG")]))]
[(eq? mode 'ignore) str]
[(eq? mode 'replace)
($string-set! str n ($fixnum->char #xFFFD))
str]
[else (error who "BUG")]))]
[($fx= ($fxsra b0 4) #b1110)
(cond
[($fx< ($fx+ i 2) j)
(let ([b1 ($bytevector-u8-ref x ($fx+ i 1))]
[b2 ($bytevector-u8-ref x ($fx+ i 2))])
(cond
[($fx= ($fxsra ($fxlogor b1 b2) 6) #b10)
($string-set! str n
($fixnum->char
($fx+ ($fxlogand b2 #b111111)
($fx+ ($fxsll ($fxlogand b1 #b111111) 6)
($fxsll ($fxlogand b0 #b1111) 12)))))
(f str x ($fx+ i 3) j ($fxadd1 n) mode)]
[(eq? mode 'ignore)
(f str x ($fxadd1 i) j n mode)]
[(eq? mode 'replace)
($string-set! str n ($fixnum->char #xFFFD))
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "BUG")]))]
[(eq? mode 'ignore) (f str x ($fxadd1 i) j n mode)]
[(eq? mode 'replace)
($string-set! str n ($fixnum->char #xFFFD))
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "BUG")])]
[($fx= ($fxsra b0 3) #b11110)
(cond
[($fx< ($fx+ i 3) j)
(let ([b1 ($bytevector-u8-ref x ($fx+ i 1))]
[b2 ($bytevector-u8-ref x ($fx+ i 2))]
[b3 ($bytevector-u8-ref x ($fx+ i 3))])
(cond
[($fx= ($fxsra ($fxlogor b1 ($fxlogor b2 b3)) 6) #b10)
($string-set! str n
($fixnum->char
($fx+ ($fxlogand b3 #b111111)
($fx+ ($fxsll ($fxlogand b2 #b111111) 6)
($fx+ ($fxsll ($fxlogand b1 #b111111) 12)
($fxsll ($fxlogand b0 #b111) 18))))))
(f str x ($fx+ i 4) j ($fxadd1 n) mode)]
[(eq? mode 'ignore)
(f str x ($fxadd1 i) j n mode)]
[(eq? mode 'replace)
($string-set! str n ($fixnum->char #xFFFD))
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "BUG")]))]
[(eq? mode 'ignore) (f str x ($fxadd1 i) j n mode)]
[(eq? mode 'replace)
($string-set! str n ($fixnum->char #xFFFD))
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "BUG")])]
[(eq? mode 'ignore) (f str x ($fxadd1 i) j n mode)]
[(eq? mode 'replace)
($string-set! str n ($fixnum->char #xFFFD))
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "BUG")]))])))
(define (convert bv mode)
(fill ($make-string (count bv mode)) bv mode))
(case-lambda
[(bv) (convert bv 'raise)]
[(bv handling-mode)
(unless (memq handling-mode '(ignore replace raise))
(error 'decode-utf8-bytevector
"not a valid handling mode"
handling-mode))
(convert bv handling-mode)])))
)