;;; Ikarus Scheme -- A compiler for R6RS Scheme.
;;; Copyright (C) 2006,2007,2008  Abdulaziz Ghuloum
;;; 
;;; This program is free software: you can redistribute it and/or modify
;;; it under the terms of the GNU General Public License version 3 as
;;; published by the Free Software Foundation.
;;; 
;;; This program is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;;; General Public License for more details.
;;; 
;;; You should have received a copy of the GNU General Public License
;;; along with this program.  If not, see <http://www.gnu.org/licenses/>.


(library (ikarus transcoders)
  (export string->utf8 utf8->string string->utf16 string->utf32
          utf16->string utf32->string)
  (import (except (ikarus) string->utf8 utf8->string string->utf16
                  utf16->string string->utf32 utf32->string)
          (ikarus system $strings)
          (ikarus system $bytevectors)
          (ikarus system $fx)
          (ikarus system $chars)) 

  ;;; From http://en.wikipedia.org/wiki/UTF-8
  ;;; hexadecimal      binary scalar value            UTF8
  ;;; 000000-00007F    00000000_00000000_0zzzzzzz     0zzzzzzz
  ;;; 000080-0007FF    00000000_00000yyy_yyzzzzzz     110yyyyy 10zzzzzz
  ;;; 000800-00FFFF    00000000_xxxxyyyy_yyzzzzzz     1110xxxx 10yyyyyy 10zzzzzz
  ;;; 010000-10FFFF    000wwwxx_xxxxyyyy_yyzzzzzz     11110www 10xxxxxx 10yyyyyy 10zzzzzz

  ;;; valid ranges:  [000000 - 00D7FF] \union [00E000 - 10FFFF]
  ;;; invalid hole:  [00D800 - 00DFFF]

  ;;; handling-modes: ignore, replace, raise
  ;;; ignore: skips over the offending bytes
  ;;; replace: places a U+FFFD in place of the malformed bytes
  ;;; raise: raises an die

  ;;; It appears that utf-8 data can start with a #xEF #xBB #xBF BOM!

  (define integer->char/invalid
    (lambda (n)
      (cond
        [(not (fixnum? n))  #\xFFFD]
        [($fx<= n #xD7FF)   ($fixnum->char n)]
        [($fx< n #xE000)    #\xFFFD]
        [($fx<= n #x10FFFF) ($fixnum->char n)]
        [else               #\xFFFD])))

  (define string->utf8
    (lambda (str)
      (define (utf8-string-size str)
        (let f ([str str] [i 0] [j ($string-length str)] [n 0])
          (cond
            [($fx= i j) n]
            [else
             (let ([c ($string-ref str i)])
               (let ([b ($char->fixnum c)])
                 (f str ($fxadd1 i) j
                    ($fx+ n
                      (cond
                        [($fx<= b #x7F)    1]
                        [($fx<= b #x7FF)   2]
                        [($fx<= b #xFFFF)  3]
                        [else              4])))))])))
      (define (fill-utf8-bytevector bv str)
        (let f ([bv bv] [str str] [i 0] [j 0] [n ($string-length str)])
          (cond
            [($fx= i n) bv]
            [else
             (let ([c ($string-ref str i)])
               (let ([b ($char->fixnum c)])
                 (cond
                  [($fx<= b #x7F)
                   ($bytevector-set! bv j b)
                   (f bv str ($fxadd1 i) ($fxadd1 j) n)]
                  [($fx<= b #x7FF)  
                   ($bytevector-set! bv j
                     ($fxlogor #b11000000 ($fxsra b 6)))
                   ($bytevector-set! bv ($fx+ j 1)
                     ($fxlogor #b10000000 ($fxlogand b #b111111)))
                   (f bv str ($fxadd1 i) ($fx+ j 2) n)]
                  [($fx<= b #xFFFF) 
                   ($bytevector-set! bv j
                     ($fxlogor #b11100000 ($fxsra b 12)))
                   ($bytevector-set! bv ($fx+ j 1)
                     ($fxlogor #b10000000 ($fxlogand ($fxsra b 6) #b111111)))
                   ($bytevector-set! bv ($fx+ j 2)
                     ($fxlogor #b10000000 ($fxlogand b #b111111)))
                   (f bv str ($fxadd1 i) ($fx+ j 3) n)]
                  [else
                   ($bytevector-set! bv j
                     ($fxlogor #b11110000 ($fxsra b 18)))
                   ($bytevector-set! bv ($fx+ j 1)
                     ($fxlogor #b10000000 ($fxlogand ($fxsra b 12) #b111111)))
                   ($bytevector-set! bv ($fx+ j 2)
                     ($fxlogor #b10000000 ($fxlogand ($fxsra b 6) #b111111)))
                   ($bytevector-set! bv ($fx+ j 3)
                     ($fxlogor #b10000000 ($fxlogand b #b111111)))
                   (f bv str ($fxadd1 i) ($fx+ j 4) n)])))])))
      (unless (string? str) 
        (die 'string->utf8 "not a string" str))
      (fill-utf8-bytevector
        ($make-bytevector (utf8-string-size str))
        str)))

  (define (utf8->string x) 
    (unless (bytevector? x) 
      (die 'utf8->string "not a bytevector" x))
    (decode-utf8-bytevector x 'replace))

  (define decode-utf8-bytevector
    (let ()
      (define who 'decode-utf8-bytevector)
      (define (count bv i mode)
        (let f ([x bv] [i i] [j ($bytevector-length bv)] [n 0] [mode mode])
          (cond
            [($fx= i j) n]
            [else
             (let ([b0 ($bytevector-u8-ref x i)])
               (cond
                 [($fx<= b0 #x7F) 
                  (f x ($fxadd1 i) j ($fxadd1 n) mode)]
                 [($fx= ($fxsra b0 5) #b110)
                  (let ([i ($fxadd1 i)])
                    (cond
                      [($fx< i j) 
                       (let ([b1 ($bytevector-u8-ref x i)])
                         (cond
                           [(and ($fx= ($fxsra b1 6) #b10)
                                 ;;; 000080-0007FF
                                 (let ([n (fxlogor (fxsll (fxlogand b0 #x1F) 6)
                                                   (fxlogand b1 #x3F))])
                                   (and (fx>= n #x80) 
                                        (fx<= n #x7FF))))
                            (f x ($fxadd1 i) j ($fxadd1 n) mode)]
                           [(eq? mode 'ignore) 
                            (f x i j n mode)]
                           [(eq? mode 'replace) 
                            (f x i j ($fxadd1 n) mode)]
                           [else
                            (die who "invalid byte sequence at idx of bytevector"
                                   b0 b1 i bv)]))]
                      [(eq? mode 'ignore) n]
                      [(eq? mode 'replace) ($fxadd1 n)]
                      [else
                       (die who "invalid byte near end of bytevector" b0)]))]
                 [($fx= ($fxsra b0 4) #b1110)
                  (cond
                    [($fx< ($fx+ i 2) j) 
                     (let ([b1 ($bytevector-u8-ref x ($fx+ i 1))]
                           [b2 ($bytevector-u8-ref x ($fx+ i 2))])
                       (cond
                         [(and ($fx= ($fxsra ($fxlogor b1 b2) 6) #b10)
                            (let ([n (fx+ (fxsll (fxlogand b0 #xF) 12)
                                       (fx+ (fxsll (fxlogand b1 #x3F) 6)
                                            (fxlogand b2 #x3F)))])
                              ;;; REVIEW LATER ; 000800-00FFFF
                               (and (fx>= n #x0000800) (fx<= n #x00FFFF))))
                          (f x ($fx+ i 3) j ($fxadd1 n) mode)]
                         [(eq? mode 'ignore) 
                          (f x ($fxadd1 i) j n mode)]
                         [(eq? mode 'replace)
                          (f x ($fxadd1 i) j ($fxadd1 n) mode)]
                         [else (die who "invalid sequence" b0 b1 b2)]))]
                    [(eq? mode 'ignore) (f x ($fxadd1 i) j n mode)]
                    [(eq? mode 'replace) (f x ($fxadd1 i) j ($fxadd1 n) mode)]
                    [else (die who "incomplete char sequence")])]
                 [($fx= ($fxsra b0 3) #b11110)
                  (cond
                    [($fx< ($fx+ i 3) j) 
                     (let ([b1 ($bytevector-u8-ref x ($fx+ i 1))]
                           [b2 ($bytevector-u8-ref x ($fx+ i 2))]
                           [b3 ($bytevector-u8-ref x ($fx+ i 3))])
                       (cond
                         [(and ($fx= ($fxsra ($fxlogor b1 ($fxlogor b2 b3)) 6) #b10)
                            (let ([n 
                                   ($fx+ ($fxlogand b3 #b111111)
                                    ($fx+ ($fxsll ($fxlogand b2 #b111111) 6)
                                     ($fx+ ($fxsll ($fxlogand b1 #b111111) 12)
                                      ($fxsll ($fxlogand b0 #b111) 18))))])
                              ;;; 010000-10FFFF  
                              (and (fx>= n #x10000) (fx<= n #x10FFFF))))
                          (f x ($fx+ i 4) j ($fxadd1 n) mode)]
                         [(eq? mode 'ignore) 
                          (f x ($fxadd1 i) j n mode)]
                         [(eq? mode 'replace)
                          (f x ($fxadd1 i) j ($fxadd1 n) mode)]
                         [else (die who "invalid sequence" b0 b1 b2 b3)]))]
                    [(eq? mode 'ignore) (f x ($fxadd1 i) j n mode)]
                    [(eq? mode 'replace) (f x ($fxadd1 i) j ($fxadd1 n) mode)]
                    [else (die who "incomplete char sequence")])]
                 [(eq? mode 'ignore) (f x ($fxadd1 i) j n mode)]
                 [(eq? mode 'replace) (f x ($fxadd1 i) j ($fxadd1 n) mode)]
                 [else (die who "invalid byte at index of bytevector" b0 i x)]))])))
      (define (fill str bv i mode)
        (let f ([str str] [x bv] [i i] [j ($bytevector-length bv)] [n 0] [mode mode])
          (cond
            [($fx= i j) str]
            [else
             (let ([b0 ($bytevector-u8-ref x i)])
               (cond
                 [($fx<= b0 #x7F) 
                  ($string-set! str n ($fixnum->char b0))
                  (f str x ($fxadd1 i) j ($fxadd1 n) mode)]
                 [($fx= ($fxsra b0 5) #b110)
                  (let ([i ($fxadd1 i)])
                    (cond
                      [($fx< i j) 
                       (let ([b1 ($bytevector-u8-ref x i)])
                         (cond
                           [(and ($fx= ($fxsra b1 6) #b10)
                              ;;; 000080-0007FF
                              (let ([n (fxlogor (fxsll (fxlogand b0 #x1F) 6)
                                                (fxlogand b1 #x3F))])
                                (and (fx>= n #x80) 
                                     (fx<= n #x7FF)
                                     ($fixnum->char n)))) =>
                            (lambda (c) 
                              ($string-set! str n c)
                              (f str x ($fxadd1 i) j ($fxadd1 n) mode))]
                           [(eq? mode 'ignore) 
                            (f str x i j n mode)]
                           [(eq? mode 'replace)
                            ($string-set! str n ($fixnum->char #xFFFD))
                            (f str x i j ($fxadd1 n) mode)]
                           [else (die who "BUG")]))]
                      [(eq? mode 'ignore) str]
                      [(eq? mode 'replace) 
                       ($string-set! str n ($fixnum->char #xFFFD))
                       str]
                      [else (die who "BUG")]))]
                 [($fx= ($fxsra b0 4) #b1110)
                  (cond
                    [($fx< ($fx+ i 2) j) 
                     (let ([b1 ($bytevector-u8-ref x ($fx+ i 1))]
                           [b2 ($bytevector-u8-ref x ($fx+ i 2))])
                       (cond
                         [(and ($fx= ($fxsra ($fxlogor b1 b2) 6) #b10)
                            (let ([n (fx+ (fxsll (fxlogand b0 #xF) 12)
                                       (fx+ (fxsll (fxlogand b1 #x3F) 6)
                                            (fxlogand b2 #x3F)))])
                              ;;; REVIEW LATER ; 000800-00FFFF
                              (and (and (fx>= n #x000800) (fx<= n #x00FFFF))
                                   ($fixnum->char n)))) =>
                          (lambda (c)
                            ($string-set! str n c)
                            (f str x ($fx+ i 3) j ($fxadd1 n) mode))]
                         [(eq? mode 'ignore) 
                          (f str x ($fxadd1 i) j n mode)]
                         [(eq? mode 'replace)
                          ($string-set! str n ($fixnum->char #xFFFD))
                          (f str x ($fxadd1 i) j ($fxadd1 n) mode)]
                         [else (die who "BUG")]))]
                    [(eq? mode 'ignore) (f str x ($fxadd1 i) j n mode)]
                    [(eq? mode 'replace) 
                     ($string-set! str n ($fixnum->char #xFFFD))
                     (f str x ($fxadd1 i) j ($fxadd1 n) mode)]
                    [else (die who "BUG")])]
                 [($fx= ($fxsra b0 3) #b11110)
                  (cond
                    [($fx< ($fx+ i 3) j) 
                     (let ([b1 ($bytevector-u8-ref x ($fx+ i 1))]
                           [b2 ($bytevector-u8-ref x ($fx+ i 2))]
                           [b3 ($bytevector-u8-ref x ($fx+ i 3))])
                       (cond
                         [(and ($fx= ($fxsra ($fxlogor b1 ($fxlogor b2 b3)) 6) #b10)
                            (let ([n 
                                   ($fx+ ($fxlogand b3 #b111111)
                                    ($fx+ ($fxsll ($fxlogand b2 #b111111) 6)
                                     ($fx+ ($fxsll ($fxlogand b1 #b111111) 12)
                                      ($fxsll ($fxlogand b0 #b111) 18))))])
                              ;;; 010000-10FFFF  
                              (and (fx>= n #x10000) 
                                   (fx<= n #x10FFFF)
                                   ($fixnum->char n)))) =>
                          (lambda (c)
                            ($string-set! str n c)
                            (f str x ($fx+ i 4) j ($fxadd1 n) mode))]
                         [(eq? mode 'ignore)
                          (f str x ($fxadd1 i) j n mode)]
                         [(eq? mode 'replace)
                          ($string-set! str n ($fixnum->char #xFFFD))
                          (f str x ($fxadd1 i) j ($fxadd1 n) mode)]
                         [else (die who "BUG")]))]
                    [(eq? mode 'ignore) (f str x ($fxadd1 i) j n mode)]
                    [(eq? mode 'replace) 
                     ($string-set! str n ($fixnum->char #xFFFD))
                     (f str x ($fxadd1 i) j ($fxadd1 n) mode)]
                    [else (die who "BUG")])]
                 [(eq? mode 'ignore) (f str x ($fxadd1 i) j n mode)]
                 [(eq? mode 'replace)
                  ($string-set! str n ($fixnum->char #xFFFD))
                  (f str x ($fxadd1 i) j ($fxadd1 n) mode)]
                 [else (die who "BUG")]))])))
      (define (has-bom? bv) 
        (and (fx> (bytevector-length bv) 3) 
             (fx= (bytevector-u8-ref bv 0) #xEF)
             (fx= (bytevector-u8-ref bv 1) #xBB)
             (fx= (bytevector-u8-ref bv 2) #xBF)))
      (define (convert bv mode)
        (cond
          [(has-bom? bv) 
           (fill ($make-string (count bv 3 mode)) bv 3 mode)]
          [else
           (fill ($make-string (count bv 0 mode)) bv 0 mode)]))
      (case-lambda
        [(bv) (convert bv 'raise)]
        [(bv handling-mode)
         (unless (memq handling-mode '(ignore replace raise)) 
           (die 'decode-utf8-bytevector
                  "not a valid handling mode"
                  handling-mode))
         (convert bv handling-mode)])))


;;; From: http://tools.ietf.org/html/rfc2781
;;;
;;; 2.1 Encoding UTF-16
;;;
;;;   Encoding of a single character from an ISO 10646 character value
;;;   to UTF-16 proceeds as follows. Let U be the character number, no
;;;   greater than 0x10FFFF.
;;;
;;;   1) If U < 0x10000, encode U as a 16-bit unsigned integer and terminate.
;;;
;;;   2) Let U' = U - 0x10000. Because U is less than or equal to 0x10FFFF,
;;;      U' must be less than or equal to 0xFFFFF. That is, U' can be
;;;      represented in 20 bits.
;;;
;;;   3) Initialize two 16-bit unsigned integers, W1 and W2, to 0xD800 and
;;;      0xDC00, respectively. These integers each have 10 bits free to
;;;      encode the character value, for a total of 20 bits.
;;;
;;;   4) Assign the 10 high-order bits of the 20-bit U' to the 10 low-order
;;;      bits of W1 and the 10 low-order bits of U' to the 10 low-order
;;;      bits of W2. Terminate.
;;;
;;;   Graphically, steps 2 through 4 look like:
;;;   U' = yyyyyyyyyyxxxxxxxxxx
;;;   W1 = 110110yyyyyyyyyy
;;;   W2 = 110111xxxxxxxxxx
;;;
;;;   Decoding of a single character from UTF-16 to an ISO 10646 character
;;;   value proceeds as follows. Let W1 be the next 16-bit integer in the
;;;   sequence of integers representing the text. Let W2 be the (eventual)
;;;   next integer following W1.
;;;
;;;   1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
;;;      of W1. Terminate.
;;;
;;;   2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
;;;      is in die and no valid character can be obtained using W1.
;;;      Terminate.
;;;
;;;   3) If there is no W2 (that is, the sequence ends with W1), or if W2
;;;      is not between 0xDC00 and 0xDFFF, the sequence is in error.
;;;      Terminate.
;;;
;;;   4) Construct a 20-bit unsigned integer U', taking the 10 low-order
;;;      bits of W1 as its 10 high-order bits and the 10 low-order bits of
;;;      W2 as its 10 low-order bits.
;;;   5) Add 0x10000 to U' to obtain the character value U.
;;;      Terminate.



  (module (string->utf16)
    (define ($string->utf16 str endianness)
      (define (count-surr* str len i n) 
        (cond
          [(fx= i len) n]
          [else
           (let ([c (string-ref str i)])
             (cond
               [(char<? c #\x10000) 
                (count-surr* str len (fx+ i 1) n)]
               [else 
                (count-surr* str len (fx+ i 1) (fx+ n 1))]))]))
      (define (bvfill str bv i j len endianness)
        (cond
          [(fx= i len) bv]
          [else
           (let ([n (char->integer (string-ref str i))])
             (cond
               [(fx< n #x10000)
                (bytevector-u16-set! bv j n endianness)
                (bvfill str bv (fx+ i 1) (fx+ j 2) len endianness)]
               [else
                (let ([u^ (fx- n #x10000)])
                  (bytevector-u16-set! bv j 
                    (fxlogor (fxsll #b110110 10) (fxsra u^ 10))
                    endianness)
                  (bytevector-u16-set! bv (fx+ j 2) 
                    (fxlogor (fxsll #b110111 10) (fxlogand u^ #x3FF))
                    endianness))
                (bvfill str bv (fx+ i 1) (fx+ j 4) len endianness)]))]))
      (let ([len ($string-length str)])
        (let ([n (count-surr* str len 0 0)])
          ;;; FIXME: maybe special case for n=0 later
          (let ([bv (make-bytevector (fxsll (fx+ len n) 1))])
             (bvfill str bv 0 0 len endianness)))))
    (define string->utf16
      (case-lambda
        [(str) 
         (unless (string? str)
           (die 'string->utf16 "not a string" str))
         ($string->utf16 str 'big)]
        [(str endianness)
         (unless (string? str)
           (die 'string->utf16 "not a string" str))
         (unless (memv endianness '(big little))
           (die 'string->utf16 "invalid endianness" endianness))
         ($string->utf16 str endianness)])))

  (module (utf16->string)
    (define who 'utf16->string)
    (define (count-size bv endianness i len n)
      (cond
        [(fx= i len) 
         (if (fx= len (bytevector-length bv)) 
             n 
             (+ n 1))]
        [else 
         (let ([w1 (bytevector-u16-ref bv i endianness)])
           (cond
             [(or (fx< w1 #xD800) (fx> w1 #xDFFF))
              (count-size bv endianness (+ i 2) len (+ n 1))]
             [(not (fx<= #xD800 w1 #xDBFF)) ;;; die sequence
              (count-size bv endianness (+ i 2) len (+ n 1))]
             [(<= (+ i 4) (bytevector-length bv))
              (let ([w2 (bytevector-u16-ref bv (+ i 2) endianness)])
                (cond
                  [(not (<= #xDC00 w2 #xDFFF)) 
                   ;;; do we skip w2 also?
                   ;;; I won't.  Just w1 is an die
                   (count-size bv endianness (+ i 2) len (+ n 1))]
                  [else 
                   ;;; 4-byte sequence is ok
                   (count-size bv endianness (+ i 4) len (+ n 1))]))]
             [else 
              ;;; die again
              (count-size bv endianness (+ i 2) len (+ n 1))]))]))
    (define (fill bv endianness str i len n)
      (cond
        [(fx= i len)
         (unless (fx= len (bytevector-length bv)) 
           (string-set! str n #\xFFFD))
         str]
        [else 
         (let ([w1 (bytevector-u16-ref bv i endianness)])
           (cond
             [(or (fx< w1 #xD800) (fx> w1 #xDFFF))
              (string-set! str n (integer->char/invalid w1))
              (fill bv endianness str (+ i 2) len (+ n 1))]
             [(not (fx<= #xD800 w1 #xDBFF)) ;;; die sequence
              (string-set! str n #\xFFFD)
              (fill bv endianness str (+ i 2) len (+ n 1))]
             [(<= (+ i 4) (bytevector-length bv))
              (let ([w2 (bytevector-u16-ref bv (+ i 2) endianness)])
                (cond
                  [(not (<= #xDC00 w2 #xDFFF)) 
                   ;;; do we skip w2 also?
                   ;;; I won't.  Just w1 is an die
                   (string-set! str n #\xFFFD)
                   (fill bv endianness str (+ i 2) len (+ n 1))]
                  [else 
                   (string-set! str n 
                     (integer->char/invalid
                       (+ #x10000
                         (fxlogor (fxsll (fxlogand w1 #x3FF) 10)
                                  (fxlogand w2 #x3FF)))))
                   (fill bv endianness str (+ i 4) len (+ n 1))]))]
             [else 
              ;;; die again
              (string-set! str n #\xFFFD)
              (fill bv endianness str (+ i 2) len (+ n 1))]))]))
    (define (decode bv endianness start)
      (let ([len (fxand (bytevector-length bv) -2)])
        (let ([n (count-size bv endianness start len 0)])
          (let ([str (make-string n)])
            (fill bv endianness str start len 0)))))
    (define ($utf16->string bv endianness em?) 
      (define (bom-present bv)
        (and (fx>= (bytevector-length bv) 2)
             (let ([n (bytevector-u16-ref bv 0 'big)])
               (cond
                 [(fx= n #xFEFF) 'big]
                 [(fx= n #xFFFE) 'little]
                 [else #f]))))
      (unless (bytevector? bv)
        (die who "not a bytevector" bv))
      (unless (memv endianness '(big little))
        (die who "invalid endianness" endianness))
      (cond
        [em?  (decode bv endianness 0)]
        [(bom-present bv) =>
         (lambda (endianness)
           (decode bv endianness 2))]
        [else 
         (decode bv endianness 0)]))
    (define utf16->string
      (case-lambda
        [(bv endianness) 
         ($utf16->string bv endianness #f)]
        [(bv endianness em?)
         ($utf16->string bv endianness em?)])))

         

  (module (string->utf32)
    (define who 'string->utf32)
    (define (vfill str bv i len endianness) 
      (cond
        [(fx= i len) bv]
        [else
         (bytevector-u32-set! bv (fxsll i 2) 
            (char->integer (string-ref str i))              
            endianness)
         (vfill str bv (fx+ i 1) len endianness)]))
    (define ($string->utf32 str endianness)
      (let ([len (string-length str)])
        (vfill str (make-bytevector (fxsll len 2)) 0 len endianness)))
    (define string->utf32
      (case-lambda
        [(str) 
         (unless (string? str)
           (die who "not a string" str))
         ($string->utf32 str 'big)]
        [(str endianness)
         (unless (string? str)
           (die who "not a string" str))
         (unless (memq endianness '(little big))
           (die who "invalid endianness" endianness))
         ($string->utf32 str endianness)])))



  (module (utf32->string)
    (define who 'utf32->string)
    (define (fill bv endianness str i j n) 
      (cond
        [(fx= i j) 
         (unless (fx= n (string-length str)) 
           (string-set! str n #\xFFFD))
         str]
        [else 
         (string-set! str n 
           (integer->char/invalid 
             (bytevector-u32-ref bv i endianness)))
         (fill bv endianness str (fx+ i 4) j (fx+ n 1))]))
    (define (decode bv endianness start)
      (let ([bvlen (bytevector-length bv)])
        (let ([strlen (fxsra (fx+ (fx- bvlen start) 3) 2)])
          (fill bv endianness (make-string strlen) 
                start (fxand bvlen -4)
                0))))
    (define ($utf32->string bv endianness em?) 
      (define (bom-present bv)
        (and (fx>= (bytevector-length bv) 4)
             (let ([n (bytevector-u32-ref bv 0 'big)])
               (cond
                 [(= n #x0000FEFF) 'big]
                 [(= n #xFFFE0000) 'little]
                 [else #f]))))
      (unless (bytevector? bv)
        (die who "not a bytevector" bv))
      (unless (memv endianness '(big little))
        (die who "invalid endianness" endianness))
      (cond
        [em? (decode bv endianness 0)]
        [(bom-present bv) =>
         (lambda (endianness)
           (decode bv endianness 4))]
        [else 
         (decode bv endianness 0)]))
    (define utf32->string
      (case-lambda
        [(bv endianness) 
         ($utf32->string bv endianness #f)]
        [(bv endianness em?)
         ($utf32->string bv endianness em?)])))

  )