* Added utf8-bytevector->string decoder

This commit is contained in:
Abdulaziz Ghuloum 2007-05-18 20:18:55 -04:00
parent 10f5e53338
commit 85944a2cff
7 changed files with 244 additions and 61 deletions

Binary file not shown.

View File

@ -773,6 +773,29 @@ ikrt_strftime(ikp outstr, ikp fmtstr){
return fix(rv);
}
ikp
ikrt_bvftime(ikp outbv, ikp fmtbv){
time_t t;
struct tm* tmp;
t = time(NULL);
tmp = localtime(&t);
if(tmp == NULL){
fprintf(stderr, "Error in time: %s\n", strerror(errno));
}
int rv =
strftime((char*)outbv+off_bytevector_data,
unfix(ref(outbv, off_bytevector_length)) + 1,
(char*)fmtbv+off_bytevector_data,
tmp);
if(rv == 0){
fprintf(stderr, "Error in strftime: %s\n", strerror(errno));
}
return fix(rv);
}
ikp
ikrt_close_file(ikp fd, ikpcb* pcb){
int err = close(unfix(fd));

Binary file not shown.

View File

@ -1943,7 +1943,7 @@
$fx= $fx< $fx<= $fx> $fx>= $fxquotient $fxmodulo $fxsll $fxsra $fxlogxor $exit)
(andmap (check op fixnum?) rand*)]
[($fixnum->char)
(andmap (check op byte?) rand*)]
(andmap (check op fixnum?) rand*)]
[($char->fixnum $char= $char< $char<= $char> $char>= $string)
(andmap (check op char?) rand*)]
[($make-vector $make-string)

View File

@ -868,6 +868,8 @@
(vector-set! vec (fx+ reloc-idx 1) v)
(set! reloc-idx (fx+ reloc-idx 2))]
[(foreign-label)
;;; FIXME: converted strings should be memoized.
;;; wait for equal? hash tables.
(let ([name
(if (string? v)
(string->utf8-bytevector v)

View File

@ -1,14 +1,12 @@
(library (ikarus transcoders)
(export string->utf8-bytevector
#;utf8-bytevector->string)
(import (except (ikarus) string->utf8-bytevector)
utf8-bytevector->string)
(import (except (ikarus) string->utf8-bytevector utf8-bytevector->string)
(ikarus system $strings)
(ikarus system $bytevectors)
(ikarus system $fx)
(ikarus system $chars)
)
(ikarus system $chars))
;;; From http://en.wikipedia.org/wiki/UTF-8
;;; hexadecimal binary scalar value UTF8
;;; 000000-00007F 00000000_00000000_0zzzzzzz 0zzzzzzz
@ -24,71 +22,230 @@
;;; replace: places a U+FFFD in place of the malformed bytes
;;; raise: raises an error
(define (utf8-string-size str)
(let f ([str str] [i 0] [j ($string-length str)] [n 0])
(cond
[($fx= i j) n]
[else
(let ([c ($string-ref str i)])
(let ([b ($char->fixnum c)])
(f str ($fxadd1 i) j
($fx+ n
(cond
[($fx<= b #x7F) 1]
[($fx<= b #x7FF) 2]
[($fx<= b #xFFFF) 3]
[else 4])))))])))
(define (fill-utf8-bytevector bv str)
(let f ([bv bv] [str str] [i 0] [j 0] [n ($string-length str)])
(cond
[($fx= i n) bv]
[else
(let ([c ($string-ref str i)])
(let ([b ($char->fixnum c)])
(cond
[($fx<= b #x7F)
($bytevector-set! bv j b)
(f bv str ($fxadd1 i) ($fxadd1 j) n)]
[($fx<= b #x7FF)
($bytevector-set! bv j
($fxlogor #b11000000 ($fxsra b 6)))
($bytevector-set! bv ($fx+ j 1)
($fxlogor #b10000000 ($fxlogand b #b111111)))
(f bv str ($fxadd1 i) ($fx+ j 2) n)]
[($fx<= b #xFFFF)
($bytevector-set! bv j
($fxlogor #b11100000 ($fxsra b 12)))
($bytevector-set! bv ($fx+ j 1)
($fxlogor #b10000000 ($fxlogand ($fxsra b 6) #b111111)))
($bytevector-set! bv ($fx+ j 2)
($fxlogor #b10000000 ($fxlogand b #b111111)))
(f bv str ($fxadd1 i) ($fx+ j 3) n)]
[else
($bytevector-set! bv j
($fxlogor #b11110000 ($fxsra b 18)))
($bytevector-set! bv ($fx+ j 1)
($fxlogor #b10000000 ($fxlogand ($fxsra b 12) #b111111)))
($bytevector-set! bv ($fx+ j 2)
($fxlogor #b10000000 ($fxlogand ($fxsra b 6) #b111111)))
($bytevector-set! bv ($fx+ j 3)
($fxlogor #b10000000 ($fxlogand b #b111111)))
(f bv str ($fxadd1 i) ($fx+ j 4) n)])))])))
(define string->utf8-bytevector
(lambda (str)
(define (utf8-string-size str)
(let f ([str str] [i 0] [j ($string-length str)] [n 0])
(cond
[($fx= i j) n]
[else
(let ([c ($string-ref str i)])
(let ([b ($char->fixnum c)])
(f str ($fxadd1 i) j
($fx+ n
(cond
[($fx<= b #x7F) 1]
[($fx<= b #x7FF) 2]
[($fx<= b #xFFFF) 3]
[else 4])))))])))
(define (fill-utf8-bytevector bv str)
(let f ([bv bv] [str str] [i 0] [j 0] [n ($string-length str)])
(cond
[($fx= i n) bv]
[else
(let ([c ($string-ref str i)])
(let ([b ($char->fixnum c)])
(cond
[($fx<= b #x7F)
($bytevector-set! bv j b)
(f bv str ($fxadd1 i) ($fxadd1 j) n)]
[($fx<= b #x7FF)
($bytevector-set! bv j
($fxlogor #b11000000 ($fxsra b 6)))
($bytevector-set! bv ($fx+ j 1)
($fxlogor #b10000000 ($fxlogand b #b111111)))
(f bv str ($fxadd1 i) ($fx+ j 2) n)]
[($fx<= b #xFFFF)
($bytevector-set! bv j
($fxlogor #b11100000 ($fxsra b 12)))
($bytevector-set! bv ($fx+ j 1)
($fxlogor #b10000000 ($fxlogand ($fxsra b 6) #b111111)))
($bytevector-set! bv ($fx+ j 2)
($fxlogor #b10000000 ($fxlogand b #b111111)))
(f bv str ($fxadd1 i) ($fx+ j 3) n)]
[else
($bytevector-set! bv j
($fxlogor #b11110000 ($fxsra b 18)))
($bytevector-set! bv ($fx+ j 1)
($fxlogor #b10000000 ($fxlogand ($fxsra b 12) #b111111)))
($bytevector-set! bv ($fx+ j 2)
($fxlogor #b10000000 ($fxlogand ($fxsra b 6) #b111111)))
($bytevector-set! bv ($fx+ j 3)
($fxlogor #b10000000 ($fxlogand b #b111111)))
(f bv str ($fxadd1 i) ($fx+ j 4) n)])))])))
(unless (string? str)
(error 'string->utf8-bytevector "~s is not a string" str))
(fill-utf8-bytevector
($make-bytevector (utf8-string-size str))
str)))
(define utf8-bytevector->string
(let ()
(define who 'utf8-bytevector->string)
(define (count bv mode)
(let f ([x bv] [i 0] [j ($bytevector-length bv)] [n 0] [mode mode])
(cond
[($fx= i j) n]
[else
(let ([b0 ($bytevector-u8-ref x i)])
(cond
[($fx<= b0 #x7F)
(f x ($fxadd1 i) j ($fxadd1 n) mode)]
[($fx= ($fxsra b0 5) #b110)
(let ([i ($fxadd1 i)])
(cond
[($fx< i j)
(let ([b1 ($bytevector-u8-ref x i)])
(cond
[($fx= ($fxsra b1 6) #b10)
(f x ($fxadd1 i) j ($fxadd1 n) mode)]
[(eq? mode 'ignore)
(f x i j n mode)]
[(eq? mode 'replace)
(f x i j ($fxadd1 n) mode)]
[else
(error who "invalid byte sequence ~s ~s" b0 b1)]))]
[(eq? mode 'ignore) n]
[(eq? mode 'replace) ($fxadd1 n)]
[else
(error who "invalid byte ~s near end of bytevector" b0)]))]
[($fx= ($fxsra b0 4) #b1110)
(cond
[($fx< ($fx+ i 2) j)
(let ([b1 ($bytevector-u8-ref x ($fx+ i 1))]
[b2 ($bytevector-u8-ref x ($fx+ i 2))])
(cond
[($fx= ($fxsra ($fxlogor b1 b2) 6) #b10)
(f x ($fx+ i 3) j ($fxadd1 n) mode)]
[(eq? mode 'ignore)
(f x ($fxadd1 i) j n mode)]
[(eq? mode 'replace)
(f x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "invalid sequence ~s ~s ~s" b0 b1 b2)]))]
[(eq? mode 'ignore) (f x ($fxadd1 i) j n mode)]
[(eq? mode 'replace) (f x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "incomplete char sequence")])]
[($fx= ($fxsra b0 3) #b11110)
(cond
[($fx< ($fx+ i 3) j)
(let ([b1 ($bytevector-u8-ref x ($fx+ i 1))]
[b2 ($bytevector-u8-ref x ($fx+ i 2))]
[b3 ($bytevector-u8-ref x ($fx+ i 3))])
(cond
[($fx= ($fxsra ($fxlogor b1 ($fxlogor b2 b3)) 6) #b10)
(f x ($fx+ i 4) j ($fxadd1 n) mode)]
[(eq? mode 'ignore)
(f x ($fxadd1 i) j n mode)]
[(eq? mode 'replace)
(f x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "invalid sequence ~s ~s ~s ~s" b0 b1 b2 b3)]))]
[(eq? mode 'ignore) (f x ($fxadd1 i) j n mode)]
[(eq? mode 'replace) (f x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "incomplete char sequence")])]
[(eq? mode 'ignore) (f x ($fxadd1 i) j n mode)]
[(eq? mode 'replace) (f x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "invalid byte ~s" b0)]))])))
(define (fill str bv mode)
(let f ([str str] [x bv] [i 0] [j ($bytevector-length bv)] [n 0] [mode mode])
(cond
[($fx= i j) str]
[else
(let ([b0 ($bytevector-u8-ref x i)])
(cond
[($fx<= b0 #x7F)
($string-set! str n ($fixnum->char b0))
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
[($fx= ($fxsra b0 5) #b110)
(let ([i ($fxadd1 i)])
(cond
[($fx< i j)
(let ([b1 ($bytevector-u8-ref x i)])
(cond
[($fx= ($fxsra b1 6) #b10)
($string-set! str n
($fixnum->char
($fx+ ($fxlogand b1 #b111111)
($fxsll ($fxlogand b0 #b11111) 6))))
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
[(eq? mode 'ignore)
(f str x i j n mode)]
[(eq? mode 'replace)
($string-set! str n ($fixnum->char #xFFFD))
(f str x i j ($fxadd1 n) mode)]
[else (error who "BUG")]))]
[(eq? mode 'ignore) str]
[(eq? mode 'replace)
($string-set! str n ($fixnum->char #xFFFD))
str]
[else (error who "BUG")]))]
[($fx= ($fxsra b0 4) #b1110)
(cond
[($fx< ($fx+ i 2) j)
(let ([b1 ($bytevector-u8-ref x ($fx+ i 1))]
[b2 ($bytevector-u8-ref x ($fx+ i 2))])
(cond
[($fx= ($fxsra ($fxlogor b1 b2) 6) #b10)
($string-set! str n
($fixnum->char
($fx+ ($fxlogand b2 #b111111)
($fx+ ($fxsll ($fxlogand b1 #b111111) 6)
($fxsll ($fxlogand b0 #b1111) 12)))))
(f str x ($fx+ i 3) j ($fxadd1 n) mode)]
[(eq? mode 'ignore)
(f str x ($fxadd1 i) j n mode)]
[(eq? mode 'replace)
($string-set! str n ($fixnum->char #xFFFD))
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "BUG")]))]
[(eq? mode 'ignore) (f str x ($fxadd1 i) j n mode)]
[(eq? mode 'replace)
($string-set! str n ($fixnum->char #xFFFD))
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "BUG: incomplete char sequence")])]
[($fx= ($fxsra b0 3) #b11110)
(cond
[($fx< ($fx+ i 3) j)
(let ([b1 ($bytevector-u8-ref x ($fx+ i 1))]
[b2 ($bytevector-u8-ref x ($fx+ i 2))]
[b3 ($bytevector-u8-ref x ($fx+ i 3))])
(cond
[($fx= ($fxsra ($fxlogor b1 ($fxlogor b2 b3)) 6) #b10)
($string-set! str n
($fixnum->char
($fx+ ($fxlogand b3 #b111111)
($fx+ ($fxsll ($fxlogand b2 #b111111) 6)
($fx+ ($fxsll ($fxlogand b1 #b111111) 12)
($fxsll ($fxlogand b0 #b111) 18))))))
(f str x ($fx+ i 4) j ($fxadd1 n) mode)]
[(eq? mode 'ignore)
(f str x ($fxadd1 i) j n mode)]
[(eq? mode 'replace)
($string-set! str n ($fixnum->char #xFFFD))
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "BUG")]))]
[(eq? mode 'ignore) (f str x ($fxadd1 i) j n mode)]
[(eq? mode 'replace)
($string-set! str n ($fixnum->char #xFFFD))
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "BUG")])]
[(eq? mode 'ignore) (f str x ($fxadd1 i) j n mode)]
[(eq? mode 'replace)
($string-set! str n ($fixnum->char #xFFFD))
(f str x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "BUG")]))])))
(define (convert bv mode)
(fill ($make-string (count bv mode)) bv mode))
(case-lambda
[(bv) (convert bv 'error)]
[(bv handling-mode)
(unless (memq handling-mode '(ignore replace raise))
(error 'utf8-bytevector->string
"~s is not a valid handling mode"
handling-mode))
(convert bv handling-mode)])))
)
#!eof
(unless (memq handling-mode '(ignore replace raise))
(error 'string->utf8-bytevector "~s is not a valid hanlding-mode, shound be one of ignore, replace, or raise" handling-mode))

View File

@ -39,6 +39,7 @@
"ikarus.chars.ss"
"ikarus.records.ss"
"ikarus.strings.ss"
"ikarus.transcoders.ss"
"ikarus.date-string.ss"
"ikarus.symbols.ss"
"ikarus.vectors.ss"
@ -69,7 +70,6 @@
"ikarus.posix.ss"
"ikarus.timer.ss"
"ikarus.bytevectors.ss"
"ikarus.transcoders.ss"
"ikarus.main.ss"))
(define ikarus-system-macros
@ -293,6 +293,7 @@
[uint-list->bytevector i]
[sint-list->bytevector i]
[string->utf8-bytevector i]
[utf8-bytevector->string i]
[for-each i r]
[map i r]