* Better handling of multi-byte read-char.

This commit is contained in:
Abdulaziz Ghuloum 2007-06-02 05:17:22 +03:00
parent ee738a9a62
commit af9798e9be
8 changed files with 113 additions and 26 deletions

Binary file not shown.

View File

@ -64,8 +64,8 @@
(cond
[(symbol? x)
(if (symbol-bound? x)
(error 'top-level-value "BUG in ~s" x)
(error 'top-level-value "~a is unbound" x))]
(error 'top-level-value-error "BUG in ~s" x)
(error #f "~a is unbound" x))]
[else
(error 'top-level-value "~s is not a symbol" x)])))

View File

@ -46,9 +46,32 @@
(close-input-port p)
(close-ports))])))
(define refill-buffer!
(lambda (p bytes)
(error 'refill-buffer! "not implemented")))
(define read-multibyte-char
(lambda (p)
(error 'read-multibyte-char "not implemented")))
(lambda (p b0)
(let ([idx ($port-input-index p)]
[size ($port-input-size p)])
(cond
[($fx= ($fxlogand b0 #b11100000) #b11000000)
;;; 2-byte utf8 sequence
(unless ($fx< ($fx+ idx 1) size)
(refill-buffer! p 1))
(let ([b1 ($bytevector-u8-ref
($port-input-buffer p)
($fxadd1 idx))])
(unless ($fx= ($fxlogand b1 #b11000000) #b10000000)
(error 'read-char "invalid utf8 sequence ~a ~a" b0 b1))
($set-port-input-index! p ($fx+ idx 2))
($fixnum->char
($fx+ ($fxsll ($fxlogand b0 #b11111) 6)
($fxlogand b1 #b111111))))]
[else
(error 'read-multibyte
"bytesequence ~a is not supported yet" b0)]))))
(define peek-multibyte-char
(lambda (p)
(error 'peek-multibyte-char "not implemented")))
@ -71,7 +94,7 @@
[($fx< b 128)
($set-port-input-index! p ($fxadd1 idx))
($fixnum->char b)]
[else (read-multibyte-char p)]))
[else (read-multibyte-char p b)]))
(if open?
(let ([bytes
(foreign-call "ikrt_read"

View File

@ -121,7 +121,7 @@
(let ([buff ($port-output-buffer p)])
(set! buffer-list (cons (bv-copy buff) buffer-list))
($bytevector-set! buff 0 b)
(set! idx 1))
($set-port-output-index! p 1))
(error 'write-byte "port ~s is closed" p))))
(error 'write-byte "~s is not an output-port" p))
(error 'write-byte "~s is not a byte" b))]
@ -129,7 +129,7 @@
(if (char? c)
(if (output-port? p)
(let ([b ($char->fixnum c)])
(if ($fx<= b 255)
(if ($fx<= b 127)
($write-byte b p)
(error 'write-char "multibyte write of ~s is not implemented" c)))
(error 'write-char "~s is not an output-port" p))

View File

@ -6,6 +6,7 @@
(ikarus system $fx)
(ikarus system $pairs)
(ikarus system $bytevectors)
(ikarus unicode-data)
(except (ikarus) read read-token comment-handler))
(define delimiter?
@ -20,7 +21,10 @@
(fx- ($char->fixnum c) ($char->fixnum #\0))))
(define initial?
(lambda (c)
(or (letter? c) (special-initial? c))))
(cond
[($char<= c ($fixnum->char 127))
(or (letter? c) (special-initial? c))]
[else (unicode-printable-char? c)])))
(define letter?
(lambda (c)
(or (and ($char<= #\a c) ($char<= c #\z))
@ -154,6 +158,29 @@
(tokenize-char-seq p "tab" '(datum . #\tab))]
[($char= #\r c)
(tokenize-char-seq p "return" '(datum . #\return))]
[($char= #\x c)
(let ([n (peek-char p)])
(cond
[(or (eof-object? n) (delimiter? n))
'(datum . #\x)]
[(hex n) =>
(lambda (v)
(read-char p)
(let f ([v v])
(let ([c (read-char p)])
(cond
[(eof-object? c)
(cons 'datum (integer->char v))]
[(delimiter? c)
(unread-char c p)
(cons 'datum (integer->char v))]
[(hex c) =>
(lambda (v0)
(f (+ (* v 16) v0)))]
[else
(error 'tokenize "invalid character sequence")]))))]
[else
(error 'tokenize "invalid character sequence #\\x~a" n)]))]
[else
(let ([n (peek-char p)])
(cond
@ -161,6 +188,17 @@
[(delimiter? n) (cons 'datum c)]
[else
(error 'tokenize "invalid syntax #\\~a~a" c n)]))]))))
(define (hex x)
(cond
[(and ($char<= #\0 x) ($char<= x #\9))
($fx- ($char->fixnum x) ($char->fixnum #\0))]
[(and ($char<= #\a x) ($char<= x #\z))
($fx- ($char->fixnum x)
($fx- ($char->fixnum #\a) 10))]
[(and ($char<= #\A x) ($char<= x #\Z))
($fx- ($char->fixnum x)
($fx- ($char->fixnum #\A) 10))]
[else #f]))
(define multiline-error
(lambda ()
(error 'tokenize
@ -486,6 +524,42 @@
[else (tokenize-bar p (cons c ac))]))]
[($char= #\| c) ac]
[else (tokenize-bar p (cons c ac))]))))
(define (tokenize-backslash p)
(let ([c (read-char p)])
(cond
[(eof-object? c)
(error 'tokenize "invalid eof after \\")]
[($char= #\x c)
(let ([c (read-char p)])
(cond
[(eof-object? c)
(error 'tokenize "invalid eof after \\x")]
[(hex c) =>
(lambda (v)
(let f ([v v] [ac `(,c #\x #\\)])
(let ([c (read-char p)])
(cond
[(eof-object? c)
(error 'tokenize "invalid eof after ~a"
(list->string (reverse ac)))]
[($char= #\; c)
(cons 'datum
(string->symbol
(list->string
(cons (integer->char v)
(reverse (tokenize-identifier '() p))))))]
[(hex c) =>
(lambda (v0)
(f (+ (* v 16) v0) (cons c ac)))]
[else
(error 'tokenize "invalid sequence ~a"
(list->string (cons c (reverse ac))))]))))]
[else
(unread-char c p)
(error 'tokenize "invalid sequence \\x~a" c)]))]
[else
(unread-char c p)
(error 'tokenize "invalid sequence \\~a" c)])))
(define tokenize/c
(lambda (c p)
(cond
@ -538,6 +612,10 @@
[($char= #\| c)
(let ([ls (reverse (tokenize-bar p '()))])
(cons 'datum (string->symbol (list->string ls))))]
[($char= #\\ c)
(tokenize-backslash p)]
[else
(unread-char c p)
(error 'tokenize "invalid syntax ~a" c)])))

View File

@ -144,7 +144,8 @@
[else (error who "incomplete char sequence")])]
[(eq? mode 'ignore) (f x ($fxadd1 i) j n mode)]
[(eq? mode 'replace) (f x ($fxadd1 i) j ($fxadd1 n) mode)]
[else (error who "invalid byte ~s" b0)]))])))
[else (error who "invalid byte ~s at index ~s of ~s"
b0 i x)]))])))
(define (fill str bv mode)
(let f ([str str] [x bv] [i 0] [j ($bytevector-length bv)] [n 0] [mode mode])
(cond

View File

@ -10,27 +10,11 @@
(ikarus system $pairs)
(ikarus system $symbols)
(ikarus system $bytevectors)
(ikarus unicode-data)
(except (ikarus) write display format printf print-error
error-handler error))
(include "unicode/unicode-constituents.ss")
(define (binary-search-on? n v)
(let ([k ($fx- ($vector-length v) 1)])
(let f ([i 0] [k k] [n n] [v v])
(cond
[($fx= i k) ($fx= ($fxlogand i 1) 1)]
[else
(let ([j ($fxsra ($fx+ i ($fx+ k 1)) 1)])
(cond
[($fx<= ($vector-ref v j) n) (f j k n v)]
[else (f i ($fx- j 1) n v)]))]))))
(define (unicode-printable-char? c)
(binary-search-on?
($char->fixnum c)
unicode-constituents-vector))
(define char-table ; first nonprintable chars
'#("nul" "x1" "x2" "x3" "x4" "x5" "x6" "alarm"

View File

@ -54,6 +54,7 @@
"ikarus.io.input-strings.ss"
"ikarus.io.output-strings.ss"
"ikarus.hash-tables.ss"
"ikarus.unicode-data.ss"
"ikarus.writer.ss"
"ikarus.reader.ss"
"ikarus.code-objects.ss"