* Better handling of multi-byte read-char.
This commit is contained in:
parent
ee738a9a62
commit
af9798e9be
BIN
src/ikarus.boot
BIN
src/ikarus.boot
Binary file not shown.
|
@ -64,8 +64,8 @@
|
|||
(cond
|
||||
[(symbol? x)
|
||||
(if (symbol-bound? x)
|
||||
(error 'top-level-value "BUG in ~s" x)
|
||||
(error 'top-level-value "~a is unbound" x))]
|
||||
(error 'top-level-value-error "BUG in ~s" x)
|
||||
(error #f "~a is unbound" x))]
|
||||
[else
|
||||
(error 'top-level-value "~s is not a symbol" x)])))
|
||||
|
||||
|
|
|
@ -46,9 +46,32 @@
|
|||
(close-input-port p)
|
||||
(close-ports))])))
|
||||
|
||||
(define refill-buffer!
|
||||
(lambda (p bytes)
|
||||
(error 'refill-buffer! "not implemented")))
|
||||
|
||||
(define read-multibyte-char
|
||||
(lambda (p)
|
||||
(error 'read-multibyte-char "not implemented")))
|
||||
(lambda (p b0)
|
||||
(let ([idx ($port-input-index p)]
|
||||
[size ($port-input-size p)])
|
||||
(cond
|
||||
[($fx= ($fxlogand b0 #b11100000) #b11000000)
|
||||
;;; 2-byte utf8 sequence
|
||||
(unless ($fx< ($fx+ idx 1) size)
|
||||
(refill-buffer! p 1))
|
||||
(let ([b1 ($bytevector-u8-ref
|
||||
($port-input-buffer p)
|
||||
($fxadd1 idx))])
|
||||
(unless ($fx= ($fxlogand b1 #b11000000) #b10000000)
|
||||
(error 'read-char "invalid utf8 sequence ~a ~a" b0 b1))
|
||||
($set-port-input-index! p ($fx+ idx 2))
|
||||
($fixnum->char
|
||||
($fx+ ($fxsll ($fxlogand b0 #b11111) 6)
|
||||
($fxlogand b1 #b111111))))]
|
||||
[else
|
||||
(error 'read-multibyte
|
||||
"bytesequence ~a is not supported yet" b0)]))))
|
||||
|
||||
(define peek-multibyte-char
|
||||
(lambda (p)
|
||||
(error 'peek-multibyte-char "not implemented")))
|
||||
|
@ -71,7 +94,7 @@
|
|||
[($fx< b 128)
|
||||
($set-port-input-index! p ($fxadd1 idx))
|
||||
($fixnum->char b)]
|
||||
[else (read-multibyte-char p)]))
|
||||
[else (read-multibyte-char p b)]))
|
||||
(if open?
|
||||
(let ([bytes
|
||||
(foreign-call "ikrt_read"
|
||||
|
|
|
@ -121,7 +121,7 @@
|
|||
(let ([buff ($port-output-buffer p)])
|
||||
(set! buffer-list (cons (bv-copy buff) buffer-list))
|
||||
($bytevector-set! buff 0 b)
|
||||
(set! idx 1))
|
||||
($set-port-output-index! p 1))
|
||||
(error 'write-byte "port ~s is closed" p))))
|
||||
(error 'write-byte "~s is not an output-port" p))
|
||||
(error 'write-byte "~s is not a byte" b))]
|
||||
|
@ -129,7 +129,7 @@
|
|||
(if (char? c)
|
||||
(if (output-port? p)
|
||||
(let ([b ($char->fixnum c)])
|
||||
(if ($fx<= b 255)
|
||||
(if ($fx<= b 127)
|
||||
($write-byte b p)
|
||||
(error 'write-char "multibyte write of ~s is not implemented" c)))
|
||||
(error 'write-char "~s is not an output-port" p))
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
(ikarus system $fx)
|
||||
(ikarus system $pairs)
|
||||
(ikarus system $bytevectors)
|
||||
(ikarus unicode-data)
|
||||
(except (ikarus) read read-token comment-handler))
|
||||
|
||||
(define delimiter?
|
||||
|
@ -20,7 +21,10 @@
|
|||
(fx- ($char->fixnum c) ($char->fixnum #\0))))
|
||||
(define initial?
|
||||
(lambda (c)
|
||||
(or (letter? c) (special-initial? c))))
|
||||
(cond
|
||||
[($char<= c ($fixnum->char 127))
|
||||
(or (letter? c) (special-initial? c))]
|
||||
[else (unicode-printable-char? c)])))
|
||||
(define letter?
|
||||
(lambda (c)
|
||||
(or (and ($char<= #\a c) ($char<= c #\z))
|
||||
|
@ -154,6 +158,29 @@
|
|||
(tokenize-char-seq p "tab" '(datum . #\tab))]
|
||||
[($char= #\r c)
|
||||
(tokenize-char-seq p "return" '(datum . #\return))]
|
||||
[($char= #\x c)
|
||||
(let ([n (peek-char p)])
|
||||
(cond
|
||||
[(or (eof-object? n) (delimiter? n))
|
||||
'(datum . #\x)]
|
||||
[(hex n) =>
|
||||
(lambda (v)
|
||||
(read-char p)
|
||||
(let f ([v v])
|
||||
(let ([c (read-char p)])
|
||||
(cond
|
||||
[(eof-object? c)
|
||||
(cons 'datum (integer->char v))]
|
||||
[(delimiter? c)
|
||||
(unread-char c p)
|
||||
(cons 'datum (integer->char v))]
|
||||
[(hex c) =>
|
||||
(lambda (v0)
|
||||
(f (+ (* v 16) v0)))]
|
||||
[else
|
||||
(error 'tokenize "invalid character sequence")]))))]
|
||||
[else
|
||||
(error 'tokenize "invalid character sequence #\\x~a" n)]))]
|
||||
[else
|
||||
(let ([n (peek-char p)])
|
||||
(cond
|
||||
|
@ -161,6 +188,17 @@
|
|||
[(delimiter? n) (cons 'datum c)]
|
||||
[else
|
||||
(error 'tokenize "invalid syntax #\\~a~a" c n)]))]))))
|
||||
(define (hex x)
|
||||
(cond
|
||||
[(and ($char<= #\0 x) ($char<= x #\9))
|
||||
($fx- ($char->fixnum x) ($char->fixnum #\0))]
|
||||
[(and ($char<= #\a x) ($char<= x #\z))
|
||||
($fx- ($char->fixnum x)
|
||||
($fx- ($char->fixnum #\a) 10))]
|
||||
[(and ($char<= #\A x) ($char<= x #\Z))
|
||||
($fx- ($char->fixnum x)
|
||||
($fx- ($char->fixnum #\A) 10))]
|
||||
[else #f]))
|
||||
(define multiline-error
|
||||
(lambda ()
|
||||
(error 'tokenize
|
||||
|
@ -486,6 +524,42 @@
|
|||
[else (tokenize-bar p (cons c ac))]))]
|
||||
[($char= #\| c) ac]
|
||||
[else (tokenize-bar p (cons c ac))]))))
|
||||
(define (tokenize-backslash p)
|
||||
(let ([c (read-char p)])
|
||||
(cond
|
||||
[(eof-object? c)
|
||||
(error 'tokenize "invalid eof after \\")]
|
||||
[($char= #\x c)
|
||||
(let ([c (read-char p)])
|
||||
(cond
|
||||
[(eof-object? c)
|
||||
(error 'tokenize "invalid eof after \\x")]
|
||||
[(hex c) =>
|
||||
(lambda (v)
|
||||
(let f ([v v] [ac `(,c #\x #\\)])
|
||||
(let ([c (read-char p)])
|
||||
(cond
|
||||
[(eof-object? c)
|
||||
(error 'tokenize "invalid eof after ~a"
|
||||
(list->string (reverse ac)))]
|
||||
[($char= #\; c)
|
||||
(cons 'datum
|
||||
(string->symbol
|
||||
(list->string
|
||||
(cons (integer->char v)
|
||||
(reverse (tokenize-identifier '() p))))))]
|
||||
[(hex c) =>
|
||||
(lambda (v0)
|
||||
(f (+ (* v 16) v0) (cons c ac)))]
|
||||
[else
|
||||
(error 'tokenize "invalid sequence ~a"
|
||||
(list->string (cons c (reverse ac))))]))))]
|
||||
[else
|
||||
(unread-char c p)
|
||||
(error 'tokenize "invalid sequence \\x~a" c)]))]
|
||||
[else
|
||||
(unread-char c p)
|
||||
(error 'tokenize "invalid sequence \\~a" c)])))
|
||||
(define tokenize/c
|
||||
(lambda (c p)
|
||||
(cond
|
||||
|
@ -538,6 +612,10 @@
|
|||
[($char= #\| c)
|
||||
(let ([ls (reverse (tokenize-bar p '()))])
|
||||
(cons 'datum (string->symbol (list->string ls))))]
|
||||
[($char= #\\ c)
|
||||
(tokenize-backslash p)]
|
||||
|
||||
|
||||
[else
|
||||
(unread-char c p)
|
||||
(error 'tokenize "invalid syntax ~a" c)])))
|
||||
|
|
|
@ -144,7 +144,8 @@
|
|||
[else (error who "incomplete char sequence")])]
|
||||
[(eq? mode 'ignore) (f x ($fxadd1 i) j n mode)]
|
||||
[(eq? mode 'replace) (f x ($fxadd1 i) j ($fxadd1 n) mode)]
|
||||
[else (error who "invalid byte ~s" b0)]))])))
|
||||
[else (error who "invalid byte ~s at index ~s of ~s"
|
||||
b0 i x)]))])))
|
||||
(define (fill str bv mode)
|
||||
(let f ([str str] [x bv] [i 0] [j ($bytevector-length bv)] [n 0] [mode mode])
|
||||
(cond
|
||||
|
|
|
@ -10,27 +10,11 @@
|
|||
(ikarus system $pairs)
|
||||
(ikarus system $symbols)
|
||||
(ikarus system $bytevectors)
|
||||
(ikarus unicode-data)
|
||||
(except (ikarus) write display format printf print-error
|
||||
error-handler error))
|
||||
|
||||
|
||||
(include "unicode/unicode-constituents.ss")
|
||||
|
||||
(define (binary-search-on? n v)
|
||||
(let ([k ($fx- ($vector-length v) 1)])
|
||||
(let f ([i 0] [k k] [n n] [v v])
|
||||
(cond
|
||||
[($fx= i k) ($fx= ($fxlogand i 1) 1)]
|
||||
[else
|
||||
(let ([j ($fxsra ($fx+ i ($fx+ k 1)) 1)])
|
||||
(cond
|
||||
[($fx<= ($vector-ref v j) n) (f j k n v)]
|
||||
[else (f i ($fx- j 1) n v)]))]))))
|
||||
|
||||
(define (unicode-printable-char? c)
|
||||
(binary-search-on?
|
||||
($char->fixnum c)
|
||||
unicode-constituents-vector))
|
||||
|
||||
(define char-table ; first nonprintable chars
|
||||
'#("nul" "x1" "x2" "x3" "x4" "x5" "x6" "alarm"
|
||||
|
|
|
@ -54,6 +54,7 @@
|
|||
"ikarus.io.input-strings.ss"
|
||||
"ikarus.io.output-strings.ss"
|
||||
"ikarus.hash-tables.ss"
|
||||
"ikarus.unicode-data.ss"
|
||||
"ikarus.writer.ss"
|
||||
"ikarus.reader.ss"
|
||||
"ikarus.code-objects.ss"
|
||||
|
|
Loading…
Reference in New Issue