* Better handling of multi-byte read-char.
This commit is contained in:
		
							parent
							
								
									ee738a9a62
								
							
						
					
					
						commit
						af9798e9be
					
				
							
								
								
									
										
											BIN
										
									
								
								src/ikarus.boot
								
								
								
								
							
							
						
						
									
										
											BIN
										
									
								
								src/ikarus.boot
								
								
								
								
							
										
											Binary file not shown.
										
									
								
							| 
						 | 
					@ -64,8 +64,8 @@
 | 
				
			||||||
      (cond
 | 
					      (cond
 | 
				
			||||||
        [(symbol? x)
 | 
					        [(symbol? x)
 | 
				
			||||||
         (if (symbol-bound? x)
 | 
					         (if (symbol-bound? x)
 | 
				
			||||||
             (error 'top-level-value "BUG in ~s" x)
 | 
					             (error 'top-level-value-error "BUG in ~s" x)
 | 
				
			||||||
             (error 'top-level-value "~a is unbound" x))]
 | 
					             (error #f "~a is unbound" x))]
 | 
				
			||||||
        [else
 | 
					        [else
 | 
				
			||||||
         (error 'top-level-value "~s is not a symbol" x)])))
 | 
					         (error 'top-level-value "~s is not a symbol" x)])))
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -46,9 +46,32 @@
 | 
				
			||||||
           (close-input-port p)
 | 
					           (close-input-port p)
 | 
				
			||||||
           (close-ports))])))
 | 
					           (close-ports))])))
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
 | 
					  (define refill-buffer!
 | 
				
			||||||
 | 
					    (lambda (p bytes)
 | 
				
			||||||
 | 
					      (error 'refill-buffer! "not implemented")))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  (define read-multibyte-char 
 | 
					  (define read-multibyte-char 
 | 
				
			||||||
    (lambda (p)
 | 
					    (lambda (p b0)
 | 
				
			||||||
      (error 'read-multibyte-char "not implemented")))
 | 
					      (let ([idx ($port-input-index p)] 
 | 
				
			||||||
 | 
					            [size ($port-input-size p)])
 | 
				
			||||||
 | 
					        (cond
 | 
				
			||||||
 | 
					          [($fx= ($fxlogand b0 #b11100000) #b11000000) 
 | 
				
			||||||
 | 
					           ;;; 2-byte utf8 sequence
 | 
				
			||||||
 | 
					           (unless ($fx< ($fx+ idx 1) size)
 | 
				
			||||||
 | 
					             (refill-buffer! p 1))
 | 
				
			||||||
 | 
					           (let ([b1 ($bytevector-u8-ref 
 | 
				
			||||||
 | 
					                       ($port-input-buffer p)
 | 
				
			||||||
 | 
					                       ($fxadd1 idx))])
 | 
				
			||||||
 | 
					             (unless ($fx= ($fxlogand b1 #b11000000) #b10000000)
 | 
				
			||||||
 | 
					               (error 'read-char "invalid utf8 sequence ~a ~a" b0 b1))
 | 
				
			||||||
 | 
					             ($set-port-input-index! p ($fx+ idx 2))
 | 
				
			||||||
 | 
					             ($fixnum->char 
 | 
				
			||||||
 | 
					               ($fx+ ($fxsll ($fxlogand b0 #b11111) 6)
 | 
				
			||||||
 | 
					                     ($fxlogand b1 #b111111))))]
 | 
				
			||||||
 | 
					          [else 
 | 
				
			||||||
 | 
					           (error 'read-multibyte
 | 
				
			||||||
 | 
					             "bytesequence ~a is not supported yet" b0)]))))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  (define peek-multibyte-char 
 | 
					  (define peek-multibyte-char 
 | 
				
			||||||
    (lambda (p)
 | 
					    (lambda (p)
 | 
				
			||||||
      (error 'peek-multibyte-char "not implemented")))
 | 
					      (error 'peek-multibyte-char "not implemented")))
 | 
				
			||||||
| 
						 | 
					@ -71,7 +94,7 @@
 | 
				
			||||||
                       [($fx< b 128) 
 | 
					                       [($fx< b 128) 
 | 
				
			||||||
                        ($set-port-input-index! p ($fxadd1 idx))
 | 
					                        ($set-port-input-index! p ($fxadd1 idx))
 | 
				
			||||||
                        ($fixnum->char b)]
 | 
					                        ($fixnum->char b)]
 | 
				
			||||||
                       [else (read-multibyte-char p)]))
 | 
					                       [else (read-multibyte-char p b)]))
 | 
				
			||||||
                   (if open?
 | 
					                   (if open?
 | 
				
			||||||
                       (let ([bytes
 | 
					                       (let ([bytes
 | 
				
			||||||
                              (foreign-call "ikrt_read" 
 | 
					                              (foreign-call "ikrt_read" 
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -121,7 +121,7 @@
 | 
				
			||||||
                             (let ([buff ($port-output-buffer p)])
 | 
					                             (let ([buff ($port-output-buffer p)])
 | 
				
			||||||
                               (set! buffer-list (cons (bv-copy buff) buffer-list))
 | 
					                               (set! buffer-list (cons (bv-copy buff) buffer-list))
 | 
				
			||||||
                               ($bytevector-set! buff 0 b)
 | 
					                               ($bytevector-set! buff 0 b)
 | 
				
			||||||
                               (set! idx 1))
 | 
					                               ($set-port-output-index! p 1))
 | 
				
			||||||
                             (error 'write-byte "port ~s is closed" p))))
 | 
					                             (error 'write-byte "port ~s is closed" p))))
 | 
				
			||||||
                     (error 'write-byte "~s is not an output-port" p))
 | 
					                     (error 'write-byte "~s is not an output-port" p))
 | 
				
			||||||
                 (error 'write-byte "~s is not a byte" b))]
 | 
					                 (error 'write-byte "~s is not a byte" b))]
 | 
				
			||||||
| 
						 | 
					@ -129,7 +129,7 @@
 | 
				
			||||||
             (if (char? c)
 | 
					             (if (char? c)
 | 
				
			||||||
                 (if (output-port? p)
 | 
					                 (if (output-port? p)
 | 
				
			||||||
                     (let ([b ($char->fixnum c)])
 | 
					                     (let ([b ($char->fixnum c)])
 | 
				
			||||||
                       (if ($fx<= b 255)
 | 
					                       (if ($fx<= b 127)
 | 
				
			||||||
                           ($write-byte b p)
 | 
					                           ($write-byte b p)
 | 
				
			||||||
                           (error 'write-char "multibyte write of ~s is not implemented" c)))
 | 
					                           (error 'write-char "multibyte write of ~s is not implemented" c)))
 | 
				
			||||||
                     (error 'write-char "~s is not an output-port" p))
 | 
					                     (error 'write-char "~s is not an output-port" p))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -6,6 +6,7 @@
 | 
				
			||||||
    (ikarus system $fx)
 | 
					    (ikarus system $fx)
 | 
				
			||||||
    (ikarus system $pairs)
 | 
					    (ikarus system $pairs)
 | 
				
			||||||
    (ikarus system $bytevectors)
 | 
					    (ikarus system $bytevectors)
 | 
				
			||||||
 | 
					    (ikarus unicode-data)
 | 
				
			||||||
    (except (ikarus) read read-token comment-handler))
 | 
					    (except (ikarus) read read-token comment-handler))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  (define delimiter?
 | 
					  (define delimiter?
 | 
				
			||||||
| 
						 | 
					@ -20,7 +21,10 @@
 | 
				
			||||||
      (fx- ($char->fixnum c) ($char->fixnum #\0))))
 | 
					      (fx- ($char->fixnum c) ($char->fixnum #\0))))
 | 
				
			||||||
  (define initial?
 | 
					  (define initial?
 | 
				
			||||||
    (lambda (c)
 | 
					    (lambda (c)
 | 
				
			||||||
      (or (letter? c) (special-initial? c))))
 | 
					      (cond
 | 
				
			||||||
 | 
					        [($char<= c ($fixnum->char 127))
 | 
				
			||||||
 | 
					         (or (letter? c) (special-initial? c))]
 | 
				
			||||||
 | 
					        [else (unicode-printable-char? c)])))
 | 
				
			||||||
  (define letter? 
 | 
					  (define letter? 
 | 
				
			||||||
    (lambda (c)
 | 
					    (lambda (c)
 | 
				
			||||||
      (or (and ($char<= #\a c) ($char<= c #\z))
 | 
					      (or (and ($char<= #\a c) ($char<= c #\z))
 | 
				
			||||||
| 
						 | 
					@ -154,6 +158,29 @@
 | 
				
			||||||
           (tokenize-char-seq p "tab" '(datum . #\tab))]
 | 
					           (tokenize-char-seq p "tab" '(datum . #\tab))]
 | 
				
			||||||
          [($char= #\r c) 
 | 
					          [($char= #\r c) 
 | 
				
			||||||
           (tokenize-char-seq p "return" '(datum . #\return))]
 | 
					           (tokenize-char-seq p "return" '(datum . #\return))]
 | 
				
			||||||
 | 
					          [($char= #\x c) 
 | 
				
			||||||
 | 
					           (let ([n (peek-char p)])
 | 
				
			||||||
 | 
					             (cond
 | 
				
			||||||
 | 
					               [(or (eof-object? n) (delimiter? n))
 | 
				
			||||||
 | 
					                '(datum . #\x)]
 | 
				
			||||||
 | 
					               [(hex n) =>
 | 
				
			||||||
 | 
					                (lambda (v) 
 | 
				
			||||||
 | 
					                  (read-char p)
 | 
				
			||||||
 | 
					                  (let f ([v v])
 | 
				
			||||||
 | 
					                    (let ([c (read-char p)])
 | 
				
			||||||
 | 
					                      (cond
 | 
				
			||||||
 | 
					                        [(eof-object? c)
 | 
				
			||||||
 | 
					                         (cons 'datum (integer->char v))]
 | 
				
			||||||
 | 
					                        [(delimiter? c)
 | 
				
			||||||
 | 
					                         (unread-char c p)
 | 
				
			||||||
 | 
					                         (cons 'datum (integer->char v))]
 | 
				
			||||||
 | 
					                        [(hex c) =>
 | 
				
			||||||
 | 
					                         (lambda (v0)
 | 
				
			||||||
 | 
					                           (f (+ (* v 16) v0)))]
 | 
				
			||||||
 | 
					                        [else
 | 
				
			||||||
 | 
					                         (error 'tokenize "invalid character sequence")]))))]
 | 
				
			||||||
 | 
					               [else
 | 
				
			||||||
 | 
					                (error 'tokenize "invalid character sequence #\\x~a" n)]))]
 | 
				
			||||||
          [else
 | 
					          [else
 | 
				
			||||||
           (let ([n (peek-char p)])
 | 
					           (let ([n (peek-char p)])
 | 
				
			||||||
             (cond
 | 
					             (cond
 | 
				
			||||||
| 
						 | 
					@ -161,6 +188,17 @@
 | 
				
			||||||
               [(delimiter? n)  (cons 'datum c)]
 | 
					               [(delimiter? n)  (cons 'datum c)]
 | 
				
			||||||
               [else 
 | 
					               [else 
 | 
				
			||||||
                (error 'tokenize "invalid syntax #\\~a~a" c n)]))]))))
 | 
					                (error 'tokenize "invalid syntax #\\~a~a" c n)]))]))))
 | 
				
			||||||
 | 
					  (define (hex x)
 | 
				
			||||||
 | 
					    (cond
 | 
				
			||||||
 | 
					      [(and ($char<= #\0 x) ($char<= x #\9))
 | 
				
			||||||
 | 
					       ($fx- ($char->fixnum x) ($char->fixnum #\0))]
 | 
				
			||||||
 | 
					      [(and ($char<= #\a x) ($char<= x #\z))
 | 
				
			||||||
 | 
					       ($fx- ($char->fixnum x) 
 | 
				
			||||||
 | 
					             ($fx- ($char->fixnum #\a) 10))]
 | 
				
			||||||
 | 
					      [(and ($char<= #\A x) ($char<= x #\Z))
 | 
				
			||||||
 | 
					       ($fx- ($char->fixnum x) 
 | 
				
			||||||
 | 
					             ($fx- ($char->fixnum #\A) 10))]
 | 
				
			||||||
 | 
					      [else #f]))
 | 
				
			||||||
  (define multiline-error
 | 
					  (define multiline-error
 | 
				
			||||||
    (lambda ()
 | 
					    (lambda ()
 | 
				
			||||||
      (error 'tokenize
 | 
					      (error 'tokenize
 | 
				
			||||||
| 
						 | 
					@ -486,6 +524,42 @@
 | 
				
			||||||
               [else (tokenize-bar p (cons c ac))]))]
 | 
					               [else (tokenize-bar p (cons c ac))]))]
 | 
				
			||||||
          [($char= #\| c) ac]
 | 
					          [($char= #\| c) ac]
 | 
				
			||||||
          [else (tokenize-bar p (cons c ac))]))))
 | 
					          [else (tokenize-bar p (cons c ac))]))))
 | 
				
			||||||
 | 
					  (define (tokenize-backslash p)
 | 
				
			||||||
 | 
					    (let ([c (read-char p)])
 | 
				
			||||||
 | 
					      (cond
 | 
				
			||||||
 | 
					        [(eof-object? c) 
 | 
				
			||||||
 | 
					         (error 'tokenize "invalid eof after \\")]
 | 
				
			||||||
 | 
					        [($char= #\x c) 
 | 
				
			||||||
 | 
					         (let ([c (read-char p)])
 | 
				
			||||||
 | 
					           (cond
 | 
				
			||||||
 | 
					             [(eof-object? c) 
 | 
				
			||||||
 | 
					              (error 'tokenize "invalid eof after \\x")]
 | 
				
			||||||
 | 
					             [(hex c) => 
 | 
				
			||||||
 | 
					              (lambda (v)
 | 
				
			||||||
 | 
					                (let f ([v v] [ac `(,c #\x #\\)])
 | 
				
			||||||
 | 
					                  (let ([c (read-char p)])
 | 
				
			||||||
 | 
					                    (cond
 | 
				
			||||||
 | 
					                      [(eof-object? c) 
 | 
				
			||||||
 | 
					                       (error 'tokenize "invalid eof after ~a"
 | 
				
			||||||
 | 
					                         (list->string (reverse ac)))]
 | 
				
			||||||
 | 
					                      [($char= #\; c)
 | 
				
			||||||
 | 
					                       (cons 'datum 
 | 
				
			||||||
 | 
					                         (string->symbol 
 | 
				
			||||||
 | 
					                           (list->string 
 | 
				
			||||||
 | 
					                             (cons (integer->char v)
 | 
				
			||||||
 | 
					                               (reverse (tokenize-identifier '() p))))))]
 | 
				
			||||||
 | 
					                      [(hex c) =>
 | 
				
			||||||
 | 
					                       (lambda (v0)
 | 
				
			||||||
 | 
					                         (f (+ (* v 16) v0) (cons c ac)))]
 | 
				
			||||||
 | 
					                      [else 
 | 
				
			||||||
 | 
					                       (error 'tokenize "invalid sequence ~a"
 | 
				
			||||||
 | 
					                         (list->string (cons c (reverse ac))))]))))]
 | 
				
			||||||
 | 
					             [else
 | 
				
			||||||
 | 
					              (unread-char c p) 
 | 
				
			||||||
 | 
					              (error 'tokenize "invalid sequence \\x~a" c)]))]
 | 
				
			||||||
 | 
					        [else 
 | 
				
			||||||
 | 
					         (unread-char c p) 
 | 
				
			||||||
 | 
					         (error 'tokenize "invalid sequence \\~a" c)])))
 | 
				
			||||||
  (define tokenize/c
 | 
					  (define tokenize/c
 | 
				
			||||||
    (lambda (c p)
 | 
					    (lambda (c p)
 | 
				
			||||||
      (cond
 | 
					      (cond
 | 
				
			||||||
| 
						 | 
					@ -538,6 +612,10 @@
 | 
				
			||||||
        [($char= #\| c)
 | 
					        [($char= #\| c)
 | 
				
			||||||
         (let ([ls (reverse (tokenize-bar p '()))])
 | 
					         (let ([ls (reverse (tokenize-bar p '()))])
 | 
				
			||||||
           (cons 'datum (string->symbol (list->string ls))))]
 | 
					           (cons 'datum (string->symbol (list->string ls))))]
 | 
				
			||||||
 | 
					        [($char= #\\ c)
 | 
				
			||||||
 | 
					         (tokenize-backslash p)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        [else
 | 
					        [else
 | 
				
			||||||
         (unread-char c p) 
 | 
					         (unread-char c p) 
 | 
				
			||||||
         (error 'tokenize "invalid syntax ~a" c)])))
 | 
					         (error 'tokenize "invalid syntax ~a" c)])))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -144,7 +144,8 @@
 | 
				
			||||||
                    [else (error who "incomplete char sequence")])]
 | 
					                    [else (error who "incomplete char sequence")])]
 | 
				
			||||||
                 [(eq? mode 'ignore) (f x ($fxadd1 i) j n mode)]
 | 
					                 [(eq? mode 'ignore) (f x ($fxadd1 i) j n mode)]
 | 
				
			||||||
                 [(eq? mode 'replace) (f x ($fxadd1 i) j ($fxadd1 n) mode)]
 | 
					                 [(eq? mode 'replace) (f x ($fxadd1 i) j ($fxadd1 n) mode)]
 | 
				
			||||||
                 [else (error who "invalid byte ~s" b0)]))])))
 | 
					                 [else (error who "invalid byte ~s at index ~s of ~s" 
 | 
				
			||||||
 | 
					                              b0 i x)]))])))
 | 
				
			||||||
      (define (fill str bv mode)
 | 
					      (define (fill str bv mode)
 | 
				
			||||||
        (let f ([str str] [x bv] [i 0] [j ($bytevector-length bv)] [n 0] [mode mode])
 | 
					        (let f ([str str] [x bv] [i 0] [j ($bytevector-length bv)] [n 0] [mode mode])
 | 
				
			||||||
          (cond
 | 
					          (cond
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,27 +10,11 @@
 | 
				
			||||||
    (ikarus system $pairs)
 | 
					    (ikarus system $pairs)
 | 
				
			||||||
    (ikarus system $symbols)
 | 
					    (ikarus system $symbols)
 | 
				
			||||||
    (ikarus system $bytevectors)
 | 
					    (ikarus system $bytevectors)
 | 
				
			||||||
 | 
					    (ikarus unicode-data)
 | 
				
			||||||
    (except (ikarus) write display format printf print-error
 | 
					    (except (ikarus) write display format printf print-error
 | 
				
			||||||
            error-handler error))
 | 
					            error-handler error))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  (include "unicode/unicode-constituents.ss")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  (define (binary-search-on? n v)
 | 
					 | 
				
			||||||
    (let ([k ($fx- ($vector-length v) 1)])
 | 
					 | 
				
			||||||
      (let f ([i 0] [k k] [n n] [v v])
 | 
					 | 
				
			||||||
        (cond
 | 
					 | 
				
			||||||
          [($fx= i k) ($fx= ($fxlogand i 1) 1)]
 | 
					 | 
				
			||||||
          [else
 | 
					 | 
				
			||||||
           (let ([j ($fxsra ($fx+ i ($fx+ k 1)) 1)])
 | 
					 | 
				
			||||||
             (cond
 | 
					 | 
				
			||||||
               [($fx<= ($vector-ref v j) n) (f j k n v)]
 | 
					 | 
				
			||||||
               [else (f i ($fx- j 1) n v)]))]))))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  (define (unicode-printable-char? c)
 | 
					 | 
				
			||||||
    (binary-search-on?
 | 
					 | 
				
			||||||
      ($char->fixnum c) 
 | 
					 | 
				
			||||||
      unicode-constituents-vector))
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  (define char-table ; first nonprintable chars
 | 
					  (define char-table ; first nonprintable chars
 | 
				
			||||||
    '#("nul" "x1" "x2" "x3" "x4" "x5" "x6" "alarm"
 | 
					    '#("nul" "x1" "x2" "x3" "x4" "x5" "x6" "alarm"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -54,6 +54,7 @@
 | 
				
			||||||
    "ikarus.io.input-strings.ss"
 | 
					    "ikarus.io.input-strings.ss"
 | 
				
			||||||
    "ikarus.io.output-strings.ss"
 | 
					    "ikarus.io.output-strings.ss"
 | 
				
			||||||
    "ikarus.hash-tables.ss"
 | 
					    "ikarus.hash-tables.ss"
 | 
				
			||||||
 | 
					    "ikarus.unicode-data.ss"
 | 
				
			||||||
    "ikarus.writer.ss"
 | 
					    "ikarus.writer.ss"
 | 
				
			||||||
    "ikarus.reader.ss"
 | 
					    "ikarus.reader.ss"
 | 
				
			||||||
    "ikarus.code-objects.ss"
 | 
					    "ikarus.code-objects.ss"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue