; -*- Mode: Scheme; Syntax: Scheme; Package: Scheme; -*- ; Copyright (c) 1993, 1994 Richard Kelsey and Jonathan Rees. See file COPYING. ; A tokenizer. ; Nonstandard things needed: ; record package ; char->ascii ; peek-char ; reverse-list->string ; error (define (reverse-list->string l n) (list->string (reverse l))) ; Tokenizer tables (define tokenizer-table-type (make-record-type 'tokenizer-table '(translation dispatch-vector terminating?-vector))) (define make-tokenizer-table (let () (define make (record-constructor tokenizer-table-type '(translation dispatch-vector terminating?-vector))) (define (make-tokenizer-table) (make (if (char=? (string-ref (symbol->string 't) 0) #\T) char-upcase char-downcase) (make-vector 256 (lambda (c port) (error "illegal character read" c))) (make-vector 256 #t))) make-tokenizer-table)) (define ttab-translation (record-accessor tokenizer-table-type 'translation)) (define ttab-dispatch-vector (record-accessor tokenizer-table-type 'dispatch-vector)) (define ttab-terminating?-vector (record-accessor tokenizer-table-type 'terminating?-vector)) (define set-tokenizer-table-translator! (record-modifier tokenizer-table-type 'translation)) (define (set-char-tokenization! ttab char reader term?) (vector-set! (ttab-dispatch-vector ttab) (char->ascii char) reader) (vector-set! (ttab-terminating?-vector ttab) (char->ascii char) term?)) ; Main dispatch (define (tokenize ttab port) (let ((c (read-char port))) (if (eof-object? c) c ((vector-ref (ttab-dispatch-vector ttab) (char->ascii c)) c port)))) ; Atoms (symbols and numbers) (define (scan-atom c ttab port) (let ((translate (ttab-translation ttab))) (let loop ((l (list (translate c))) (n 1)) (let ((c (peek-char port))) (cond ((or (eof-object? c) (vector-ref (ttab-terminating?-vector ttab) (char->ascii c))) (reverse-list->string l n)) (else (loop (cons (translate (read-char port)) l) (+ n 1)))))))) ; Allow ->foo, -v-, etc. (define (parse-atom string) (let ((c (string-ref string 0))) (cond ((char=? c #\+) (parse-possible-number string)) ((char=? c #\-) (parse-possible-number string)) ((char=? c #\.) (parse-possible-number string)) (else (if (char-numeric? c) (parse-number string) (string->symbol string)))))) ; First char is + - . (define (parse-possible-number string) (if (and (> (string-length string) 1) (char-numeric? (string-ref string 1))) (parse-number string) (string->symbol string))) (define (parse-number string) (or (string->number string 'e 'd) (error "unsupported number syntax" string))) ; Usual stuff (what you'd expect to be common to Scheme and ML syntax) (define (set-up-usual-tokenization! ttab) (define (tokenize-whitespace c port) c ;ignored (tokenize ttab port)) (define (tokenize-constituent c port) (parse-atom (scan-atom c ttab port))) (for-each (lambda (c) (set-char-tokenization! ttab (ascii->char c) tokenize-whitespace #t)) ascii-whitespaces) (for-each (lambda (c) (set-char-tokenization! ttab c tokenize-constituent #f)) (string->list (string-append ".0123456789" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz"))) (set-char-tokenization! ttab #\" tokenize-string #t) ) (define (make-constituent! c ttab) (set-char-tokenization! ttab c (lambda (c port) (parse-atom (scan-atom c ttab port))) #f)) (define (tokenize-string c port) c ;ignored (let loop ((l '()) (i 0)) (let ((c (read-char port))) (cond ((eof-object? c) (error "end of file within a string")) ((char=? c #\\) (let ((c (read-char port))) (if (or (char=? c #\\) (char=? c #\")) (loop (cons c l) (+ i 1)) (error "invalid escaped character in string" c)))) ((char=? c #\") (reverse-list->string l i)) (else (loop (cons c l) (+ i 1))))))) ; Auxiliary for parse-atom and tokenize-string ;(define (reverse-list->string l n) ;In microcode? ; (let ((s (make-string n))) ; (do ((l l (cdr l)) ; (i (- n 1) (- i 1))) ; ((< i 0) s) ; (string-set! s i (car l)))))