;; Token ::= Contents | Start-tag | End-tag | Eof (define read-comments (make-parameter #f)) (define collapse-whitespace (make-parameter #f)) ;; read-xml : [Input-port] -> Document (define (read-xml . maybe-port) (read-from-port (if (null? maybe-port) (current-input-port) (car maybe-port)))) ;; read-from-port : Input-port -> Document (define (read-from-port in) (let*-values (((in pos) (positionify in)) ((misc0 start) (read-misc in pos))) (make-document (make-prolog misc0 #f) (cond ((start-tag? start) (read-element start in pos)) ((element? start) start) (else (error 'read-xml "expected root element - received ~a" start))) (let-values (((misc1 end-of-file) (read-misc in pos))) (unless (eof-object? end-of-file) (error 'read-xml "extra stuff at end of document ~a" end-of-file)) misc1)))) ;; read-misc : Input-port (-> Location) -> (listof Misc) Token (define (read-misc in pos) (let read-more () (let ((x (lex in pos))) (cond ((or (pi? x) (comment? x)) (let-values (((lst next) (read-more))) (values (cons x lst) next))) ((and (pcdata? x) (andmap char-whitespace? (string->list (pcdata-string x)))) (read-more)) (else (values null x)))))) ;; read-element : Start-tag Input-port (-> Location) -> Element (define (read-element start in pos) (let ((name (start-tag-name start)) (a (source-start start)) (b (source-stop start))) (make-element a b name (start-tag-attrs start) (let read-content () (let ((x (lex in pos))) (cond ((eof-object? x) (error 'read-xml "unclosed ~a tag at [~a ~a]" name (format-source a) (format-source b))) ((start-tag? x) (cons (read-element x in pos) (read-content))) ((end-tag? x) (unless (eq? name (end-tag-name x)) (error 'read-xml "start tag ~a at [~a ~a] doesn't match end tag ~a at [~a ~a]" name (format-source a) (format-source b) (end-tag-name x) (format-source (source-start x)) (format-source (source-stop x)))) null) ((entity? x) (cons (expand-entity x) (read-content))) ((comment? x) (if (read-comments) (cons x (read-content)) (read-content))) (else (cons x (read-content))))))))) ;; expand-entity : Entity -> (U Entity Pcdata) ;; more here - allow expansion of user defined entities (define (expand-entity x) (let ((expanded (default-entity-table (entity-text x)))) (if expanded (make-pcdata (source-start x) (source-stop x) expanded) x))) ;; default-entity-table : Symbol -> (U #f String) (define (default-entity-table name) (case name ((amp) "&") ((lt) "<") ((gt) ">") ((quot) "\"") ((apos) "'") (else #f))) ;; lex : Input-port (-> Location) -> Token (define (lex in pos) (let ((c (peek-char in))) (cond ((eof-object? c) c) ((eq? c #\&) (lex-entity in pos)) ((eq? c #\<) (lex-tag-cdata-pi-comment in pos)) (else (lex-pcdata in pos))))) ;; lex-entity : Input-port (-> Location) -> Entity (define (lex-entity in pos) (let ((start (pos))) (read-char in) (let ((data (case (peek-char in) ((#\#) (read-char in) (let ((n (case (peek-char in) ((#\x) (read-char in) (string->number (read-until #\; in pos) 16)) (else (string->number (read-until #\; in pos)))))) (unless (number? n) (lex-error in pos "malformed numeric entity")) n)) (else (begin0 (lex-name in pos) (unless (eq? (read-char in) #\;) (lex-error in pos "expected ; at the end of an entity"))))))) (make-entity start (pos) data)))) ;; lex-tag-cdata-pi-comment : Input-port (-> Location) -> Start-tag | Element | End-tag | Pcdata | Pi | Comment (define (lex-tag-cdata-pi-comment in pos) (let ((start (pos))) (read-char in) (case (non-eof peek-char in pos) ((#\!) (read-char in) (case (non-eof peek-char in pos) ((#\-) (read-char in) (unless (eq? (read-char in) #\-) (lex-error in pos "expected second - after ) (lex-error in pos "expected > to end comment (\"--\" can't appear in comments)")) ;(make-comment start (pos) data) (make-comment data))) ((#\[) (read-char in) (unless (string=? (read-string 6 in) "CDATA[") (lex-error in pos "expected CDATA following <[")) (let ((data (lex-cdata-contents in pos))) (make-pcdata start (pos) data))) (else (skip-dtd in pos) (skip-space in) (unless (eq? (peek-char in) #\<) (lex-error in pos "expected pi, comment, or element after doctype")) (lex-tag-cdata-pi-comment in pos)))) ((#\?) (read-char in) (let ((name (lex-name in pos))) (skip-space in) (let ((data (lex-pi-data in pos))) (make-pi start (pos) name data)))) ((#\/) (read-char in) (let ((name (lex-name in pos))) (skip-space in) (unless (eq? (read-char in) #\>) (lex-error in pos "expected > to close ~a's end tag" name)) (make-end-tag start (pos) name))) (else (let ((name (lex-name in pos)) (attrs (lex-attributes in pos))) (skip-space in) (case (read-char in) ((#\/) (unless (eq? (read-char in) #\>) (lex-error in pos "expected > to close empty element ~a" name)) (make-element start (pos) name attrs null)) ((#\>) (make-start-tag start (pos) name attrs)) (else (lex-error in pos "expected / or > to close tag `~a'" name)))))))) ;; lex-attributes : Input-port (-> Location) -> (listof Attribute) (define (lex-attributes in pos) (quicksort (let loop () (skip-space in) (cond ((name-start? (peek-char in)) (cons (lex-attribute in pos) (loop))) (else null))) (lambda (a b) (let ((na (attribute-name a)) (nb (attribute-name b))) (cond ((eq? na nb) (lex-error in pos "duplicated attribute name ~a" na)) (else (stringstring na) (symbol->string nb)))))))) ;; lex-attribute : Input-port (-> Location) -> Attribute (define (lex-attribute in pos) (let ((start (pos)) (name (lex-name in pos))) (skip-space in) (unless (eq? (read-char in) #\=) (lex-error in pos "expected = in attribute ~a" name)) (skip-space in) ;; more here - handle entites and disallow "<" (let* ((delimiter (read-char in)) (value (case delimiter ((#\' #\") (list->string (let read-more () (let ((c (non-eof peek-char in pos))) (cond ((eq? c delimiter) (read-char in) null) ((eq? c #\&) (let ((entity (expand-entity (lex-entity in pos)))) (if (pcdata? entity) (append (string->list (pcdata-string entity)) (read-more)) ;; more here - do something with user defined entites (read-more)))) (else (read-char in) (cons c (read-more)))))))) (else (lex-error in pos "attribute values must be in ''s or in \"\"s"))))) (make-attribute start (pos) name value)))) ;; skip-space : Input-port -> Void ;; deviation - should sometimes insist on at least one space (define (skip-space in) (let loop () (let ((c (peek-char in))) (when (and (not (eof-object? c)) (char-whitespace? c)) (read-char in) (loop))))) ;; lex-pcdata : Input-port (-> Location) -> Pcdata ;; deviation - disallow ]]> "for compatability" with SGML, sec 2.4 XML spec (define (lex-pcdata in pos) (let ((start (pos)) (data (let loop () (let ((next (peek-char in))) (cond ((or (eof-object? next) (eq? next #\&) (eq? next #\<)) null) ((and (char-whitespace? next) (collapse-whitespace)) (skip-space in) (cons #\space (loop))) (else (cons (read-char in) (loop)))))))) (make-pcdata start (pos) (list->string data)))) ;; lex-name : Input-port (-> Location) -> Symbol (define (lex-name in pos) (let ((c (read-char in))) (unless (name-start? c) (lex-error in pos "expected name, received ~a" c)) (string->symbol (list->string (cons c (let lex-rest () (cond ((name-char? (peek-char in)) (cons (read-char in) (lex-rest))) (else null)))))))) ;; skip-dtd : Input-port (-> Location) -> Void (define (skip-dtd in pos) (let skip () (case (non-eof read-char in pos) ((#\') (read-until #\' in pos) (skip)) ((#\") (read-until #\" in pos) (skip)) ((#\<) (case (non-eof read-char in pos) ((#\!) (case (non-eof read-char in pos) ((#\-) (read-char in) (lex-comment-contents in pos) (read-char in) (skip)) (else (skip) (skip)))) ((#\?) (lex-pi-data in pos) (skip)) (else (skip) (skip)))) ((#\>) (void)) (else (skip))))) ;; name-start? : Char -> Bool (define (name-start? ch) (or (char-alphabetic? ch) (eq? ch #\_) (eq? ch #\:))) ;; name-char? : Char -> Bool (define (name-char? ch) (or (name-start? ch) (char-numeric? ch) (eq? ch #\.) (eq? ch #\-))) ;; read-until : Char Input-port (-> Location) -> String ;; discards the stop character, too (define (read-until char in pos) (list->string (let read-more () (let ((c (non-eof read-char in pos))) (cond ((eq? c char) null) (else (cons c (read-more)))))))) ;; non-eof : (Input-port -> (U Char Eof)) Input-port (-> Location) -> Char (define (non-eof f in pos) (let ((c (f in))) (cond ((eof-object? c) (lex-error in pos "unexpected eof")) (else c)))) ;; gen-read-until-string : String -> Input-port (-> Location) -> String ;; uses Knuth-Morris-Pratt from ;; Introduction to Algorithms, Cormen, Leiserson, and Rivest, pages 869-876 ;; discards stop from input (define (gen-read-until-string stop) (let* ((len (string-length stop)) (prefix (make-vector len 0)) (fall-back (lambda (k c) (let ((k (let loop ((k k)) (cond ((and (> k 0) (not (eq? (string-ref stop k) c))) (loop (vector-ref prefix (sub1 k)))) (else k))))) (if (eq? (string-ref stop k) c) (add1 k) k))))) (let init ((k 0) (q 1)) (when (< q len) (let ((k (fall-back k (string-ref stop q)))) (vector-set! prefix q k) (init k (add1 q))))) ;; (vector-ref prefix x) = the longest suffix that matches a prefix of stop (lambda (in pos) (list->string (let/ec out (let loop ((matched 0) (out out)) (let* ((c (non-eof read-char in pos)) (matched (fall-back matched c))) (cond ((= matched len) (out null)) ((zero? matched) (cons c (let/ec out (loop matched out)))) (else (cons c (loop matched out))))))))))) ;; "-->" makes more sense, but "--" follows the spec. (define lex-comment-contents (gen-read-until-string "--")) (define lex-pi-data (gen-read-until-string "?>")) (define lex-cdata-contents (gen-read-until-string "]]>")) ;; positionify : Input-port -> Input-port (-> Location) ;; Well, this really depends on scsh-0.6 and should be replace by ;; big-scheme's more-port ;; For S48 you probably need to do something completely different (define (positionify in) (let ((line 1) (char 0) (offset 0) (old-handler (port-handler in))) (set-port-buffering in bufpol/block 1) (let ((handler (make-buffered-input-port-handler (port-handler-discloser old-handler) (port-handler-close old-handler) (lambda (data buffer start needed) (let ((res ((port-handler-buffer-proc old-handler) data buffer start needed))) (if (number? res) (begin (set! char (add1 char)) (set! offset (add1 offset)) (let ((c (byte-vector-ref buffer 0))) (when (= c 10) (set! line (+ line 1)) (set! char -1))))) res)) (port-handler-ready? old-handler) (port-handler-steal old-handler)))) (set-port-handler! in handler) (values in (lambda () (make-location line char offset)))))) ; (- n (- (port-limit in) (port-index in)))))))) ;; lex-error : Input-port String (-> Location) TST* -> alpha (define (lex-error in pos str . rest) (error 'lex-error " at position:" (format-source (pos)) str rest)) ;; format-source : Location -> string ;; to format the source location for an error message (define (format-source loc) (if (location? loc) (format #f "~a.~a/~a" (location-line loc) (location-char loc) (location-offset loc)) (format #f "~a" loc)))