sunet/scheme/xml/reader.scm

379 lines
14 KiB
Scheme

;; Token ::= Contents | Start-tag | End-tag | Eof
(define read-comments (make-parameter #f))
(define collapse-whitespace (make-parameter #f))
;; read-xml : [Input-port] -> Document
(define (read-xml . maybe-port)
(read-from-port (if (null? maybe-port) (current-input-port) (car maybe-port))))
;; read-from-port : Input-port -> Document
(define (read-from-port in)
(let*-values (((in pos) (positionify in))
((misc0 start) (read-misc in pos)))
(make-document (make-prolog misc0 #f)
(cond
((start-tag? start) (read-element start in pos))
((element? start) start)
(else (error 'read-xml "expected root element - received ~a" start)))
(let-values (((misc1 end-of-file) (read-misc in pos)))
(unless (eof-object? end-of-file)
(error 'read-xml "extra stuff at end of document ~a" end-of-file))
misc1))))
;; read-misc : Input-port (-> Location) -> (listof Misc) Token
(define (read-misc in pos)
(let read-more ()
(let ((x (lex in pos)))
(cond
((or (pi? x) (comment? x))
(let-values (((lst next) (read-more)))
(values (cons x lst) next)))
((and (pcdata? x) (andmap char-whitespace? (string->list (pcdata-string x))))
(read-more))
(else (values null x))))))
;; read-element : Start-tag Input-port (-> Location) -> Element
(define (read-element start in pos)
(let ((name (start-tag-name start))
(a (source-start start))
(b (source-stop start)))
(make-element
a b name (start-tag-attrs start)
(let read-content ()
(let ((x (lex in pos)))
(cond
((eof-object? x)
(error 'read-xml "unclosed ~a tag at [~a ~a]" name
(format-source a)
(format-source b)))
((start-tag? x) (cons (read-element x in pos) (read-content)))
((end-tag? x)
(unless (eq? name (end-tag-name x))
(error 'read-xml "start tag ~a at [~a ~a] doesn't match end tag ~a at [~a ~a]"
name
(format-source a)
(format-source b)
(end-tag-name x)
(format-source (source-start x))
(format-source (source-stop x))))
null)
((entity? x) (cons (expand-entity x) (read-content)))
((comment? x) (if (read-comments)
(cons x (read-content))
(read-content)))
(else (cons x (read-content)))))))))
;; expand-entity : Entity -> (U Entity Pcdata)
;; more here - allow expansion of user defined entities
(define (expand-entity x)
(let ((expanded (default-entity-table (entity-text x))))
(if expanded
(make-pcdata (source-start x) (source-stop x) expanded)
x)))
;; default-entity-table : Symbol -> (U #f String)
(define (default-entity-table name)
(case name
((amp) "&")
((lt) "<")
((gt) ">")
((quot) "\"")
((apos) "'")
(else #f)))
;; lex : Input-port (-> Location) -> Token
(define (lex in pos)
(let ((c (peek-char in)))
(cond
((eof-object? c) c)
((eq? c #\&) (lex-entity in pos))
((eq? c #\<) (lex-tag-cdata-pi-comment in pos))
(else (lex-pcdata in pos)))))
;; lex-entity : Input-port (-> Location) -> Entity
(define (lex-entity in pos)
(let ((start (pos)))
(read-char in)
(let ((data (case (peek-char in)
((#\#)
(read-char in)
(let ((n (case (peek-char in)
((#\x) (read-char in)
(string->number (read-until #\; in pos) 16))
(else (string->number (read-until #\; in pos))))))
(unless (number? n)
(lex-error in pos "malformed numeric entity"))
n))
(else
(begin0
(lex-name in pos)
(unless (eq? (read-char in) #\;)
(lex-error in pos "expected ; at the end of an entity")))))))
(make-entity start (pos) data))))
;; lex-tag-cdata-pi-comment : Input-port (-> Location) -> Start-tag | Element | End-tag | Pcdata | Pi | Comment
(define (lex-tag-cdata-pi-comment in pos)
(let ((start (pos)))
(read-char in)
(case (non-eof peek-char in pos)
((#\!)
(read-char in)
(case (non-eof peek-char in pos)
((#\-) (read-char in)
(unless (eq? (read-char in) #\-)
(lex-error in pos "expected second - after <!-"))
(let ((data (lex-comment-contents in pos)))
(unless (eq? (read-char in) #\>)
(lex-error in pos "expected > to end comment (\"--\" can't appear in comments)"))
;(make-comment start (pos) data)
(make-comment data)))
((#\[) (read-char in)
(unless (string=? (read-string 6 in) "CDATA[")
(lex-error in pos "expected CDATA following <["))
(let ((data (lex-cdata-contents in pos)))
(make-pcdata start (pos) data)))
(else (skip-dtd in pos)
(skip-space in)
(unless (eq? (peek-char in) #\<)
(lex-error in pos "expected pi, comment, or element after doctype"))
(lex-tag-cdata-pi-comment in pos))))
((#\?) (read-char in)
(let ((name (lex-name in pos)))
(skip-space in)
(let ((data (lex-pi-data in pos)))
(make-pi start (pos) name data))))
((#\/) (read-char in)
(let ((name (lex-name in pos)))
(skip-space in)
(unless (eq? (read-char in) #\>)
(lex-error in pos "expected > to close ~a's end tag" name))
(make-end-tag start (pos) name)))
(else
(let ((name (lex-name in pos))
(attrs (lex-attributes in pos)))
(skip-space in)
(case (read-char in)
((#\/)
(unless (eq? (read-char in) #\>)
(lex-error in pos "expected > to close empty element ~a" name))
(make-element start (pos) name attrs null))
((#\>) (make-start-tag start (pos) name attrs))
(else (lex-error in pos "expected / or > to close tag `~a'" name))))))))
;; lex-attributes : Input-port (-> Location) -> (listof Attribute)
(define (lex-attributes in pos)
(quicksort (let loop ()
(skip-space in)
(cond
((name-start? (peek-char in))
(cons (lex-attribute in pos) (loop)))
(else null)))
(lambda (a b)
(let ((na (attribute-name a))
(nb (attribute-name b)))
(cond
((eq? na nb) (lex-error in pos "duplicated attribute name ~a" na))
(else (string<? (symbol->string na) (symbol->string nb))))))))
;; lex-attribute : Input-port (-> Location) -> Attribute
(define (lex-attribute in pos)
(let ((start (pos))
(name (lex-name in pos)))
(skip-space in)
(unless (eq? (read-char in) #\=)
(lex-error in pos "expected = in attribute ~a" name))
(skip-space in)
;; more here - handle entites and disallow "<"
(let* ((delimiter (read-char in))
(value (case delimiter
((#\' #\")
(list->string
(let read-more ()
(let ((c (non-eof peek-char in pos)))
(cond
((eq? c delimiter) (read-char in) null)
((eq? c #\&)
(let ((entity (expand-entity (lex-entity in pos))))
(if (pcdata? entity)
(append (string->list (pcdata-string entity)) (read-more))
;; more here - do something with user defined entites
(read-more))))
(else (read-char in) (cons c (read-more))))))))
(else (lex-error in pos "attribute values must be in ''s or in \"\"s")))))
(make-attribute start (pos) name value))))
;; skip-space : Input-port -> Void
;; deviation - should sometimes insist on at least one space
(define (skip-space in)
(let loop ()
(let ((c (peek-char in)))
(when (and (not (eof-object? c)) (char-whitespace? c))
(read-char in)
(loop)))))
;; lex-pcdata : Input-port (-> Location) -> Pcdata
;; deviation - disallow ]]> "for compatability" with SGML, sec 2.4 XML spec
(define (lex-pcdata in pos)
(let ((start (pos))
(data (let loop ()
(let ((next (peek-char in)))
(cond
((or (eof-object? next) (eq? next #\&) (eq? next #\<))
null)
((and (char-whitespace? next) (collapse-whitespace))
(skip-space in)
(cons #\space (loop)))
(else (cons (read-char in) (loop))))))))
(make-pcdata start
(pos)
(list->string data))))
;; lex-name : Input-port (-> Location) -> Symbol
(define (lex-name in pos)
(let ((c (read-char in)))
(unless (name-start? c)
(lex-error in pos "expected name, received ~a" c))
(string->symbol
(list->string
(cons c (let lex-rest ()
(cond
((name-char? (peek-char in))
(cons (read-char in) (lex-rest)))
(else null))))))))
;; skip-dtd : Input-port (-> Location) -> Void
(define (skip-dtd in pos)
(let skip ()
(case (non-eof read-char in pos)
((#\') (read-until #\' in pos) (skip))
((#\") (read-until #\" in pos) (skip))
((#\<)
(case (non-eof read-char in pos)
((#\!) (case (non-eof read-char in pos)
((#\-) (read-char in) (lex-comment-contents in pos) (read-char in) (skip))
(else (skip) (skip))))
((#\?) (lex-pi-data in pos) (skip))
(else (skip) (skip))))
((#\>) (void))
(else (skip)))))
;; name-start? : Char -> Bool
(define (name-start? ch)
(or (char-alphabetic? ch)
(eq? ch #\_)
(eq? ch #\:)))
;; name-char? : Char -> Bool
(define (name-char? ch)
(or (name-start? ch)
(char-numeric? ch)
(eq? ch #\.)
(eq? ch #\-)))
;; read-until : Char Input-port (-> Location) -> String
;; discards the stop character, too
(define (read-until char in pos)
(list->string
(let read-more ()
(let ((c (non-eof read-char in pos)))
(cond
((eq? c char) null)
(else (cons c (read-more))))))))
;; non-eof : (Input-port -> (U Char Eof)) Input-port (-> Location) -> Char
(define (non-eof f in pos)
(let ((c (f in)))
(cond
((eof-object? c) (lex-error in pos "unexpected eof"))
(else c))))
;; gen-read-until-string : String -> Input-port (-> Location) -> String
;; uses Knuth-Morris-Pratt from
;; Introduction to Algorithms, Cormen, Leiserson, and Rivest, pages 869-876
;; discards stop from input
(define (gen-read-until-string stop)
(let* ((len (string-length stop))
(prefix (make-vector len 0))
(fall-back
(lambda (k c)
(let ((k (let loop ((k k))
(cond
((and (> k 0) (not (eq? (string-ref stop k) c)))
(loop (vector-ref prefix (sub1 k))))
(else k)))))
(if (eq? (string-ref stop k) c)
(add1 k)
k)))))
(let init ((k 0) (q 1))
(when (< q len)
(let ((k (fall-back k (string-ref stop q))))
(vector-set! prefix q k)
(init k (add1 q)))))
;; (vector-ref prefix x) = the longest suffix that matches a prefix of stop
(lambda (in pos)
(list->string
(let/ec out
(let loop ((matched 0) (out out))
(let* ((c (non-eof read-char in pos))
(matched (fall-back matched c)))
(cond
((= matched len) (out null))
((zero? matched) (cons c (let/ec out (loop matched out))))
(else (cons c (loop matched out)))))))))))
;; "-->" makes more sense, but "--" follows the spec.
(define lex-comment-contents (gen-read-until-string "--"))
(define lex-pi-data (gen-read-until-string "?>"))
(define lex-cdata-contents (gen-read-until-string "]]>"))
;; positionify : Input-port -> Input-port (-> Location)
;; Well, this really depends on scsh-0.6 and should be replace by
;; big-scheme's more-port
;; For S48 you probably need to do something completely different
(define (positionify in)
(let ((line 1)
(char 0)
(offset 0)
(old-handler (port-handler in)))
(set-port-buffering in bufpol/block 1)
(let ((handler (make-buffered-input-port-handler
(port-handler-discloser old-handler)
(port-handler-close old-handler)
(lambda (data buffer start needed)
(let ((res
((port-handler-buffer-proc old-handler)
data buffer start needed)))
(if (number? res)
(begin
(set! char (add1 char))
(set! offset (add1 offset))
(let ((c (byte-vector-ref buffer 0)))
(when (= c 10)
(set! line (+ line 1))
(set! char -1)))))
res))
(port-handler-ready? old-handler)
(port-handler-steal old-handler))))
(set-port-handler! in handler)
(values in
(lambda ()
(make-location line char offset))))))
; (- n (- (port-limit in) (port-index in))))))))
;; lex-error : Input-port String (-> Location) TST* -> alpha
(define (lex-error in pos str . rest)
(error 'lex-error " at position:" (format-source (pos)) str rest))
;; format-source : Location -> string
;; to format the source location for an error message
(define (format-source loc)
(if (location? loc)
(format #f "~a.~a/~a" (location-line loc) (location-char loc) (location-offset loc))
(format #f "~a" loc)))