sunet/scheme/xml/reader.scm

469 lines
16 KiB
Scheme

; Taken directly from the SRFI document.
(define-syntax let-values
(syntax-rules ()
((let-values (?binding ...) ?body0 ?body1 ...)
(let-values "bind" (?binding ...) () (begin ?body0 ?body1 ...)))
((let-values "bind" () ?tmps ?body)
(let ?tmps ?body))
((let-values "bind" ((?b0 ?e0) ?binding ...) ?tmps ?body)
(let-values "mktmp" ?b0 ?e0 () (?binding ...) ?tmps ?body))
((let-values "mktmp" () ?e0 ?args ?bindings ?tmps ?body)
(call-with-values
(lambda () ?e0)
(lambda ?args
(let-values "bind" ?bindings ?tmps ?body))))
((let-values "mktmp" (?a . ?b) ?e0 (?arg ...) ?bindings (?tmp ...) ?body)
(let-values "mktmp" ?b ?e0 (?arg ... x) ?bindings (?tmp ... (?a x)) ?body))
((let-values "mktmp" ?a ?e0 (?arg ...) ?bindings (?tmp ...) ?body)
(call-with-values
(lambda () ?e0)
(lambda (?arg ... . x)
(let-values "bind" ?bindings (?tmp ... (?a x)) ?body))))))
(define-syntax let*-values
(syntax-rules ()
((let*-values () ?body0 ?body1 ...)
(begin ?body0 ?body1 ...))
((let*-values (?binding0 ?binding1 ...) ?body0 ?body1 ...)
(let-values (?binding0)
(let*-values (?binding1 ...) ?body0 ?body1 ...)))))
(define (add1 x) (+ x 1))
(define-syntax when
(syntax-rules
()
((when test expr ...)
(if test (begin expr ...)))))
(define-syntax unless
(syntax-rules
()
((unless test expr ...)
(if (not test) (begin expr ...)))))
(define (sub1 x)
(- x 1))
(define (void . a)
(if #f #f))
(define-syntax begin0
(syntax-rules
()
((begin0 expr1 expr ...)
(let ((r expr1))
(begin expr ...)
r))))
(define andmap
(lambda (f list0 . lists)
(if (null? list0)
(and)
(let loop ((lists (cons list0 lists)))
(if (null? (cdr (car lists)))
(apply f (map car lists))
(and (apply f (map car lists))
(loop (map cdr lists))))))))
(define null '())
; stolen from mzlib/functior.ss
(define (quicksort l less-than)
(let* ((v (list->vector l))
(count (vector-length v)))
(let loop ((min 0)(max count))
(if (< min (sub1 max))
(let ((pval (vector-ref v min)))
(let pivot-loop ((pivot min)
(pos (add1 min)))
(if (< pos max)
(let ((cval (vector-ref v pos)))
(if (less-than cval pval)
(begin
(vector-set! v pos (vector-ref v pivot))
(vector-set! v pivot cval)
(pivot-loop (add1 pivot) (add1 pos)))
(pivot-loop pivot (add1 pos))))
(if (= min pivot)
(loop (add1 pivot) max)
(begin
(loop min pivot)
(loop pivot max))))))))
(vector->list v)))
;;; HACK!
(define call/ec call-with-current-continuation)
(define-syntax let/ec
(syntax-rules
()
((let/ec k expr ...)
(call-with-current-continuation (lambda (k) expr ...)))))
;;; HACK!
(define (make-parameter val)
(lambda ()
val))
;;;;;;;;;;;
;; Token ::= Contents | Start-tag | End-tag | Eof
(define read-comments (make-parameter #f))
(define collapse-whitespace (make-parameter #f))
;; read-xml : [Input-port] -> Document
(define (read-xml . maybe-port)
(read-from-port (if (null? maybe-port) (current-input-port) (car maybe-port))))
;; read-from-port : Input-port -> Document
(define (read-from-port in)
(let*-values (((in pos) (positionify in))
((misc0 start) (read-misc in pos)))
(make-document (make-prolog misc0 #f)
(cond
((start-tag? start) (read-element start in pos))
((element? start) start)
(else (error 'read-xml "expected root element - received ~a" start)))
(let-values (((misc1 end-of-file) (read-misc in pos)))
(unless (eof-object? end-of-file)
(error 'read-xml "extra stuff at end of document ~a" end-of-file))
misc1))))
;; read-misc : Input-port (-> Nat) -> (listof Misc) Token
(define (read-misc in pos)
(let read-more ()
(let ((x (lex in pos)))
(cond
((or (pi? x) (comment? x))
(let-values (((lst next) (read-more)))
(values (cons x lst) next)))
((and (pcdata? x) (andmap char-whitespace? (string->list (pcdata-string x))))
(read-more))
(else (values null x))))))
;; read-element : Start-tag Input-port (-> Nat) -> Element
(define (read-element start in pos)
(let ((name (start-tag-name start))
(a (source-start start))
(b (source-stop start)))
(make-element
a b name (start-tag-attrs start)
(let read-content ()
(let ((x (lex in pos)))
(cond
((eof-object? x)
(error 'read-xml "unclosed ~a tag at [~a ~a]" name a b))
((start-tag? x) (cons (read-element x in pos) (read-content)))
((end-tag? x)
(unless (eq? name (end-tag-name x))
(error 'read-xml "start tag ~a at [~a ~a] doesn't match end tag ~a at [~a ~a]"
name a b (end-tag-name x) (source-start x) (source-stop x)))
null)
((entity? x) (cons (expand-entity x) (read-content)))
((comment? x) (if (read-comments)
(cons x (read-content))
(read-content)))
(else (cons x (read-content)))))))))
;; expand-entity : Entity -> (U Entity Pcdata)
;; more here - allow expansion of user defined entities
(define (expand-entity x)
(let ((expanded (default-entity-table (entity-text x))))
(if expanded
(make-pcdata (source-start x) (source-stop x) expanded)
x)))
;; default-entity-table : Symbol -> (U #f String)
(define (default-entity-table name)
(case name
((amp) "&")
((lt) "<")
((gt) ">")
((quot) "\"")
((apos) "'")
(else #f)))
;; lex : Input-port (-> Nat) -> Token
(define (lex in pos)
(let ((c (peek-char in)))
(cond
((eof-object? c) c)
((eq? c #\&) (lex-entity in pos))
((eq? c #\<) (lex-tag-cdata-pi-comment in pos))
(else (lex-pcdata in pos)))))
;; lex-entity : Input-port (-> Nat) -> Entity
(define (lex-entity in pos)
(let ((start (pos)))
(read-char in)
(let ((data (case (peek-char in)
((#\#)
(read-char in)
(let ((n (case (peek-char in)
((#\x) (read-char in)
(string->number (read-until #\; in pos) 16))
(else (string->number (read-until #\; in pos))))))
(unless (number? n)
(lex-error in pos "malformed numeric entity"))
n))
(else
(begin0
(lex-name in pos)
(unless (eq? (read-char in) #\;)
(lex-error in pos "expected ; at the end of an entity")))))))
(make-entity start (pos) data))))
;; lex-tag-cdata-pi-comment : Input-port (-> Nat) -> Start-tag | Element | End-tag | Pcdata | Pi | Comment
(define (lex-tag-cdata-pi-comment in pos)
(let ((start (pos)))
(read-char in)
(case (non-eof peek-char in pos)
((#\!)
(read-char in)
(case (non-eof peek-char in pos)
((#\-) (read-char in)
(unless (eq? (read-char in) #\-)
(lex-error in pos "expected second - after <!-"))
(let ((data (lex-comment-contents in pos)))
(unless (eq? (read-char in) #\>)
(lex-error in pos "expected > to end comment (\"--\" can't appear in comments)"))
;(make-comment start (pos) data)
(make-comment data)))
((#\[) (read-char in)
(unless (string=? (read-string 6 in) "CDATA[")
(lex-error in pos "expected CDATA following <["))
(let ((data (lex-cdata-contents in pos)))
(make-pcdata start (pos) data)))
(else (skip-dtd in pos)
(skip-space in)
(unless (eq? (peek-char in) #\<)
(lex-error in pos "expected pi, comment, or element after doctype"))
(lex-tag-cdata-pi-comment in pos))))
((#\?) (read-char in)
(let ((name (lex-name in pos)))
(skip-space in)
(let ((data (lex-pi-data in pos)))
(make-pi start (pos) name data))))
((#\/) (read-char in)
(let ((name (lex-name in pos)))
(skip-space in)
(unless (eq? (read-char in) #\>)
(lex-error in pos "expected > to close ~a's end tag" name))
(make-end-tag start (pos) name)))
(else
(let ((name (lex-name in pos))
(attrs (lex-attributes in pos)))
(skip-space in)
(case (read-char in)
((#\/)
(unless (eq? (read-char in) #\>)
(lex-error in pos "expected > to close empty element ~a" name))
(make-element start (pos) name attrs null))
((#\>) (make-start-tag start (pos) name attrs))
(else (lex-error in pos "expected / or > to close tag ~a" name))))))))
;; lex-attributes : Input-port (-> Nat) -> (listof Attribute)
(define (lex-attributes in pos)
(quicksort (let loop ()
(skip-space in)
(cond
((name-start? (peek-char in))
(cons (lex-attribute in pos) (loop)))
(else null)))
(lambda (a b)
(let ((na (attribute-name a))
(nb (attribute-name b)))
(cond
((eq? na nb) (lex-error in pos "duplicated attribute name ~a" na))
(else (string<? (symbol->string na) (symbol->string nb))))))))
;; lex-attribute : Input-port (-> Nat) -> Attribute
(define (lex-attribute in pos)
(let ((start (pos))
(name (lex-name in pos)))
(skip-space in)
(unless (eq? (read-char in) #\=)
(lex-error in pos "expected = in attribute ~a" name))
(skip-space in)
;; more here - handle entites and disallow "<"
(let* ((delimiter (read-char in))
(value (case delimiter
((#\' #\")
(list->string
(let read-more ()
(let ((c (non-eof peek-char in pos)))
(cond
((eq? c delimiter) (read-char in) null)
((eq? c #\&)
(let ((entity (expand-entity (lex-entity in pos))))
(if (pcdata? entity)
(append (string->list (pcdata-string entity)) (read-more))
;; more here - do something with user defined entites
(read-more))))
(else (read-char in) (cons c (read-more))))))))
(else (lex-error in pos "attribute values must be in ''s or in \"\"s")))))
(make-attribute start (pos) name value))))
;; skip-space : Input-port -> Void
;; deviation - should sometimes insist on at least one space
(define (skip-space in)
(let loop ()
(let ((c (peek-char in)))
(when (and (not (eof-object? c)) (char-whitespace? c))
(read-char in)
(loop)))))
;; lex-pcdata : Input-port (-> Nat) -> Pcdata
;; deviation - disallow ]]> "for compatability" with SGML, sec 2.4 XML spec
(define (lex-pcdata in pos)
(let ((start (pos))
(data (let loop ()
(let ((next (peek-char in)))
(cond
((or (eof-object? next) (eq? next #\&) (eq? next #\<))
null)
((and (char-whitespace? next) (collapse-whitespace))
(skip-space in)
(cons #\space (loop)))
(else (cons (read-char in) (loop))))))))
(make-pcdata start
(pos)
(list->string data))))
;; lex-name : Input-port (-> Nat) -> Symbol
(define (lex-name in pos)
(let ((c (read-char in)))
(unless (name-start? c)
(lex-error in pos "expected name, received ~a" c))
(string->symbol
(list->string
(cons c (let lex-rest ()
(cond
((name-char? (peek-char in))
(cons (read-char in) (lex-rest)))
(else null))))))))
;; skip-dtd : Input-port (-> Nat) -> Void
(define (skip-dtd in pos)
(let skip ()
(case (non-eof read-char in pos)
((#\') (read-until #\' in pos) (skip))
((#\") (read-until #\" in pos) (skip))
((#\<)
(case (non-eof read-char in pos)
((#\!) (case (non-eof read-char in pos)
((#\-) (read-char in) (lex-comment-contents in pos) (read-char in) (skip))
(else (skip) (skip))))
((#\?) (lex-pi-data in pos) (skip))
(else (skip) (skip))))
((#\>) (void))
(else (skip)))))
;; name-start? : Char -> Bool
(define (name-start? ch)
(or (char-alphabetic? ch)
(eq? ch #\_)
(eq? ch #\:)))
;; name-char? : Char -> Bool
(define (name-char? ch)
(or (name-start? ch)
(char-numeric? ch)
(eq? ch #\.)
(eq? ch #\-)))
;; read-until : Char Input-port (-> Nat) -> String
;; discards the stop character, too
(define (read-until char in pos)
(list->string
(let read-more ()
(let ((c (non-eof read-char in pos)))
(cond
((eq? c char) null)
(else (cons c (read-more))))))))
;; non-eof : (Input-port -> (U Char Eof)) Input-port (-> Nat) -> Char
(define (non-eof f in pos)
(let ((c (f in)))
(cond
((eof-object? c) (lex-error in pos "unexpected eof"))
(else c))))
;; gen-read-until-string : String -> Input-port (-> Nat) -> String
;; uses Knuth-Morris-Pratt from
;; Introduction to Algorithms, Cormen, Leiserson, and Rivest, pages 869-876
;; discards stop from input
(define (gen-read-until-string stop)
(let* ((len (string-length stop))
(prefix (make-vector len 0))
(fall-back
(lambda (k c)
(let ((k (let loop ((k k))
(cond
((and (> k 0) (not (eq? (string-ref stop k) c)))
(loop (vector-ref prefix (sub1 k))))
(else k)))))
(if (eq? (string-ref stop k) c)
(add1 k)
k)))))
(let init ((k 0) (q 1))
(when (< q len)
(let ((k (fall-back k (string-ref stop q))))
(vector-set! prefix q k)
(init k (add1 q)))))
;; (vector-ref prefix x) = the longest suffix that matches a prefix of stop
(lambda (in pos)
(list->string
(let/ec out
(let loop ((matched 0) (out out))
(let* ((c (non-eof read-char in pos))
(matched (fall-back matched c)))
(cond
((= matched len) (out null))
((zero? matched) (cons c (let/ec out (loop matched out))))
(else (cons c (loop matched out)))))))))))
;; "-->" makes more sense, but "--" follows the spec.
(define lex-comment-contents (gen-read-until-string "--"))
(define lex-pi-data (gen-read-until-string "?>"))
(define lex-cdata-contents (gen-read-until-string "]]>"))
;; positionify : Input-port -> Input-port (-> Nat)
;; Well, this really depends on scsh-0.6
;; For S48 you probably need to do something completely different
(define (positionify in)
(let ((n 0); port-limit as absolute value
(old-handler (port-handler in)))
(let ((handler (make-buffered-input-port-handler
(port-handler-discloser old-handler)
(port-handler-close old-handler)
(lambda (data buffer start needed)
(let ((res
((port-handler-buffer-proc old-handler)
data buffer start needed)))
(if (number? res)
(set! n (+ n res)))
res))
(port-handler-ready? old-handler)
(port-handler-steal old-handler))))
(set-port-handler! in handler)
(values in
(lambda ()
(- n (- (port-limit in) (port-index in))))))))
;; lex-error : Input-port String (-> Nat) TST* -> alpha
(define (lex-error in pos str . rest)
(error 'lex-error " at positon:" (pos) str rest))