ikarus/benchmarks/src/earley.scm

652 lines
27 KiB
Scheme

;;; EARLEY -- Earley's parser, written by Marc Feeley.
; (make-parser grammar lexer) is used to create a parser from the grammar
; description `grammar' and the lexer function `lexer'.
;
; A grammar is a list of definitions. Each definition defines a non-terminal
; by a set of rules. Thus a definition has the form: (nt rule1 rule2...).
; A given non-terminal can only be defined once. The first non-terminal
; defined is the grammar's goal. Each rule is a possibly empty list of
; non-terminals. Thus a rule has the form: (nt1 nt2...). A non-terminal
; can be any scheme value. Note that all grammar symbols are treated as
; non-terminals. This is fine though because the lexer will be outputing
; non-terminals.
;
; The lexer defines what a token is and the mapping between tokens and
; the grammar's non-terminals. It is a function of one argument, the input,
; that returns the list of tokens corresponding to the input. Each token is
; represented by a list. The first element is some `user-defined' information
; associated with the token and the rest represents the token's class(es) (as a
; list of non-terminals that this token corresponds to).
;
; The result of `make-parser' is a function that parses the single input it
; is given into the grammar's goal. The result is a `parse' which can be
; manipulated with the procedures: `parse->parsed?', `parse->trees'
; and `parse->nb-trees' (see below).
;
; Let's assume that we want a parser for the grammar
;
; S -> x = E
; E -> E + E | V
; V -> V y |
;
; and that the input to the parser is a string of characters. Also, assume we
; would like to map the characters `x', `y', `+' and `=' into the corresponding
; non-terminals in the grammar. Such a parser could be created with
;
; (make-parser
; '(
; (s (x = e))
; (e (e + e) (v))
; (v (v y) ())
; )
; (lambda (str)
; (map (lambda (char)
; (list char ; user-info = the character itself
; (case char
; ((#\x) 'x)
; ((#\y) 'y)
; ((#\+) '+)
; ((#\=) '=)
; (else (fatal-error "lexer error")))))
; (string->list str)))
; )
;
; An alternative definition (that does not check for lexical errors) is
;
; (make-parser
; '(
; (s (#\x #\= e))
; (e (e #\+ e) (v))
; (v (v #\y) ())
; )
; (lambda (str) (map (lambda (char) (list char char)) (string->list str)))
; )
;
; To help with the rest of the discussion, here are a few definitions:
;
; An input pointer (for an input of `n' tokens) is a value between 0 and `n'.
; It indicates a point between two input tokens (0 = beginning, `n' = end).
; For example, if `n' = 4, there are 5 input pointers:
;
; input token1 token2 token3 token4
; input pointers 0 1 2 3 4
;
; A configuration indicates the extent to which a given rule is parsed (this
; is the common `dot notation'). For simplicity, a configuration is
; represented as an integer, with successive configurations in the same
; rule associated with successive integers. It is assumed that the grammar
; has been extended with rules to aid scanning. These rules are of the
; form `nt ->', and there is one such rule for every non-terminal. Note
; that these rules are special because they only apply when the corresponding
; non-terminal is returned by the lexer.
;
; A configuration set is a configuration grouped with the set of input pointers
; representing where the head non-terminal of the configuration was predicted.
;
; Here are the rules and configurations for the grammar given above:
;
; S -> . \
; 0 |
; x -> . |
; 1 |
; = -> . |
; 2 |
; E -> . |
; 3 > special rules (for scanning)
; + -> . |
; 4 |
; V -> . |
; 5 |
; y -> . |
; 6 /
; S -> . x . = . E .
; 7 8 9 10
; E -> . E . + . E .
; 11 12 13 14
; E -> . V .
; 15 16
; V -> . V . y .
; 17 18 19
; V -> .
; 20
;
; Starters of the non-terminal `nt' are configurations that are leftmost
; in a non-special rule for `nt'. Enders of the non-terminal `nt' are
; configurations that are rightmost in any rule for `nt'. Predictors of the
; non-terminal `nt' are configurations that are directly to the left of `nt'
; in any rule.
;
; For the grammar given above,
;
; Starters of V = (17 20)
; Enders of V = (5 19 20)
; Predictors of V = (15 17)
(define (make-parser grammar lexer)
(define (non-terminals grammar) ; return vector of non-terminals in grammar
(define (add-nt nt nts)
(if (member nt nts) nts (cons nt nts))) ; use equal? for equality tests
(let def-loop ((defs grammar) (nts '()))
(if (pair? defs)
(let* ((def (car defs))
(head (car def)))
(let rule-loop ((rules (cdr def))
(nts (add-nt head nts)))
(if (pair? rules)
(let ((rule (car rules)))
(let loop ((l rule) (nts nts))
(if (pair? l)
(let ((nt (car l)))
(loop (cdr l) (add-nt nt nts)))
(rule-loop (cdr rules) nts))))
(def-loop (cdr defs) nts))))
(list->vector (reverse nts))))) ; goal non-terminal must be at index 0
(define (ind nt nts) ; return index of non-terminal `nt' in `nts'
(let loop ((i (- (vector-length nts) 1)))
(if (>= i 0)
(if (equal? (vector-ref nts i) nt) i (loop (- i 1)))
#f)))
(define (nb-configurations grammar) ; return nb of configurations in grammar
(let def-loop ((defs grammar) (nb-confs 0))
(if (pair? defs)
(let ((def (car defs)))
(let rule-loop ((rules (cdr def)) (nb-confs nb-confs))
(if (pair? rules)
(let ((rule (car rules)))
(let loop ((l rule) (nb-confs nb-confs))
(if (pair? l)
(loop (cdr l) (+ nb-confs 1))
(rule-loop (cdr rules) (+ nb-confs 1)))))
(def-loop (cdr defs) nb-confs))))
nb-confs)))
; First, associate a numeric identifier to every non-terminal in the
; grammar (with the goal non-terminal associated with 0).
;
; So, for the grammar given above we get:
;
; s -> 0 x -> 1 = -> 4 e ->3 + -> 4 v -> 5 y -> 6
(let* ((nts (non-terminals grammar)) ; id map = list of non-terms
(nb-nts (vector-length nts)) ; the number of non-terms
(nb-confs (+ (nb-configurations grammar) nb-nts)) ; the nb of confs
(starters (make-vector nb-nts '())) ; starters for every non-term
(enders (make-vector nb-nts '())) ; enders for every non-term
(predictors (make-vector nb-nts '())) ; predictors for every non-term
(steps (make-vector nb-confs #f)) ; what to do in a given conf
(names (make-vector nb-confs #f))) ; name of rules
(define (setup-tables grammar nts starters enders predictors steps names)
(define (add-conf conf nt nts class)
(let ((i (ind nt nts)))
(vector-set! class i (cons conf (vector-ref class i)))))
(let ((nb-nts (vector-length nts)))
(let nt-loop ((i (- nb-nts 1)))
(if (>= i 0)
(begin
(vector-set! steps i (- i nb-nts))
(vector-set! names i (list (vector-ref nts i) 0))
(vector-set! enders i (list i))
(nt-loop (- i 1)))))
(let def-loop ((defs grammar) (conf (vector-length nts)))
(if (pair? defs)
(let* ((def (car defs))
(head (car def)))
(let rule-loop ((rules (cdr def)) (conf conf) (rule-num 1))
(if (pair? rules)
(let ((rule (car rules)))
(vector-set! names conf (list head rule-num))
(add-conf conf head nts starters)
(let loop ((l rule) (conf conf))
(if (pair? l)
(let ((nt (car l)))
(vector-set! steps conf (ind nt nts))
(add-conf conf nt nts predictors)
(loop (cdr l) (+ conf 1)))
(begin
(vector-set! steps conf (- (ind head nts) nb-nts))
(add-conf conf head nts enders)
(rule-loop (cdr rules) (+ conf 1) (+ rule-num 1))))))
(def-loop (cdr defs) conf))))))))
; Now, for each non-terminal, compute the starters, enders and predictors and
; the names and steps tables.
(setup-tables grammar nts starters enders predictors steps names)
; Build the parser description
(let ((parser-descr (vector lexer
nts
starters
enders
predictors
steps
names)))
(lambda (input)
(define (ind nt nts) ; return index of non-terminal `nt' in `nts'
(let loop ((i (- (vector-length nts) 1)))
(if (>= i 0)
(if (equal? (vector-ref nts i) nt) i (loop (- i 1)))
#f)))
(define (comp-tok tok nts) ; transform token to parsing format
(let loop ((l1 (cdr tok)) (l2 '()))
(if (pair? l1)
(let ((i (ind (car l1) nts)))
(if i
(loop (cdr l1) (cons i l2))
(loop (cdr l1) l2)))
(cons (car tok) (reverse l2)))))
(define (input->tokens input lexer nts)
(list->vector (map (lambda (tok) (comp-tok tok nts)) (lexer input))))
(define (make-states nb-toks nb-confs)
(let ((states (make-vector (+ nb-toks 1) #f)))
(let loop ((i nb-toks))
(if (>= i 0)
(let ((v (make-vector (+ nb-confs 1) #f)))
(vector-set! v 0 -1)
(vector-set! states i v)
(loop (- i 1)))
states))))
(define (conf-set-get state conf)
(vector-ref state (+ conf 1)))
(define (conf-set-get* state state-num conf)
(let ((conf-set (conf-set-get state conf)))
(if conf-set
conf-set
(let ((conf-set (make-vector (+ state-num 6) #f)))
(vector-set! conf-set 1 -3) ; old elems tail (points to head)
(vector-set! conf-set 2 -1) ; old elems head
(vector-set! conf-set 3 -1) ; new elems tail (points to head)
(vector-set! conf-set 4 -1) ; new elems head
(vector-set! state (+ conf 1) conf-set)
conf-set))))
(define (conf-set-merge-new! conf-set)
(vector-set! conf-set
(+ (vector-ref conf-set 1) 5)
(vector-ref conf-set 4))
(vector-set! conf-set 1 (vector-ref conf-set 3))
(vector-set! conf-set 3 -1)
(vector-set! conf-set 4 -1))
(define (conf-set-head conf-set)
(vector-ref conf-set 2))
(define (conf-set-next conf-set i)
(vector-ref conf-set (+ i 5)))
(define (conf-set-member? state conf i)
(let ((conf-set (vector-ref state (+ conf 1))))
(if conf-set
(conf-set-next conf-set i)
#f)))
(define (conf-set-adjoin state conf-set conf i)
(let ((tail (vector-ref conf-set 3))) ; put new element at tail
(vector-set! conf-set (+ i 5) -1)
(vector-set! conf-set (+ tail 5) i)
(vector-set! conf-set 3 i)
(if (< tail 0)
(begin
(vector-set! conf-set 0 (vector-ref state 0))
(vector-set! state 0 conf)))))
(define (conf-set-adjoin* states state-num l i)
(let ((state (vector-ref states state-num)))
(let loop ((l1 l))
(if (pair? l1)
(let* ((conf (car l1))
(conf-set (conf-set-get* state state-num conf)))
(if (not (conf-set-next conf-set i))
(begin
(conf-set-adjoin state conf-set conf i)
(loop (cdr l1)))
(loop (cdr l1))))))))
(define (conf-set-adjoin** states states* state-num conf i)
(let ((state (vector-ref states state-num)))
(if (conf-set-member? state conf i)
(let* ((state* (vector-ref states* state-num))
(conf-set* (conf-set-get* state* state-num conf)))
(if (not (conf-set-next conf-set* i))
(conf-set-adjoin state* conf-set* conf i))
#t)
#f)))
(define (conf-set-union state conf-set conf other-set)
(let loop ((i (conf-set-head other-set)))
(if (>= i 0)
(if (not (conf-set-next conf-set i))
(begin
(conf-set-adjoin state conf-set conf i)
(loop (conf-set-next other-set i)))
(loop (conf-set-next other-set i))))))
(define (forw states state-num starters enders predictors steps nts)
(define (predict state state-num conf-set conf nt starters enders)
; add configurations which start the non-terminal `nt' to the
; right of the dot
(let loop1 ((l (vector-ref starters nt)))
(if (pair? l)
(let* ((starter (car l))
(starter-set (conf-set-get* state state-num starter)))
(if (not (conf-set-next starter-set state-num))
(begin
(conf-set-adjoin state starter-set starter state-num)
(loop1 (cdr l)))
(loop1 (cdr l))))))
; check for possible completion of the non-terminal `nt' to the
; right of the dot
(let loop2 ((l (vector-ref enders nt)))
(if (pair? l)
(let ((ender (car l)))
(if (conf-set-member? state ender state-num)
(let* ((next (+ conf 1))
(next-set (conf-set-get* state state-num next)))
(conf-set-union state next-set next conf-set)
(loop2 (cdr l)))
(loop2 (cdr l)))))))
(define (reduce states state state-num conf-set head preds)
; a non-terminal is now completed so check for reductions that
; are now possible at the configurations `preds'
(let loop1 ((l preds))
(if (pair? l)
(let ((pred (car l)))
(let loop2 ((i head))
(if (>= i 0)
(let ((pred-set (conf-set-get (vector-ref states i) pred)))
(if pred-set
(let* ((next (+ pred 1))
(next-set (conf-set-get* state state-num next)))
(conf-set-union state next-set next pred-set)))
(loop2 (conf-set-next conf-set i)))
(loop1 (cdr l))))))))
(let ((state (vector-ref states state-num))
(nb-nts (vector-length nts)))
(let loop ()
(let ((conf (vector-ref state 0)))
(if (>= conf 0)
(let* ((step (vector-ref steps conf))
(conf-set (vector-ref state (+ conf 1)))
(head (vector-ref conf-set 4)))
(vector-set! state 0 (vector-ref conf-set 0))
(conf-set-merge-new! conf-set)
(if (>= step 0)
(predict state state-num conf-set conf step starters enders)
(let ((preds (vector-ref predictors (+ step nb-nts))))
(reduce states state state-num conf-set head preds)))
(loop)))))))
(define (forward starters enders predictors steps nts toks)
(let* ((nb-toks (vector-length toks))
(nb-confs (vector-length steps))
(states (make-states nb-toks nb-confs))
(goal-starters (vector-ref starters 0)))
(conf-set-adjoin* states 0 goal-starters 0) ; predict goal
(forw states 0 starters enders predictors steps nts)
(let loop ((i 0))
(if (< i nb-toks)
(let ((tok-nts (cdr (vector-ref toks i))))
(conf-set-adjoin* states (+ i 1) tok-nts i) ; scan token
(forw states (+ i 1) starters enders predictors steps nts)
(loop (+ i 1)))))
states))
(define (produce conf i j enders steps toks states states* nb-nts)
(let ((prev (- conf 1)))
(if (and (>= conf nb-nts) (>= (vector-ref steps prev) 0))
(let loop1 ((l (vector-ref enders (vector-ref steps prev))))
(if (pair? l)
(let* ((ender (car l))
(ender-set (conf-set-get (vector-ref states j)
ender)))
(if ender-set
(let loop2 ((k (conf-set-head ender-set)))
(if (>= k 0)
(begin
(and (>= k i)
(conf-set-adjoin** states states* k prev i)
(conf-set-adjoin** states states* j ender k))
(loop2 (conf-set-next ender-set k)))
(loop1 (cdr l))))
(loop1 (cdr l)))))))))
(define (back states states* state-num enders steps nb-nts toks)
(let ((state* (vector-ref states* state-num)))
(let loop1 ()
(let ((conf (vector-ref state* 0)))
(if (>= conf 0)
(let* ((conf-set (vector-ref state* (+ conf 1)))
(head (vector-ref conf-set 4)))
(vector-set! state* 0 (vector-ref conf-set 0))
(conf-set-merge-new! conf-set)
(let loop2 ((i head))
(if (>= i 0)
(begin
(produce conf i state-num enders steps
toks states states* nb-nts)
(loop2 (conf-set-next conf-set i)))
(loop1)))))))))
(define (backward states enders steps nts toks)
(let* ((nb-toks (vector-length toks))
(nb-confs (vector-length steps))
(nb-nts (vector-length nts))
(states* (make-states nb-toks nb-confs))
(goal-enders (vector-ref enders 0)))
(let loop1 ((l goal-enders))
(if (pair? l)
(let ((conf (car l)))
(conf-set-adjoin** states states* nb-toks conf 0)
(loop1 (cdr l)))))
(let loop2 ((i nb-toks))
(if (>= i 0)
(begin
(back states states* i enders steps nb-nts toks)
(loop2 (- i 1)))))
states*))
(define (parsed? nt i j nts enders states)
(let ((nt* (ind nt nts)))
(if nt*
(let ((nb-nts (vector-length nts)))
(let loop ((l (vector-ref enders nt*)))
(if (pair? l)
(let ((conf (car l)))
(if (conf-set-member? (vector-ref states j) conf i)
#t
(loop (cdr l))))
#f)))
#f)))
(define (deriv-trees conf i j enders steps names toks states nb-nts)
(let ((name (vector-ref names conf)))
(if name ; `conf' is at the start of a rule (either special or not)
(if (< conf nb-nts)
(list (list name (car (vector-ref toks i))))
(list (list name)))
(let ((prev (- conf 1)))
(let loop1 ((l1 (vector-ref enders (vector-ref steps prev)))
(l2 '()))
(if (pair? l1)
(let* ((ender (car l1))
(ender-set (conf-set-get (vector-ref states j)
ender)))
(if ender-set
(let loop2 ((k (conf-set-head ender-set)) (l2 l2))
(if (>= k 0)
(if (and (>= k i)
(conf-set-member? (vector-ref states k)
prev i))
(let ((prev-trees
(deriv-trees prev i k enders steps names
toks states nb-nts))
(ender-trees
(deriv-trees ender k j enders steps names
toks states nb-nts)))
(let loop3 ((l3 ender-trees) (l2 l2))
(if (pair? l3)
(let ((ender-tree (list (car l3))))
(let loop4 ((l4 prev-trees) (l2 l2))
(if (pair? l4)
(loop4 (cdr l4)
(cons (append (car l4)
ender-tree)
l2))
(loop3 (cdr l3) l2))))
(loop2 (conf-set-next ender-set k) l2))))
(loop2 (conf-set-next ender-set k) l2))
(loop1 (cdr l1) l2)))
(loop1 (cdr l1) l2)))
l2))))))
(define (deriv-trees* nt i j nts enders steps names toks states)
(let ((nt* (ind nt nts)))
(if nt*
(let ((nb-nts (vector-length nts)))
(let loop ((l (vector-ref enders nt*)) (trees '()))
(if (pair? l)
(let ((conf (car l)))
(if (conf-set-member? (vector-ref states j) conf i)
(loop (cdr l)
(append (deriv-trees conf i j enders steps names
toks states nb-nts)
trees))
(loop (cdr l) trees)))
trees)))
#f)))
(define (nb-deriv-trees conf i j enders steps toks states nb-nts)
(let ((prev (- conf 1)))
(if (or (< conf nb-nts) (< (vector-ref steps prev) 0))
1
(let loop1 ((l (vector-ref enders (vector-ref steps prev)))
(n 0))
(if (pair? l)
(let* ((ender (car l))
(ender-set (conf-set-get (vector-ref states j)
ender)))
(if ender-set
(let loop2 ((k (conf-set-head ender-set)) (n n))
(if (>= k 0)
(if (and (>= k i)
(conf-set-member? (vector-ref states k)
prev i))
(let ((nb-prev-trees
(nb-deriv-trees prev i k enders steps
toks states nb-nts))
(nb-ender-trees
(nb-deriv-trees ender k j enders steps
toks states nb-nts)))
(loop2 (conf-set-next ender-set k)
(+ n (* nb-prev-trees nb-ender-trees))))
(loop2 (conf-set-next ender-set k) n))
(loop1 (cdr l) n)))
(loop1 (cdr l) n)))
n)))))
(define (nb-deriv-trees* nt i j nts enders steps toks states)
(let ((nt* (ind nt nts)))
(if nt*
(let ((nb-nts (vector-length nts)))
(let loop ((l (vector-ref enders nt*)) (nb-trees 0))
(if (pair? l)
(let ((conf (car l)))
(if (conf-set-member? (vector-ref states j) conf i)
(loop (cdr l)
(+ (nb-deriv-trees conf i j enders steps
toks states nb-nts)
nb-trees))
(loop (cdr l) nb-trees)))
nb-trees)))
#f)))
(let* ((lexer (vector-ref parser-descr 0))
(nts (vector-ref parser-descr 1))
(starters (vector-ref parser-descr 2))
(enders (vector-ref parser-descr 3))
(predictors (vector-ref parser-descr 4))
(steps (vector-ref parser-descr 5))
(names (vector-ref parser-descr 6))
(toks (input->tokens input lexer nts)))
(vector nts
starters
enders
predictors
steps
names
toks
(backward (forward starters enders predictors steps nts toks)
enders steps nts toks)
parsed?
deriv-trees*
nb-deriv-trees*))))))
(define (parse->parsed? parse nt i j)
(let* ((nts (vector-ref parse 0))
(enders (vector-ref parse 2))
(states (vector-ref parse 7))
(parsed? (vector-ref parse 8)))
(parsed? nt i j nts enders states)))
(define (parse->trees parse nt i j)
(let* ((nts (vector-ref parse 0))
(enders (vector-ref parse 2))
(steps (vector-ref parse 4))
(names (vector-ref parse 5))
(toks (vector-ref parse 6))
(states (vector-ref parse 7))
(deriv-trees* (vector-ref parse 9)))
(deriv-trees* nt i j nts enders steps names toks states)))
(define (parse->nb-trees parse nt i j)
(let* ((nts (vector-ref parse 0))
(enders (vector-ref parse 2))
(steps (vector-ref parse 4))
(toks (vector-ref parse 6))
(states (vector-ref parse 7))
(nb-deriv-trees* (vector-ref parse 10)))
(nb-deriv-trees* nt i j nts enders steps toks states)))
(define (test)
(let ((p (make-parser '( (s (a) (s s)) )
(lambda (l) (map (lambda (x) (list x x)) l)))))
(let ((x (p '(a a a a a a a a a))))
(length (parse->trees x 's 0 9)))))
(define (main . args)
(run-benchmark
"earley"
earley-iters
(lambda (result) (equal? result 1430))
(lambda () (lambda () (test)))))