scsh-0.5/scsh/rx/simp.scm

404 lines
15 KiB
Scheme
Raw Normal View History

;;; Olin Shivers, June 1998
;;; Copyright (c) 1998 by the Scheme Underground.
;;; One export: (simplify-regexp re) -> re
;;; Regexp simplifier
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; (| (in c1 ...) (in c2 ...) re ...) => (| (in c1 ... c2 ...) re ...)
;;; (| (not-in c1 ...) (not-in c2 ...)) => (| (not-in [intersect (c1 ...)
;;; (c2 ...)])
;;; A run of BOS's or a run of EOS's in a sequence may be elided.
;;; Nested exponents can be collapsed (*, +, ?) -- multiply the "from's"
;;; together; multiply the "to's" together.
;;; Exponent range [1,1] simplifies, as does [0,0].
;;; Uniquify branches
;;; Adjacent literals in a sequence can be collapsed
;;; A singleton-char char class can be collapsed into a constant
;;; Nested choices can be collapsed
;;; Nested sequences can be collapsed
;;; An empty sequence (:) can be turned into an empty-string match "".
;;; Singleton choices and sequences can be reduced to their body.
;;;
;;; The simplifier is carefully written so that it won't blow up
;;; when applied to a dynamic regexp -- that is,
;;; - a chunk of Scheme code that produces a regexp instead of
;;; an actual regexp value;
;;; - a repeat regexp whose FROM or TO fields are chunks of Scheme code
;;; rather than integers;
;;; - a char-set regexp whose CSET field is a chunk of Scheme code rather
;;; than an actual char-set value.
;;; This is useful because the RX macro can build such a regexp as part
;;; of its expansion process.
(define (simplify-regexp re)
(receive (simp-re pre-dsm) (simp-re re)
(re-dsm simp-re pre-dsm (- (re-tsm re) (+ (re-tsm simp-re) pre-dsm)))))
(define (simp-re re)
(? ((re-string? re) (values re 0))
((re-seq? re) (simp-seq re))
((re-choice? re) (simp-choice re))
;; Singleton char-sets reduce to the character.
;; Bear in mind the cset field might be Scheme code instead
;; of an actual char set if the regexp is dynamic.
((re-char-set? re)
(values (let ((cs (re-char-set:cset re)))
(if (and (char-set? cs)
(= 1 (char-set-size cs)))
(make-re-string (string (car (char-set->list cs))))
re))
0))
((re-repeat? re) (simp-repeat re))
((re-submatch? re) (simp-submatch re))
((re-dsm? re) (simp-dsm re))
(else (values re 0))))
;;; If the body of a submatch is the empty re, reduce it to the empty re.
(define (simp-submatch re)
(let ((tsm (re-submatch:tsm re))
(pre-dsm (re-submatch:pre-dsm re)))
(receive (body1 pre-dsm1) (simp-re (re-submatch:body re))
(if (re-empty? body1)
(values re-empty tsm)
(values (%make-re-submatch body1 (+ pre-dsm pre-dsm1) tsm)
0)))))
;;; - Flatten nested DSM's.
;;; - Return pre-dsm field and body field as the two return values.
(define (simp-dsm re)
(receive (body pre-dsm1) (simp-re (re-dsm:body re))
(values body (+ (re-dsm:pre-dsm re) pre-dsm1))))
;;; Simplifying sequences
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; - Collapse nested sequences and DSM's.
;;; - Merge adjacent strings, identical adjacent anchors (bos, eos, etc.).
;;; - Bubble DSM's forwards past elts that don't contain live submatches.
;;; (Going past live submatches would switch the submatch indexes around,
;;; which would be an error). This helps to coalesce DSMs and if we bring
;;; them all the way to the front, we can pop them off and make them a
;;; pre-dsm for the entire seq record.
;;; - If an elt is the re-empty, reduce the whole re to the empty re.
;;; - Reduce singleton and empty seq.
(define (simp-seq re)
(let ((tsm (re-seq:tsm re))
(elts (map simplify-regexp (re-seq:elts re))))
(if (pair? elts)
(call-with-current-continuation
(lambda (abort)
(receive (pre-dsm head tail) (simp-seq1 elts abort tsm)
(values (if (pair? tail)
(%make-re-seq (cons head tail) (- tsm pre-dsm))
head) ; Singleton seq
pre-dsm))))
(values re-trivial 0)))) ; Empty seq
;;; Simplify the non-empty sequence ELTS.
;;; - Return the result split out into three values:
;;; [head-elt-pre-dsm, head-elt, tail].
;;; - If any elt is the empty (impossible) re, abort by calling
;;; (abort elt tsm). TSM is otherwise unused.
(define (simp-seq1 elts abort tsm)
(let recur ((elt (car elts)) (elts (cdr elts)))
(receive (elt pre-dsm) (open-dsm elt)
(? ((re-seq? elt) ; Flatten nested seqs.
(let ((sub-elts (re-seq:elts elt)))
(recur (re-dsm (car sub-elts) pre-dsm 0)
(append (cdr sub-elts) elts))))
((re-empty? elt) (abort elt tsm)) ; Bomb out on the empty
; (impossible) re.
((pair? elts)
(receive (next-pre-dsm next tail) ; Simplify the tail,
(recur (car elts) (cdr elts)) ; then think about
; the head:
;; This guy is called when we couldn't find any other
;; simplification. If ELT contains live submatches, then
;; there really is nothing to be done at this step -- just
;; assemble the pieces together and return them. If ELT
;; *doesn't* contain any live submatches, do the same, but
;; bubble its following next-pre-dsm submatches forwards.
(define (no-simp)
(if (has-live-submatches? elt)
(values pre-dsm elt (cons (re-dsm next next-pre-dsm 0) tail))
(values (+ pre-dsm next-pre-dsm) elt (cons next tail))))
;; Coalesces two adjacent bol's, two adjacent eol's, etc.
(define (coalesce-anchor anchor?)
(if (and (anchor? elt) (anchor? next))
(values (+ pre-dsm next-pre-dsm) elt tail)
(no-simp)))
(? ((re-trivial? elt) ; Drop trivial re's.
(values (+ pre-dsm next-pre-dsm) next tail))
;; Coalesce adjacent strings
((re-string? elt)
(if (re-string? next)
(values (+ pre-dsm next-pre-dsm)
(make-re-string (string-append (re-string:chars elt)
(re-string:chars next)))
tail)
(no-simp)))
;; Coalesce adjacent bol/eol/bos/eos/bow/eow's.
((re-bol? elt) (coalesce-anchor re-bol?))
((re-eol? elt) (coalesce-anchor re-eol?))
((re-bos? elt) (coalesce-anchor re-bos?))
((re-eos? elt) (coalesce-anchor re-eos?))
((re-bow? elt) (coalesce-anchor re-bow?))
((re-eow? elt) (coalesce-anchor re-eow?))
(else (no-simp)))))
(else (values pre-dsm elt '()))))))
;;; Simplifying choices
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; - Collapse nested choices and DSM's.
;;; - Delete re-empty's.
;;; - Merge sets; merge identical anchors (bos, eos, etc.).
;;; But you can't merge across an element that contains a live submatch,
;;; see below.
;;; - A singleton string "c" is included into the char-set merge as a
;;; singleton set.
;;; - Bubble DSM's forwards past elts that don't contain live submatches.
;;; (Going past live submatches would switch the submatch indexes around,
;;; which would be an error). This helps to coalesce DSMs and if we bring
;;; them all the way to the front, we can pop them off and make them a
;;; pre-dsm for the entire seq record.
;;; - Reduce singleton and empty choice.
;;;
;;; You have to be careful simplifying choices -- you can't merge two sets
;;; that appear on different sides of an element containing a live submatch.
;;; The problem is that the assignment of submatches breaks ties left-to-right.
;;; So these aren't the same:
;;; (| (submatch "x") any) (| any (submatch "x"))
;;; The first assigns the submatch, the second doesn't -- the ANY gets credit.
;;; We want to collapse multiple char-sets, bos's, and eos's, but we have
;;; to deal with this issue. So
;;; - When we coalesce anchors, we retain the *leftmost* one.
;;; - We coalesce sets that appear between live-submatch boundaries.
;;; When we do this, we subtract from the set any char that was in
;;; an earlier coalesced char-set. If this gets us down to the empty set,
;;; we drop it. If it gets us down to a singleton set, we convert it into
;;; a singleton string.
;;; Whew. I had to think about this one.
(define (simp-choice re)
(let ((tsm (re-choice:tsm re)))
(receive (pre-dsm cset bos? eos? bol? eol? bow? eow? tail)
(simp-choice1 (map simplify-regexp (re-choice:elts re)))
(let ((tail (assemble-boundary-tail char-set:empty cset
bos? eos? bol? eol? bow? eow?
#f #f #f #f #f #f
tail)))
(values (if (pair? tail)
(if (pair? (cdr tail))
(%make-re-choice tail (- tsm pre-dsm))
(car tail)) ; Singleton choice
re-empty) ; Empty choice
pre-dsm)))))
;;; Given the return values from simp-choice1, this tacks all
;;; the various pieces (CSET, BOS?, EOS?, etc.) onto the front of
;;; TAIL. However, elements are not added onto TAIL that are already
;;; described by PREV-CSET, PREV-BOS?, etc. -- they will be added onto
;;; some earlier bit of the final result.
(define (assemble-boundary-tail prev-cset cset
bos? eos? bol? eol? bow? eow?
prev-bos? prev-eos?
prev-bol? prev-eol?
prev-bow? prev-eow?
tail)
(let* ((cset (char-set-difference cset prev-cset))
(numchars (char-set-size cset))
(tail (if (and eos? (not prev-eos?)) (cons re-eos tail) tail))
(tail (if (and eol? (not prev-eol?)) (cons re-eol tail) tail))
(tail (if (and eow? (not prev-eow?)) (cons re-eow tail) tail))
(tail (if (and bow? (not prev-bow?)) (cons re-bow tail) tail))
(tail (if (and bol? (not prev-bol?)) (cons re-bol tail) tail))
(tail (if (and bos? (not prev-bos?)) (cons re-bos tail) tail))
(tail (? ((zero? numchars) tail) ; Drop empty char set.
((= 1 numchars) ; {c} => "c"
(cons (make-re-string (string (car (char-set->list cset))))
tail))
(else (cons (make-re-char-set cset) tail)))))
tail))
;;; Simplify the non-empty list of choices ELTS.
;;; Return the result split out into the values
;;; [pre-dsm, cset, bos?, eos?, bol?, eol?, bow?, eow?, tail]
(define (simp-choice1 elts)
(let recur ((elts elts)
(prev-cset char-set:empty) ; Chars we've already seen.
(prev-bos? #f) (prev-eos? #f) ; These flags say if we've
(prev-bol? #f) (prev-eol? #f) ; already seen one of these
(prev-bow? #f) (prev-eow? #f)) ; anchors.
(if (pair? elts)
(let ((elt (car elts))
(elts (cdr elts)))
(receive (elt pre-dsm) (open-dsm elt)
(if (re-choice? elt)
;; Flatten nested choices.
(let ((sub-elts (re-seq:elts elt)))
(receive (tail-pre-dsm cset bos? eos? bol? eol? bow? eow? tail)
(recur (append sub-elts elts)
prev-cset
prev-bos? prev-eos?
prev-bol? prev-eol?
prev-bow? prev-eow?)
(values (+ pre-dsm tail-pre-dsm)
cset bos? eos? bol? eol? bow? eow? tail)))
;; Simplify the tail, then think about the head.
(receive (tail-pre-dsm cset bos? eos? bol? eol? bow? eow? tail)
(recur elts
(? ((and (re-string? elt)
(= 1 (string-length (re-string:chars elt))))
(char-set-union prev-cset
(string->char-set (re-string:chars elt))))
;; The cset might be a Scheme exp.
((and (re-char-set? elt)
(char-set? (re-char-set:cset elt)))
(char-set-union prev-cset
(re-char-set:cset elt)))
(else prev-cset))
(or prev-bos? (re-bos? elt))
(or prev-eos? (re-eos? elt))
(or prev-bol? (re-bol? elt))
(or prev-eol? (re-eol? elt))
(or prev-bow? (re-bow? elt))
(or prev-eow? (re-eow? elt)))
;; This guy is called when we couldn't find any other
;; simplification. If ELT contains live submatches, then we
;; are at a merge boundary, and have to take all the
;; TAIL-PRE-DSM, CSET, BOS?, EOS?, ... stuff we've collected
;; and tack them onto TAIL as elements, then put ELT on
;; front. Otherwise, we can commute TAIL-PRE-DSM, CSET,
;; BOS?, etc. with ELT, since it contains no live
;; submatches, so just tack ELT onto TAIL.
(define (no-simp)
(if (has-live-submatches? elt)
(let ((tail (assemble-boundary-tail prev-cset cset
bos? eos?
bol? eol?
bow? eow?
prev-bos? prev-eos?
prev-bol? prev-eol?
prev-bow? prev-eow?
tail)))
(values pre-dsm char-set:empty #f #f #f #f #f #f
(if (pair? tail)
;; Tack tail-pre-dsm onto
;; TAIL's first elt.
(cons elt
(cons (re-dsm (car tail)
tail-pre-dsm 0)
(cdr tail)))
;; Squirrel case: TAIL is empty, so use
;; TAIL-PRE-DSM as ELT's post-dsm.
(list (re-dsm elt 0 tail-pre-dsm)))))
;; ELT has no live submatches, so we can commute all
;; the recursion state forwards past it.
(values (+ pre-dsm tail-pre-dsm)
cset bos? eos? bol? eol? bow? eow?
(cons elt tail))))
(? ((and (re-char-set? elt)
(char-set? (re-char-set:cset elt))) ; Might be Scheme code
(values (+ pre-dsm tail-pre-dsm)
(char-set-union cset (re-char-set:cset elt))
bos? eos? bol? eol? bow? eow? tail))
;; Treat a singleton string "c" as a singleton set {c}.
((and (re-string? elt) (= 1 (string-length (re-string:chars elt))))
(values (+ pre-dsm tail-pre-dsm)
(char-set-union cset (string->char-set (re-string:chars elt)))
bos? eos? bol? eol? bow? eow? tail))
;; Coalesce bol/eol/bos/eos/bow/eow's.
((re-bos? elt) (values (+ pre-dsm tail-pre-dsm) cset
#t eos? bol? eol? bow? eow? tail))
((re-eos? elt) (values (+ pre-dsm tail-pre-dsm) cset
bos? #t bol? eol? bow? eow? tail))
((re-bol? elt) (values (+ pre-dsm tail-pre-dsm) cset
bos? eos? #t eol? bow? eow? tail))
((re-eol? elt) (values (+ pre-dsm tail-pre-dsm) cset
bos? eos? bol? #t bow? eow? tail))
((re-bow? elt) (values (+ pre-dsm tail-pre-dsm) cset
bos? eos? bol? eol? #t eow? tail))
((re-eow? elt) (values (+ pre-dsm tail-pre-dsm) cset
bos? eos? bol? eol? bow? #t tail))
(else (no-simp)))))))
(values 0 char-set:empty #f #f #f #f #f #f '()))))
(define (simp-repeat re)
(let ((from (re-repeat:from re))
(to (re-repeat:to re))
(body (re-repeat:body re)))
(receive (simp-body pre-dsm) (simp-re body) ; Simplify body.
;; The fancy reductions are all handled by REDUCE-REPEAT.
(reduce-repeat from to simp-body pre-dsm))))
;;; Does RE contain a live submatch?
;;; If RE is dynamic, we can't tell, so we err conservatively,
;;; which means we say "yes."
(define (has-live-submatches? re)
(or (re-submatch? re)
(? ((re-seq? re) (every has-live-submatches? (re-seq:elts re)))
((re-choice? re) (every has-live-submatches? (re-choice:elts re)))
((re-repeat? re) (has-live-submatches? (re-repeat:body re)))
((re-dsm? re) (has-live-submatches? (re-dsm:body re)))
;; If it's not one of these things, then this isn't a regexp -- it's
;; a chunk of Scheme code producing a regexp, and we conservatively
;; return #T -- the expression *might* produce a regexp containing
;; a live submatch:
(else (not (or (re-char-set? re) (re-string? re)
(re-bos? re) (re-eos? re)
(re-bol? re) (re-eol? re)
(re-bow? re) (re-eow? re)))))))