scsh-0.5/scsh/rx/simp.scm

;;; Olin Shivers, June 1998
;;; Copyright (c) 1998 by the Scheme Underground.

;;; One export: (simplify-regexp re) -> re

;;; Regexp simplifier
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; (| (in c1 ...) (in c2 ...) re ...) => (| (in c1 ... c2 ...) re ...)
;;; (| (not-in c1 ...) (not-in c2 ...)) => (| (not-in [intersect (c1 ...)
;;; 							     (c2 ...)])
;;; A run of BOS's or a run of EOS's in a sequence may be elided.
;;; Nested exponents can be collapsed (*, +, ?) -- multiply the "from's"
;;;   together; multiply the "to's" together.
;;; Exponent range [1,1] simplifies, as does [0,0].
;;; Uniquify branches
;;; Adjacent literals in a sequence can be collapsed
;;; A singleton-char char class can be collapsed into a constant
;;; Nested choices can be collapsed
;;; Nested sequences can be collapsed
;;; An empty sequence (:) can be turned into an empty-string match "".
;;; Singleton choices and sequences can be reduced to their body.
;;;
;;; The simplifier is carefully written so that it won't blow up
;;; when applied to a dynamic regexp -- that is,
;;; - a chunk of Scheme code that produces a regexp instead of
;;;   an actual regexp value;
;;; - a repeat regexp whose FROM or TO fields are chunks of Scheme code
;;;   rather than integers;
;;; - a char-set regexp whose CSET field is a chunk of Scheme code rather
;;;   than an actual char-set value.
;;; This is useful because the RX macro can build such a regexp as part
;;; of its expansion process.

(define (simplify-regexp re)
  (receive (simp-re pre-dsm) (simp-re re)
    (re-dsm simp-re pre-dsm (- (re-tsm re) (+ (re-tsm simp-re) pre-dsm)))))

(define (simp-re re)
  (? ((re-string? re) (values re 0))
     ((re-seq? re)    (simp-seq re))
     ((re-choice? re) (simp-choice re))

     ;; Singleton char-sets reduce to the character.
     ;; Bear in mind the cset field might be Scheme code instead
     ;; of an actual char set if the regexp is dynamic.
     ((re-char-set? re)
      (values (let ((cs (re-char-set:cset re)))
		(if (and (char-set? cs)
			 (= 1 (char-set-size cs)))
		    (make-re-string (string (car (char-set-members cs))))
		    re))
	      0))

     ((re-repeat? re) (simp-repeat re))

     ((re-submatch? re) (simp-submatch re))
     ((re-dsm?      re) (simp-dsm      re))

     (else (values re 0))))


;;; If the body of a submatch is the empty re, reduce it to the empty re.

(define (simp-submatch re)
  (let ((tsm     (re-submatch:tsm     re))
	(pre-dsm (re-submatch:pre-dsm re)))
    (receive (body1 pre-dsm1) (simp-re (re-submatch:body re))
      (if (empty-re? body1)
	  (values empty-re tsm)
	  (values (%make-re-submatch body1 (+ pre-dsm pre-dsm1) tsm)
		  0)))))

;;; - Flatten nested DSM's.
;;; - Return pre-dsm field and body field as the two return values.

(define (simp-dsm re)
  (receive (body pre-dsm1) (simp-re (re-dsm:body re))
    (values body (+ (re-dsm:pre-dsm re) pre-dsm1))))


;;; Simplifying sequences
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; - Collapse nested sequences and DSM's.
;;; - Merge adjacent strings, identical adjacent anchors (bos, eos, etc.).
;;; - Bubble DSM's forwards past elts that don't contain live submatches.
;;;   (Going past live submatches would switch the submatch indexes around,
;;;   which would be an error). This helps to coalesce DSMs and if we bring
;;;   them all the way to the front, we can pop them off and make them a
;;;   pre-dsm for the entire seq record.
;;; - If an elt is the empty-re, reduce the whole re to the empty re.
;;; - Reduce singleton and empty seq.

(define (simp-seq re)
  (let ((tsm (re-seq:tsm re))
	(elts (map simplify-regexp (re-seq:elts re))))
    (if (pair? elts)

	(call-with-current-continuation
	 (lambda (abort)
	   (receive (pre-dsm head tail) (simp-seq1 elts abort tsm)
	     (values (if (pair? tail)
			 (%make-re-seq (cons head tail) (- tsm pre-dsm))
			 head)					; Singleton seq
		     pre-dsm))))

	(values trivial-re 0))))				; Empty seq


;;; Simplify the non-empty sequence ELTS.
;;; - Return the result split out into three values:
;;;   [head-elt-pre-dsm, head-elt, tail].
;;; - If any elt is the empty (impossible) re, abort by calling
;;;   (abort elt tsm). TSM is otherwise unused.

(define (simp-seq1 elts abort tsm)
  (let recur ((elt (car elts)) (elts (cdr elts)))
    (receive (elt pre-dsm) (open-dsm elt)
      (? ((re-seq? elt)					; Flatten nested seqs.
	  (let ((sub-elts (re-seq:elts elt)))
	    (recur (re-dsm (car sub-elts) pre-dsm 0)
		   (append (cdr sub-elts) elts))))

	 ((empty-re? elt) (abort elt tsm))		; Bomb out on the empty
							; (impossible) re.
	 ((pair? elts)
	  (receive (next-pre-dsm next tail)		; Simplify the tail,
	           (recur (car elts) (cdr elts))	; then think about
							; the head:
	    ;; This guy is called when we couldn't find any other
	    ;; simplification. If ELT contains live submatches, then
	    ;; there really is nothing to be done at this step -- just
	    ;; assemble the pieces together and return them. If ELT
	    ;; *doesn't* contain any live submatches, do the same, but
	    ;; bubble its following next-pre-dsm submatches forwards.
	    (define (no-simp)
	      (if (has-live-submatches? elt)
		  (values pre-dsm elt (cons (re-dsm next next-pre-dsm 0) tail))
		  (values (+ pre-dsm next-pre-dsm) elt (cons next tail))))

	    ;; Coalesces two adjacent bol's, two adjacent eol's, etc.
	    (define (coalesce-anchor anchor?)
	      (if (and (anchor? elt) (anchor? next))
		  (values (+ pre-dsm next-pre-dsm) elt tail)
		  (no-simp)))

	    (? ((trivial-re? elt)	; Drop trivial re's.
		(values (+ pre-dsm next-pre-dsm) next tail))

	       ;; Coalesce adjacent strings
	       ((re-string? elt)
		(if (re-string? next)
		    (values (+ pre-dsm next-pre-dsm)
			    (make-re-string (string-append (re-string:chars elt)
							   (re-string:chars next)))
			    tail)
		    (no-simp)))

	       ;; Coalesce adjacent bol/eol/bos/eos/bow/eow's.
	       ((re-bol? elt) (coalesce-anchor re-bol?))
	       ((re-eol? elt) (coalesce-anchor re-eol?))
	       ((re-bos? elt) (coalesce-anchor re-bos?))
	       ((re-eos? elt) (coalesce-anchor re-eos?))
	       ((re-bow? elt) (coalesce-anchor re-bow?))
	       ((re-eow? elt) (coalesce-anchor re-eow?))
	       (else (no-simp)))))

	 (else (values pre-dsm elt '()))))))


;;; Simplifying choices
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; - Collapse nested choices and DSM's.
;;; - Delete empty-re's.
;;; - Merge sets; merge identical anchors (bos, eos, etc.).
;;;   But you can't merge across an element that contains a live submatch,
;;;   see below.
;;; - A singleton string "c" is included into the char-set merge as a
;;;   singleton set.
;;; - Bubble DSM's forwards past elts that don't contain live submatches.
;;;   (Going past live submatches would switch the submatch indexes around,
;;;   which would be an error). This helps to coalesce DSMs and if we bring
;;;   them all the way to the front, we can pop them off and make them a
;;;   pre-dsm for the entire seq record.
;;; - Reduce singleton and empty choice.
;;;
;;; You have to be careful simplifying choices -- you can't merge two sets
;;; that appear on different sides of an element containing a live submatch.
;;; The problem is that the assignment of submatches breaks ties left-to-right.
;;; So these aren't the same:
;;;     (| (submatch "x") any)    (| any (submatch "x"))
;;; The first assigns the submatch, the second doesn't -- the ANY gets credit.
;;; We want to collapse multiple char-sets, bos's, and eos's, but we have
;;; to deal with this issue. So
;;; - When we coalesce anchors, we retain the *leftmost* one.
;;; - We coalesce sets that appear between live-submatch boundaries.
;;;   When we do this, we subtract from the set any char that was in
;;;   an earlier coalesced char-set. If this gets us down to the empty set,
;;;   we drop it. If it gets us down to a singleton set, we convert it into
;;;   a singleton string.
;;; Whew. I had to think about this one.

(define (simp-choice re)
  (let ((tsm (re-choice:tsm re)))

    (receive (pre-dsm cset bos? eos? bol? eol? bow? eow? tail)
	     (simp-choice1 (map simplify-regexp (re-choice:elts re)))

      (let ((tail (assemble-boundary-tail char-set:empty cset
					  bos? eos? bol? eol? bow? eow?
					  #f #f #f #f #f #f
					  tail)))
	(values (if (pair? tail)
		    (if (pair? (cdr tail))
			(%make-re-choice tail (- tsm pre-dsm))
			(car tail))		; Singleton choice
		    empty-re)			; Empty choice
		pre-dsm)))))


;;; Given the return values from simp-choice1, this tacks all
;;; the various pieces (CSET, BOS?, EOS?, etc.) onto the front of
;;; TAIL. However, elements are not added onto TAIL that are already
;;; described by PREV-CSET, PREV-BOS?, etc. -- they will be added onto
;;; some earlier bit of the final result.

(define (assemble-boundary-tail prev-cset cset
				bos? eos? bol? eol? bow? eow?
				prev-bos? prev-eos?
				prev-bol? prev-eol?
				prev-bow? prev-eow?
				tail)
  (let* ((cset (char-set-difference cset prev-cset))
	 (numchars (char-set-size cset))
	 (tail (if (and eos? (not prev-eos?)) (cons re-eos tail) tail))
	 (tail (if (and eol? (not prev-eol?)) (cons re-eol tail) tail))
	 (tail (if (and eow? (not prev-eow?)) (cons re-eow tail) tail))
	 (tail (if (and bow? (not prev-bow?)) (cons re-bow tail) tail))
	 (tail (if (and bol? (not prev-bol?)) (cons re-bol tail) tail))
	 (tail (if (and bos? (not prev-bos?)) (cons re-bos tail) tail))
	 (tail (? ((zero? numchars) tail)	; Drop empty char set.
		  ((= 1 numchars)		; {c} => "c"
		   (cons (make-re-string (string (car (char-set-members cset))))
			 tail))
		  (else (cons (make-re-char-set cset) tail)))))
    tail))


;;; Simplify the non-empty list of choices ELTS.
;;; Return the result split out into the values
;;;     [pre-dsm, cset, bos?, eos?, bol?, eol?, bow?, eow?, tail]

(define (simp-choice1 elts)
  (let recur ((elts elts)

	      (prev-cset char-set:empty)	; Chars we've already seen.

	      (prev-bos? #f) (prev-eos? #f)	; These flags say if we've
	      (prev-bol? #f) (prev-eol? #f)	; already seen one of these
	      (prev-bow? #f) (prev-eow? #f))	; anchors.


    (if (pair? elts)
	(let ((elt  (car elts))
	      (elts (cdr elts)))
	  (receive (elt pre-dsm) (open-dsm elt)
	    (if (re-choice? elt)

		;; Flatten nested choices.
		(let ((sub-elts (re-seq:elts elt)))
		  (receive (tail-pre-dsm cset bos? eos? bol? eol? bow? eow? tail)
		           (recur (append sub-elts elts)
				  prev-cset
				  prev-bos? prev-eos?
				  prev-bol? prev-eol?
				  prev-bow? prev-eow?)
		    (values (+ pre-dsm tail-pre-dsm)
			    cset bos? eos? bol? eol? bow? eow? tail)))

		;; Simplify the tail, then think about the head.
		(receive (tail-pre-dsm cset bos? eos? bol? eol? bow? eow? tail)
		         (recur elts
				(? ((and (re-string? elt)
					 (= 1 (string-length (re-string:chars elt))))
				    (char-set-union prev-cset
						    (string->char-set (re-string:chars elt))))

				   ;; The cset might be a Scheme exp.
				   ((and (re-char-set? elt)
					 (char-set? (re-char-set:cset elt)))
				    (char-set-union prev-cset
						    (re-char-set:cset elt)))

				   (else prev-cset))
				(or prev-bos? (re-bos? elt))
				(or prev-eos? (re-eos? elt))
				(or prev-bol? (re-bol? elt))
				(or prev-eol? (re-eol? elt))
				(or prev-bow? (re-bow? elt))
				(or prev-eow? (re-eow? elt)))

		  ;; This guy is called when we couldn't find any other
		  ;; simplification. If ELT contains live submatches, then we
		  ;; are at a merge boundary, and have to take all the
		  ;; TAIL-PRE-DSM, CSET, BOS?, EOS?, ... stuff we've collected
		  ;; and tack them onto TAIL as elements, then put ELT on
		  ;; front.  Otherwise, we can commute TAIL-PRE-DSM, CSET,
		  ;; BOS?, etc. with ELT, since it contains no live
		  ;; submatches, so just tack ELT onto TAIL.

		  (define (no-simp)
		    (if (has-live-submatches? elt)
			(let ((tail (assemble-boundary-tail prev-cset cset
							    bos? eos?
							    bol? eol?
							    bow? eow?
							    prev-bos? prev-eos?
							    prev-bol? prev-eol?
							    prev-bow? prev-eow?
							    tail)))
			  (values pre-dsm char-set:empty #f #f #f #f #f #f
				  (if (pair? tail)
				      ;; Tack tail-pre-dsm onto
				      ;; TAIL's first elt.
				      (cons elt
					    (cons (re-dsm (car tail)
							  tail-pre-dsm 0)
						  (cdr tail)))

				      ;; Squirrel case: TAIL is empty, so use
				      ;; TAIL-PRE-DSM as ELT's post-dsm.
				      (list (re-dsm elt 0 tail-pre-dsm)))))

			;; ELT has no live submatches, so we can commute all
			;; the recursion state forwards past it.
			(values (+ pre-dsm tail-pre-dsm)
				cset bos? eos? bol? eol? bow? eow?
				(cons elt tail))))

	    (? ((and (re-char-set? elt)
		     (char-set? (re-char-set:cset elt))) ; Might be Scheme code
		(values (+ pre-dsm tail-pre-dsm)
			(char-set-union cset (re-char-set:cset elt))
			bos? eos? bol? eol? bow? eow? tail))

	       ;; Treat a singleton string "c" as a singleton set {c}.
	       ((and (re-string? elt) (= 1 (string-length (re-string:chars elt))))
		(values (+ pre-dsm tail-pre-dsm)
			(char-set-union cset (string->char-set (re-string:chars elt)))
			bos? eos? bol? eol? bow? eow? tail))

	       ;; Coalesce bol/eol/bos/eos/bow/eow's.
	       ((re-bos? elt) (values (+ pre-dsm tail-pre-dsm) cset
				      #t   eos? bol? eol? bow? eow? tail))
	       ((re-eos? elt) (values (+ pre-dsm tail-pre-dsm) cset
				      bos? #t   bol? eol? bow? eow? tail))
	       ((re-bol? elt) (values (+ pre-dsm tail-pre-dsm) cset
				      bos? eos? #t   eol? bow? eow? tail))
	       ((re-eol? elt) (values (+ pre-dsm tail-pre-dsm) cset
				      bos? eos? bol? #t   bow? eow? tail))
	       ((re-bow? elt) (values (+ pre-dsm tail-pre-dsm) cset
				      bos? eos? bol? eol? #t   eow? tail))
	       ((re-eow? elt) (values (+ pre-dsm tail-pre-dsm) cset
				      bos? eos? bol? eol? bow? #t   tail))

	       (else (no-simp)))))))

	(values 0 char-set:empty #f #f #f #f #f #f '()))))


(define (simp-repeat re)
  (let ((from (re-repeat:from re))
	(to   (re-repeat:to   re))
	(body (re-repeat:body re)))
    (receive (simp-body pre-dsm) (simp-re body) ; Simplify body.
      ;; The fancy reductions are all handled by REDUCE-REPEAT.
      (reduce-repeat from to simp-body pre-dsm))))


;;; Does RE contain a live submatch?
;;; If RE is dynamic, we can't tell, so we err conservatively,
;;; which means we say "yes."

(define (has-live-submatches? re)
  (or (re-submatch? re)
      (? ((re-seq?    re) (every has-live-submatches? (re-seq:elts    re)))
	 ((re-choice? re) (every has-live-submatches? (re-choice:elts re)))
	 ((re-repeat? re) (has-live-submatches? (re-repeat:body re)))
	 ((re-dsm?    re) (has-live-submatches? (re-dsm:body    re)))

	 ;; If it's not one of these things, then this isn't a regexp -- it's
	 ;; a chunk of Scheme code producing a regexp, and we conservatively
	 ;; return #T -- the expression *might* produce a regexp containing
	 ;; a live submatch:
	 (else (not (or (re-char-set? re) (re-string? re)
			(re-bos? re) (re-eos? re)
			(re-bol? re) (re-eol? re)
			(re-bow? re) (re-eow? re)))))))