scsh-0.6/scsh/rx/re.scm

;;; The regexp data type
;;;     Olin Shivers, January 1997, May 1998.

;;;       A DSM around a choice gets absorbed into the choice's first elt.
;;;         But this prevents it from being moved out into a containing
;;;         choice or seq elt, or outer DSM. Fix.

;;; A regexp is a: dsm, submatch, seq, choice, repeat,
;;;                char-set, string, bos, eos

;;; Deleted sub-match regexp
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; This stands for a regexp containing TSM submatches, of which
;;; PRE-DSM come first as dead submatches, then the regexp BODY with its
;;; submatches, then POST-DSM as dead submatches.

(define-record-type re-dsm :re-dsm
  (really-make-re-dsm body pre-dsm tsm posix)
  re-dsm?
  (body     re-dsm:body)		; A Regexp
  (pre-dsm  re-dsm:pre-dsm)		; Integer -- initial dead submatches
  (tsm      re-dsm:tsm)					; Total submatch count
  (posix    re-dsm:posix set-re-dsm:posix))		; Posix bits

(define (make-re-dsm/tsm body pre-dsm tsm) (really-make-re-dsm body pre-dsm tsm #f))

;;; This is only used in code that the (RX ...) macro produces
;;; for static regexps.
(define (make-re-dsm/posix body pre-dsm tsm posix-str tvec)
  (really-make-re-dsm body pre-dsm tsm (new-cre posix-str tvec)))

(define (make-re-dsm body pre-dsm post-dsm)
  (make-re-dsm/tsm body pre-dsm (+ post-dsm pre-dsm (re-tsm body))))

;;; "Virtual field" for the RE-DSM record -- how many dead submatches
;;; come after the body:

(define (re-dsm:post-dsm re)		; Number of post-body DSM's =
  (- (re-dsm:tsm re)			;   total submatches
     (+ (re-dsm:pre-dsm re)		;   minus pre-body dead submatches
	(re-tsm (re-dsm:body re)))))	;   minus body's submatches.

;;; Slightly smart DSM constructor:
;;; - Absorb this DSM into an inner dsm.
;;; - Punt unnecessary DSM's.

(define (re-dsm body pre-dsm post-dsm)
  (let ((tsm (+ pre-dsm (re-tsm body) post-dsm)))
    (receive (body1 pre-dsm1) (open-dsm body)
      (let ((pre-dsm (+ pre-dsm pre-dsm1)))

	(if (= tsm (re-tsm body1)) body1		; Trivial DSM
	    (make-re-dsm/tsm body1 pre-dsm tsm))))))	; Non-trivial DSM

;;; Take a regexp RE and return an equivalent (re', pre-dsm) pair of values.
;;; Recurses into DSM records. It is the case that
;;;   (<= (+ pre-dsm (re-tsm re')) (re-tsm re))
;;; The post-dsm value is (- (re-tsm re) (re-tsm re') pre-dsm).

(define (open-dsm re)
  (let lp ((re re) (pre-dsm 0))
    (if (re-dsm? re)
	(lp (re-dsm:body re) (+ pre-dsm (re-dsm:pre-dsm re)))
	(values re pre-dsm))))


;;; Sequence: (: re ...)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(define-record-type re-seq :re-seq
  (really-make-re-seq elts tsm posix)
  re-seq?
  (elts   re-seq:elts)				; Regexp list
  (tsm    re-seq:tsm)				; Total submatch count
  (posix  re-seq:posix set-re-seq:posix))	; Posix record

(define (make-re-seq/tsm elts tsm) (really-make-re-seq elts tsm #f))

;;; This is only used in code that (RE ...) macro produces for static regexps.
(define (make-re-seq/posix elts tsm posix-str tvec)
  (really-make-re-seq elts tsm (new-cre posix-str tvec)))

(define (make-re-seq res)
  (make-re-seq/tsm res
		   (fold (lambda (re sm-count)
			   (let ((maybe-tsm (re-tsm re)))
			     (if (and (number? maybe-tsm)
				      (number? sm-count))
				 (+ maybe-tsm sm-count)
				 (unspecific))))
			 0 res)))

;;; Slightly smart sequence constructor:
;;; - Flattens nested sequences
;;; - Drops trivial "" elements
;;; - Empty sequence => ""
;;; - Singleton sequence is reduced to its one element.
;;; - We don't descend into DSM's; too much work for this routine.

(define (re-seq res)
  (let ((res (let recur ((res res)) 	; Flatten nested seqs & drop ""'s.
	       (if (pair? res)
		   (let* ((re (car res))
			  (tail (recur (cdr res))))
		     (cond ((re-seq? re) ; Flatten nested seqs
			    (append (recur (re-seq:elts re)) tail))
			   ((re-trivial? re) tail) ; Drop trivial elts
			   (else (cons re tail))))
		   '()))))

    (if (pair? res)
	(if (pair? (cdr res))
	    (make-re-seq res)		; General case
	    (car res))			; Singleton sequence
	re-trivial)))			; Empty seq -- ""


;;; Choice: (| re ...)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(define-record-type re-choice :re-choice
  (really-make-re-choice elts tsm posix)
  re-choice?
  (elts re-choice:elts)				; List of rel-items
  (tsm  re-choice:tsm)				; Total submatch count
  (posix  re-choice:posix set-re-choice:posix))	; Posix string

(define (make-re-choice/tsm elts tsm) (really-make-re-choice elts tsm #f))

;;; This is only used in code that (RE ...) macro produces for static regexps.
(define (make-re-choice/posix elts tsm posix-str tvec)
  (really-make-re-choice elts tsm (new-cre posix-str tvec)))

(define (make-re-choice res)
  (if (every re-char-set? res)
      (make-re-char-set (apply char-set-union (map re-char-set:cset res)))
      (make-re-choice/tsm res
                          (fold (lambda (re sm-count)
                                  (let ((maybe-tsm (re-tsm re)))
                                    (if (and (number? maybe-tsm)
                                             (number? sm-count))
                                        (+ maybe-tsm sm-count)
                                        (unspecific))))
                                0 res))))

;;; Slightly smart choice constructor:
;;; - Flattens nested choices
;;; - Drops empty (impossible) elements
;;; - Empty choice => empty-match
;;; - Singleton choice is reduced to its one element.
;;; - We don't descend into DSM's; too much work for this routine.
;;;
;;; This routine guarantees to preserve char-classness -- if it is applied
;;; to a list of char-class regexps (char-set and singleton-string re's),
;;; it will return a char-class regexp.

(define (re-choice res)
  (let ((res (let recur ((res res)) 	; Flatten nested choices
	       (if (pair? res)		; & drop empty re's.
		   (let* ((re (car res))
			  (tail (recur (cdr res))))
		     (cond ((re-choice? re) ; Flatten nested choices
			    (append (recur (re-choice:elts re)) tail))
			   ((re-empty? re) tail) ; Drop empty re's.
			   (else (cons re tail))))
		   '()))))
    ;; If all elts are char-class re's, fold them together.
    (if (every static-char-class? res)
	(let ((cset (apply char-set-union
			   (map (lambda (elt)
				  (if (re-char-set? elt)
				      (re-char-set:cset elt)
				      (string->char-set (re-string:chars elt))))
				res))))
	  (if (= 1 (char-set-size cset))
	      (make-re-string (apply string (char-set->list cset)))
	      (make-re-char-set cset)))

	(if (pair? res)
	    (if (pair? (cdr res))
		(make-re-choice res)	; General case
		(car res))		; Singleton sequence
	    re-empty))))		; Empty choice = ("")

;;; Repetition (*,?,+,=,>=,**)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; The repeat record's body contains all of the repeat record's submatches --
;;; there is no pre-dsm field allowing for initial & trailing dead submatches.
;;; This is not a limit on expressiveness because repeat commutes with dsm --
;;; we can always move submatches that come before and after body to an outer
;;; DSM. Hence
;;;     (= (re-repeat:tsm re) (re-tsm (re-repeat:body re)))

(define-record-type re-repeat :re-repeat
  (really-make-re-repeat from to body tsm posix)
  re-repeat?
  (from  re-repeat:from)	; Integer    (Macro expander abuses.)
  (to    re-repeat:to)		; Integer or #f for infinite	(Macro expander abuses.)
  (body  re-repeat:body)	; Regexp
  (tsm   re-repeat:tsm)		; Total submatch count
  (posix re-repeat:posix set-re-repeat:posix))		; Posix record

(define (make-re-repeat/tsm from to body tsm)
  (really-make-re-repeat from to body tsm #f))

;;; This is only used in code that (RE ...) macro produces for static regexps.
(define (make-re-repeat/posix from to body tsm posix-str tvec)
  (really-make-re-repeat from to body tsm (new-cre posix-str tvec)))

(define (make-re-repeat from to body)
  (make-re-repeat/tsm  (check-arg (lambda (from)
				    (or (not (integer? from)) ; Dynamic
					(>= from 0)))
				  from
				  make-re-repeat)
		       (check-arg (lambda (to)
				    (or (not (integer? to)) ; #f or dynamic
					(and (integer? to) (>= to 0))))
				  to
				  make-re-repeat)
		       body
		       (re-tsm body)))

;;; Slightly smart repeat constructor
;;; - Flattens nested repeats.
;;; - re{1,1}, re{0,0}, and re{m,n} where m>n reduced.
;;; - If re is empty-match: from=0 => "", from>0 => empty-match.
;;; - If re is eos, bos, or "", and to <= from, reduce to simply re.
;;; - Commutes into DSM records.

(define (re-repeat from to body)
  (receive (re pre-dsm) (reduce-repeat from to body 0)
    (re-dsm re pre-dsm (- (re-tsm body) (+ pre-dsm (re-tsm re))))))

;;; This guy does all the work (and is also called by the repeat simplifier)

(define (reduce-repeat from to body pre-dsm)
  (receive (from to body1 pre-dsm)
           ;; Collapse nested repeats and dsm's:
           (let iter ((from from) (to to) (body body) (dsm0 pre-dsm))
	     (receive (body body-dsm0) (open-dsm body)
	       (let ((dsm0 (+ dsm0 body-dsm0)))
		 (if (and (integer? from)		; Stop if FROM or TO
			  (or (not to) (integer? to))	; are code.
			  (re-repeat? body))
		     (let ((bfrom (re-repeat:from body))
			   (bto (re-repeat:to body))
			   (bbody (re-repeat:body body)))
		       (if (or (not (integer? bfrom))        ; Stop if bfrom or
			       (and bto (not (integer? bto)))) ; bto are code.
			   (values from to body dsm0)
			   (iter (* from bfrom)
				 (and to bto (* to bto))
				 bbody
				 dsm0)))
		     (values from to body dsm0)))))

    (cond
     ((and (eqv? from 1) (eqv? to 1))	; re{1,1} => re
      (values body1 pre-dsm))

     ((and (eqv? from 0) (eqv? to 0))	; re{0,0} => ""
      (values re-trivial (+ (re-tsm body1) pre-dsm)))

     ;; re{m,n} => re-empty when m>n:
     ((and (integer? from) (integer? to) (> from to))
      (values re-empty (+ (re-tsm body1) pre-dsm)))

     ;; Reduce the body = re-empty case.
     ((and (re-empty? body1) (integer? from)) ; (+ (in)) => (in)
      (values (if (> from 0) re-empty re-trivial) ; (* (in)) => ""
	      pre-dsm))

     ;; If BODY1 is eos, bos, or "", and m<=n, reduce to simply BODY1.
     ((and (integer? from)
	   (or (and (integer? to) (<= from to)) (not to))
	   (or (re-eos? body1)
	       (re-bos? body1)
	       (and (re-string? body1)
		    (string=? "" (re-string:chars body1)))))
      (values body1 pre-dsm))

     (else (values (make-re-repeat from to body1) ; general case
		   pre-dsm)))))


;;; Submatch
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; A submatch record introduces a new submatch. This is followed by
;;; PRE-DSM dead submatches (caused by simplifying the body), then the
;;; BODY, then perhaps more dead submatches, all for a total of TSM
;;; submatches.

(define-record-type re-submatch :re-submatch
  (really-make-re-submatch body pre-dsm tsm posix)
  re-submatch?
  (body re-submatch:body)	; Regexp
  (pre-dsm re-submatch:pre-dsm)	; Deleted submatches preceding the body
  (tsm  re-submatch:tsm)	; Total submatch count for the record
  (posix re-submatch:posix set-re-submatch:posix)) ; Posix string

(define (make-re-submatch/tsm body pre-dsm tsm)
  (really-make-re-submatch body pre-dsm tsm #f))

;;; This is only used in code that (RE ...) macro produces for static regexps.
(define (make-re-submatch/posix body pre-dsm tsm posix-str tvec)
  (really-make-re-submatch body pre-dsm tsm (new-cre posix-str tvec)))


;;; "Virtual field" for the RE-SUBMATCH record -- how many dead submatches
;;; come after the body:

(define (re-submatch:post-dsm re)	 ; Number of post-body DSM's =
  (- (re-submatch:tsm re)		 ;   total submatches
     (+ 1				 ;   minus *this* submatch
	(re-submatch:pre-dsm re)	 ;   minus pre-body dead submatches
	(re-tsm (re-submatch:body re)))));   minus body's submatches.

(define (make-re-submatch body . maybe-pre+post-dsm)
  (let-optionals maybe-pre+post-dsm ((pre-dsm 0) (post-dsm 0))
    (make-re-submatch/tsm body pre-dsm (+ pre-dsm 1 (re-tsm body) post-dsm))))

;;; Slightly smart submatch constructor
;;; - DSM's unpacked
;;; - If BODY is the re-empty, we'll never match, so just produce a DSM.

(define (re-submatch body . maybe-pre+post-dsm)
  (let-optionals maybe-pre+post-dsm ((pre-dsm 0) (post-dsm 0))
    (let ((tsm (+ 1 pre-dsm (re-tsm body) post-dsm)))
      (receive (body1 pre-dsm1) (open-dsm body)
	(if (re-empty? body1)
	    (re-dsm re-empty tsm 0)
	    (make-re-submatch/tsm body1 (+ pre-dsm pre-dsm1) tsm))))))


;;; Other regexps : string, char-set, bos & eos
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Also, re-empty and re-trivial.

(define-record-type re-string :re-string
  (really-make-re-string chars posix)
  re-string?
  (chars re-string:chars set-re-string:chars)
  (posix re-string:posix set-re-string:posix))

(define-record-discloser :re-string
  (lambda (r)
    (list 're-string
	  (re-string:chars r))))

;; Kludge: POSIX wants "()" for "the empty string".

(define (make-re-string chars)
  (if (string=? "" chars)
      re-trivial
      (really-make-re-string chars #f)))

(define re-string make-re-string)	; For consistency w/other re makers.

;;; This is only used in code that (RE ...) macro produces for static regexps.
(define (make-re-string/posix chars posix-str tvec)
  (if (string=? "" chars)
      re-trivial
      (really-make-re-string chars (new-cre posix-str tvec))))

(define re-empty-string (really-make-re-string "" #f))

;;; Matches the empty string anywhere.
(define re-trivial (make-re-dsm/posix re-empty-string
				      1 0 "()" '#()))

(define (re-trivial? re)
  (eq? re re-trivial))

(define-record re-char-set
  cset			; A character set	(Macro expander abuses.)
  (posix    #f))	; Posix record

(define re-char-set make-re-char-set)	; For consistency w/other re makers.

;;; This is only used in code that (RE ...) macro produces for static regexps.
(define (make-re-char-set/posix cs posix-str tvec)
  (let ((re (make-re-char-set cs)))
    (set-re-char-set:posix re (new-cre posix-str tvec))
    re))

;;; Never matches
;;; NEED TO OPTIMIZE - PRE-SET POSIX FIELD.
(define re-empty (make-re-char-set char-set:empty))

(define (re-empty? re)
  (and (re-char-set? re)
       (let ((cs (re-char-set:cset re)))
	 (and (char-set? cs) ; Might be code...
	      (char-set-empty? cs)))))

(define-record re-bos)	(define re-bos (make-re-bos))
(define-record re-eos)  (define re-eos (make-re-eos))

(define-record re-bol)  (define re-bol (make-re-bol))
(define-record re-eol)  (define re-eol (make-re-eol))

(define re-any (make-re-char-set/posix char-set:full "." '#()))

(define (re-any? re)
  (and (re-char-set? re)
       (let ((cs (re-char-set:cset re)))
	 (and (char-set? cs) ; Might be code...
	      (char-set-full? cs)))))

(define re-nonl
  (make-re-char-set/posix (char-set-complement (char-set #\newline))
			  "[^\n]"
			  '#()))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(define (regexp? x)
  (or (re-seq? x)      (re-choice? x)   (re-repeat? x)
      (re-char-set? x) (re-string? x)
      (re-bos? x)      (re-eos? x)
      (re-bol? x)      (re-eol? x)
      (re-submatch? x) (re-dsm? x)))


;;; Return the total number of submatches bound in RE.

(define (re-tsm re)
  (cond
   ((re-seq? re)      (re-seq:tsm re))
   ((re-choice? re)   (re-choice:tsm re))
   ((re-repeat? re)   (re-repeat:tsm re))
   ((re-dsm? re)      (re-dsm:tsm re))
   ((re-submatch? re) (re-submatch:tsm re))
   ((or (re-char-set? re) (re-string? re)
	(re-bos? re) (re-eos? re)
	(re-bol? re) (re-eol? re))
    0)))

;;; (flush-submatches re)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Return regular expression RE with all submatch-binding elements
;;; stripped out -- (= 0 (re-tsm (flush-submatches re))).

(define (flush-submatches re)
  (cond
   ((zero? (re-tsm re)) re)		; RE has no submatches.

   ((re-seq?    re) (re-seq    (map flush-submatches (re-seq:elts    re))))
   ((re-choice? re) (re-choice (map flush-submatches (re-choice:elts re))))

   ((re-repeat? re) (re-repeat (re-repeat:from re)
			       (re-repeat:to re)
			       (flush-submatches (re-repeat:body re))))

   ((re-submatch? re) (flush-submatches (re-submatch:body re)))
   ((re-dsm? re)      (flush-submatches (re-dsm:body      re)))

   (else re)))


;;; Map F over ELTS. (F x) returns two values -- the "real" return value,
;;; and a "changed?" flag. If CHANGED? is false, then the "real" return value
;;; should be identical to the original argument X. MAP/CHANGED constructs
;;; the mapped list sharing as long an unchanged tail as possible with the
;;; list ELTS; if F changes no argument, MAP/CHANGED returns exactly the list
;;; ELTS. MAP/CHANGED returns two values: the mapped list, and a changed?
;;; flag for the entire list.

(define (map/changed f elts)
  (let recur ((elts elts))
    (if (pair? elts)
	(let ((elt (car elts)))
	  (receive (new-elts elts-changed?) (recur (cdr elts))
	    (receive (new-elt elt-changed?) (f elt)
	      (if (or elts-changed? elt-changed?)
		  (values (cons new-elt new-elts) #t)
		  (values elts #f)))))
	  (values '() #f))))


(define (uncase re)
  (receive (new-re changed?)
      (let recur ((re re))
	(cond
	 ((re-seq? re)
	  (let ((elts (re-seq:elts re)))
	    (receive (new-elts elts-changed?)
		(map/changed recur elts)
	      (if elts-changed?
		  (values (make-re-seq/tsm new-elts (re-seq:tsm re)) #t)
		  (values re #f)))))

	 ((re-choice? re)
	  (let ((elts (re-choice:elts re)))
	    (receive (new-elts elts-changed?)
		(map/changed recur elts)
	      (if elts-changed?
		  (values (re-choice new-elts) #t)
		  (values re #f)))))

	 ((re-char-set? re)
	  (let* ((cs (re-char-set:cset re))
		 (new-cs (uncase-char-set cs))) ; Better not be code.
	    (if (char-set= cs new-cs)
		(values re #f)
		(values (make-re-char-set new-cs) #t))))

	 ((re-repeat? re)
	  (receive (new-body body-changed?) (recur (re-repeat:body re))
	    (if body-changed?
		(values (re-repeat (re-repeat:from re)
				   (re-repeat:to re)
				   new-body)
			#t)
		(values re #f))))

	 ((re-submatch? re)
	  (receive (new-body body-changed?) (recur (re-submatch:body re))
	    (if body-changed?
		(values (make-re-submatch/tsm new-body
					      (re-submatch:pre-dsm re)
					      (re-submatch:tsm     re))
			#t)
		(values re #f))))

	 ((re-string? re)
	  (let ((cf-re (uncase-string (re-string:chars re))))
	    (if (re-string? cf-re)
		(values re    #f)
		(values cf-re #t))))

	 (else (values re #f))))
    new-re))


;;; (uncase-char-set cs)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Return a char-set cs' such that cs' contains every char c in cs in both
;;; its upcase and downcase form.

(define (uncase-char-set cs)
  (char-set-fold (lambda (c new-cset)
		   (char-set-adjoin! new-cset
				     (char-downcase c)
				     (char-upcase c)))
		 (char-set-copy char-set:empty)
		 cs))


;;; I actually make an effort to keep this a re-string
;;; if possible (if the string contains no case-sensitive
;;; characters). Returns a regexp matching the string in
;;; a case-insensitive fashion.

(define (uncase-string s)
  ;; SEQ is a list of chars and doubleton char-sets.
  (let* ((seq (string-fold-right (lambda (c lis)
				   (cons (cond
					  ((char-lower-case? c)
					   (char-set c (char-upcase   c)))
					  ((char-upper-case? c)
					   (char-set c (char-downcase c)))
					  (else c))
					 lis))
				 '() s))

	 ;; Coalesce adjacent chars together into a string.
	 (fixup (lambda (chars seq)
		  (if (pair? chars)
		      (cons (make-re-string (list->string (reverse chars)))
			    seq)
		      seq)))

	 (new-seq (let recur ((seq seq) (chars '()))
		    (if (pair? seq)
			(let ((elt (car seq))
			      (seq (cdr seq)))
			  (if (char? elt)
			      (recur seq (cons elt chars))
			      (fixup chars (cons (make-re-char-set elt)
						 (recur seq '())))))
			(fixup chars '())))))

    (if (= 1 (length new-seq)) (car new-seq)
	(make-re-seq new-seq))))


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define char-set-full?
  (let ((allchars-nchars (char-set-size char-set:full)))
    (lambda (cs) (= allchars-nchars (char-set-size cs)))))

(define (char-set-empty? cs) (zero? (char-set-size cs)))


;;; A "char-class" re is either a char-set re or a string re whose string
;;; has only one character.

(define (re-char-class? re)
  (or (re-char-set? re)
      (and (re-string? re)
	   (= 1 (string-length (re-string:chars re))))))

(define (static-char-class? re)
  (or (and (re-char-set? re)
	   (char-set? (re-char-set:cset re)))	; This might be code.
      (and (re-string? re)			; But never this, so no check.
	   (= 1 (string-length (re-string:chars re))))))