1999-07-05 23:45:37 -04:00
|
|
|
;;; The regexp data type
|
|
|
|
;;; Olin Shivers, January 1997, May 1998.
|
|
|
|
|
|
|
|
;;; A DSM around a choice gets absorbed into the choice's first elt.
|
|
|
|
;;; But this prevents it from being moved out into a containing
|
|
|
|
;;; choice or seq elt, or outer DSM. Fix.
|
|
|
|
|
|
|
|
;;; A regexp is a: dsm, submatch, seq, choice, repeat,
|
|
|
|
;;; char-set, string, bos, eos
|
|
|
|
|
|
|
|
;;; Deleted sub-match regexp
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;; This stands for a regexp containing TSM submatches, of which
|
|
|
|
;;; PRE-DSM come first as dead submatches, then the regexp BODY with its
|
|
|
|
;;; submatches, then POST-DSM as dead submatches.
|
|
|
|
|
|
|
|
(define-record-type re-dsm :re-dsm
|
|
|
|
(%%make-re-dsm body pre-dsm tsm posix)
|
|
|
|
re-dsm?
|
|
|
|
(body re-dsm:body) ; A Regexp
|
|
|
|
(pre-dsm re-dsm:pre-dsm) ; Integer -- initial dead submatches
|
|
|
|
(tsm re-dsm:tsm) ; Total submatch count
|
|
|
|
(posix re-dsm:posix set-re-dsm:posix)) ; Posix bits
|
|
|
|
|
|
|
|
(define (%make-re-dsm body pre-dsm tsm) (%%make-re-dsm body pre-dsm tsm #f))
|
|
|
|
|
|
|
|
;;; This is only used in code that the (RX ...) macro produces
|
|
|
|
;;; for static regexps.
|
|
|
|
(define (%make-re-dsm/posix body pre-dsm tsm posix-str tvec)
|
|
|
|
(%%make-re-dsm body pre-dsm tsm (new-cre posix-str tvec)))
|
|
|
|
|
|
|
|
(define (make-re-dsm body pre-dsm post-dsm)
|
|
|
|
(%make-re-dsm body pre-dsm (+ post-dsm pre-dsm (re-tsm body))))
|
|
|
|
|
|
|
|
;;; "Virtual field" for the RE-DSM record -- how many dead submatches
|
|
|
|
;;; come after the body:
|
|
|
|
|
|
|
|
(define (re-dsm:post-dsm re) ; Number of post-body DSM's =
|
|
|
|
(- (re-dsm:tsm re) ; total submatches
|
|
|
|
(+ (re-dsm:pre-dsm re) ; minus pre-body dead submatches
|
|
|
|
(re-tsm (re-dsm:body re))))) ; minus body's submatches.
|
|
|
|
|
|
|
|
;;; Slightly smart DSM constructor:
|
|
|
|
;;; - Absorb this DSM into an inner dsm, or submatch.
|
|
|
|
;;; - Punt unnecessary DSM's.
|
|
|
|
|
|
|
|
(define (re-dsm body pre-dsm post-dsm)
|
|
|
|
(let ((tsm (+ pre-dsm (re-tsm body) post-dsm)))
|
|
|
|
(receive (body1 pre-dsm1) (open-dsm body)
|
|
|
|
(let ((pre-dsm (+ pre-dsm pre-dsm1)))
|
|
|
|
|
|
|
|
(? ((= tsm (re-tsm body1)) body1) ; Trivial DSM
|
|
|
|
|
|
|
|
((re-submatch? body1) ; Absorb into submatch.
|
|
|
|
(%make-re-submatch (re-submatch:body body1)
|
|
|
|
(+ pre-dsm (re-submatch:pre-dsm body1))
|
|
|
|
tsm))
|
|
|
|
|
|
|
|
(else (%make-re-dsm body1 pre-dsm tsm))))))) ; Non-trivial DSM
|
|
|
|
|
|
|
|
;;; Take a regexp RE and return an equivalent (re', pre-dsm) pair of values.
|
|
|
|
;;; Recurses into DSM records. It is the case that
|
|
|
|
;;; (<= (+ pre-dsm (re-tsm re')) (re-tsm re))
|
|
|
|
;;; The post-dsm value is (- (re-tsm re) (re-tsm re') pre-dsm).
|
|
|
|
|
|
|
|
(define (open-dsm re)
|
|
|
|
(let lp ((re re) (pre-dsm 0))
|
|
|
|
(if (re-dsm? re)
|
|
|
|
(lp (re-dsm:body re) (+ pre-dsm (re-dsm:pre-dsm re)))
|
|
|
|
(values re pre-dsm))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;;; Sequence: (& re ...)
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
|
|
|
|
(define-record-type re-seq :re-seq
|
|
|
|
(%%make-re-seq elts tsm posix)
|
|
|
|
re-seq?
|
|
|
|
(elts re-seq:elts) ; Regexp list
|
|
|
|
(tsm re-seq:tsm) ; Total submatch count
|
|
|
|
(posix re-seq:posix set-re-seq:posix)) ; Posix record
|
|
|
|
|
|
|
|
(define (%make-re-seq elts tsm) (%%make-re-seq elts tsm #f))
|
|
|
|
|
|
|
|
;;; This is only used in code that (RE ...) macro produces for static regexps.
|
|
|
|
(define (%make-re-seq/posix elts tsm posix-str tvec)
|
|
|
|
(%%make-re-seq elts tsm (new-cre posix-str tvec)))
|
|
|
|
|
|
|
|
(define (make-re-seq res)
|
|
|
|
(%make-re-seq res
|
1999-07-11 16:41:27 -04:00
|
|
|
(fold (lambda (re sm-count) (+ (re-tsm re) sm-count))
|
|
|
|
0 res)))
|
1999-07-05 23:45:37 -04:00
|
|
|
|
|
|
|
;;; Slightly smart sequence constructor:
|
|
|
|
;;; - Flattens nested sequences
|
|
|
|
;;; - Drops trivial "" elements
|
|
|
|
;;; - Empty sequence => ""
|
|
|
|
;;; - Singleton sequence is reduced to its one element.
|
|
|
|
;;; - We don't descend into DSM's; too much work for this routine.
|
|
|
|
|
|
|
|
(define (re-seq res)
|
|
|
|
(let ((res (let recur ((res res)) ; Flatten nested seqs & drop ""'s.
|
|
|
|
(if (pair? res)
|
|
|
|
(let* ((re (car res))
|
|
|
|
(tail (recur (cdr res))))
|
|
|
|
(? ((re-seq? re) ; Flatten nested seqs
|
|
|
|
(append (recur (re-seq:elts re)) tail))
|
|
|
|
((trivial-re? re) tail) ; Drop trivial elts
|
|
|
|
(else (cons re tail))))
|
|
|
|
'()))))
|
|
|
|
|
|
|
|
(if (pair? res)
|
|
|
|
(if (pair? (cdr res))
|
|
|
|
(make-re-seq res) ; General case
|
|
|
|
(car res)) ; Singleton sequence
|
|
|
|
trivial-re))) ; Empty seq -- ""
|
|
|
|
|
|
|
|
|
|
|
|
;;; Choice: (| re ...)
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
|
|
|
|
(define-record-type re-choice :re-choice
|
|
|
|
(%%make-re-choice elts tsm posix)
|
|
|
|
re-choice?
|
|
|
|
(elts re-choice:elts) ; List of rel-items
|
|
|
|
(tsm re-choice:tsm) ; Total submatch count
|
|
|
|
(posix re-choice:posix set-re-choice:posix)) ; Posix string
|
|
|
|
|
|
|
|
(define (%make-re-choice elts tsm) (%%make-re-choice elts tsm #f))
|
|
|
|
|
|
|
|
;;; This is only used in code that (RE ...) macro produces for static regexps.
|
|
|
|
(define (%make-re-choice/posix elts tsm posix-str tvec)
|
|
|
|
(%%make-re-choice elts tsm (new-cre posix-str tvec)))
|
|
|
|
|
|
|
|
(define (make-re-choice res)
|
|
|
|
(%make-re-choice res
|
1999-07-11 16:41:27 -04:00
|
|
|
(fold (lambda (re sm-count) (+ (re-tsm re) sm-count))
|
|
|
|
0 res)))
|
1999-07-05 23:45:37 -04:00
|
|
|
|
|
|
|
;;; Slightly smart choice constructor:
|
|
|
|
;;; - Flattens nested choices
|
|
|
|
;;; - Drops empty (impossible) elements
|
|
|
|
;;; - Empty choice => empty-match
|
|
|
|
;;; - Singleton choice is reduced to its one element.
|
|
|
|
;;; - We don't descend into DSM's; too much work for this routine.
|
|
|
|
;;;
|
|
|
|
;;; This routine guarantees to preserve char-classness -- if it is applied
|
|
|
|
;;; to a list of char-class regexps (char-set and singleton-string re's),
|
|
|
|
;;; it will return a char-class regexp.
|
|
|
|
|
|
|
|
(define (re-choice res)
|
|
|
|
(let ((res (let recur ((res res)) ; Flatten nested choices
|
|
|
|
(if (pair? res) ; & drop empty re's.
|
|
|
|
(let* ((re (car res))
|
|
|
|
(tail (recur (cdr res))))
|
|
|
|
(? ((re-choice? re) ; Flatten nested choices
|
|
|
|
(append (recur (re-choice:elts re)) tail))
|
|
|
|
((empty-re? re) tail) ; Drop empty re's.
|
|
|
|
(else (cons re tail))))
|
|
|
|
'()))))
|
|
|
|
;; If all elts are char-class re's, fold them together.
|
1999-07-11 16:41:27 -04:00
|
|
|
(if (every static-char-class? res)
|
1999-07-05 23:45:37 -04:00
|
|
|
(let ((cset (apply char-set-union
|
|
|
|
(map (lambda (elt)
|
|
|
|
(if (re-char-set? elt)
|
|
|
|
(re-char-set:cset elt)
|
|
|
|
(string->char-set (re-string:chars elt))))
|
|
|
|
res))))
|
|
|
|
(if (= 1 (char-set-size cset))
|
|
|
|
(make-re-string (apply string (char-set-members cset)))
|
|
|
|
(make-re-char-set cset)))
|
|
|
|
|
|
|
|
(if (pair? res)
|
|
|
|
(if (pair? (cdr res))
|
|
|
|
(make-re-choice res) ; General case
|
|
|
|
(car res)) ; Singleton sequence
|
|
|
|
empty-re)))) ; Empty choice = ("")
|
|
|
|
|
|
|
|
;;; Repetition (*,?,+,=,>=,**)
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;; The repeat record's body contains all of the repeat record's submatches --
|
|
|
|
;;; there is no pre-dsm field allowing for initial & trailing dead submatches.
|
|
|
|
;;; This is not a limit on expressiveness because repeat commutes with dsm --
|
|
|
|
;;; we can always move submatches that come before and after body to an outer
|
|
|
|
;;; DSM. Hence
|
|
|
|
;;; (= (re-repeat:tsm re) (re-tsm (re-repeat:body re)))
|
|
|
|
|
|
|
|
(define-record-type re-repeat :re-repeat
|
|
|
|
(%%make-re-repeat from to body tsm posix)
|
|
|
|
re-repeat?
|
|
|
|
(from re-repeat:from) ; Integer (Macro expander abuses.)
|
|
|
|
(to re-repeat:to) ; Integer or #f for infinite (Macro expander abuses.)
|
|
|
|
(body re-repeat:body) ; Regexp
|
|
|
|
(tsm re-repeat:tsm) ; Total submatch count
|
|
|
|
(posix re-repeat:posix set-re-repeat:posix)) ; Posix record
|
|
|
|
|
|
|
|
(define (%make-re-repeat from to body tsm)
|
|
|
|
(%%make-re-repeat from to body tsm #f))
|
|
|
|
|
|
|
|
;;; This is only used in code that (RE ...) macro produces for static regexps.
|
|
|
|
(define (%make-re-repeat/posix from to body tsm posix-str tvec)
|
|
|
|
(%%make-re-repeat from to body tsm (new-cre posix-str tvec)))
|
|
|
|
|
|
|
|
(define (make-re-repeat from to body)
|
|
|
|
(%make-re-repeat (check-arg (lambda (from)
|
|
|
|
(or (not (integer? from)) ; Dynamic
|
|
|
|
(>= from 0)))
|
|
|
|
from
|
|
|
|
make-re-repeat)
|
|
|
|
(check-arg (lambda (to)
|
|
|
|
(or (not (integer? to)) ; #f or dynamic
|
|
|
|
(and (integer? to) (>= to 0))))
|
|
|
|
to
|
|
|
|
make-re-repeat)
|
|
|
|
body
|
|
|
|
(re-tsm body)))
|
|
|
|
|
|
|
|
;;; Slightly smart repeat constructor
|
|
|
|
;;; - Flattens nested repeats.
|
|
|
|
;;; - re{1,1}, re{0,0}, and re{m,n} where m>n reduced.
|
|
|
|
;;; - If re is empty-match: from=0 => "", from>0 => empty-match.
|
|
|
|
;;; - If re is eos, bos, or "", and to <= from, reduce to simply re.
|
|
|
|
;;; - Commutes into DSM records.
|
|
|
|
|
|
|
|
(define (re-repeat from to body)
|
|
|
|
(receive (re pre-dsm) (reduce-repeat from to body 0)
|
|
|
|
(re-dsm re pre-dsm (- (re-tsm body) (+ pre-dsm (re-tsm re))))))
|
|
|
|
|
|
|
|
;;; This guy does all the work (and is also called by the repeat simplifier)
|
|
|
|
|
|
|
|
(define (reduce-repeat from to body pre-dsm)
|
|
|
|
(receive (from to body1 pre-dsm)
|
|
|
|
;; Collapse nested repeats and dsm's:
|
|
|
|
(let iter ((from from) (to to) (body body) (dsm0 pre-dsm))
|
|
|
|
(receive (body body-dsm0) (open-dsm body)
|
|
|
|
(let ((dsm0 (+ dsm0 body-dsm0)))
|
|
|
|
(if (and (integer? from) ; Stop if FROM or TO
|
|
|
|
(or (not to) (integer? to)) ; are code.
|
|
|
|
(re-repeat? body))
|
|
|
|
(let ((bfrom (re-repeat:from body))
|
|
|
|
(bto (re-repeat:to body))
|
|
|
|
(bbody (re-repeat:body body)))
|
|
|
|
(if (or (not (integer? bfrom)) ; Stop if bfrom or
|
|
|
|
(and bto (not (integer? bto)))) ; bto are code.
|
|
|
|
(values from to body dsm0)
|
|
|
|
(iter (* from bfrom)
|
|
|
|
(and to bto (* to bto))
|
|
|
|
bbody
|
|
|
|
dsm0)))
|
|
|
|
(values from to body dsm0)))))
|
|
|
|
|
|
|
|
(? ((and (eqv? from 1) (eqv? to 1)) ; re{1,1} => re
|
|
|
|
(values body1 pre-dsm))
|
|
|
|
|
|
|
|
((and (eqv? from 0) (eqv? to 0)) ; re{0,0} => ""
|
|
|
|
(values trivial-re (+ (re-tsm body1) pre-dsm)))
|
|
|
|
|
|
|
|
;; re{m,n} => empty-re when m>n:
|
|
|
|
((and (integer? from) (integer? to) (> from to))
|
|
|
|
(values empty-re (+ (re-tsm body1) pre-dsm)))
|
|
|
|
|
|
|
|
;; Reduce the body = empty-re case.
|
|
|
|
((and (empty-re? body1) (integer? from)) ; (+ (in)) => (in)
|
|
|
|
(values (if (> from 0) empty-re trivial-re) ; (* (in)) => ""
|
|
|
|
pre-dsm))
|
|
|
|
|
|
|
|
;; If BODY1 is eos, bos, or "", and m<=n, reduce to simply BODY1.
|
|
|
|
((and (integer? from)
|
|
|
|
(or (and (integer? to) (<= from to)) (not to))
|
|
|
|
(or (re-eos? body1)
|
|
|
|
(re-bos? body1)
|
|
|
|
(and (re-string? body1)
|
|
|
|
(string=? "" (re-string:chars body1)))))
|
|
|
|
(values body1 pre-dsm))
|
|
|
|
|
|
|
|
(else (values (make-re-repeat from to body1) ; general case
|
|
|
|
pre-dsm)))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;;; Submatch
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;; A submatch record introduces a new submatch. This is followed by
|
|
|
|
;;; PRE-DSM dead submatches (caused by simplifying the body), then the
|
|
|
|
;;; BODY, then perhaps more dead submatches, all for a total of TSM
|
|
|
|
;;; submatches.
|
|
|
|
|
|
|
|
(define-record-type re-submatch :re-submatch
|
|
|
|
(%%make-re-submatch body pre-dsm tsm posix)
|
|
|
|
re-submatch?
|
|
|
|
(body re-submatch:body) ; Regexp
|
|
|
|
(pre-dsm re-submatch:pre-dsm) ; Deleted submatches preceding the body
|
|
|
|
(tsm re-submatch:tsm) ; Total submatch count for the record
|
|
|
|
(posix re-submatch:posix set-re-submatch:posix)) ; Posix string
|
|
|
|
|
|
|
|
(define (%make-re-submatch body pre-dsm tsm)
|
|
|
|
(%%make-re-submatch body pre-dsm tsm #f))
|
|
|
|
|
|
|
|
;;; This is only used in code that (RE ...) macro produces for static regexps.
|
|
|
|
(define (%make-re-submatch/posix body pre-dsm tsm posix-str tvec)
|
|
|
|
(%%make-re-submatch body pre-dsm tsm (new-cre posix-str tvec)))
|
|
|
|
|
|
|
|
|
|
|
|
;;; "Virtual field" for the RE-SUBMATCH record -- how many dead submatches
|
|
|
|
;;; come after the body:
|
|
|
|
|
|
|
|
(define (re-submatch:post-dsm re) ; Number of post-body DSM's =
|
|
|
|
(- (re-submatch:tsm re) ; total submatches
|
|
|
|
(+ 1 ; minus *this* submatch
|
|
|
|
(re-submatch:pre-dsm re) ; minus pre-body dead submatches
|
|
|
|
(re-tsm (re-submatch:body re))))); minus body's submatches.
|
|
|
|
|
|
|
|
(define (make-re-submatch body . maybe-pre+post-dsm)
|
|
|
|
(let-optionals maybe-pre+post-dsm ((pre-dsm 0) (post-dsm 0))
|
|
|
|
(%make-re-submatch body pre-dsm (+ pre-dsm 1 (re-tsm body) post-dsm))))
|
|
|
|
|
|
|
|
;;; Slightly smart submatch constructor
|
|
|
|
;;; - DSM's unpacked
|
|
|
|
;;; - If BODY is the empty-re, we'll never match, so just produce a DSM.
|
|
|
|
|
|
|
|
(define (re-submatch body . maybe-pre+post-dsm)
|
|
|
|
(let-optionals maybe-pre+post-dsm ((pre-dsm 0) (post-dsm 0))
|
|
|
|
(let ((tsm (+ 1 pre-dsm (re-tsm body) post-dsm)))
|
|
|
|
(receive (body1 pre-dsm1) (open-dsm body)
|
|
|
|
(if (empty-re? body1)
|
|
|
|
(re-dsm empty-re tsm 0)
|
|
|
|
(%make-re-submatch body1 (+ pre-dsm pre-dsm1) tsm))))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;;; Other regexps : string, char-set, bos & eos
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;; Also, empty-re and trivial-re.
|
|
|
|
|
|
|
|
(define-record re-string
|
|
|
|
chars ; String
|
|
|
|
(posix #f) ; Posix record
|
|
|
|
((disclose self) (list "re-string" (re-string:chars self))))
|
|
|
|
|
|
|
|
(define re-string make-re-string) ; For consistency w/other re makers.
|
|
|
|
|
|
|
|
;;; This is only used in code that (RE ...) macro produces for static regexps.
|
|
|
|
(define (make-re-string/posix chars posix-str tvec)
|
|
|
|
(let ((re (make-re-string chars)))
|
|
|
|
(set-re-string:posix re (new-cre posix-str tvec))
|
|
|
|
re))
|
|
|
|
|
|
|
|
;;; Matches the empty string anywhere.
|
|
|
|
(define trivial-re (make-re-string/posix "" "" '#()))
|
|
|
|
|
|
|
|
(define (trivial-re? re)
|
|
|
|
(and (re-string? re) (zero? (string-length (re-string:chars re)))))
|
|
|
|
|
|
|
|
(define-record re-char-set
|
|
|
|
cset ; A character set (Macro expander abuses.)
|
|
|
|
(posix #f)) ; Posix record
|
|
|
|
|
|
|
|
(define re-char-set make-re-char-set) ; For consistency w/other re makers.
|
|
|
|
|
|
|
|
;;; This is only used in code that (RE ...) macro produces for static regexps.
|
|
|
|
(define (make-re-char-set/posix cs posix-str tvec)
|
|
|
|
(let ((re (make-re-char-set cs)))
|
|
|
|
(set-re-char-set:posix re (new-cre posix-str tvec))
|
|
|
|
re))
|
|
|
|
|
|
|
|
;;; Never matches
|
|
|
|
;;; NEED TO OPTIMIZE - PRE-SET POSIX FIELD.
|
|
|
|
(define empty-re (make-re-char-set char-set:empty))
|
|
|
|
|
|
|
|
(define (empty-re? re)
|
|
|
|
(and (re-char-set? re)
|
|
|
|
(let ((cs (re-char-set:cset re)))
|
|
|
|
(and (char-set? cs) ; Might be code...
|
|
|
|
(char-set-empty? cs)))))
|
|
|
|
|
|
|
|
(define-record re-bos) (define re-bos (make-re-bos))
|
|
|
|
(define-record re-eos) (define re-eos (make-re-eos))
|
|
|
|
|
|
|
|
(define-record re-bol) (define re-bol (make-re-bol))
|
|
|
|
(define-record re-eol) (define re-eol (make-re-eol))
|
|
|
|
|
|
|
|
(define-record re-bow) (define re-bow (make-re-bow))
|
|
|
|
(define-record re-eow) (define re-eow (make-re-eow))
|
|
|
|
|
|
|
|
|
|
|
|
(define re-any (make-re-char-set/posix char-set:full "." '#()))
|
|
|
|
|
|
|
|
(define (re-any? re)
|
|
|
|
(and (re-char-set? re)
|
|
|
|
(let ((cs (re-char-set:cset re)))
|
|
|
|
(and (char-set? cs) ; Might be code...
|
|
|
|
(char-set-full? cs)))))
|
|
|
|
|
|
|
|
(define re-nonl (make-re-char-set/posix (char-set #\newline) "[^\n]" '#()))
|
|
|
|
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
|
|
|
|
(define (regexp? x)
|
|
|
|
(or (re-seq? x) (re-choice? x) (re-repeat? x)
|
|
|
|
(re-char-set? x) (re-string? x)
|
|
|
|
(re-bos? x) (re-eos? x)
|
|
|
|
(re-bol? x) (re-eol? x)
|
|
|
|
(re-bow? x) (re-eow? x)
|
|
|
|
(re-submatch? x) (re-dsm? x)))
|
|
|
|
|
|
|
|
|
|
|
|
;;; Return the total number of submatches bound in RE.
|
|
|
|
|
|
|
|
(define (re-tsm re)
|
|
|
|
(? ((re-seq? re) (re-seq:tsm re))
|
|
|
|
((re-choice? re) (re-choice:tsm re))
|
|
|
|
((re-repeat? re) (re-repeat:tsm re))
|
|
|
|
((re-dsm? re) (re-dsm:tsm re))
|
|
|
|
((re-submatch? re) (re-submatch:tsm re))
|
|
|
|
(else 0)))
|
|
|
|
|
|
|
|
|
|
|
|
(define re-word
|
|
|
|
(let ((wcs (char-set-union char-set:alphanumeric ; Word chars
|
|
|
|
(char-set #\_))))
|
|
|
|
(make-re-seq (list re-bow
|
|
|
|
(make-re-repeat 1 #f (make-re-char-set wcs))
|
|
|
|
re-eow))))
|
|
|
|
|
|
|
|
|
|
|
|
;;; (flush-submatches re)
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;; Return regular expression RE with all submatch-binding elements
|
|
|
|
;;; stripped out -- (= 0 (re-tsm (flush-submatches re))).
|
|
|
|
|
|
|
|
(define (flush-submatches re)
|
|
|
|
(? ((zero? (re-tsm re)) re) ; RE has no submatches.
|
|
|
|
|
|
|
|
((re-seq? re) (re-seq (map flush-submatches (re-seq:elts re))))
|
|
|
|
((re-choice? re) (re-choice (map flush-submatches (re-choice:elts re))))
|
|
|
|
|
|
|
|
((re-repeat? re) (re-repeat (re-repeat:from re)
|
|
|
|
(re-repeat:to re)
|
|
|
|
(flush-submatches (re-repeat:body re))))
|
|
|
|
|
|
|
|
((re-submatch? re) (flush-submatches (re-submatch:body re)))
|
|
|
|
((re-dsm? re) (flush-submatches (re-dsm:body re)))
|
|
|
|
|
|
|
|
(else re)))
|
|
|
|
|
|
|
|
|
|
|
|
;;; Map F over ELTS. (F x) returns two values -- the "real" return value,
|
|
|
|
;;; and a "changed?" flag. If CHANGED? is false, then the "real" return value
|
|
|
|
;;; should be identical to the original argument X. MAP/CHANGED constructs
|
|
|
|
;;; the mapped list sharing as long an unchanged tail as possible with the
|
|
|
|
;;; list ELTS; if F changes no argument, MAP/CHANGED returns exactly the list
|
|
|
|
;;; ELTS. MAP/CHANGED returns two values: the mapped list, and a changed?
|
|
|
|
;;; flag for the entire list.
|
|
|
|
|
|
|
|
(define (map/changed f elts)
|
|
|
|
(let recur ((elts elts))
|
|
|
|
(if (pair? elts)
|
|
|
|
(let ((elt (car elts)))
|
|
|
|
(receive (new-elts elts-changed?) (recur (cdr elts))
|
|
|
|
(receive (new-elt elt-changed?) (f elt)
|
|
|
|
(if (or elts-changed? elt-changed?)
|
|
|
|
(values (cons new-elt new-elts) #t)
|
|
|
|
(values elts #f)))))
|
|
|
|
(values '() #f))))
|
|
|
|
|
|
|
|
|
|
|
|
(define (uncase re)
|
|
|
|
(receive (new-re changed?)
|
|
|
|
(let recur ((re re))
|
|
|
|
(? ((re-seq? re)
|
|
|
|
(let ((elts (re-seq:elts re)))
|
|
|
|
(receive (new-elts elts-changed?)
|
|
|
|
(map/changed recur elts)
|
|
|
|
(if elts-changed?
|
|
|
|
(values (%make-re-seq new-elts (re-seq:tsm re)) #t)
|
|
|
|
(values re #f)))))
|
|
|
|
|
|
|
|
((re-choice? re)
|
|
|
|
(let ((elts (re-choice:elts re)))
|
|
|
|
(receive (new-elts elts-changed?)
|
|
|
|
(map/changed recur elts)
|
|
|
|
(if elts-changed?
|
|
|
|
(values (re-choice new-elts) #t)
|
|
|
|
(values re #f)))))
|
|
|
|
|
|
|
|
((re-char-set? re)
|
|
|
|
(let* ((cs (re-char-set:cset re))
|
|
|
|
(new-cs (uncase-char-set cs))) ; Better not be code.
|
|
|
|
(if (char-set= cs new-cs)
|
|
|
|
(values re #f)
|
|
|
|
(values (make-re-char-set new-cs) #t))))
|
|
|
|
|
|
|
|
((re-repeat? re)
|
|
|
|
(receive (new-body body-changed?) (recur (re-repeat:body re))
|
|
|
|
(if body-changed?
|
|
|
|
(values (re-repeat (re-repeat:from re)
|
|
|
|
(re-repeat:to re)
|
|
|
|
new-body)
|
|
|
|
#t)
|
|
|
|
(values re #f))))
|
|
|
|
|
|
|
|
((re-submatch? re)
|
|
|
|
(receive (new-body body-changed?) (recur (re-submatch? re))
|
|
|
|
(if body-changed?
|
|
|
|
(values (%make-re-submatch new-body
|
|
|
|
(re-submatch:pre-dsm re)
|
|
|
|
(re-submatch:tsm re))
|
|
|
|
#t)
|
|
|
|
(values re #f))))
|
|
|
|
|
|
|
|
((re-string? re)
|
|
|
|
(let ((cf-re (uncase-string (re-string:chars re))))
|
|
|
|
(if (re-string? cf-re)
|
|
|
|
(values re #f)
|
|
|
|
(values cf-re #t))))
|
|
|
|
|
|
|
|
(else (values re #f))))
|
|
|
|
new-re))
|
|
|
|
|
|
|
|
|
|
|
|
;;; (uncase-char-set cs)
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;; Return a char-set cs' such that cs' contains every char c in cs in both
|
|
|
|
;;; its upcase and downcase form.
|
|
|
|
|
|
|
|
(define (uncase-char-set cs)
|
|
|
|
(char-set-fold (lambda (c new-cset)
|
|
|
|
(char-set-adjoin! new-cset
|
|
|
|
(char-downcase c)
|
|
|
|
(char-upcase c)))
|
|
|
|
(char-set-copy char-set:empty)
|
|
|
|
cs))
|
|
|
|
|
|
|
|
|
|
|
|
;;; I actually make an effort to keep this a re-string
|
|
|
|
;;; if possible (if the string contains no case-sensitive
|
|
|
|
;;; characters). Returns a regexp matching the string in
|
|
|
|
;;; a case-insensitive fashion.
|
|
|
|
|
|
|
|
(define (uncase-string s)
|
|
|
|
;; SEQ is a list of chars and doubleton char-sets.
|
1999-07-11 16:41:27 -04:00
|
|
|
(let* ((seq (string-fold-right (lambda (c lis)
|
|
|
|
(cons (? ((char-lower-case? c) (char-set c (char-upcase c)))
|
|
|
|
((char-upper-case? c) (char-set c (char-downcase c)))
|
|
|
|
(else c))
|
|
|
|
lis))
|
1999-07-05 23:45:37 -04:00
|
|
|
'() s))
|
|
|
|
|
|
|
|
;; Coalesce adjacent chars together into a string.
|
|
|
|
(fixup (lambda (chars seq)
|
|
|
|
(if (pair? chars)
|
|
|
|
(cons (make-re-string (list->string (reverse chars)))
|
|
|
|
seq)
|
|
|
|
seq)))
|
|
|
|
|
|
|
|
(new-seq (let recur ((seq seq) (chars '()))
|
|
|
|
(if (pair? seq)
|
|
|
|
(let ((elt (car seq))
|
|
|
|
(seq (cdr seq)))
|
|
|
|
(if (char? elt)
|
|
|
|
(recur seq (cons elt chars))
|
|
|
|
(fixup chars (cons (make-re-char-set elt)
|
|
|
|
(recur seq '())))))
|
|
|
|
(fixup chars '())))))
|
|
|
|
|
|
|
|
(if (= 1 (length new-seq)) (car new-seq)
|
|
|
|
(make-re-seq new-seq))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(define char-set-full?
|
|
|
|
(let ((allchars-nchars (char-set-size char-set:full)))
|
|
|
|
(lambda (cs) (= allchars-nchars (char-set-size cs)))))
|
|
|
|
|
|
|
|
(define (char-set-empty? cs) (zero? (char-set-size cs)))
|
|
|
|
|
|
|
|
|
|
|
|
;;; A "char-class" re is either a char-set re or a string re whose string
|
|
|
|
;;; has only one character.
|
|
|
|
|
|
|
|
(define (re-char-class? re)
|
|
|
|
(or (re-char-set? re)
|
|
|
|
(and (re-string? re)
|
|
|
|
(= 1 (string-length (re-string:chars re))))))
|
|
|
|
|
|
|
|
(define (static-char-class? re)
|
|
|
|
(or (and (re-char-set? re)
|
|
|
|
(char-set? (re-char-set:cset re))) ; This might be code.
|
|
|
|
(and (re-string? re) ; But never this, so no check.
|
|
|
|
(= 1 (string-length (re-string:chars re))))))
|