scsh-0.5/scsh/rx/re.scm

596 lines
20 KiB
Scheme

;;; The regexp data type
;;; Olin Shivers, January 1997, May 1998.
;;; A DSM around a choice gets absorbed into the choice's first elt.
;;; But this prevents it from being moved out into a containing
;;; choice or seq elt, or outer DSM. Fix.
;;; A regexp is a: dsm, submatch, seq, choice, repeat,
;;; char-set, string, bos, eos
;;; Deleted sub-match regexp
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; This stands for a regexp containing TSM submatches, of which
;;; PRE-DSM come first as dead submatches, then the regexp BODY with its
;;; submatches, then POST-DSM as dead submatches.
(define-record-type re-dsm :re-dsm
(%%make-re-dsm body pre-dsm tsm posix)
re-dsm?
(body re-dsm:body) ; A Regexp
(pre-dsm re-dsm:pre-dsm) ; Integer -- initial dead submatches
(tsm re-dsm:tsm) ; Total submatch count
(posix re-dsm:posix set-re-dsm:posix)) ; Posix bits
(define (%make-re-dsm body pre-dsm tsm) (%%make-re-dsm body pre-dsm tsm #f))
;;; This is only used in code that the (RX ...) macro produces
;;; for static regexps.
(define (%make-re-dsm/posix body pre-dsm tsm posix-str tvec)
(%%make-re-dsm body pre-dsm tsm (new-cre posix-str tvec)))
(define (make-re-dsm body pre-dsm post-dsm)
(%make-re-dsm body pre-dsm (+ post-dsm pre-dsm (re-tsm body))))
;;; "Virtual field" for the RE-DSM record -- how many dead submatches
;;; come after the body:
(define (re-dsm:post-dsm re) ; Number of post-body DSM's =
(- (re-dsm:tsm re) ; total submatches
(+ (re-dsm:pre-dsm re) ; minus pre-body dead submatches
(re-tsm (re-dsm:body re))))) ; minus body's submatches.
;;; Slightly smart DSM constructor:
;;; - Absorb this DSM into an inner dsm, or submatch.
;;; - Punt unnecessary DSM's.
(define (re-dsm body pre-dsm post-dsm)
(let ((tsm (+ pre-dsm (re-tsm body) post-dsm)))
(receive (body1 pre-dsm1) (open-dsm body)
(let ((pre-dsm (+ pre-dsm pre-dsm1)))
(? ((= tsm (re-tsm body1)) body1) ; Trivial DSM
((re-submatch? body1) ; Absorb into submatch.
(%make-re-submatch (re-submatch:body body1)
(+ pre-dsm (re-submatch:pre-dsm body1))
tsm))
(else (%make-re-dsm body1 pre-dsm tsm))))))) ; Non-trivial DSM
;;; Take a regexp RE and return an equivalent (re', pre-dsm) pair of values.
;;; Recurses into DSM records. It is the case that
;;; (<= (+ pre-dsm (re-tsm re')) (re-tsm re))
;;; The post-dsm value is (- (re-tsm re) (re-tsm re') pre-dsm).
(define (open-dsm re)
(let lp ((re re) (pre-dsm 0))
(if (re-dsm? re)
(lp (re-dsm:body re) (+ pre-dsm (re-dsm:pre-dsm re)))
(values re pre-dsm))))
;;; Sequence: (& re ...)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define-record-type re-seq :re-seq
(%%make-re-seq elts tsm posix)
re-seq?
(elts re-seq:elts) ; Regexp list
(tsm re-seq:tsm) ; Total submatch count
(posix re-seq:posix set-re-seq:posix)) ; Posix record
(define (%make-re-seq elts tsm) (%%make-re-seq elts tsm #f))
;;; This is only used in code that (RE ...) macro produces for static regexps.
(define (%make-re-seq/posix elts tsm posix-str tvec)
(%%make-re-seq elts tsm (new-cre posix-str tvec)))
(define (make-re-seq res)
(%make-re-seq res
(fold (lambda (re sm-count) (+ (re-tsm re) sm-count))
0 res)))
;;; Slightly smart sequence constructor:
;;; - Flattens nested sequences
;;; - Drops trivial "" elements
;;; - Empty sequence => ""
;;; - Singleton sequence is reduced to its one element.
;;; - We don't descend into DSM's; too much work for this routine.
(define (re-seq res)
(let ((res (let recur ((res res)) ; Flatten nested seqs & drop ""'s.
(if (pair? res)
(let* ((re (car res))
(tail (recur (cdr res))))
(? ((re-seq? re) ; Flatten nested seqs
(append (recur (re-seq:elts re)) tail))
((re-trivial? re) tail) ; Drop trivial elts
(else (cons re tail))))
'()))))
(if (pair? res)
(if (pair? (cdr res))
(make-re-seq res) ; General case
(car res)) ; Singleton sequence
re-trivial))) ; Empty seq -- ""
;;; Choice: (| re ...)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define-record-type re-choice :re-choice
(%%make-re-choice elts tsm posix)
re-choice?
(elts re-choice:elts) ; List of rel-items
(tsm re-choice:tsm) ; Total submatch count
(posix re-choice:posix set-re-choice:posix)) ; Posix string
(define (%make-re-choice elts tsm) (%%make-re-choice elts tsm #f))
;;; This is only used in code that (RE ...) macro produces for static regexps.
(define (%make-re-choice/posix elts tsm posix-str tvec)
(%%make-re-choice elts tsm (new-cre posix-str tvec)))
(define (make-re-choice res)
(%make-re-choice res
(fold (lambda (re sm-count) (+ (re-tsm re) sm-count))
0 res)))
;;; Slightly smart choice constructor:
;;; - Flattens nested choices
;;; - Drops empty (impossible) elements
;;; - Empty choice => empty-match
;;; - Singleton choice is reduced to its one element.
;;; - We don't descend into DSM's; too much work for this routine.
;;;
;;; This routine guarantees to preserve char-classness -- if it is applied
;;; to a list of char-class regexps (char-set and singleton-string re's),
;;; it will return a char-class regexp.
(define (re-choice res)
(let ((res (let recur ((res res)) ; Flatten nested choices
(if (pair? res) ; & drop empty re's.
(let* ((re (car res))
(tail (recur (cdr res))))
(? ((re-choice? re) ; Flatten nested choices
(append (recur (re-choice:elts re)) tail))
((re-empty? re) tail) ; Drop empty re's.
(else (cons re tail))))
'()))))
;; If all elts are char-class re's, fold them together.
(if (every static-char-class? res)
(let ((cset (apply char-set-union
(map (lambda (elt)
(if (re-char-set? elt)
(re-char-set:cset elt)
(string->char-set (re-string:chars elt))))
res))))
(if (= 1 (char-set-size cset))
(make-re-string (apply string (char-set-members cset)))
(make-re-char-set cset)))
(if (pair? res)
(if (pair? (cdr res))
(make-re-choice res) ; General case
(car res)) ; Singleton sequence
re-empty)))) ; Empty choice = ("")
;;; Repetition (*,?,+,=,>=,**)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; The repeat record's body contains all of the repeat record's submatches --
;;; there is no pre-dsm field allowing for initial & trailing dead submatches.
;;; This is not a limit on expressiveness because repeat commutes with dsm --
;;; we can always move submatches that come before and after body to an outer
;;; DSM. Hence
;;; (= (re-repeat:tsm re) (re-tsm (re-repeat:body re)))
(define-record-type re-repeat :re-repeat
(%%make-re-repeat from to body tsm posix)
re-repeat?
(from re-repeat:from) ; Integer (Macro expander abuses.)
(to re-repeat:to) ; Integer or #f for infinite (Macro expander abuses.)
(body re-repeat:body) ; Regexp
(tsm re-repeat:tsm) ; Total submatch count
(posix re-repeat:posix set-re-repeat:posix)) ; Posix record
(define (%make-re-repeat from to body tsm)
(%%make-re-repeat from to body tsm #f))
;;; This is only used in code that (RE ...) macro produces for static regexps.
(define (%make-re-repeat/posix from to body tsm posix-str tvec)
(%%make-re-repeat from to body tsm (new-cre posix-str tvec)))
(define (make-re-repeat from to body)
(%make-re-repeat (check-arg (lambda (from)
(or (not (integer? from)) ; Dynamic
(>= from 0)))
from
make-re-repeat)
(check-arg (lambda (to)
(or (not (integer? to)) ; #f or dynamic
(and (integer? to) (>= to 0))))
to
make-re-repeat)
body
(re-tsm body)))
;;; Slightly smart repeat constructor
;;; - Flattens nested repeats.
;;; - re{1,1}, re{0,0}, and re{m,n} where m>n reduced.
;;; - If re is empty-match: from=0 => "", from>0 => empty-match.
;;; - If re is eos, bos, or "", and to <= from, reduce to simply re.
;;; - Commutes into DSM records.
(define (re-repeat from to body)
(receive (re pre-dsm) (reduce-repeat from to body 0)
(re-dsm re pre-dsm (- (re-tsm body) (+ pre-dsm (re-tsm re))))))
;;; This guy does all the work (and is also called by the repeat simplifier)
(define (reduce-repeat from to body pre-dsm)
(receive (from to body1 pre-dsm)
;; Collapse nested repeats and dsm's:
(let iter ((from from) (to to) (body body) (dsm0 pre-dsm))
(receive (body body-dsm0) (open-dsm body)
(let ((dsm0 (+ dsm0 body-dsm0)))
(if (and (integer? from) ; Stop if FROM or TO
(or (not to) (integer? to)) ; are code.
(re-repeat? body))
(let ((bfrom (re-repeat:from body))
(bto (re-repeat:to body))
(bbody (re-repeat:body body)))
(if (or (not (integer? bfrom)) ; Stop if bfrom or
(and bto (not (integer? bto)))) ; bto are code.
(values from to body dsm0)
(iter (* from bfrom)
(and to bto (* to bto))
bbody
dsm0)))
(values from to body dsm0)))))
(? ((and (eqv? from 1) (eqv? to 1)) ; re{1,1} => re
(values body1 pre-dsm))
((and (eqv? from 0) (eqv? to 0)) ; re{0,0} => ""
(values re-trivial (+ (re-tsm body1) pre-dsm)))
;; re{m,n} => re-empty when m>n:
((and (integer? from) (integer? to) (> from to))
(values re-empty (+ (re-tsm body1) pre-dsm)))
;; Reduce the body = re-empty case.
((and (re-empty? body1) (integer? from)) ; (+ (in)) => (in)
(values (if (> from 0) re-empty re-trivial) ; (* (in)) => ""
pre-dsm))
;; If BODY1 is eos, bos, or "", and m<=n, reduce to simply BODY1.
((and (integer? from)
(or (and (integer? to) (<= from to)) (not to))
(or (re-eos? body1)
(re-bos? body1)
(and (re-string? body1)
(string=? "" (re-string:chars body1)))))
(values body1 pre-dsm))
(else (values (make-re-repeat from to body1) ; general case
pre-dsm)))))
;;; Submatch
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; A submatch record introduces a new submatch. This is followed by
;;; PRE-DSM dead submatches (caused by simplifying the body), then the
;;; BODY, then perhaps more dead submatches, all for a total of TSM
;;; submatches.
(define-record-type re-submatch :re-submatch
(%%make-re-submatch body pre-dsm tsm posix)
re-submatch?
(body re-submatch:body) ; Regexp
(pre-dsm re-submatch:pre-dsm) ; Deleted submatches preceding the body
(tsm re-submatch:tsm) ; Total submatch count for the record
(posix re-submatch:posix set-re-submatch:posix)) ; Posix string
(define (%make-re-submatch body pre-dsm tsm)
(%%make-re-submatch body pre-dsm tsm #f))
;;; This is only used in code that (RE ...) macro produces for static regexps.
(define (%make-re-submatch/posix body pre-dsm tsm posix-str tvec)
(%%make-re-submatch body pre-dsm tsm (new-cre posix-str tvec)))
;;; "Virtual field" for the RE-SUBMATCH record -- how many dead submatches
;;; come after the body:
(define (re-submatch:post-dsm re) ; Number of post-body DSM's =
(- (re-submatch:tsm re) ; total submatches
(+ 1 ; minus *this* submatch
(re-submatch:pre-dsm re) ; minus pre-body dead submatches
(re-tsm (re-submatch:body re))))); minus body's submatches.
(define (make-re-submatch body . maybe-pre+post-dsm)
(let-optionals maybe-pre+post-dsm ((pre-dsm 0) (post-dsm 0))
(%make-re-submatch body pre-dsm (+ pre-dsm 1 (re-tsm body) post-dsm))))
;;; Slightly smart submatch constructor
;;; - DSM's unpacked
;;; - If BODY is the re-empty, we'll never match, so just produce a DSM.
(define (re-submatch body . maybe-pre+post-dsm)
(let-optionals maybe-pre+post-dsm ((pre-dsm 0) (post-dsm 0))
(let ((tsm (+ 1 pre-dsm (re-tsm body) post-dsm)))
(receive (body1 pre-dsm1) (open-dsm body)
(if (re-empty? body1)
(re-dsm re-empty tsm 0)
(%make-re-submatch body1 (+ pre-dsm pre-dsm1) tsm))))))
;;; Other regexps : string, char-set, bos & eos
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Also, re-empty and re-trivial.
(define-record re-string
chars ; String
(posix #f) ; Posix record
((disclose self) (list "re-string" (re-string:chars self))))
(define re-string make-re-string) ; For consistency w/other re makers.
;;; This is only used in code that (RE ...) macro produces for static regexps.
(define (make-re-string/posix chars posix-str tvec)
(let ((re (make-re-string chars)))
(set-re-string:posix re (new-cre posix-str tvec))
re))
;;; Matches the empty string anywhere.
(define re-trivial (make-re-string/posix "" "" '#()))
(define (re-trivial? re)
(and (re-string? re) (zero? (string-length (re-string:chars re)))))
(define-record re-char-set
cset ; A character set (Macro expander abuses.)
(posix #f)) ; Posix record
(define re-char-set make-re-char-set) ; For consistency w/other re makers.
;;; This is only used in code that (RE ...) macro produces for static regexps.
(define (make-re-char-set/posix cs posix-str tvec)
(let ((re (make-re-char-set cs)))
(set-re-char-set:posix re (new-cre posix-str tvec))
re))
;;; Never matches
;;; NEED TO OPTIMIZE - PRE-SET POSIX FIELD.
(define re-empty (make-re-char-set char-set:empty))
(define (re-empty? re)
(and (re-char-set? re)
(let ((cs (re-char-set:cset re)))
(and (char-set? cs) ; Might be code...
(char-set-empty? cs)))))
(define-record re-bos) (define re-bos (make-re-bos))
(define-record re-eos) (define re-eos (make-re-eos))
(define-record re-bol) (define re-bol (make-re-bol))
(define-record re-eol) (define re-eol (make-re-eol))
(define-record re-bow) (define re-bow (make-re-bow))
(define-record re-eow) (define re-eow (make-re-eow))
(define re-any (make-re-char-set/posix char-set:full "." '#()))
(define (re-any? re)
(and (re-char-set? re)
(let ((cs (re-char-set:cset re)))
(and (char-set? cs) ; Might be code...
(char-set-full? cs)))))
(define re-nonl
(make-re-char-set/posix (char-set-invert (char-set #\newline))
"[^\n]"
'#()))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define (regexp? x)
(or (re-seq? x) (re-choice? x) (re-repeat? x)
(re-char-set? x) (re-string? x)
(re-bos? x) (re-eos? x)
(re-bol? x) (re-eol? x)
(re-bow? x) (re-eow? x)
(re-submatch? x) (re-dsm? x)))
;;; Return the total number of submatches bound in RE.
(define (re-tsm re)
(? ((re-seq? re) (re-seq:tsm re))
((re-choice? re) (re-choice:tsm re))
((re-repeat? re) (re-repeat:tsm re))
((re-dsm? re) (re-dsm:tsm re))
((re-submatch? re) (re-submatch:tsm re))
(else 0)))
(define re-word
(let ((wcs (char-set-union char-set:alphanumeric ; Word chars
(char-set #\_))))
(make-re-seq (list re-bow
(make-re-repeat 1 #f (make-re-char-set wcs))
re-eow))))
;;; (flush-submatches re)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Return regular expression RE with all submatch-binding elements
;;; stripped out -- (= 0 (re-tsm (flush-submatches re))).
(define (flush-submatches re)
(? ((zero? (re-tsm re)) re) ; RE has no submatches.
((re-seq? re) (re-seq (map flush-submatches (re-seq:elts re))))
((re-choice? re) (re-choice (map flush-submatches (re-choice:elts re))))
((re-repeat? re) (re-repeat (re-repeat:from re)
(re-repeat:to re)
(flush-submatches (re-repeat:body re))))
((re-submatch? re) (flush-submatches (re-submatch:body re)))
((re-dsm? re) (flush-submatches (re-dsm:body re)))
(else re)))
;;; Map F over ELTS. (F x) returns two values -- the "real" return value,
;;; and a "changed?" flag. If CHANGED? is false, then the "real" return value
;;; should be identical to the original argument X. MAP/CHANGED constructs
;;; the mapped list sharing as long an unchanged tail as possible with the
;;; list ELTS; if F changes no argument, MAP/CHANGED returns exactly the list
;;; ELTS. MAP/CHANGED returns two values: the mapped list, and a changed?
;;; flag for the entire list.
(define (map/changed f elts)
(let recur ((elts elts))
(if (pair? elts)
(let ((elt (car elts)))
(receive (new-elts elts-changed?) (recur (cdr elts))
(receive (new-elt elt-changed?) (f elt)
(if (or elts-changed? elt-changed?)
(values (cons new-elt new-elts) #t)
(values elts #f)))))
(values '() #f))))
(define (uncase re)
(receive (new-re changed?)
(let recur ((re re))
(? ((re-seq? re)
(let ((elts (re-seq:elts re)))
(receive (new-elts elts-changed?)
(map/changed recur elts)
(if elts-changed?
(values (%make-re-seq new-elts (re-seq:tsm re)) #t)
(values re #f)))))
((re-choice? re)
(let ((elts (re-choice:elts re)))
(receive (new-elts elts-changed?)
(map/changed recur elts)
(if elts-changed?
(values (re-choice new-elts) #t)
(values re #f)))))
((re-char-set? re)
(let* ((cs (re-char-set:cset re))
(new-cs (uncase-char-set cs))) ; Better not be code.
(if (char-set= cs new-cs)
(values re #f)
(values (make-re-char-set new-cs) #t))))
((re-repeat? re)
(receive (new-body body-changed?) (recur (re-repeat:body re))
(if body-changed?
(values (re-repeat (re-repeat:from re)
(re-repeat:to re)
new-body)
#t)
(values re #f))))
((re-submatch? re)
(receive (new-body body-changed?) (recur (re-submatch? re))
(if body-changed?
(values (%make-re-submatch new-body
(re-submatch:pre-dsm re)
(re-submatch:tsm re))
#t)
(values re #f))))
((re-string? re)
(let ((cf-re (uncase-string (re-string:chars re))))
(if (re-string? cf-re)
(values re #f)
(values cf-re #t))))
(else (values re #f))))
new-re))
;;; (uncase-char-set cs)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Return a char-set cs' such that cs' contains every char c in cs in both
;;; its upcase and downcase form.
(define (uncase-char-set cs)
(char-set-fold (lambda (c new-cset)
(char-set-adjoin! new-cset
(char-downcase c)
(char-upcase c)))
(char-set-copy char-set:empty)
cs))
;;; I actually make an effort to keep this a re-string
;;; if possible (if the string contains no case-sensitive
;;; characters). Returns a regexp matching the string in
;;; a case-insensitive fashion.
(define (uncase-string s)
;; SEQ is a list of chars and doubleton char-sets.
(let* ((seq (string-fold-right (lambda (c lis)
(cons (? ((char-lower-case? c) (char-set c (char-upcase c)))
((char-upper-case? c) (char-set c (char-downcase c)))
(else c))
lis))
'() s))
;; Coalesce adjacent chars together into a string.
(fixup (lambda (chars seq)
(if (pair? chars)
(cons (make-re-string (list->string (reverse chars)))
seq)
seq)))
(new-seq (let recur ((seq seq) (chars '()))
(if (pair? seq)
(let ((elt (car seq))
(seq (cdr seq)))
(if (char? elt)
(recur seq (cons elt chars))
(fixup chars (cons (make-re-char-set elt)
(recur seq '())))))
(fixup chars '())))))
(if (= 1 (length new-seq)) (car new-seq)
(make-re-seq new-seq))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define char-set-full?
(let ((allchars-nchars (char-set-size char-set:full)))
(lambda (cs) (= allchars-nchars (char-set-size cs)))))
(define (char-set-empty? cs) (zero? (char-set-size cs)))
;;; A "char-class" re is either a char-set re or a string re whose string
;;; has only one character.
(define (re-char-class? re)
(or (re-char-set? re)
(and (re-string? re)
(= 1 (string-length (re-string:chars re))))))
(define (static-char-class? re)
(or (and (re-char-set? re)
(char-set? (re-char-set:cset re))) ; This might be code.
(and (re-string? re) ; But never this, so no check.
(= 1 (string-length (re-string:chars re))))))