scsh-0.5/scsh/rx/posixstr.scm

624 lines
22 KiB
Scheme

;;; Regexp-ADT -> Posix-string translator.
;;; Olin Shivers January 1997, May 1998.
;;; - If the regexp value contains nul character constants, or character sets
;;; that contain the nul character, they will show up in the Posix string
;;; we produce. Spencer's C regexp engine can handle regexp strings that
;;; contain nul bytes, but this might blow up other implementations -- that
;;; is, the nul byte might prematurely terminate the C string passed to the
;;; regexp engine.
;;;
;;; - The code is ASCII-specific in only one place: the expression for
;;; a regexp that matches nothing is the 6-char pattern "[^\000-\177]",
;;; which assumes a 7-bit character code. Note that the static simplifier
;;; can remove *all* occurences of this "empty regexp" except for the
;;; un-simplifiable case of a single, top-level empty regexp, e.g.
;;; (rx (in))
;;; We can handle this one special case specially, so we shouldn't *ever*
;;; have to produce this ASCII-specific pattern.
;;; Exports: regexp->posix-string
;;; Todo: A dumb, simple char-set renderer.
;;; These functions translate static regular expressions into Posix regexp
;;; strings. They generally return four values:
;;; - string (regexp)
;;;
;;; - syntax level: 0 parenthesized exp, 1 piece, 2 branch, 3 top
;;; ("piece", "branch" and "top" are Spencer's terms):
;;; + A parenthesized exp is syntactically equivalent to a piece.
;;; (But it's useful to know when an exp is parenthesized for
;;; eliminating redundant submatch-generated parens.)
;;; + A piece is something that would bind to a following *
;;; ("a" but not "aa").
;;; + A branch is a sequence of pieces -- something that would bind to a |
;;; ("ab*d" but not "ab*|d"). That is, a branch is not allowed to contain
;;; top-level |'s.
;;; + Top is for a sequence of branches -- "a|b*c|d".
;;;
;;; - paren count in the returned string.
;;;
;;; [This is a newer description; is it correct?]
;;; - A vector mapping submatches (vector index 0 is submatch 1)
;;; to the paren for that submatch (the first paren is paren #1).
;;;
;;; [This is my original description.]
;;; - Vector of parens numbers used for submatching. The first paren is
;;; numbered 1. #F means a dead submatch -- one we can tell statically
;;; will never match anything.
;;; Non-R4RS imports:
;;; ? = COND
;;; Multiple-value return: VALUES RECEIVE CALL-WITH-VALUES
;;; SORT-LIST
;;; Useful little utility -- pad vector V with
;;; PRE initial and POST following #f's.
(define (pad-vector pre post v)
(if (= pre post 0) v
(let* ((vlen (vector-length v))
(alen (+ pre post vlen))
(ans (make-vector alen #f)))
(do ((from (- vlen 1) (- from 1))
(to (+ pre vlen -1) (- to 1)))
((< from 0))
(vector-set! ans to (vector-ref v from)))
ans)))
(define (n-falses n) (make-vector n #f))
;;; There's no representation for regexps that never match anything (e.g.,
;;; (|)) in strict Posix notation. When we get one of these, we treat it
;;; specially, producing [#f #f #f #f].
;;;
;;; We can always detect these empty regexps, because they always simplify
;;; to one of these two values:
;;; - (make-re-char-set char-set:empty)
;;; - (dsm m n (make-re-char-set char-set:empty))
(define (simple-empty-re? re)
(or (and (re-char-set? re)
(char-set-empty? (re-char-set:cset re)))
(and (re-dsm? re)
(simple-empty-re? (re-dsm:body re)))))
;;; Top-level
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define (regexp->posix-string re)
;; We *must* simplify, to guarantee correct translation.
(let ((re (simplify-regexp re)))
(if (simple-empty-re? re) (values #f #f #f #f)
(translate-regexp re))))
(define (translate-regexp re)
(? ((re-string? re) (translate-string (re-string:chars re)))
((re-repeat? re) (translate-repeat re))
((re-choice? re) (translate-choice re))
((re-seq? re) (translate-seq re))
((re-char-set? re) (translate-char-set (re-char-set:cset re)))
((re-submatch? re) (translate-submatch re))
((re-bos? re) (values "^" 1 0 '#()))
((re-eos? re) (values "$" 1 0 '#()))
((re-bol? re) (error "Beginning-of-line regexp not supported in this implementation."))
((re-eol? re) (error "End-of-line regexp not supported in this implementation."))
((re-bow? re) (values "[[:<:]]" 1 0 '#())) ; These two are
((re-eow? re) (values "[[:>:]]" 1 0 '#())) ; Spencer-specific.
((re-dsm? re) (let ((pre-dsm (re-dsm:pre-dsm re))
(body (re-dsm:body re)))
(translate-dsm body pre-dsm
(- (re-dsm:tsm re)
(+ pre-dsm (re-tsm body))))))
(else (error "Illegal regular expression" re))))
;;; Translate reloc-elt ELT = (N . RE) from a sequence or choice
;;; into a Posix string.
;;; - Relocate the submatch indices by PREV-PCOUNT.
;;; (That is, assume rendering preceding elts used PREV-PCOUNT parens.)
;;; - Assume preceding elements allocated PREV-SMCOUNT submatches
;;; (we may have to pad our returned submatches string with some
;;; initial #F's to account for dead submatches PREV-SMCOUNT through N.)
;;; - If SUB-LEV3? is true, the result string is guaranteed to be < level 3.
;;; This is used by the & and | translators.
;;; - Returns the usual 4 values plus the final submatch count including
;;; this regexp.
(define (translate-elt elt prev-pcount prev-smcount sub-lev3?)
(let ((offset (car elt))
(re (cdr elt)))
(receive (s level pcount submatches) (translate-regexp re)
;; Relocate submatch indices by OFFSET and force level <3, if needed:
(receive (s level pcount submatches)
(if (and sub-lev3? (= level 3))
(values (string-append "(" s ")")
0
(+ pcount 1)
(mapv (lambda (sm) (and sm (+ prev-pcount 1 sm)))
submatches))
(values s level pcount
(mapv (lambda (sm) (and sm (+ prev-pcount sm)))
submatches)))
;; Tack onto submatches as many initial #F's as needed to bump
;; the previous submatches count from PREV-SMCOUNT to OFFSET.
(values s level pcount
(pad-vector (- offset prev-smcount) 0 submatches)
(+ offset (re-tsm re)))))))
;;; Force the string to be level < 3 by parenthesizing it if necessary.
(define (paren-if-necessary s lev pcount submatches)
(if (< lev 3)
(values s lev pcount submatches)
(values (string-append "(" s ")")
0
(+ pcount 1)
(mapv (lambda (sm) (and sm (+ 1 sm)))
submatches))))
;;; (: re1 ... ren)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define (translate-seq re)
(let ((elts (re-seq:elts re))
(tsm (re-seq:tsm re)))
(let recur ((elts elts) (prev-pcount 0) (prev-smcount 0))
;; Render a sequence tail ELTS, assuming the previous elements translated
;; to a string with PREV-PCOUNT parens, and allocated PREV-SMCOUNT
;; submatches.
(if (pair? elts)
(let* ((elt (car elts))
(elts (cdr elts)))
(receive (s1 level1 pcount1 submatches1)
(translate-regexp elt)
(receive (s1 level1 pcount1 submatches1)
(paren-if-necessary s1 level1 pcount1 submatches1)
(receive (s level pcount submatches)
(recur elts
(+ pcount1 prev-pcount)
(+ prev-smcount (re-tsm elt)))
(values (string-append s1 s)
2
(+ pcount1 pcount)
(vector-append (mapv (lambda (p) (and p (+ p prev-pcount)))
submatches1)
submatches))))))
(values "" 2 0 '#()))))) ; Empty seq
;;; (| re1 ... ren)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define (translate-choice re)
(let ((elts (re-choice:elts re))
(tsm (re-choice:tsm re)))
(if (pair? elts)
(let recur ((elts elts) (prev-pcount 0) (prev-smcount 0))
;; ELTS is a non-empty choice tail. Render it, assuming the
;; previous elements translated to a string with PREV-PCOUNT parens,
;; and allocated PREV-SMCOUNT submatches.
(let ((elt (car elts)) (tail (cdr elts)))
(receive (s1 level1 pcount1 submatches1) (translate-regexp elt)
(let ((submatches1 (mapv (lambda (sm) (and sm (+ sm prev-pcount)))
submatches1)))
(if (pair? tail)
(receive (s level pcount submatches)
(recur tail
(+ pcount1 prev-pcount)
(+ prev-smcount (re-tsm elt)))
(values (string-append s1 "|" s) 3
(+ pcount1 pcount)
(vector-append submatches1 submatches)))
(values s1 level1 pcount1 submatches1))))))
(values "[^\000-\377]" 1 0 (n-falses tsm))))) ; Empty choice.
;;; Repeated cases: * + ? and {n,m} ranges.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define (translate-repeat re)
(let ((from (re-repeat:from re))
(to (re-repeat:to re))
(body (re-repeat:body re))
(tsm (re-repeat:tsm re)))
(? ((and to (> from to)) ; Unsatisfiable
(values "[^\000-\377]" 1 0 (n-falses tsm)))
((and to (= from to 1)) (translate-seq body)) ; RE{1,1} => RE
((and to (= to 0)) ; RE{0,0} => ""
(values "" 2 0 (n-falses tsm)))
(else ; General case
(receive (s level pcount submatches) (translate-regexp body)
(receive (s level pcount submatches) ; Coerce S to level <2.
(if (> level 1)
(values (string-append "(" s ")")
0
(+ pcount 1)
(mapv (lambda (i) (and i (+ i 1))) submatches))
(values s level pcount submatches))
(values (if to
(? ((and (= from 0) (= to 1)) (string-append s "?"))
((= from to)
(string-append s "{" (number->string to) "}"))
(else
(string-append s "{" (number->string from)
"," (number->string to) "}")))
(? ((= from 0) (string-append s "*"))
((= from 1) (string-append s "+"))
(else (string-append s "{" (number->string from) ",}"))))
1 pcount submatches)))))))
;;; Submatch
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define (translate-submatch re)
(let ((body (re-submatch:body re))
(pre-dsm (re-submatch:pre-dsm re)))
;; Translate the body, along with any leading or trailing dead submatches.
(receive (s level pcount submatches)
(translate-dsm body
pre-dsm
(- (re-submatch:tsm re)
(+ 1 pre-dsm (re-tsm body))))
;; If the whole expression isn't already wrapped in a paren, wrap it.
;; This outer paren becomes the new submatch -- add to submatches list.
(if (= level 0)
(values s 0 pcount (vector-append '#(1) submatches))
(values (string-append "(" s ")")
0
(+ pcount 1)
(mapv! (lambda (i) (and i (+ i 1))) ; Excuse me.
(vector-append '#(0) submatches)))))))
;;; Translating DSM
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Translate the body, and paste enough #F's before and after the submatches
;;; list to account for extra dead submatches.
(define (translate-dsm body pre-dsm post-dsm)
(receive (s level pcount submatches) (translate-regexp body)
(values s level pcount (pad-vector pre-dsm post-dsm submatches))))
;;; Constant regexps
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Convert a string into a regexp pattern that matches that string exactly --
;;; quote the special chars with backslashes.
(define translate-string
(let ((specials (string->char-set "[.*?()|\\$^+")))
(lambda (s)
(let ((len (string-length s)))
(if (zero? len)
(values "()" 0 1 '#()) ; Special case ""
(let* ((len2 (string-fold (lambda (c len) ; Length of answer str
(+ len (if (char-set-contains? specials c) 2 1)))
0 s))
(s2 (make-string len2))) ; Answer string
;; Copy the chars over to S2.
(string-fold (lambda (c i)
;; Write char C at index I, return the next index.
(let ((i (cond ((char-set-contains? specials c)
(string-set! s2 i #\\)
(+ i 1))
(else i))))
(string-set! s2 i c)
(+ i 1)))
0 s)
(values s2 (if (= len 1) 1 2)
0 '#())))))))
;;; Translating char-sets to [...] strings
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; This is the nastiest code in the system. We make an effort to return
;;; succinct encodings of the char-sets, in the event these encodings are
;;; being shown to humans.
;;; - A singleton set is rendered as that char.
;;; - A full set is rendered as "."
;;; - An empty set is rendered as [^\000-\177].
;;; - Otherwise, render it both as a [...] and as a [^...] spec, and
;;; take whichever is shortest.
;;; Take a char set, and return the standard
;;; [regexp-string, level, pcount, submatches]
;;; quadruple.
;;;
(define (translate-char-set cset)
(if (char-set-full? cset) (values "." 1 0 '#()) ; Full set
(let ((nchars (char-set-size cset))
(->bracket-string (lambda (cset in?)
(receive (loose ranges) (char-set->in-pair cset)
(hack-bracket-spec loose ranges in?)))))
(? ((= 0 nchars) (values "[^\000-\177]" 1 0 '#())) ; Empty set
((= 1 nchars) ; Singleton set
(translate-string (string (car (char-set->list cset)))))
;; General case. Try both [...] and [^...].
(else (let ((s- (->bracket-string cset #t))
(s+ (->bracket-string (char-set-complement cset) #f)))
(values (if (< (string-length s-) (string-length s+))
s- s+)
1 0 '#())))))))
;;; Commentary
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Hacking special chars in character-class strings:
;;; ] - ^ ]...^-
;;; ] - ]...-
;;; ] ^ ]...^
;;; ] ]...
;;; - ^ ...^- (or doubleton screw-case)
;;; - ...-
;;; ^ ...^ (or singleton screw-case)
;;;
;;; Two screw cases:
;;; "^-" must be converted to "-^" for IN.
;;; "^" must be converted to non-class "^" for IN.
;;; Rendering a general char-set into a correct Posix [...] bracket expression
;;; is a complete mess.
;;;
;;; The rules on bracket expressions:
;;; - ] terminates the exp unless it is the first char
;;; (after an optional leading ^).
;;; - .*[\ are not special in bracket expressions.
;;; - However, [. [= and [: *are* special, so you can't follow an
;;; open bracket by one of .=: -- argh. See below.
;;; - ^ isn't special unless it's the first char.
;;; - - is special unless it's first (after an optional ^), last,
;;; or as the ending char in a range (e.g., a--).
;;; This means:
;;; - You must ensure that ] doesn't begin or terminate a range.
;;; - You must ensure that .=: don't follow [
;;; + This can happen in the loose char list;
;;; + This can happen in the range list -- consider the pair of
;;; ranges "x-[.-%" Handle this by prohibiting [ as a range-terminator.
;;; + It can happen at the loose/range boundary: %[:-?
;;; First, run-length encode the set into loose and range-pairs.
;;; If the set is a singleton set, then punt the whole [...] effort,
;;; and do it as a simple char.
;;; Repeat until stable:
;;; - Sort the ranges in this order:
;;; 1. other ranges;
;;; 2. ranges that begin with ^ (not priority)
;;; 3. ranges that begin with .=: (priority)
;;; 4. ranges that end with [ (priority)
;;; This eliminates [. [= [: problems in the ranges, and
;;; minimises the chances of the problem at the loose/range boundary.
;;; and problems with initial ^ chars.
;;; - Sort the loose chars so that ] is first, then -, then .=:, then [,
;;; then others, then ^. This eliminates [. [= [: problems in the loose
;;; chars, and minimises the chances of the problem at the loose/range
;;; boundary.
;;; - Shrink ranges by moving an opening or closing range char into the
;;; loose-char set:
;;; + If ] opens or closes a range, shrink it out.
;;; + If any range opens with -, shrink it out.
;;; + If the first range opens with .=:, and the last loose char is [,
;;; shrink it out.
;;; + If there are no loose chars, the first range begins with ^, and
;;; we're doing an IN range, shrink out the ^.
;;; + Shrinking a range down to <3 chars means move it's elts into the
;;; loose char set.
;;; - If both [ and - are in the loose char set,
;;; pull - out as special end-hypen.
;;; Finally, we have to hack things so that ^ doesn't begin an IN sequence.
;;; - If it's a NOT-IN sequence, no worries.
;;; - If ^ is the opening loose char, then it's the only loose char.
;;; If there are ranges, move it to the end of the string.
;;; If there are no ranges, then just punt the char-class and convert
;;; it to a singleton ^. In fact, do this up-front, for any singleton
;;; set.
;;;
;;; If the special end-hyphen flag is set, add - to the end of the string.
;;; This general approach -- starting out with maximal ranges, and then
;;; shrinking them to avoid other syntax violations -- has the advantage
;;; of not relying on the details of the ASCII encodings.
;;; Ordering ranges:
;;; 1. other ranges (ordered by start char)
;;; 2. ranges that begin with ^ (not priority)
;;; 3. ranges that begin with .=:
;;; 4. ranges that end with [ (priority over #2 & #3)
(define (range< r1 r2)
(let ((r1-start (car r1)) (r1-end (cdr r1))
(r2-start (car r2)) (r2-end (cdr r2)))
(or (char=? r2-end #\[) ; Range ending with [ comes last.
(and (not (char=? r1-end #\[))
;; Range begin with one of .=: comes next-to-last
(or (char=? r2-start #\.) (char=? r2-start #\=) (char=? r2-start #\:)
(and (not (char=? r1-start #\.))
(not (char=? r1-start #\=))
(not (char=? r1-start #\:))
;; Range beginning with ^ comes before that.
(or (char=? r1-start #\^)
(and (not (char=? r2-start #\^))
;; Other ranges are ordered by start char.
(< (char->ascii r1-start)
(char->ascii r2-start))))))))))
;;; Order loose chars:
;;; ] is first,
;;; - is next,
;;; .=: are next,
;;; [ is next,
;;; then others (ordered by ascii val)
;;; ^ is last.
(define (loose<= c1 c2)
(or (char=? c1 #\]) ; ] is first,
(and (not (char=? c2 #\]))
(or (char=? c1 #\-) ; - is next,
(and (not (char=? c2 #\-))
;; .=: are next,
(or (char=? c1 #\.) (char=? c1 #\=) (char=? c1 #\:)
(and (not (char=? c2 #\.))
(not (char=? c2 #\=))
(not (char=? c2 #\:))
(or (char=? c1 #\[) ; [ is next,
(and (not (char=? c2 #\[))
(or (char=? c2 #\^) ; ^ is last,
(and (not (char=? c1 #\^))
;; other chars by ASCII.
(<= (char->ascii c1)
(char->ascii c2)))))))))))))
;;; Returns (1) a list of 0-3 loose chars, (2) a list of 0 or 1 ranges.
(define (shrink-range-start r)
(let ((start (char->ascii (car r)))
(end (char->ascii (cdr r))))
(shrink-range-finish-up start (+ start 1) end)))
(define (shrink-range-end r)
(let ((start (char->ascii (car r)))
(end (char->ascii (cdr r))))
(shrink-range-finish-up end start (- end 1))))
(define (shrink-range-finish-up c start end)
(? ((> start end) (values (list (ascii->char c)) '())) ; Empty range
((= start end) ; Collapse singleton range.
(values (list (ascii->char c) (ascii->char start))
'()))
((= (+ start 1) end) ; Collapse doubleton range.
(values (list (ascii->char c) (ascii->char start) (ascii->char end))
'()))
(else (values (list (ascii->char c))
(list (cons (ascii->char start) (ascii->char end)))))))
;;; We assume the bracket-spec is not a singleton, not empty, and not complete.
;;; (These cases get rendered as the letter, [^\000-\177], and ".",
;;; respectively.) We assume the loose chars and the ranges are all disjoint.
(define (hack-bracket-spec loose ranges in?)
(let lp ((loose0 loose) (ranges0 ranges) (end-hyphen? #f))
;; Repeat until stable:
(let ((loose (sort-list loose0 loose<=)) ; Sort loose chars and ranges.
(ranges (sort-list ranges0 range<)))
;; If ] opens or closes a range, shrink it out.
;; If - opens a range, shrink it out.
(receive (loose ranges)
(let recur ((ranges ranges))
(if (pair? ranges)
(let* ((range (car ranges))
(start (car range))
(end (cdr range))
(ranges (cdr ranges)))
(receive (new-loose new-ranges) (recur ranges)
(receive (new-loose0 new-ranges0)
(? ((char=? #\] start)
(shrink-range-start range))
((char=? #\] end)
(shrink-range-end range))
((char=? #\- start)
(shrink-range-start range))
(else (values '() (list range))))
(values (append new-loose0 new-loose)
(append new-ranges0 new-ranges)))))
(values loose '())))
(? ((or (not (equal? loose0 loose)) ; Loop if anything changed.
(not (equal? ranges0 ranges)))
(lp loose ranges end-hyphen?))
;; If the first range opens with .=:, and the last loose char is [,
;; shrink it out & loop.
((and (pair? ranges)
(memv (caar ranges) '(#\. #\= #\:))
(pair? loose)
(char=? #\[ (car (reverse loose))))
(receive (new-loose new-ranges)
(shrink-range-start (car ranges))
(lp (append new-loose loose) (append new-ranges (cdr ranges)) end-hyphen?)))
;; If there are no loose chars, the first range begins with ^, and
;; we're doing an IN range, shrink out the ^.
((and in? (null? loose) (pair? ranges) (char=? #\^ (caar ranges)))
(receive (new-loose new-ranges) (shrink-range-start (car ranges))
(lp (append new-loose loose) (append new-ranges ranges) end-hyphen?)))
;; If both [ and - are in the loose char set,
;; pull - out as special end-hypen.
((and (pair? loose)
(pair? (cdr loose))
(char=? (car loose) #\[)
(char=? (car loose) #\-))
(lp (cons (car loose) (cddr loose)) ranges #t))
;; No change! Build the answer...
(else (string-append (if in? "[" "[^")
(list->string loose)
(apply string-append
(map (lambda (r) (string (car r) #\- (cdr r)))
ranges))
"]")))))))