1999-07-05 23:45:37 -04:00
|
|
|
;;; Regexp-ADT -> Posix-string translator.
|
|
|
|
;;; Olin Shivers January 1997, May 1998.
|
|
|
|
|
|
|
|
;;; - If the regexp value contains nul character constants, or character sets
|
|
|
|
;;; that contain the nul character, they will show up in the Posix string
|
|
|
|
;;; we produce. Spencer's C regexp engine can handle regexp strings that
|
|
|
|
;;; contain nul bytes, but this might blow up other implementations -- that
|
|
|
|
;;; is, the nul byte might prematurely terminate the C string passed to the
|
|
|
|
;;; regexp engine.
|
|
|
|
;;;
|
|
|
|
;;; - The code is ASCII-specific in only one place: the expression for
|
|
|
|
;;; a regexp that matches nothing is the 6-char pattern "[^\000-\177]",
|
|
|
|
;;; which assumes a 7-bit character code. Note that the static simplifier
|
|
|
|
;;; can remove *all* occurences of this "empty regexp" except for the
|
|
|
|
;;; un-simplifiable case of a single, top-level empty regexp, e.g.
|
|
|
|
;;; (rx (in))
|
|
|
|
;;; We can handle this one special case specially, so we shouldn't *ever*
|
|
|
|
;;; have to produce this ASCII-specific pattern.
|
|
|
|
|
|
|
|
;;; Exports: regexp->posix-string
|
|
|
|
|
|
|
|
;;; Todo: A dumb, simple char-set renderer.
|
|
|
|
|
|
|
|
;;; These functions translate static regular expressions into Posix regexp
|
|
|
|
;;; strings. They generally return four values:
|
|
|
|
;;; - string (regexp)
|
|
|
|
;;;
|
|
|
|
;;; - syntax level: 0 parenthesized exp, 1 piece, 2 branch, 3 top
|
|
|
|
;;; ("piece", "branch" and "top" are Spencer's terms):
|
|
|
|
;;; + A parenthesized exp is syntactically equivalent to a piece.
|
|
|
|
;;; (But it's useful to know when an exp is parenthesized for
|
|
|
|
;;; eliminating redundant submatch-generated parens.)
|
|
|
|
;;; + A piece is something that would bind to a following *
|
|
|
|
;;; ("a" but not "aa").
|
|
|
|
;;; + A branch is a sequence of pieces -- something that would bind to a |
|
|
|
|
;;; ("ab*d" but not "ab*|d"). That is, a branch is not allowed to contain
|
|
|
|
;;; top-level |'s.
|
|
|
|
;;; + Top is for a sequence of branches -- "a|b*c|d".
|
|
|
|
;;;
|
|
|
|
;;; - paren count in the returned string.
|
|
|
|
;;;
|
|
|
|
;;; - Vector of parens numbers used for submatching. The first paren is
|
|
|
|
;;; numbered 1. #F means a dead submatch -- one we can tell statically
|
|
|
|
;;; will never match anything.
|
|
|
|
|
|
|
|
;;; Non-R4RS imports:
|
|
|
|
;;; ? = COND
|
|
|
|
;;; Multiple-value return: VALUES RECEIVE CALL-WITH-VALUES
|
|
|
|
;;; SORT-LIST
|
|
|
|
|
|
|
|
|
|
|
|
;;; Useful little utility -- pad vector V with
|
|
|
|
;;; PRE initial and POST following #f's.
|
|
|
|
|
|
|
|
(define (pad-vector pre post v)
|
|
|
|
(if (= pre post 0) v
|
|
|
|
(let* ((vlen (vector-length v))
|
|
|
|
(alen (+ pre post vlen))
|
|
|
|
(ans (make-vector alen #f)))
|
|
|
|
(do ((from (- vlen 1) (- from 1))
|
|
|
|
(to (+ pre vlen -1) (- to 1)))
|
|
|
|
((< from 0))
|
|
|
|
(vector-set! ans to (vector-ref v from)))
|
|
|
|
ans)))
|
|
|
|
|
|
|
|
(define (n-falses n) (make-vector n #f))
|
|
|
|
|
|
|
|
|
|
|
|
;;; There's no representation for regexps that never match anything (e.g.,
|
|
|
|
;;; (|)) in strict Posix notation. When we get one of these, we treat it
|
|
|
|
;;; specially, producing [#f #f #f #f].
|
|
|
|
;;;
|
|
|
|
;;; We can always detect these empty regexps, because they always simplify
|
|
|
|
;;; to one of these two values:
|
|
|
|
;;; - (make-re-char-set char-set:empty)
|
|
|
|
;;; - (dsm m n (make-re-char-set char-set:empty))
|
|
|
|
|
|
|
|
(define (simple-empty-re? re)
|
|
|
|
(or (and (re-char-set? re)
|
|
|
|
(char-set-empty? (re-char-set:cset re)))
|
|
|
|
(and (re-dsm? re)
|
|
|
|
(simple-empty-re? (re-dsm:body re)))))
|
|
|
|
|
|
|
|
|
|
|
|
;;; Top-level
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
|
|
|
|
(define (regexp->posix-string re)
|
|
|
|
;; We *must* simplify, to guarantee correct translation.
|
|
|
|
(let ((re (simplify-regexp re)))
|
|
|
|
(if (simple-empty-re? re) (values #f #f #f #f)
|
|
|
|
(translate-regexp re))))
|
|
|
|
|
|
|
|
|
|
|
|
(define (translate-regexp re)
|
|
|
|
(? ((re-string? re) (translate-string (re-string:chars re)))
|
|
|
|
|
|
|
|
((re-repeat? re) (translate-repeat re))
|
|
|
|
((re-choice? re) (translate-choice re))
|
|
|
|
((re-seq? re) (translate-seq re))
|
|
|
|
((re-char-set? re) (translate-char-set (re-char-set:cset re)))
|
|
|
|
|
|
|
|
((re-submatch? re) (translate-submatch re))
|
|
|
|
|
|
|
|
((re-bos? re) (values "^" 1 0 '#()))
|
|
|
|
((re-eos? re) (values "$" 1 0 '#()))
|
|
|
|
|
|
|
|
((re-bol? re) (error "Beginning-of-line regexp not supported in this implementation."))
|
|
|
|
((re-eol? re) (error "End-of-line regexp not supported in this implementation."))
|
|
|
|
|
|
|
|
((re-bow? re) (values "[[:<:]]" 1 0 '#())) ; These two are
|
|
|
|
((re-eow? re) (values "[[:>:]]" 1 0 '#())) ; Spencer-specific.
|
|
|
|
|
|
|
|
((re-dsm? re) (let ((pre-dsm (re-dsm:pre-dsm re))
|
|
|
|
(body (re-dsm:body re)))
|
|
|
|
(translate-dsm body pre-dsm
|
|
|
|
(- (re-dsm:tsm re)
|
|
|
|
(+ pre-dsm (re-tsm body))))))
|
|
|
|
|
|
|
|
(else (error "Illegal regular expression" re))))
|
|
|
|
|
|
|
|
|
|
|
|
;;; Translate reloc-elt ELT = (N . RE) from a sequence or choice
|
|
|
|
;;; into a Posix string.
|
|
|
|
;;; - Relocate the submatch indices by PREV-PCOUNT.
|
|
|
|
;;; (That is, assume rendering preceding elts used PREV-PCOUNT parens.)
|
|
|
|
;;; - Assume preceding elements allocated PREV-SMCOUNT submatches
|
|
|
|
;;; (we may have to pad our returned submatches string with some
|
|
|
|
;;; initial #F's to account for dead submatches PREV-SMCOUNT through N.)
|
|
|
|
;;; - If SUB-LEV3? is true, the result string is guaranteed to be < level 3.
|
|
|
|
;;; This is used by the & and | translators.
|
|
|
|
;;; - Returns the usual 4 values plus the final submatch count including
|
|
|
|
;;; this regexp.
|
|
|
|
|
|
|
|
(define (translate-elt elt prev-pcount prev-smcount sub-lev3?)
|
|
|
|
(let ((offset (car elt))
|
|
|
|
(re (cdr elt)))
|
|
|
|
|
|
|
|
(receive (s level pcount submatches) (translate-regexp re)
|
|
|
|
|
|
|
|
;; Relocate submatch indices by OFFSET and force level <3, if needed:
|
|
|
|
(receive (s level pcount submatches)
|
|
|
|
(if (and sub-lev3? (= level 3))
|
|
|
|
(values (string-append "(" s ")")
|
|
|
|
0
|
|
|
|
(+ pcount 1)
|
|
|
|
(mapv (lambda (sm) (and sm (+ prev-pcount 1 sm)))
|
|
|
|
submatches))
|
|
|
|
(values s level pcount
|
|
|
|
(mapv (lambda (sm) (and sm (+ prev-pcount sm)))
|
|
|
|
submatches)))
|
|
|
|
|
|
|
|
;; Tack onto submatches as many initial #F's as needed to bump
|
|
|
|
;; the previous submatches count from PREV-SMCOUNT to OFFSET.
|
|
|
|
(values s level pcount
|
|
|
|
(pad-vector (- offset prev-smcount) 0 submatches)
|
|
|
|
(+ offset (re-tsm re)))))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;;; Force the string to be level < 3 by parenthesizing it if necessary.
|
|
|
|
|
|
|
|
(define (paren-if-necessary s lev pcount submatches)
|
|
|
|
(if (< lev 3)
|
|
|
|
(values s lev pcount submatches)
|
|
|
|
(values (string-append "(" s ")")
|
|
|
|
0
|
|
|
|
(+ pcount 1)
|
|
|
|
(mapv (lambda (sm) (and sm (+ 1 sm)))
|
|
|
|
submatches))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;;; (: re1 ... ren)
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
|
|
|
|
(define (translate-seq re)
|
|
|
|
(let ((elts (re-seq:elts re))
|
|
|
|
(tsm (re-seq:tsm re)))
|
|
|
|
(let recur ((elts elts) (prev-pcount 0) (prev-smcount 0))
|
|
|
|
;; Render a sequence tail ELTS, assuming the previous elements translated
|
|
|
|
;; to a string with PREV-PCOUNT parens, and allocated PREV-SMCOUNT
|
|
|
|
;; submatches.
|
|
|
|
(if (pair? elts)
|
|
|
|
(let* ((elt (car elts))
|
|
|
|
(elts (cdr elts)))
|
|
|
|
|
|
|
|
(receive (s1 level1 pcount1 submatches1)
|
|
|
|
(translate-regexp elt)
|
|
|
|
|
|
|
|
(receive (s1 level1 pcount1 submatches1)
|
|
|
|
(paren-if-necessary s1 level1 pcount1 submatches1)
|
|
|
|
|
|
|
|
(receive (s level pcount submatches)
|
|
|
|
(recur elts
|
|
|
|
(+ pcount1 prev-pcount)
|
|
|
|
(+ prev-smcount (re-tsm elt)))
|
|
|
|
|
|
|
|
(values (string-append s1 s)
|
|
|
|
2
|
|
|
|
(+ pcount1 pcount)
|
|
|
|
(vector-append (mapv (lambda (sm) (+ sm prev-smcount))
|
|
|
|
submatches1)
|
|
|
|
submatches))))))
|
|
|
|
|
|
|
|
(values "" 2 0 '#()))))) ; Empty seq
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;;; (| re1 ... ren)
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
|
|
|
|
(define (translate-choice re)
|
|
|
|
(let ((elts (re-choice:elts re))
|
|
|
|
(tsm (re-choice:tsm re)))
|
|
|
|
(if (pair? elts)
|
|
|
|
(let recur ((elts elts) (prev-pcount 0) (prev-smcount 0))
|
|
|
|
;; ELTS is a non-empty choice tail. Render it, assuming the
|
|
|
|
;; previous elements translated to a string with PREV-PCOUNT parens,
|
|
|
|
;; and allocated PREV-SMCOUNT submatches.
|
|
|
|
(let ((elt (car elts)) (tail (cdr elts)))
|
|
|
|
(receive (s1 level1 pcount1 submatches1) (translate-regexp elt)
|
1999-09-08 11:21:40 -04:00
|
|
|
(let ((submatches1 (mapv (lambda (sm) (and sm (+ sm prev-pcount)))
|
|
|
|
submatches1)))
|
|
|
|
(if (pair? tail)
|
|
|
|
(receive (s level pcount submatches)
|
|
|
|
(recur tail
|
|
|
|
(+ pcount1 prev-pcount)
|
|
|
|
(+ prev-smcount (re-tsm elt)))
|
|
|
|
(values (string-append s1 "|" s) 3
|
|
|
|
(+ pcount1 pcount)
|
|
|
|
(vector-append submatches1 submatches)))
|
|
|
|
|
|
|
|
(values s1 level1 pcount1 submatches1))))))
|
1999-07-05 23:45:37 -04:00
|
|
|
|
|
|
|
(values "[^\000-\377]" 1 0 (n-falses tsm))))) ; Empty choice.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;;; Repeated cases: * + ? and {n,m} ranges.
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
|
|
|
|
(define (translate-repeat re)
|
|
|
|
(let ((from (re-repeat:from re))
|
|
|
|
(to (re-repeat:to re))
|
|
|
|
(body (re-repeat:body re))
|
|
|
|
(tsm (re-repeat:tsm re)))
|
|
|
|
|
|
|
|
(? ((and to (> from to)) ; Unsatisfiable
|
|
|
|
(values "[^\000-\377]" 1 0 (n-falses tsm)))
|
|
|
|
|
|
|
|
((and to (= from to 1)) (translate-seq body)) ; RE{1,1} => RE
|
|
|
|
|
|
|
|
((and to (= to 0)) ; RE{0,0} => ""
|
|
|
|
(values "" 2 0 (n-falses tsm)))
|
|
|
|
|
|
|
|
(else ; General case
|
|
|
|
(receive (s level pcount submatches) (translate-regexp body)
|
|
|
|
(receive (s level pcount submatches) ; Coerce S to level <2.
|
|
|
|
(if (> level 1)
|
|
|
|
(values (string-append "(" s ")")
|
|
|
|
0
|
|
|
|
(+ pcount 1)
|
|
|
|
(mapv (lambda (i) (and i (+ i 1))) submatches))
|
|
|
|
(values s level pcount submatches))
|
|
|
|
|
|
|
|
(values (if to
|
|
|
|
(? ((and (= from 0) (= to 1)) (string-append s "?"))
|
|
|
|
((= from to)
|
|
|
|
(string-append s "{" (number->string to) "}"))
|
|
|
|
(else
|
|
|
|
(string-append s "{" (number->string from)
|
|
|
|
"," (number->string to) "}")))
|
|
|
|
(? ((= from 0) (string-append s "*"))
|
|
|
|
((= from 1) (string-append s "+"))
|
|
|
|
(else (string-append s "{" (number->string from) ",}"))))
|
|
|
|
1 pcount submatches)))))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;;; Submatch
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
|
|
|
|
(define (translate-submatch re)
|
|
|
|
(let ((body (re-submatch:body re))
|
|
|
|
(pre-dsm (re-submatch:pre-dsm re)))
|
|
|
|
|
|
|
|
;; Translate the body, along with any leading or trailing dead submatches.
|
|
|
|
(receive (s level pcount submatches)
|
|
|
|
(translate-dsm body
|
|
|
|
pre-dsm
|
|
|
|
(- (re-submatch:tsm re)
|
|
|
|
(+ 1 pre-dsm (re-tsm body))))
|
|
|
|
|
|
|
|
;; If the whole expression isn't already wrapped in a paren, wrap it.
|
|
|
|
;; This outer paren becomes the new submatch -- add to submatches list.
|
|
|
|
(if (= level 0)
|
|
|
|
(values s 0 pcount (vector-append '#(1) submatches))
|
|
|
|
(values (string-append "(" s ")")
|
|
|
|
0
|
|
|
|
(+ pcount 1)
|
|
|
|
(mapv! (lambda (i) (and i (+ i 1))) ; Excuse me.
|
|
|
|
(vector-append '#(0) submatches)))))))
|
|
|
|
|
|
|
|
;;; Translating DSM
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;; Translate the body, and paste enough #F's before and after the submatches
|
|
|
|
;;; list to account for extra dead submatches.
|
|
|
|
|
|
|
|
(define (translate-dsm body pre-dsm post-dsm)
|
|
|
|
(receive (s level pcount submatches) (translate-regexp body)
|
|
|
|
(values s level pcount (pad-vector pre-dsm post-dsm submatches))))
|
|
|
|
|
|
|
|
;;; Constant regexps
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;; Convert a string into a regexp pattern that matches that string exactly --
|
|
|
|
;;; quote the special chars with backslashes.
|
|
|
|
|
|
|
|
(define translate-string
|
|
|
|
(let ((specials (string->char-set "[.*?()|\\$^+")))
|
|
|
|
(lambda (s)
|
|
|
|
(let ((len (string-length s)))
|
|
|
|
(if (zero? len)
|
|
|
|
(values "()" 0 1 '#()) ; Special case ""
|
|
|
|
|
1999-07-11 16:41:27 -04:00
|
|
|
(let* ((len2 (string-fold (lambda (c len) ; Length of answer str
|
|
|
|
(+ len (if (char-set-contains? specials c) 2 1)))
|
|
|
|
0 s))
|
1999-07-05 23:45:37 -04:00
|
|
|
(s2 (make-string len2))) ; Answer string
|
|
|
|
|
|
|
|
;; Copy the chars over to S2.
|
1999-07-11 16:41:27 -04:00
|
|
|
(string-fold (lambda (c i)
|
|
|
|
;; Write char C at index I, return the next index.
|
|
|
|
(let ((i (cond ((char-set-contains? specials c)
|
|
|
|
(string-set! s2 i #\\)
|
|
|
|
(+ i 1))
|
|
|
|
(else i))))
|
|
|
|
(string-set! s2 i c)
|
|
|
|
(+ i 1)))
|
|
|
|
0 s)
|
1999-07-05 23:45:37 -04:00
|
|
|
(values s2 (if (= len 1) 1 2)
|
|
|
|
0 '#())))))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;;; Translating char-sets to [...] strings
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
;;; This is the nastiest code in the system. We make an effort to return
|
|
|
|
;;; succinct encodings of the char-sets, in the event these encodings are
|
|
|
|
;;; being shown to humans.
|
|
|
|
;;; - A singleton set is rendered as that char.
|
|
|
|
;;; - A full set is rendered as "."
|
|
|
|
;;; - An empty set is rendered as [^\000-\177].
|
|
|
|
;;; - Otherwise, render it both as a [...] and as a [^...] spec, and
|
|
|
|
;;; take whichever is shortest.
|
|
|
|
|
|
|
|
;;; Take a char set, and return the standard
|
|
|
|
;;; [regexp-string, level, pcount, submatches]
|
|
|
|
;;; quadruple.
|
|
|
|
;;;
|
|
|
|
|
|
|
|
(define (translate-char-set cset)
|
|
|
|
(if (char-set-full? cset) (values "." 1 0 '#()) ; Full set
|
|
|
|
|
|
|
|
(let ((nchars (char-set-size cset))
|
|
|
|
(->bracket-string (lambda (cset in?)
|
|
|
|
(receive (loose ranges) (char-set->in-pair cset)
|
|
|
|
(hack-bracket-spec loose ranges in?)))))
|
|
|
|
|
|
|
|
(? ((= 0 nchars) (values "[^\000-\177]" 1 0 '#())) ; Empty set
|
|
|
|
|
|
|
|
((= 1 nchars) ; Singleton set
|
|
|
|
(translate-string (string (car (char-set-members cset)))))
|
|
|
|
|
|
|
|
;; General case. Try both [...] and [^...].
|
|
|
|
(else (let ((s- (->bracket-string cset #t))
|
|
|
|
(s+ (->bracket-string (char-set-invert cset) #f)))
|
|
|
|
(values (if (< (string-length s-) (string-length s+))
|
|
|
|
s- s+)
|
|
|
|
1 0 '#())))))))
|
|
|
|
|
|
|
|
|
|
|
|
;;; Commentary
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
|
|
|
|
;;; Hacking special chars in character-class strings:
|
|
|
|
;;; ] - ^ ]...^-
|
|
|
|
;;; ] - ]...-
|
|
|
|
;;; ] ^ ]...^
|
|
|
|
;;; ] ]...
|
|
|
|
;;; - ^ ...^- (or doubleton screw-case)
|
|
|
|
;;; - ...-
|
|
|
|
;;; ^ ...^ (or singleton screw-case)
|
|
|
|
;;;
|
|
|
|
;;; Two screw cases:
|
|
|
|
;;; "^-" must be converted to "-^" for IN.
|
|
|
|
;;; "^" must be converted to non-class "^" for IN.
|
|
|
|
|
|
|
|
;;; Rendering a general char-set into a correct Posix [...] bracket expression
|
|
|
|
;;; is a complete mess.
|
|
|
|
;;;
|
|
|
|
;;; The rules on bracket expressions:
|
|
|
|
;;; - ] terminates the exp unless it is the first char
|
|
|
|
;;; (after an optional leading ^).
|
|
|
|
;;; - .*[\ are not special in bracket expressions.
|
|
|
|
;;; - However, [. [= and [: *are* special, so you can't follow an
|
|
|
|
;;; open bracket by one of .=: -- argh. See below.
|
|
|
|
;;; - ^ isn't special unless it's the first char.
|
|
|
|
;;; - - is special unless it's first (after an optional ^), last,
|
|
|
|
;;; or as the ending char in a range (e.g., a--).
|
|
|
|
|
|
|
|
;;; This means:
|
|
|
|
;;; - You must ensure that ] doesn't begin or terminate a range.
|
|
|
|
;;; - You must ensure that .=: don't follow [
|
|
|
|
;;; + This can happen in the loose char list;
|
|
|
|
;;; + This can happen in the range list -- consider the pair of
|
|
|
|
;;; ranges "x-[.-%" Handle this by prohibiting [ as a range-terminator.
|
|
|
|
;;; + It can happen at the loose/range boundary: %[:-?
|
|
|
|
|
|
|
|
;;; First, run-length encode the set into loose and range-pairs.
|
|
|
|
;;; If the set is a singleton set, then punt the whole [...] effort,
|
|
|
|
;;; and do it as a simple char.
|
|
|
|
|
|
|
|
;;; Repeat until stable:
|
|
|
|
;;; - Sort the ranges in this order:
|
|
|
|
;;; 1. other ranges;
|
|
|
|
;;; 2. ranges that begin with ^ (not priority)
|
|
|
|
;;; 3. ranges that begin with .=: (priority)
|
|
|
|
;;; 4. ranges that end with [ (priority)
|
|
|
|
;;; This eliminates [. [= [: problems in the ranges, and
|
|
|
|
;;; minimises the chances of the problem at the loose/range boundary.
|
|
|
|
;;; and problems with initial ^ chars.
|
|
|
|
;;; - Sort the loose chars so that ] is first, then -, then .=:, then [,
|
|
|
|
;;; then others, then ^. This eliminates [. [= [: problems in the loose
|
|
|
|
;;; chars, and minimises the chances of the problem at the loose/range
|
|
|
|
;;; boundary.
|
|
|
|
;;; - Shrink ranges by moving an opening or closing range char into the
|
|
|
|
;;; loose-char set:
|
|
|
|
;;; + If ] opens or closes a range, shrink it out.
|
|
|
|
;;; + If any range opens with -, shrink it out.
|
|
|
|
;;; + If the first range opens with .=:, and the last loose char is [,
|
|
|
|
;;; shrink it out.
|
|
|
|
;;; + If there are no loose chars, the first range begins with ^, and
|
|
|
|
;;; we're doing an IN range, shrink out the ^.
|
|
|
|
;;; + Shrinking a range down to <3 chars means move it's elts into the
|
|
|
|
;;; loose char set.
|
|
|
|
;;; - If both [ and - are in the loose char set,
|
|
|
|
;;; pull - out as special end-hypen.
|
|
|
|
|
|
|
|
;;; Finally, we have to hack things so that ^ doesn't begin an IN sequence.
|
|
|
|
;;; - If it's a NOT-IN sequence, no worries.
|
|
|
|
;;; - If ^ is the opening loose char, then it's the only loose char.
|
|
|
|
;;; If there are ranges, move it to the end of the string.
|
|
|
|
;;; If there are no ranges, then just punt the char-class and convert
|
|
|
|
;;; it to a singleton ^. In fact, do this up-front, for any singleton
|
|
|
|
;;; set.
|
|
|
|
;;;
|
|
|
|
;;; If the special end-hyphen flag is set, add - to the end of the string.
|
|
|
|
|
|
|
|
;;; This general approach -- starting out with maximal ranges, and then
|
|
|
|
;;; shrinking them to avoid other syntax violations -- has the advantage
|
|
|
|
;;; of not relying on the details of the ASCII encodings.
|
|
|
|
|
|
|
|
;;; Ordering ranges:
|
|
|
|
;;; 1. other ranges (ordered by start char)
|
|
|
|
;;; 2. ranges that begin with ^ (not priority)
|
|
|
|
;;; 3. ranges that begin with .=:
|
|
|
|
;;; 4. ranges that end with [ (priority over #2 & #3)
|
|
|
|
|
|
|
|
(define (range< r1 r2)
|
|
|
|
(let ((r1-start (car r1)) (r1-end (cdr r1))
|
|
|
|
(r2-start (car r2)) (r2-end (cdr r2)))
|
|
|
|
(or (char=? r2-end #\[) ; Range ending with [ comes last.
|
|
|
|
(and (not (char=? r1-end #\[))
|
|
|
|
|
|
|
|
;; Range begin with one of .=: comes next-to-last
|
|
|
|
(or (char=? r2-start #\.) (char=? r2-start #\=) (char=? r2-start #\:)
|
|
|
|
(and (not (char=? r1-start #\.))
|
|
|
|
(not (char=? r1-start #\=))
|
|
|
|
(not (char=? r1-start #\:))
|
|
|
|
|
|
|
|
;; Range beginning with ^ comes before that.
|
|
|
|
(or (char=? r1-start #\^)
|
|
|
|
(and (not (char=? r2-start #\^))
|
|
|
|
|
|
|
|
;; Other ranges are ordered by start char.
|
|
|
|
(< (char->ascii r1-start)
|
|
|
|
(char->ascii r2-start))))))))))
|
|
|
|
|
|
|
|
;;; Order loose chars:
|
|
|
|
;;; ] is first,
|
|
|
|
;;; - is next,
|
|
|
|
;;; .=: are next,
|
|
|
|
;;; [ is next,
|
|
|
|
;;; then others (ordered by ascii val)
|
|
|
|
;;; ^ is last.
|
|
|
|
|
|
|
|
|
|
|
|
(define (loose<= c1 c2)
|
|
|
|
(or (char=? c1 #\]) ; ] is first,
|
|
|
|
(and (not (char=? c2 #\]))
|
|
|
|
|
|
|
|
(or (char=? c1 #\-) ; - is next,
|
|
|
|
(and (not (char=? c2 #\-))
|
|
|
|
|
|
|
|
;; .=: are next,
|
|
|
|
(or (char=? c1 #\.) (char=? c1 #\=) (char=? c1 #\:)
|
|
|
|
(and (not (char=? c2 #\.))
|
|
|
|
(not (char=? c2 #\=))
|
|
|
|
(not (char=? c2 #\:))
|
|
|
|
|
|
|
|
(or (char=? c1 #\[) ; [ is next,
|
|
|
|
(and (not (char=? c2 #\[))
|
|
|
|
|
|
|
|
(or (char=? c2 #\^) ; ^ is last,
|
|
|
|
(and (not (char=? c1 #\^))
|
|
|
|
|
|
|
|
;; other chars by ASCII.
|
|
|
|
(<= (char->ascii c1)
|
|
|
|
(char->ascii c2)))))))))))))
|
|
|
|
|
|
|
|
;;; Returns (1) a list of 0-3 loose chars, (2) a list of 0 or 1 ranges.
|
|
|
|
|
|
|
|
(define (shrink-range-start r)
|
|
|
|
(let ((start (char->ascii (car r)))
|
|
|
|
(end (char->ascii (cdr r))))
|
|
|
|
(shrink-range-finish-up start (+ start 1) end)))
|
|
|
|
|
|
|
|
(define (shrink-range-end r)
|
|
|
|
(let ((start (char->ascii (car r)))
|
|
|
|
(end (char->ascii (cdr r))))
|
|
|
|
(shrink-range-finish-up end start (- end 1))))
|
|
|
|
|
|
|
|
(define (shrink-range-finish-up c start end)
|
|
|
|
(? ((> start end) (values (list (ascii->char c)) '())) ; Empty range
|
|
|
|
|
|
|
|
((= start end) ; Collapse singleton range.
|
|
|
|
(values (list (ascii->char c) (ascii->char start))
|
|
|
|
'()))
|
|
|
|
|
|
|
|
((= (+ start 1) end) ; Collapse doubleton range.
|
|
|
|
(values (list (ascii->char c) (ascii->char start) (ascii->char end))
|
|
|
|
'()))
|
|
|
|
|
|
|
|
(else (values (list (ascii->char c))
|
|
|
|
(list (cons (ascii->char start) (ascii->char end)))))))
|
|
|
|
|
|
|
|
|
|
|
|
;;; We assume the bracket-spec is not a singleton, not empty, and not complete.
|
|
|
|
;;; (These cases get rendered as the letter, [^\000-\177], and ".",
|
|
|
|
;;; respectively.) We assume the loose chars and the ranges are all disjoint.
|
|
|
|
|
|
|
|
(define (hack-bracket-spec loose ranges in?)
|
|
|
|
(let lp ((loose0 loose) (ranges0 ranges) (end-hyphen? #f))
|
|
|
|
;; Repeat until stable:
|
|
|
|
(let ((loose (sort-list loose0 loose<=)) ; Sort loose chars and ranges.
|
|
|
|
(ranges (sort-list ranges0 range<)))
|
|
|
|
|
|
|
|
;; If ] opens or closes a range, shrink it out.
|
|
|
|
;; If - opens a range, shrink it out.
|
|
|
|
(receive (loose ranges)
|
|
|
|
(let recur ((ranges ranges))
|
|
|
|
(if (pair? ranges)
|
|
|
|
(let* ((range (car ranges))
|
|
|
|
(start (car range))
|
|
|
|
(end (cdr range))
|
|
|
|
(ranges (cdr ranges)))
|
|
|
|
(receive (new-loose new-ranges) (recur ranges)
|
|
|
|
(receive (new-loose0 new-ranges0)
|
|
|
|
(? ((char=? #\] start)
|
|
|
|
(shrink-range-start range))
|
|
|
|
|
|
|
|
((char=? #\] end)
|
|
|
|
(shrink-range-end range))
|
|
|
|
|
|
|
|
((char=? #\- start)
|
|
|
|
(shrink-range-start range))
|
|
|
|
|
|
|
|
(else (values '() (list range))))
|
|
|
|
(values (append new-loose0 new-loose)
|
|
|
|
(append new-ranges0 new-ranges)))))
|
|
|
|
(values loose '())))
|
|
|
|
|
|
|
|
(? ((or (not (equal? loose0 loose)) ; Loop if anything changed.
|
|
|
|
(not (equal? ranges0 ranges)))
|
|
|
|
(lp loose ranges end-hyphen?))
|
|
|
|
|
|
|
|
;; If the first range opens with .=:, and the last loose char is [,
|
|
|
|
;; shrink it out & loop.
|
|
|
|
((and (pair? ranges)
|
|
|
|
(memv (caar ranges) '(#\. #\= #\:))
|
|
|
|
(pair? loose)
|
|
|
|
(char=? #\[ (car (reverse loose))))
|
|
|
|
(receive (new-loose new-ranges)
|
|
|
|
(shrink-range-start (car ranges))
|
|
|
|
(lp (append new-loose loose) (append new-ranges (cdr ranges)) end-hyphen?)))
|
|
|
|
|
|
|
|
;; If there are no loose chars, the first range begins with ^, and
|
|
|
|
;; we're doing an IN range, shrink out the ^.
|
|
|
|
((and in? (null? loose) (pair? ranges) (char=? #\^ (caar ranges)))
|
|
|
|
(receive (new-loose new-ranges) (shrink-range-start (car ranges))
|
|
|
|
(lp (append new-loose loose) (append new-ranges ranges) end-hyphen?)))
|
|
|
|
|
|
|
|
;; If both [ and - are in the loose char set,
|
|
|
|
;; pull - out as special end-hypen.
|
|
|
|
((and (pair? loose)
|
|
|
|
(pair? (cdr loose))
|
|
|
|
(char=? (car loose) #\[)
|
|
|
|
(char=? (car loose) #\-))
|
|
|
|
(lp (cons (car loose) (cddr loose)) ranges #t))
|
|
|
|
|
|
|
|
;; No change! Build the answer...
|
|
|
|
(else (string-append (if in? "[" "[^")
|
|
|
|
(list->string loose)
|
|
|
|
(apply string-append
|
|
|
|
(map (lambda (r) (string (car r) #\- (cdr r)))
|
|
|
|
ranges))
|
|
|
|
"]")))))))
|