new from 0.5.2
This commit is contained in:
parent
6569af5b8b
commit
c9f54e79da
|
@ -0,0 +1,70 @@
|
||||||
|
To: alan@lcs.mit.edu
|
||||||
|
Subject: scoping macros
|
||||||
|
Reply-to: shivers@ai.mit.edu
|
||||||
|
--text follows this line--
|
||||||
|
So, I'd like to write a macro that introduces a lexical contour.
|
||||||
|
Here's my toy problem:
|
||||||
|
|
||||||
|
(color) - macro form that produces the current lexical color.
|
||||||
|
|
||||||
|
(with-green body ...)
|
||||||
|
(with-blue body ...)
|
||||||
|
Evaluate BODY forms in a lexical color contour that is green/blue,
|
||||||
|
respectively.
|
||||||
|
|
||||||
|
The default, top-level color is green.
|
||||||
|
|
||||||
|
This doesn't work:
|
||||||
|
|
||||||
|
---
|
||||||
|
(define-syntax color (syntax-rules () ((color) 'green)))
|
||||||
|
|
||||||
|
(define-syntax with-blue
|
||||||
|
(syntax-rules (color)
|
||||||
|
((with-blue body ...)
|
||||||
|
(let-syntax ((color (syntax-rules () ((color) 'blue))))
|
||||||
|
body ...))))
|
||||||
|
|
||||||
|
(define-syntax with-green
|
||||||
|
(syntax-rules (color)
|
||||||
|
((with-blue body ...)
|
||||||
|
(let-syntax ((color (syntax-rules () ((color) 'green))))
|
||||||
|
body ...))))
|
||||||
|
---
|
||||||
|
|
||||||
|
Everything comes out green. Removing COLOR from the syntax-rules keyword list
|
||||||
|
doesn't fix it.
|
||||||
|
|
||||||
|
This *does* work:
|
||||||
|
---
|
||||||
|
(define-syntax with-blue
|
||||||
|
(syntax-rules (color)
|
||||||
|
((with-blue body ...)
|
||||||
|
(let-syntax ((color (syntax-rules (color) ((color) 'blue))))
|
||||||
|
body ...))))
|
||||||
|
|
||||||
|
(with-blue (color))
|
||||||
|
'green
|
||||||
|
|
||||||
|
(define-syntax with-blue
|
||||||
|
(lambda (exp r c)
|
||||||
|
`(,(r 'let-syntax) ((color (,(r 'syntax-rules) ()
|
||||||
|
((color) 'blue))))
|
||||||
|
. ,(cdr exp))))
|
||||||
|
|
||||||
|
> (with-blue (color))
|
||||||
|
'blue
|
||||||
|
|
||||||
|
> (list (color) (with-blue (color)))
|
||||||
|
'(green blue)
|
||||||
|
|
||||||
|
> (define-syntax with-green
|
||||||
|
(lambda (exp r c)
|
||||||
|
`(,(r 'let-syntax) ((color (,(r 'syntax-rules) ()
|
||||||
|
((color) 'green))))
|
||||||
|
. ,(cdr exp))))
|
||||||
|
|
||||||
|
> (cons (color) (with-blue (list (color) (with-green (color)))))
|
||||||
|
'(green blue green)
|
||||||
|
---
|
||||||
|
|
|
@ -0,0 +1,60 @@
|
||||||
|
scsh name Posix ctype Alternate
|
||||||
|
-----------------------------------------
|
||||||
|
lower-case lower
|
||||||
|
upper-case upper
|
||||||
|
alphabetic alpha
|
||||||
|
numeric digit num
|
||||||
|
alphanumeric alnum alphanum
|
||||||
|
punctuation punct
|
||||||
|
graphic graph
|
||||||
|
blank (Gnu extension)
|
||||||
|
whitespace space white ("space" is potentially confusing.)
|
||||||
|
printing print
|
||||||
|
control cntrl
|
||||||
|
hex-digit xdigit hex
|
||||||
|
ascii ascii (Gnu extension)
|
||||||
|
|
||||||
|
SRE ::= ... <char-class> ...
|
||||||
|
|
||||||
|
<char-class> ::= <char>
|
||||||
|
| any | nonl | <class-name>
|
||||||
|
| (in <char-class-arg> ...)
|
||||||
|
| (not-in <char-class-arg> ...)
|
||||||
|
| (and <char-class-arg> ...)
|
||||||
|
| (diff <char-class-arg> ...)
|
||||||
|
| (- <char-or-string> ...)
|
||||||
|
| ,<cset-exp>
|
||||||
|
|
||||||
|
<char-class-arg> ::= <char-class> | <string>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
(diff <arg1> <arg2> ...) = (and <arg1> (not-in <arg2> ...))
|
||||||
|
|
||||||
|
~!@#$%^&*-_+=|:<>?/ in
|
||||||
|
|
||||||
|
(: ...) sequence
|
||||||
|
|
||||||
|
(and ...)
|
||||||
|
(or ...)
|
||||||
|
(not ...)
|
||||||
|
(diff ...)
|
||||||
|
|
||||||
|
(* ...)
|
||||||
|
(+ ...)
|
||||||
|
(- ...)
|
||||||
|
|
||||||
|
(in ...) ; union
|
||||||
|
(not-in ...) ; complement-of-union
|
||||||
|
(and ...) ; intersection
|
||||||
|
(diff ...) ; diff
|
||||||
|
|
||||||
|
(+ ...) ; union
|
||||||
|
(~ ...) ; complement-of-union
|
||||||
|
(& ...) ; intersection
|
||||||
|
(- ...) ; diff
|
||||||
|
|
||||||
|
(in ...) ; union
|
||||||
|
(~ ...) ; complement-of-union
|
||||||
|
(& ...) ; intersection
|
||||||
|
(- ...) ; diff
|
|
@ -0,0 +1,160 @@
|
||||||
|
(define-structure conditionals
|
||||||
|
(export (define-simple-syntax :syntax)
|
||||||
|
(when :syntax)
|
||||||
|
(unless :syntax)
|
||||||
|
(? :syntax)
|
||||||
|
(switchq :syntax)
|
||||||
|
(switch :syntax)
|
||||||
|
(prog0 :syntax)
|
||||||
|
(land* :syntax))
|
||||||
|
(open scheme)
|
||||||
|
(begin
|
||||||
|
|
||||||
|
;;; (define-simple-syntax (name subforms ...) expansion)
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define-syntax define-simple-syntax
|
||||||
|
(syntax-rules ()
|
||||||
|
((define-simple-syntax (name subforms ...) expansion)
|
||||||
|
(define-syntax name (syntax-rules () ((name subforms ...) expansion))))))
|
||||||
|
|
||||||
|
|
||||||
|
;;; ? = COND
|
||||||
|
;;; (WHEN test body ...) (SWITCHQ = key clause ...)
|
||||||
|
;;; (UNLESS test body ...) (SWITCH = key clause ...)
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; Handy conditional forms. ? is so short that it renders WHEN pretty
|
||||||
|
;;; much useless.
|
||||||
|
|
||||||
|
(define-simple-syntax (when test body ...)
|
||||||
|
(if test (begin body ...)))
|
||||||
|
|
||||||
|
(define-simple-syntax (unless test body ...)
|
||||||
|
(if (not test) (begin body ...)))
|
||||||
|
|
||||||
|
;;; ? is synonym for COND.
|
||||||
|
(define-simple-syntax (? clause ...) (cond clause ...))
|
||||||
|
|
||||||
|
|
||||||
|
;;; (PROG0 val-exp exp ...)
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define-simple-syntax (prog0 val-exp exp ...)
|
||||||
|
(let ((v val-exp)) exp ... v))
|
||||||
|
|
||||||
|
|
||||||
|
;;; (land* (clause ...) body ...) -*- Scheme -*-
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; Evaluate each clause. If any clause returns false, land* stops and
|
||||||
|
;;; returns false. If all the clauses evaluate to a true value, return
|
||||||
|
;;; the value of the body.
|
||||||
|
;;;
|
||||||
|
;;; The difference between LAND* and AND is that LAND* binds names to
|
||||||
|
;;; the values of its clauses, which may be used by subsequent clauses.
|
||||||
|
;;; Clauses are of the form
|
||||||
|
;;; (var exp) ; binds VAR to the value of EXP.
|
||||||
|
;;; (exp) ; No binding.
|
||||||
|
;;; var ; Reference -- no binding.
|
||||||
|
;;;
|
||||||
|
;;; Example:
|
||||||
|
;;; (land* ((probe (assq key alist)))
|
||||||
|
;;; (cdr probe))
|
||||||
|
;;;
|
||||||
|
;;; LAND* is due to Oleg Kiselyov (http://pobox.com/~oleg); I wrote this
|
||||||
|
;;; simple implementation as a high-level R5RS DEFINE-SYNTAX macro.
|
||||||
|
;;; Olin 98/9/29
|
||||||
|
|
||||||
|
(define-syntax land*
|
||||||
|
(syntax-rules ()
|
||||||
|
((land* () body ...) (begin body ...))
|
||||||
|
|
||||||
|
((land* ((var exp) clause ...) body ...)
|
||||||
|
(let ((var exp)) (and var (land* (clause ...) body ...))))
|
||||||
|
|
||||||
|
((land* ((#f exp) clause ...) body ...)
|
||||||
|
(and exp (land* (clause ...) body ...)))
|
||||||
|
|
||||||
|
((land* ((exp) clause ...) body ...)
|
||||||
|
(and exp (land* (clause ...) body ...)))
|
||||||
|
|
||||||
|
((land* (var clause ...) body ...)
|
||||||
|
(and var (land* (clause ...) body ...)))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; Like CASE, but you specify the key-comparison procedure.
|
||||||
|
;;; SWITCH evaluates its keys each time through the conditional.
|
||||||
|
;;; SWITCHQ keys are not evaluated -- are simply constants.
|
||||||
|
;;; (switchq string=? (vector-ref vec i)
|
||||||
|
;;; (("plus" "minus") ...)
|
||||||
|
;;; (("times" "div") ...)
|
||||||
|
;;; (else ...))
|
||||||
|
|
||||||
|
(define-simple-syntax (switchq compare key clause ...)
|
||||||
|
(let ((k key) ; Eval KEY and COMPARE
|
||||||
|
(c compare)) ; just once, then call %switch.
|
||||||
|
(%switchq c k clause ...))) ; C, K are vars, hence replicable.
|
||||||
|
|
||||||
|
(define-syntax %switchq
|
||||||
|
(syntax-rules (else)
|
||||||
|
((%switchq compare key ((key1 ...) body1 body2 ...) rest ...)
|
||||||
|
(if (or (compare key 'key1) ...)
|
||||||
|
(begin body1 body2 ...)
|
||||||
|
(%switchq compare key rest ...)))
|
||||||
|
|
||||||
|
((%switchq compare key ((key1 ...)) rest ...) ; Null body.
|
||||||
|
(if (not (or (compare key 'key1) ...))
|
||||||
|
(%switchq compare key rest ...)))
|
||||||
|
|
||||||
|
((%switchq compare key (else body ...))
|
||||||
|
(begin body ...))
|
||||||
|
|
||||||
|
((%switchq compare key) '#f)))
|
||||||
|
|
||||||
|
|
||||||
|
(define-simple-syntax (switch compare key clause ...)
|
||||||
|
(let ((k key) ; Eval KEY and COMPARE
|
||||||
|
(c compare)) ; just once, then call %switch.
|
||||||
|
(%switch c k clause ...))) ; C, K are vars, hence replicable.
|
||||||
|
|
||||||
|
(define-syntax %switch
|
||||||
|
(syntax-rules (else)
|
||||||
|
((%switch compare key ((key1 ...) body1 body2 ...) rest ...)
|
||||||
|
(if (or (compare key key1) ...)
|
||||||
|
(begin body1 body2 ...)
|
||||||
|
(%switch compare key rest ...)))
|
||||||
|
|
||||||
|
((%switch compare key ((key1 ...)) rest ...) ; Null body.
|
||||||
|
(if (not (or (compare key key1) ...))
|
||||||
|
(%switch compare key rest ...)))
|
||||||
|
|
||||||
|
((%switch compare key (else body ...))
|
||||||
|
(begin body ...))
|
||||||
|
|
||||||
|
((%switch compare key) '#f)))
|
||||||
|
|
||||||
|
;;; I can't get this to work -- S48 complains "too many ...'s".
|
||||||
|
;(define-syntax switchq
|
||||||
|
; (syntax-rules (else)
|
||||||
|
; ((switchq compare key clause ...)
|
||||||
|
; (letrec-syntax ((%switchq (syntax-rules (else)
|
||||||
|
; ((%switchq compare key
|
||||||
|
; ((key1 ...) body1 body2 ...) rest ...)
|
||||||
|
; (if (or (compare key 'key1) ...)
|
||||||
|
; (begin body1 body2 ...)
|
||||||
|
; (%switchq compare key rest ...)))
|
||||||
|
;
|
||||||
|
; ; Null body.
|
||||||
|
; ((%switchq compare key ((key1 ...)) rest ...)
|
||||||
|
; (if (not (or (compare key 'key1) ...))
|
||||||
|
; (%switchq compare key rest ...)))
|
||||||
|
;
|
||||||
|
; ((%switchq compare key (else body ...))
|
||||||
|
; (begin body ...))
|
||||||
|
;
|
||||||
|
; ((%switchq compare key) '#f))))
|
||||||
|
;
|
||||||
|
; (let ((k key) ; Eval KEY and COMPARE
|
||||||
|
; (c compare)) ; just once, then call %switch.
|
||||||
|
; (%switchq c k clause ...)))))); C, K are vars, hence replicable.
|
||||||
|
))
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,121 @@
|
||||||
|
;;; These are some macros to support using regexp matching.
|
||||||
|
|
||||||
|
(define-structure let-match-package
|
||||||
|
(export (let-match :syntax)
|
||||||
|
(if-match :syntax)
|
||||||
|
(match-cond :syntax))
|
||||||
|
(for-syntax (open scheme
|
||||||
|
signals)) ; For ERROR
|
||||||
|
|
||||||
|
(open scsh scheme)
|
||||||
|
(access signals) ; for ERROR
|
||||||
|
|
||||||
|
(begin
|
||||||
|
|
||||||
|
;;; (let-match m mvars body ...)
|
||||||
|
;;; Bind the vars in MVARS to the match & submatch strings of match data M,
|
||||||
|
;;; and eval the body forms. #F is allowed in the MVARS list, as a don't-care
|
||||||
|
;;; parameter.
|
||||||
|
;;;
|
||||||
|
;;; (if-match m mvars conseq alt)
|
||||||
|
;;; The same as LET-MATCH -- eval the CONSEQ form in the scope of the
|
||||||
|
;;; bound MVARS. However, if the match data M evaluates to false, instead
|
||||||
|
;;; of blowing up, we execute the ALT form instead.
|
||||||
|
|
||||||
|
(define-syntax let-match
|
||||||
|
(lambda (exp r c)
|
||||||
|
(if (< (length exp) 3)
|
||||||
|
(error "No match-vars list in LET-MATCH" exp))
|
||||||
|
(let ((m (cadr exp)) ; The match expression
|
||||||
|
(mvars (caddr exp)) ; The match vars
|
||||||
|
(body (cdddr exp)) ; The expression's body forms
|
||||||
|
|
||||||
|
(%begin (r 'begin))
|
||||||
|
(%match:substring (r 'match:substring))
|
||||||
|
(%let* (r 'let*)))
|
||||||
|
|
||||||
|
(cond ((null? mvars) `(,%begin ,@body))
|
||||||
|
|
||||||
|
((pair? mvars)
|
||||||
|
(let* ((msv (or (car mvars) (r 'match-val))) ; "match-struct var"
|
||||||
|
(sm-bindings (let recur ((i 0) (vars (cdr mvars)))
|
||||||
|
(if (pair? vars)
|
||||||
|
(let ((var (car vars))
|
||||||
|
(bindings (recur (+ i 1) (cdr vars))))
|
||||||
|
(if var
|
||||||
|
(cons `(,var (,%match:substring ,msv ,i))
|
||||||
|
bindings)
|
||||||
|
bindings))
|
||||||
|
'()))))
|
||||||
|
`(,%let* ((,msv ,m) ,@sm-bindings) ,@body)))
|
||||||
|
|
||||||
|
|
||||||
|
(else (error "Illegal match-vars list in LET-MATCH" mvars exp))))))
|
||||||
|
|
||||||
|
(define-syntax if-match
|
||||||
|
(syntax-rules ()
|
||||||
|
((if-match match-exp mvars on-match no-match)
|
||||||
|
(cond (match-exp => (lambda (m) (let-match m mvars on-match)))
|
||||||
|
(else no-match)))))
|
||||||
|
|
||||||
|
;;; (MATCH-COND (<match-exp> <match-vars> <body> ...)
|
||||||
|
;;; (TEST <exp> <body> ...)
|
||||||
|
;;; (TEST <exp> => <proc>)
|
||||||
|
;;; (ELSE <body> ...))
|
||||||
|
;;;
|
||||||
|
;;; The first clause is as-in IF-MATCH; the next three clauses are as-in COND.
|
||||||
|
;;;
|
||||||
|
;;; It would be slicker if we could *add* extra clauses to the syntax
|
||||||
|
;;; of COND, but Scheme macros aren't extensible this way.
|
||||||
|
|
||||||
|
;;; Two defs. The other expander produces prettier output -- one COND
|
||||||
|
;;; rather than a mess of nested IF's.
|
||||||
|
;(define-syntax match-cond
|
||||||
|
; (syntax-rules (else test =>)
|
||||||
|
; ((match-cond (else body ...) clause2 ...) (begin body ...))
|
||||||
|
;
|
||||||
|
; ((match-cond) (cond))
|
||||||
|
;
|
||||||
|
; ((match-cond (test exp => proc) clause2 ...)
|
||||||
|
; (let ((v exp)) (if v (proc v) (match-cond clause2 ...))))
|
||||||
|
;
|
||||||
|
; ((match-cond (test exp body ...) clause2 ...)
|
||||||
|
; (if exp (begin body ...) (match-cond clause2 ...)))
|
||||||
|
;
|
||||||
|
; ((match-cond (test exp) clause2 ...)
|
||||||
|
; (or exp (match-cond clause2 ...)))
|
||||||
|
;
|
||||||
|
; ((match-cond (match-exp mvars body ...) clause2 ...)
|
||||||
|
; (if-match match-exp mvars (begin body ...)
|
||||||
|
; (match-cond clause2 ...)))))
|
||||||
|
|
||||||
|
(define-syntax match-cond
|
||||||
|
(syntax-rules ()
|
||||||
|
((match-cond clause ...) (match-cond-aux () clause ...))))
|
||||||
|
|
||||||
|
(define-syntax match-cond-aux
|
||||||
|
(syntax-rules (test else)
|
||||||
|
|
||||||
|
;; No more clauses.
|
||||||
|
((match-cond-aux (cond-clause ...))
|
||||||
|
(cond cond-clause ...))
|
||||||
|
|
||||||
|
;; (TEST . <cond-clause>)
|
||||||
|
((match-cond-aux (cond-clause ...)
|
||||||
|
(test . another-cond-clause) clause2 ...)
|
||||||
|
(match-cond-aux (cond-clause ... another-cond-clause)
|
||||||
|
clause2 ...))
|
||||||
|
|
||||||
|
;; (ELSE <body> ...)
|
||||||
|
((match-cond-aux (cond-clause ...)
|
||||||
|
(else body ...) clause2 ...)
|
||||||
|
(match-cond-aux (cond-clause ... (else body ...))))
|
||||||
|
|
||||||
|
;; (<match-exp> <mvars> <body> ...)
|
||||||
|
((match-cond-aux (cond-clause ...)
|
||||||
|
(match-exp mvars body ...) clause2 ...)
|
||||||
|
(match-cond-aux (cond-clause ... (match-exp => (lambda (m)
|
||||||
|
(let-match m mvars
|
||||||
|
body ...))))
|
||||||
|
clause2 ...))))
|
||||||
|
))
|
|
@ -0,0 +1,7 @@
|
||||||
|
;;; ,exec ,load loadem.scm
|
||||||
|
|
||||||
|
(config '(load "packages2.scm"))
|
||||||
|
(config '(load "cond-package.scm"))
|
||||||
|
;(map load-package '(rx-lib re-basics re-low-exports re-high-tools
|
||||||
|
; sre-parser-package re-posix-parsers sre-syntax-tools
|
||||||
|
; rx-syntax))
|
|
@ -0,0 +1,26 @@
|
||||||
|
(define-structure re-package (export)
|
||||||
|
(open scsh
|
||||||
|
formats
|
||||||
|
define-record-types ; re
|
||||||
|
defrec-package ; re
|
||||||
|
scsh-utilities ;
|
||||||
|
define-foreign-syntax ; re-low
|
||||||
|
weak ; re-low
|
||||||
|
let-opt ; re
|
||||||
|
sort ; posixstr
|
||||||
|
receiving ; all of them
|
||||||
|
scheme)
|
||||||
|
|
||||||
|
(files "/usr/home/shivers/src/scm/conditionals.scm"
|
||||||
|
re
|
||||||
|
re-low
|
||||||
|
simp
|
||||||
|
re-high
|
||||||
|
parse
|
||||||
|
posixstr
|
||||||
|
spencer
|
||||||
|
;re-syntax
|
||||||
|
)
|
||||||
|
|
||||||
|
(optimize auto-integrate)
|
||||||
|
)
|
|
@ -0,0 +1,21 @@
|
||||||
|
;;; These functions were dropped from the regexp API when I shifted scsh's
|
||||||
|
;;; regexps over to SREs. They are retained for backwards compatibility.
|
||||||
|
;;; -Olin 8/98
|
||||||
|
|
||||||
|
(define (string-match re str . maybe-start)
|
||||||
|
(apply regexp-search (->regexp re) str maybe-start))
|
||||||
|
|
||||||
|
(define make-regexp posix-string->regexp)
|
||||||
|
|
||||||
|
(define regexp-exec regexp-search)
|
||||||
|
|
||||||
|
(define (->regexp str-or-re)
|
||||||
|
(cond ((string? str-or-re) (posix-string->regexp str-or-re))
|
||||||
|
((regexp? str-or-re) str-or-re)
|
||||||
|
(else (error ->regexp
|
||||||
|
"Value must be either a Posix regexp string or a regexp value"
|
||||||
|
str-or-re))))
|
||||||
|
|
||||||
|
(define (regexp-quote str)
|
||||||
|
(receive (s lev pcount tvec) (regexp->posix-string (re-string str))
|
||||||
|
s))
|
|
@ -0,0 +1,235 @@
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
(define-interface re-low-exports-interface ; User-level stuff
|
||||||
|
(export regexp-match?
|
||||||
|
match:start match:end match:substring
|
||||||
|
clean-up-cres))
|
||||||
|
|
||||||
|
;;; TOO MUCH STUFF HERE
|
||||||
|
(define-interface re-low-internals-interface ; For scsh internals
|
||||||
|
(export make-regexp-match
|
||||||
|
regexp-match:string set-regexp-match:string
|
||||||
|
regexp-match:start set-regexp-match:start
|
||||||
|
regexp-match:end set-regexp-match:end
|
||||||
|
|
||||||
|
cre? new-cre
|
||||||
|
cre:string set-cre:string
|
||||||
|
cre:bytes set-cre:bytes
|
||||||
|
cre:bytes set-cre:bytes
|
||||||
|
cre:tvec set-cre:tvec
|
||||||
|
|
||||||
|
cre-search cre-search?))
|
||||||
|
|
||||||
|
(define-structures ((re-low-exports re-low-exports-interface)
|
||||||
|
(re-low-internals re-low-internals-interface))
|
||||||
|
(open scsh
|
||||||
|
scsh-utilities
|
||||||
|
defrec-package
|
||||||
|
let-opt
|
||||||
|
define-foreign-syntax
|
||||||
|
weak
|
||||||
|
receiving
|
||||||
|
scheme)
|
||||||
|
(files re-low)
|
||||||
|
(optimize auto-integrate))
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; Do these guys really need to open the scsh package?
|
||||||
|
|
||||||
|
(define-interface basic-re-interface
|
||||||
|
(export re-dsm? make-re-dsm
|
||||||
|
re-dsm:body
|
||||||
|
re-dsm:pre-dsm
|
||||||
|
re-dsm:tsm
|
||||||
|
re-dsm:posix set-re-dsm:posix
|
||||||
|
re-dsm:post-dsm
|
||||||
|
re-dsm open-dsm
|
||||||
|
|
||||||
|
re-seq? %%make-re-seq %make-re-seq make-re-seq re-seq
|
||||||
|
re-seq:elts
|
||||||
|
re-seq:tsm
|
||||||
|
re-seq:posix set-re-seq:posix
|
||||||
|
|
||||||
|
re-choice? %%make-re-choice %make-re-choice make-re-choice re-choice
|
||||||
|
re-choice:elts
|
||||||
|
re-choice:tsm
|
||||||
|
re-choice:posix set-re-choice:posix
|
||||||
|
|
||||||
|
re-repeat? %%make-re-repeat %make-re-repeat make-re-repeat re-repeat
|
||||||
|
re-repeat:from
|
||||||
|
re-repeat:to
|
||||||
|
re-repeat:body
|
||||||
|
re-repeat:tsm
|
||||||
|
re-repeat:posix set-re-repeat:posix
|
||||||
|
|
||||||
|
re-submatch?
|
||||||
|
%%make-re-submatch %make-re-submatch make-re-submatch re-submatch
|
||||||
|
re-submatch:body
|
||||||
|
re-submatch:pre-dsm
|
||||||
|
re-submatch:tsm
|
||||||
|
re-submatch:posix set-re-submatch:posix
|
||||||
|
re-submatch:post-dsm
|
||||||
|
|
||||||
|
re-string? make-re-string re-string
|
||||||
|
re-string:chars set-re-string:chars
|
||||||
|
re-string:posix set-re-string:posix
|
||||||
|
|
||||||
|
re-trivial re-trivial?
|
||||||
|
|
||||||
|
re-char-set? make-re-char-set re-char-set
|
||||||
|
re-char-set:cset set-re-char-set:cset
|
||||||
|
re-char-set:posix set-re-char-set:posix
|
||||||
|
|
||||||
|
;; Constructors for the Scheme unparser
|
||||||
|
make-re-string/posix
|
||||||
|
%make-re-seq/posix
|
||||||
|
%make-re-choice/posix
|
||||||
|
make-re-char-set/posix
|
||||||
|
%make-re-repeat/posix
|
||||||
|
%make-re-dsm/posix
|
||||||
|
%make-re-submatch/posix
|
||||||
|
|
||||||
|
re-empty re-empty?
|
||||||
|
re-bos re-bos? re-eos re-eos?
|
||||||
|
re-bol re-bol? re-eol re-eol?
|
||||||
|
re-bow re-bow? re-eow re-eow?
|
||||||
|
|
||||||
|
re-any re-any?
|
||||||
|
|
||||||
|
re-nonl
|
||||||
|
re-word
|
||||||
|
|
||||||
|
re?
|
||||||
|
re-tsm
|
||||||
|
|
||||||
|
flush-submatches ; Can be in code produced by RX expander.
|
||||||
|
uncase ; Can be in code produced by RX expander.
|
||||||
|
uncase-char-set ; Can be in code produced by RX expander.
|
||||||
|
uncase-string
|
||||||
|
|
||||||
|
char-set-empty?
|
||||||
|
char-set-full?))
|
||||||
|
|
||||||
|
;;; Stuff that could appear in code produced by (rx ...)
|
||||||
|
(define-interface rx-lib-interface
|
||||||
|
(export coerce-dynamic-regexp
|
||||||
|
coerce-dynamic-charset
|
||||||
|
spec->char-set
|
||||||
|
flush-submatches
|
||||||
|
uncase
|
||||||
|
uncase-char-set
|
||||||
|
uncase-string))
|
||||||
|
|
||||||
|
(define-structure rx-lib rx-lib-interface
|
||||||
|
(open scsh conditionals re-basics scheme)
|
||||||
|
(files rx-lib)
|
||||||
|
(optimize auto-integrate))
|
||||||
|
|
||||||
|
(define-structures ((re-basics basic-re-interface)
|
||||||
|
(re-simp-package (export simplify-regexp)))
|
||||||
|
(open scsh
|
||||||
|
re-low-internals ; new-cre
|
||||||
|
conditionals
|
||||||
|
scsh-utilities
|
||||||
|
define-record-types
|
||||||
|
defrec-package
|
||||||
|
let-opt
|
||||||
|
receiving
|
||||||
|
scheme)
|
||||||
|
(files re simp)
|
||||||
|
(optimize auto-integrate))
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define-interface re-high-interface
|
||||||
|
(export regexp-search regexp-search?
|
||||||
|
regexp-substitute regexp-substitute/global))
|
||||||
|
|
||||||
|
(define-interface re-high-tools-interface (export compile-regexp))
|
||||||
|
|
||||||
|
(define-structures ((re-high-exports re-high-interface)
|
||||||
|
(re-high-tools re-high-tools-interface))
|
||||||
|
(open scsh
|
||||||
|
scsh-utilities
|
||||||
|
conditionals
|
||||||
|
|
||||||
|
;; compile-regexp needs:
|
||||||
|
re-low-internals ; new-cre
|
||||||
|
re-simp-package ; simplify-regexp
|
||||||
|
re-posix-parsers ; regexp->posix-string
|
||||||
|
|
||||||
|
re-basics ; re-tsm
|
||||||
|
|
||||||
|
let-opt
|
||||||
|
receiving
|
||||||
|
scheme)
|
||||||
|
(files re-high)
|
||||||
|
(optimize auto-integrate))
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define-interface sre-parser-interface
|
||||||
|
(export sre->regexp regexp->sre
|
||||||
|
parse-sre parse-sres regexp->scheme
|
||||||
|
char-set->in-pair
|
||||||
|
static-regexp?))
|
||||||
|
|
||||||
|
(define-interface posix-re-interface
|
||||||
|
(export regexp->posix-string ; posixstr.scm
|
||||||
|
posix-string->regexp ; spencer
|
||||||
|
))
|
||||||
|
|
||||||
|
;;; The Posix-string stuff needs char-set->in-pair from parse.scm
|
||||||
|
;;; The SRE parser needs the Posix string parser for POSIX-STRING SRE's.
|
||||||
|
|
||||||
|
(define-structures ((sre-parser-package sre-parser-interface)
|
||||||
|
(re-posix-parsers posix-re-interface))
|
||||||
|
(open scsh
|
||||||
|
conditionals
|
||||||
|
re-low-internals ; cre:string cre:tvec
|
||||||
|
re-basics
|
||||||
|
re-simp-package
|
||||||
|
sort ; Posix renderer
|
||||||
|
scsh-utilities
|
||||||
|
receiving
|
||||||
|
scheme)
|
||||||
|
(files parse ; sre-parser-package
|
||||||
|
posixstr spencer) ; re-posix-parsers
|
||||||
|
(optimize auto-integrate))
|
||||||
|
|
||||||
|
|
||||||
|
;;; re-syntax provides 2 structures:
|
||||||
|
;;; re-syntax (exports (rx :syntax))
|
||||||
|
;;; re-syntax-tools (exports (if-sre-form :syntax) sre-form?)
|
||||||
|
|
||||||
|
(define-interface sre-syntax-tools-interface
|
||||||
|
(export expand-rx sre-form?))
|
||||||
|
|
||||||
|
(define-structure sre-syntax-tools sre-syntax-tools-interface
|
||||||
|
(open scheme
|
||||||
|
receiving
|
||||||
|
conditionals
|
||||||
|
re-posix-parsers ; regexp->posix-string
|
||||||
|
sre-parser-package ; static-regexp? parse-sres
|
||||||
|
re-high-tools ; compile-regexp
|
||||||
|
re-basics ; For the posix-cacher and code-producer
|
||||||
|
re-simp-package)
|
||||||
|
(files re-syntax)
|
||||||
|
(optimize auto-integrate))
|
||||||
|
|
||||||
|
(define-structure rx-syntax (export (rx :syntax)
|
||||||
|
(if-sre-form :syntax))
|
||||||
|
|
||||||
|
(open re-basics
|
||||||
|
rx-lib
|
||||||
|
scheme)
|
||||||
|
(for-syntax (open sre-syntax-tools scheme))
|
||||||
|
(begin (define-syntax rx expand-rx)
|
||||||
|
(define-syntax if-sre-form
|
||||||
|
(lambda (exp r c)
|
||||||
|
(if (sre-form? (cadr exp) r c)
|
||||||
|
(caddr exp)
|
||||||
|
(cadddr exp)))))
|
||||||
|
(optimize auto-integrate))
|
|
@ -0,0 +1,185 @@
|
||||||
|
;;; Module definitions for the scsh regexp system.
|
||||||
|
;;; -Olin <shivers@ai.mit.edu> 8/98
|
||||||
|
|
||||||
|
(define-interface basic-re-interface
|
||||||
|
(export re-dsm? make-re-dsm
|
||||||
|
re-dsm:body
|
||||||
|
re-dsm:pre-dsm
|
||||||
|
re-dsm:tsm
|
||||||
|
re-dsm:posix set-re-dsm:posix
|
||||||
|
re-dsm:post-dsm
|
||||||
|
re-dsm open-dsm
|
||||||
|
|
||||||
|
re-seq? %%make-re-seq %make-re-seq make-re-seq re-seq
|
||||||
|
re-seq:elts
|
||||||
|
re-seq:tsm
|
||||||
|
re-seq:posix set-re-seq:posix
|
||||||
|
|
||||||
|
re-choice? %%make-re-choice %make-re-choice make-re-choice re-choice
|
||||||
|
re-choice:elts
|
||||||
|
re-choice:tsm
|
||||||
|
re-choice:posix set-re-choice:posix
|
||||||
|
|
||||||
|
re-repeat? %%make-re-repeat %make-re-repeat make-re-repeat re-repeat
|
||||||
|
re-repeat:from
|
||||||
|
re-repeat:to
|
||||||
|
re-repeat:body
|
||||||
|
re-repeat:tsm
|
||||||
|
re-repeat:posix set-re-repeat:posix
|
||||||
|
|
||||||
|
re-submatch?
|
||||||
|
%%make-re-submatch %make-re-submatch make-re-submatch re-submatch
|
||||||
|
re-submatch:body
|
||||||
|
re-submatch:pre-dsm
|
||||||
|
re-submatch:tsm
|
||||||
|
re-submatch:posix set-re-submatch:posix
|
||||||
|
re-submatch:post-dsm
|
||||||
|
|
||||||
|
re-string? make-re-string re-string
|
||||||
|
re-string:chars set-re-string:chars
|
||||||
|
re-string:posix set-re-string:posix
|
||||||
|
|
||||||
|
re-trivial re-trivial?
|
||||||
|
|
||||||
|
re-char-set? make-re-char-set re-char-set
|
||||||
|
re-char-set:cset set-re-char-set:cset
|
||||||
|
re-char-set:posix set-re-char-set:posix
|
||||||
|
|
||||||
|
re-empty re-empty?
|
||||||
|
re-bos re-bos? re-eos re-eos?
|
||||||
|
re-bol re-bol? re-eol re-eol?
|
||||||
|
re-bow re-bow? re-eow re-eow?
|
||||||
|
|
||||||
|
re-any re-any?
|
||||||
|
|
||||||
|
re-nonl
|
||||||
|
re-word
|
||||||
|
|
||||||
|
regexp?
|
||||||
|
re-tsm
|
||||||
|
|
||||||
|
flush-submatches ; Can be in code produced by RX expander.
|
||||||
|
uncase ; Can be in code produced by RX expander.
|
||||||
|
uncase-char-set ; Can be in code produced by RX expander.
|
||||||
|
uncase-string
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
;;; These guys were made obsolete by the new SRE package and exist for
|
||||||
|
;;; backwards compatibility only.
|
||||||
|
(define-interface re-old-funs-interface
|
||||||
|
(export string-match make-regexp regexp-exec ->regexp regexp-quote))
|
||||||
|
|
||||||
|
|
||||||
|
(define-interface re-internals-interface
|
||||||
|
(export make-re-string/posix ; Constructors for the Scheme unparser
|
||||||
|
%make-re-seq/posix
|
||||||
|
%make-re-choice/posix
|
||||||
|
make-re-char-set/posix
|
||||||
|
%make-re-repeat/posix
|
||||||
|
%make-re-dsm/posix
|
||||||
|
%make-re-submatch/posix))
|
||||||
|
|
||||||
|
|
||||||
|
(define-interface posix-re-interface
|
||||||
|
(export regexp->posix-string ; posixstr.scm
|
||||||
|
posix-string->regexp ; spencer
|
||||||
|
))
|
||||||
|
|
||||||
|
(define-interface re-exports-interface
|
||||||
|
(compound-interface posix-re-interface
|
||||||
|
basic-re-interface
|
||||||
|
(export regexp-match?
|
||||||
|
match:start match:end match:substring
|
||||||
|
clean-up-cres
|
||||||
|
regexp-search regexp-search?
|
||||||
|
regexp-substitute regexp-substitute/global
|
||||||
|
sre->regexp regexp->sre
|
||||||
|
)))
|
||||||
|
|
||||||
|
|
||||||
|
(define-structures ((re-exports re-exports-interface)
|
||||||
|
(re-internals re-internals-interface)
|
||||||
|
(sre-syntax-tools (export expand-rx sre-form?))
|
||||||
|
)
|
||||||
|
(open scsh-utilities
|
||||||
|
defrec-package
|
||||||
|
define-foreign-syntax
|
||||||
|
weak
|
||||||
|
;re-posix-parsers ; regexp->posix-string
|
||||||
|
let-opt
|
||||||
|
sort ; Posix renderer
|
||||||
|
conditionals
|
||||||
|
define-record-types
|
||||||
|
defrec-package
|
||||||
|
receiving
|
||||||
|
scsh
|
||||||
|
scheme)
|
||||||
|
(files re-low re simp re-high
|
||||||
|
parse posixstr spencer re-syntax)
|
||||||
|
(optimize auto-integrate)
|
||||||
|
)
|
||||||
|
|
||||||
|
;;; Stuff that could appear in code produced by (rx ...)
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define-interface rx-lib-interface
|
||||||
|
(compound-interface (export coerce-dynamic-regexp
|
||||||
|
coerce-dynamic-charset
|
||||||
|
spec->char-set
|
||||||
|
flush-submatches
|
||||||
|
uncase
|
||||||
|
uncase-char-set
|
||||||
|
uncase-string)
|
||||||
|
re-internals-interface))
|
||||||
|
|
||||||
|
(define-structure rx-lib rx-lib-interface
|
||||||
|
(open re-internals
|
||||||
|
conditionals re-exports scsh scheme)
|
||||||
|
(files rx-lib)
|
||||||
|
(optimize auto-integrate))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define-structure rx-syntax (export (rx :syntax)
|
||||||
|
(if-sre-form :syntax))
|
||||||
|
(open re-exports
|
||||||
|
rx-lib
|
||||||
|
scheme)
|
||||||
|
(for-syntax (open sre-syntax-tools scheme))
|
||||||
|
(begin (define-syntax rx expand-rx)
|
||||||
|
(define-syntax if-sre-form
|
||||||
|
(lambda (exp r c)
|
||||||
|
(if (sre-form? (cadr exp) r c)
|
||||||
|
(caddr exp)
|
||||||
|
(cadddr exp)))))
|
||||||
|
(optimize auto-integrate))
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define-structure re-old-funs re-old-funs-interface
|
||||||
|
(open re-exports scsh scheme)
|
||||||
|
(files oldfuns))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; File Exports
|
||||||
|
;;; ---- -------
|
||||||
|
;;; parse sre->regexp regexp->sre
|
||||||
|
;;; parse-sre parse-sres regexp->scheme
|
||||||
|
;;; char-set->in-pair
|
||||||
|
;;; posixstr regexp->posix-string
|
||||||
|
;;; re-high compile-regexp regexp-search regexp-search?
|
||||||
|
;;; regexp-substitute regexp-substitute/global
|
||||||
|
;;; re-low match:start match:end match:substring
|
||||||
|
;;; CRE record, new-cre, compile-posix-re->c-struct
|
||||||
|
;;; cre-search cre-search? clean-up-cres
|
||||||
|
;;; re-syntax sre-form? if-sre-form expand-rx
|
||||||
|
;;; re.scm The ADT. flush-submatches uncase uncase-char-set
|
||||||
|
;;; char-set-full? char-set-empty?
|
||||||
|
;;; re-char-class? static-char-class?
|
||||||
|
;;; rx-lib coerce-dynamic-regexp coerce-dynamic-charset spec->char-set
|
||||||
|
;;; simp simplify-regexp
|
||||||
|
;;; spencer posix-string->regexp
|
|
@ -0,0 +1,309 @@
|
||||||
|
;;; Module definitions for the scsh regexp system.
|
||||||
|
;;; This is a sleazy modularisation -- we just load everything into
|
||||||
|
;;; scsh-level-0, and export from there.
|
||||||
|
;;; -Olin <shivers@ai.mit.edu> 8/98
|
||||||
|
|
||||||
|
(define-interface basic-re-interface
|
||||||
|
(export (re-dsm? (proc (:value) :boolean))
|
||||||
|
(make-re-dsm (proc (:value :exact-integer :exact-integer) :value))
|
||||||
|
(re-dsm:body (proc (:value) :value))
|
||||||
|
(re-dsm:pre-dsm (proc (:value) :exact-integer))
|
||||||
|
(re-dsm:tsm (proc (:value) :exact-integer))
|
||||||
|
(re-dsm:posix (proc (:value) :value))
|
||||||
|
(set-re-dsm:posix (proc (:value :value) :unspecific))
|
||||||
|
((re-dsm:post-dsm re-dsm) (proc (:value) :exact-integer))
|
||||||
|
(open-dsm (proc (:value) (some-values :value :exact-integer)))
|
||||||
|
|
||||||
|
(re-seq? (proc (:value) :boolean))
|
||||||
|
(%%make-re-seq (proc (:value :exact-integer :value) :value))
|
||||||
|
(%make-re-seq (proc (:value :exact-integer) :value))
|
||||||
|
((re-seq make-re-seq) (proc (:value) :value))
|
||||||
|
(re-seq:elts (proc (:value) :value))
|
||||||
|
(re-seq:tsm (proc (:value) :exact-integer))
|
||||||
|
(re-seq:posix (proc (:value) :value))
|
||||||
|
(set-re-seq:posix (proc (:value :value) :unspecific))
|
||||||
|
|
||||||
|
(re-choice? (proc (:value) :boolean))
|
||||||
|
(%%make-re-choice (proc (:value :exact-integer :value) :value))
|
||||||
|
(%make-re-choice (proc (:value :exact-integer) :value))
|
||||||
|
((make-re-choice re-choice) (proc (:value) :value))
|
||||||
|
(re-choice:elts (proc (:value) :value))
|
||||||
|
(re-choice:tsm (proc (:value) :exact-integer))
|
||||||
|
(re-choice:posix (proc (:value) :value))
|
||||||
|
(set-re-choice:posix (proc (:value :value) :unspecific))
|
||||||
|
|
||||||
|
(re-repeat? (proc (:value) :boolean))
|
||||||
|
(%%make-re-repeat (proc (:exact-integer :value :value
|
||||||
|
:exact-integer :value)
|
||||||
|
:value))
|
||||||
|
(%make-re-repeat (proc (:exact-integer :value :value :exact-integer )
|
||||||
|
:value))
|
||||||
|
((re-repeat make-re-repeat)
|
||||||
|
(proc (:exact-integer :value :value) :value))
|
||||||
|
((re-repeat:from re-repeat:tsm)
|
||||||
|
(proc (:value) :exact-integer))
|
||||||
|
(re-repeat:to (proc (:value) :value))
|
||||||
|
((re-repeat:body re-repeat:posix)
|
||||||
|
(proc (:value) :value))
|
||||||
|
(set-re-repeat:posix (proc (:value :value) :unspecific))
|
||||||
|
|
||||||
|
(re-submatch? (proc (:value) :boolean))
|
||||||
|
(%%make-re-submatch (proc (:value :exact-integer :exact-integer :value)
|
||||||
|
:value))
|
||||||
|
(%make-re-submatch (proc (:value :exact-integer :exact-integer) :value))
|
||||||
|
((make-re-submatch re-submatch)
|
||||||
|
(proc (:value &opt :exact-integer :exact-integer) :value))
|
||||||
|
|
||||||
|
(re-submatch:body (proc (:value) :value))
|
||||||
|
((re-submatch:pre-dsm re-submatch:tsm re-submatch:post-dsm)
|
||||||
|
(proc (:value) :exact-integer))
|
||||||
|
(re-submatch:posix (proc (:value) :value))
|
||||||
|
(set-re-submatch:posix (proc (:value :value) :unspecific))
|
||||||
|
|
||||||
|
(re-string? (proc (:value) :boolean))
|
||||||
|
((make-re-string re-string) (proc (:string) :value))
|
||||||
|
(re-string:chars (proc (:value) :string))
|
||||||
|
(set-re-string:chars (proc (:value :string) :unspecific))
|
||||||
|
(re-string:posix (proc (:value) :value))
|
||||||
|
(set-re-string:posix (proc (:value :value) :unspecific))
|
||||||
|
|
||||||
|
re-trivial
|
||||||
|
(re-trivial? (proc (:value) :boolean))
|
||||||
|
|
||||||
|
(re-char-set? (proc (:value) :boolean))
|
||||||
|
((make-re-char-set re-char-set) (proc (:value) :value))
|
||||||
|
(re-char-set:cset (proc (:value) :value))
|
||||||
|
(set-re-char-set:cset (proc (:value :value) :unspecific))
|
||||||
|
(re-char-set:posix (proc (:value) :value))
|
||||||
|
(set-re-char-set:posix (proc (:value :value) :unspecific))
|
||||||
|
|
||||||
|
re-empty
|
||||||
|
(re-empty? (proc (:value) :boolean))
|
||||||
|
re-bos re-eos
|
||||||
|
re-bol re-eol
|
||||||
|
re-bow re-eow
|
||||||
|
|
||||||
|
((re-bos? re-eos? re-bol? re-eol? re-bow? re-eow? re-any?)
|
||||||
|
(proc (:value) :boolean))
|
||||||
|
|
||||||
|
re-any
|
||||||
|
re-nonl
|
||||||
|
re-word
|
||||||
|
|
||||||
|
(regexp? (proc (:value) :boolean))
|
||||||
|
(re-tsm (proc (:value) :exact-integer))
|
||||||
|
|
||||||
|
;; These guys can be in code produced by RX expander.
|
||||||
|
(flush-submatches (proc (:value) :value))
|
||||||
|
(uncase (proc (:value) :value))
|
||||||
|
(uncase-char-set (proc (:value) :value))
|
||||||
|
(uncase-string (proc (:string) :value))
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
;;; These guys were made obsolete by the new SRE package and exist for
|
||||||
|
;;; backwards compatibility only.
|
||||||
|
(define-interface re-old-funs-interface
|
||||||
|
(export
|
||||||
|
(string-match (proc (:value :string &opt :exact-integer) :value))
|
||||||
|
(make-regexp (proc (:string) :value))
|
||||||
|
(regexp-exec (proc (:value :string &opt :exact-integer) :value))
|
||||||
|
(->regexp (proc (:value) :value))
|
||||||
|
(regexp-quote (proc (:string) :value))))
|
||||||
|
|
||||||
|
|
||||||
|
(define-interface re-internals-interface
|
||||||
|
;; These are constructors for the Scheme unparser
|
||||||
|
(export
|
||||||
|
(make-re-string/posix (proc (:string :string :vector) :value))
|
||||||
|
((%make-re-seq/posix %make-re-choice/posix)
|
||||||
|
(proc (:value :exact-integer :string :vector) :value))
|
||||||
|
(make-re-char-set/posix (proc (:value :string :vector) :value))
|
||||||
|
(%make-re-repeat/posix (proc (:exact-integer :value :value :exact-integer :string :vector)
|
||||||
|
:value))
|
||||||
|
(%make-re-dsm/posix (proc (:value :exact-integer :exact-integer :string :vector)
|
||||||
|
:value))
|
||||||
|
(%make-re-submatch/posix (proc (:value :exact-integer :exact-integer :string :vector) :value))))
|
||||||
|
|
||||||
|
|
||||||
|
(define re-match-internals-interface
|
||||||
|
(export (regexp-match:string (proc (:value) :string))
|
||||||
|
(regexp-match:start (proc (:value) :vector))
|
||||||
|
(regexp-match:end (proc (:value) :vector))))
|
||||||
|
|
||||||
|
|
||||||
|
(define-interface posix-re-interface
|
||||||
|
(export (regexp->posix-string (proc (:value) :string)) ; posixstr.scm
|
||||||
|
(posix-string->regexp (proc (:string) :value)) ; spencer
|
||||||
|
))
|
||||||
|
|
||||||
|
(define-interface re-subst-interface
|
||||||
|
(export
|
||||||
|
(regexp-substitute (proc (:value :value &rest :value) :value))
|
||||||
|
(regexp-substitute/global (proc (:value :value :string &rest :value) :value))))
|
||||||
|
|
||||||
|
(define-interface re-folders-interface
|
||||||
|
(export
|
||||||
|
(regexp-fold (proc (:value (proc (:exact-integer :value :value) :value)
|
||||||
|
:value
|
||||||
|
:string
|
||||||
|
&opt (proc (:exact-integer :value) :value)
|
||||||
|
:exact-integer)
|
||||||
|
:value))
|
||||||
|
(regexp-fold (proc (:value (proc (:value :exact-integer :value) :value)
|
||||||
|
:value
|
||||||
|
:string
|
||||||
|
&opt (proc (:exact-integer :value) :value)
|
||||||
|
:exact-integer)
|
||||||
|
:value))
|
||||||
|
(regexp-for-each (proc (:value (proc (:value) :unspecific)
|
||||||
|
:string &opt :exact-integer)
|
||||||
|
:unspecific))))
|
||||||
|
|
||||||
|
(define-interface re-level-0-interface
|
||||||
|
(compound-interface posix-re-interface
|
||||||
|
basic-re-interface
|
||||||
|
(export (regexp-match? (proc (:value) :boolean))
|
||||||
|
(match:start (proc (:value :exact-integer) :value))
|
||||||
|
(match:end (proc (:value :exact-integer) :value))
|
||||||
|
(match:substring (proc (:value :exact-integer) :value))
|
||||||
|
(clean-up-cres (proc () :unspecific))
|
||||||
|
(regexp-search (proc (:value :string &opt :exact-integer)
|
||||||
|
:value))
|
||||||
|
(regexp-search? (proc (:value :string &opt :exact-integer)
|
||||||
|
:boolean))
|
||||||
|
(sre->regexp (proc (:value) :value))
|
||||||
|
(regexp->sre (proc (:value) :value))
|
||||||
|
)))
|
||||||
|
|
||||||
|
|
||||||
|
(define-structures ((re-level-0 re-level-0-interface)
|
||||||
|
(re-match-internals re-match-internals-interface)
|
||||||
|
(re-internals re-internals-interface)
|
||||||
|
(sre-syntax-tools (export expand-rx static-regexp?
|
||||||
|
sre-form?
|
||||||
|
parse-sre parse-sres
|
||||||
|
sre->regexp regexp->sre
|
||||||
|
regexp->scheme)))
|
||||||
|
(open scsh-utilities
|
||||||
|
defrec-package
|
||||||
|
define-foreign-syntax
|
||||||
|
weak
|
||||||
|
;re-posix-parsers ; regexp->posix-string
|
||||||
|
let-opt
|
||||||
|
sort ; Posix renderer
|
||||||
|
conditionals
|
||||||
|
define-record-types
|
||||||
|
defrec-package
|
||||||
|
receiving
|
||||||
|
char-set-package
|
||||||
|
error-package
|
||||||
|
ascii
|
||||||
|
string-lib ; string-fold
|
||||||
|
scheme)
|
||||||
|
(files re-low re simp re-high
|
||||||
|
parse posixstr spencer re-syntax)
|
||||||
|
(optimize auto-integrate)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
;;; Stuff that could appear in code produced by (rx ...)
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define-interface rx-lib-interface
|
||||||
|
(compound-interface (export coerce-dynamic-regexp
|
||||||
|
coerce-dynamic-charset
|
||||||
|
spec->char-set
|
||||||
|
flush-submatches
|
||||||
|
uncase
|
||||||
|
uncase-char-set
|
||||||
|
uncase-string)
|
||||||
|
re-internals-interface))
|
||||||
|
|
||||||
|
(define-structure rx-lib rx-lib-interface
|
||||||
|
(open re-internals
|
||||||
|
conditionals
|
||||||
|
re-level-0
|
||||||
|
char-set-package
|
||||||
|
scsh-utilities ; fold
|
||||||
|
error-package
|
||||||
|
ascii
|
||||||
|
scheme)
|
||||||
|
(files rx-lib)
|
||||||
|
(optimize auto-integrate))
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define-interface rx-syntax-interface (export (rx :syntax)
|
||||||
|
(if-sre-form :syntax)))
|
||||||
|
|
||||||
|
(define-structure rx-syntax rx-syntax-interface
|
||||||
|
(open re-level-0
|
||||||
|
char-set-package
|
||||||
|
rx-lib
|
||||||
|
scheme)
|
||||||
|
(for-syntax (open sre-syntax-tools scheme))
|
||||||
|
(begin (define-syntax rx expand-rx)
|
||||||
|
(define-syntax if-sre-form
|
||||||
|
(lambda (exp r c)
|
||||||
|
(if (sre-form? (cadr exp) r c)
|
||||||
|
(caddr exp)
|
||||||
|
(cadddr exp)))))
|
||||||
|
(optimize auto-integrate))
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define-structure re-old-funs re-old-funs-interface
|
||||||
|
(open re-level-0 error-package receiving scheme)
|
||||||
|
(files oldfuns)
|
||||||
|
(optimize auto-integrate))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
(define-structure re-subst re-subst-interface
|
||||||
|
(open re-level-0
|
||||||
|
re-match-internals
|
||||||
|
scsh-utilities ; fold & some string utilities that need to be moved.
|
||||||
|
scsh-level-0 ; write-string
|
||||||
|
string-lib ; string-copy!
|
||||||
|
scheme)
|
||||||
|
(files re-subst)
|
||||||
|
(optimize auto-integrate))
|
||||||
|
|
||||||
|
|
||||||
|
(define-structure re-folders re-folders-interface
|
||||||
|
(open re-level-0 let-opt conditionals error-package scheme)
|
||||||
|
(files re-fold)
|
||||||
|
(optimize auto-integrate))
|
||||||
|
|
||||||
|
|
||||||
|
(define-interface re-exports-interface
|
||||||
|
(compound-interface re-level-0-interface
|
||||||
|
rx-syntax-interface
|
||||||
|
re-subst-interface
|
||||||
|
re-folders-interface))
|
||||||
|
|
||||||
|
(define-structure re-exports re-exports-interface
|
||||||
|
(open rx-syntax re-level-0 re-subst re-folders)
|
||||||
|
(optimize auto-integrate))
|
||||||
|
|
||||||
|
|
||||||
|
;;; File Exports
|
||||||
|
;;; ---- -------
|
||||||
|
;;; parse sre->regexp regexp->sre
|
||||||
|
;;; parse-sre parse-sres regexp->scheme
|
||||||
|
;;; char-set->in-pair static-regexp?
|
||||||
|
;;; posixstr regexp->posix-string
|
||||||
|
;;; re-high compile-regexp regexp-search regexp-search?
|
||||||
|
;;; re-subst regexp-substitute regexp-substitute/global
|
||||||
|
;;; re-low match:start match:end match:substring
|
||||||
|
;;; CRE record, new-cre, compile-posix-re->c-struct
|
||||||
|
;;; cre-search cre-search? clean-up-cres
|
||||||
|
;;; re-syntax sre-form? if-sre-form expand-rx
|
||||||
|
;;; re.scm The ADT. flush-submatches uncase uncase-char-set
|
||||||
|
;;; char-set-full? char-set-empty?
|
||||||
|
;;; re-char-class? static-char-class?
|
||||||
|
;;; rx-lib coerce-dynamic-regexp coerce-dynamic-charset spec->char-set
|
||||||
|
;;; simp simplify-regexp
|
||||||
|
;;; spencer posix-string->regexp
|
|
@ -0,0 +1,667 @@
|
||||||
|
;;; Regexp support for Scheme
|
||||||
|
;;; Olin Shivers, January 1997, May 1998.
|
||||||
|
|
||||||
|
;;; Todo:
|
||||||
|
;;; - Better unparsers for (word ...) and (word+ ...).
|
||||||
|
;;; - Unparse char-sets into set-diff SREs -- find a char set that's a
|
||||||
|
;;; tight bound, then get the difference. This would really pretty up
|
||||||
|
;;; things like (- alpha "aeiou")
|
||||||
|
|
||||||
|
;;; Exports:
|
||||||
|
;;; (sre->regexp sre) SRE->ADT parser
|
||||||
|
;;; (regexp->sre re) ADT->SRE unparser
|
||||||
|
;;;
|
||||||
|
;;; Procedures that parse sexp regexps and translate ADTs for low-level macros:
|
||||||
|
;;; (parse-sre sre rename compare)
|
||||||
|
;;; (parse-sres sres rename compare)
|
||||||
|
;;; (regexp->scheme re rename)
|
||||||
|
;;;
|
||||||
|
;;; (char-set->in-pair cset) Char-set unparsing utility
|
||||||
|
|
||||||
|
;;; Character-set dependencies:
|
||||||
|
;;; The only stuff in here dependent on the implementation's character type
|
||||||
|
;;; is the char-set parsing and unparsing, which deal with ranges of
|
||||||
|
;;; characters. We assume an 8-bit ASCII superset.
|
||||||
|
|
||||||
|
;;; Imports:
|
||||||
|
;;; ? for COND, and SWITCHQ conditional form.
|
||||||
|
;;; every
|
||||||
|
|
||||||
|
;;; This code is much hairier than it would otherwise be because of the
|
||||||
|
;;; the presence of ,<exp> forms, which put a static/dynamic duality over
|
||||||
|
;;; a lot of the processing -- we have to be prepared to handle either
|
||||||
|
;;; re's or Scheme epressions that produce re's; char-sets or Scheme
|
||||||
|
;;; expressions that produce char-sets. It's a pain.
|
||||||
|
;;;
|
||||||
|
;;; See comments in re.scm ADT code about building regexp trees that have
|
||||||
|
;;; code in the record fields instead of values.
|
||||||
|
;;;
|
||||||
|
;;; The macro expander works by parsing the regexp form into an re record,
|
||||||
|
;;; and simplifying it. If the record is completely static, it is then
|
||||||
|
;;; translated, at macro-expand time, into a Posix regex string. If the
|
||||||
|
;;; regexp needs runtime values -- e.g, the computed from and to fields in
|
||||||
|
;;; (** "ha, " (- min 1) (+ max 1))
|
||||||
|
;;; -- the expander instead produces Scheme ADT constructors to build
|
||||||
|
;;; the regexp at run-time.
|
||||||
|
|
||||||
|
|
||||||
|
;;; Parser
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;;; Is a parsed regexp completely determined statically, or does it
|
||||||
|
;;; have dynamic components (e.g., a ,@<pattern> or a computed char-set)
|
||||||
|
;;; in the form of embedded code in some of the regexp's fields?
|
||||||
|
|
||||||
|
(define (static-regexp? re)
|
||||||
|
(? ((re-seq? re) (every static-regexp? (re-seq:elts re)))
|
||||||
|
((re-choice? re) (every static-regexp? (re-choice:elts re)))
|
||||||
|
|
||||||
|
((re-char-set? re) (char-set? (re-char-set:cset re))) ; Might be code.
|
||||||
|
|
||||||
|
((re-repeat? re) ; FROM & TO fields might be code.
|
||||||
|
(let ((to (re-repeat:to re)))
|
||||||
|
(and (integer? (re-repeat:from re))
|
||||||
|
(or (not to) (integer? to))
|
||||||
|
(static-regexp? (re-repeat:body re)))))
|
||||||
|
|
||||||
|
((re-dsm? re) (static-regexp? (re-dsm:body re)))
|
||||||
|
((re-submatch? re) (static-regexp? (re-submatch:body re)))
|
||||||
|
|
||||||
|
(else (or (re-bos? re) (re-eos? re) ; Otw, if it's not
|
||||||
|
(re-bol? re) (re-eol? re) ; one of these,
|
||||||
|
(re-bow? re) (re-eow? re) ; then it's Scheme code.
|
||||||
|
(re-string? re)))))
|
||||||
|
|
||||||
|
|
||||||
|
;;; Two useful standard char sets
|
||||||
|
(define nonl-chars (char-set-invert (char-set #\newline)))
|
||||||
|
(define word-chars (char-set-union (char-set #\_) char-set:alphanumeric))
|
||||||
|
|
||||||
|
;;; Little utility that should be moved to scsh's utilities.scm
|
||||||
|
(define (partition pred lis)
|
||||||
|
(let recur ((in '()) (out '()) (lis lis))
|
||||||
|
(if (pair? lis)
|
||||||
|
(let ((head (car lis))
|
||||||
|
(tail (cdr lis)))
|
||||||
|
(if (pred head)
|
||||||
|
(recur (cons head in) out tail)
|
||||||
|
(recur in (cons head out) tail)))
|
||||||
|
(values in out))))
|
||||||
|
|
||||||
|
|
||||||
|
(define (sre->regexp sre)
|
||||||
|
(parse-sre sre (lambda (x) x) equal?))
|
||||||
|
|
||||||
|
|
||||||
|
;;; Parse a sexp regexp into a regexp value, which may be "dynamic" --
|
||||||
|
;;; i.e., some slots may be filled with the Scheme code that will produce
|
||||||
|
;;; their true vaues.
|
||||||
|
;;;
|
||||||
|
;;; R & C are rename and compare functions for low-level macro expanders.
|
||||||
|
|
||||||
|
;;; These two guys are little front-ends for the main routine.
|
||||||
|
|
||||||
|
(define (parse-sre sre r c) (parse-sre/context sre #t #f r c))
|
||||||
|
|
||||||
|
(define (parse-sres sres r c)
|
||||||
|
(re-seq (map (lambda (sre) (parse-sre sre r c)) sres)))
|
||||||
|
|
||||||
|
|
||||||
|
;;; (parse-sre/context sre case-sensitive? cset? r c)
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; This is the main entry point. Parse SRE, given the lexical case-sensitivity
|
||||||
|
;;; flag CASE-SENSITIVE?. If CSET? is true, SRE *must* be parseable as a
|
||||||
|
;;; char-class SRE, and this function returns a character set, *not* a
|
||||||
|
;;; regexp value. If CSET? is false, SRE can be any SRE, and this function
|
||||||
|
;;; returns a regexp value. R and C are low-level macro rename and compare
|
||||||
|
;;; functions.
|
||||||
|
|
||||||
|
(define (parse-sre/context sre case-sensitive? cset? r c)
|
||||||
|
(let ((%bos (r 'bos)) (%eos (r 'eos))
|
||||||
|
(%bol (r 'bol)) (%eol (r 'eol))
|
||||||
|
(%bow (r 'bow)) (%eow (r 'eow))
|
||||||
|
|
||||||
|
(%word (r 'word))
|
||||||
|
|
||||||
|
(%flush-submatches (r 'flush-submatches))
|
||||||
|
(%coerce-dynamic-charset (r 'coerce-dynamic-charset))
|
||||||
|
(%coerce-dynamic-regexp (r 'coerce-dynamic-regexp)))
|
||||||
|
|
||||||
|
(let recur ((sre sre)
|
||||||
|
(case-sensitive? case-sensitive?)
|
||||||
|
(cset? cset?))
|
||||||
|
|
||||||
|
;; Parse the sequence of regexp expressions SEQ with a lexical
|
||||||
|
;; case-sensitivity context of CS?.
|
||||||
|
(define (parse-seq/context seq cs?)
|
||||||
|
(if cset?
|
||||||
|
(if (= 1 (length seq))
|
||||||
|
(recur (car sre) cs? #t)
|
||||||
|
(error "Non-singleton sequence not allowed in char-class context."
|
||||||
|
seq))
|
||||||
|
(re-seq (map (lambda (sre) (recur sre cs? cset?))
|
||||||
|
seq))))
|
||||||
|
|
||||||
|
(define (parse-seq seq) (parse-seq/context seq case-sensitive?))
|
||||||
|
(define (parse-char-class sre) (recur sre case-sensitive? #t))
|
||||||
|
|
||||||
|
(define (non-cset) ; Blow up if cset? is true.
|
||||||
|
(if cset? (error "Illegal SRE in char-class context." sre)))
|
||||||
|
|
||||||
|
(? ((char? sre) (parse-char-re sre case-sensitive? cset?))
|
||||||
|
((string? sre) (parse-string-re sre case-sensitive? cset?))
|
||||||
|
|
||||||
|
((c sre %bos) (non-cset) re-bos)
|
||||||
|
((c sre %eos) (non-cset) re-eos)
|
||||||
|
|
||||||
|
((c sre %bol) (non-cset) re-bol)
|
||||||
|
((c sre %eol) (non-cset) re-eol)
|
||||||
|
|
||||||
|
((c sre %bow) (non-cset) re-bow)
|
||||||
|
((c sre %eow) (non-cset) re-eow)
|
||||||
|
((c sre %word) (non-cset) re-word)
|
||||||
|
|
||||||
|
((pair? sre)
|
||||||
|
(case (car sre)
|
||||||
|
((*) (non-cset) (re-repeat 0 #f (parse-seq (cdr sre))))
|
||||||
|
((+) (non-cset) (re-repeat 1 #f (parse-seq (cdr sre))))
|
||||||
|
((?) (non-cset) (re-repeat 0 1 (parse-seq (cdr sre))))
|
||||||
|
((=) (non-cset) (let ((n (cadr sre)))
|
||||||
|
(re-repeat n n (parse-seq (cddr sre)))))
|
||||||
|
((>=) (non-cset) (re-repeat (cadr sre) #f (parse-seq (cddr sre))))
|
||||||
|
((**) (non-cset) (re-repeat (cadr sre) (caddr sre)
|
||||||
|
(parse-seq (cdddr sre))))
|
||||||
|
|
||||||
|
;; Choice is special wrt cset? because it's "polymorphic".
|
||||||
|
;; Note that RE-CHOICE guarantees to construct a char-set
|
||||||
|
;; or single-char string regexp if all of its args are char
|
||||||
|
;; classes.
|
||||||
|
((| or) (let ((elts (map (lambda (sre)
|
||||||
|
(recur sre case-sensitive? cset?))
|
||||||
|
(cdr sre))))
|
||||||
|
(if cset?
|
||||||
|
(assoc-cset-op char-set-union 'char-set-union elts r)
|
||||||
|
(re-choice elts))))
|
||||||
|
|
||||||
|
((: seq) (non-cset) (parse-seq (cdr sre)))
|
||||||
|
|
||||||
|
((word) (non-cset) (parse-seq `(,%bow ,@(cdr sre) ,%eow)))
|
||||||
|
((word+)
|
||||||
|
(recur `(,(r 'word) (,(r '+) (,(r '&) (,(r '|) ,(r 'alphanum) "_")
|
||||||
|
(,(r '|) . ,(cdr sre)))))
|
||||||
|
case-sensitive?
|
||||||
|
cset?))
|
||||||
|
|
||||||
|
((submatch) (non-cset) (re-submatch (parse-seq (cdr sre))))
|
||||||
|
((dsm) (non-cset) (re-dsm (parse-seq (cdddr sre))
|
||||||
|
(cadr sre)
|
||||||
|
(caddr sre)))
|
||||||
|
|
||||||
|
;; We could be more aggressive and push the uncase op down into
|
||||||
|
;; partially-static regexps, but enough is enough.
|
||||||
|
((uncase)
|
||||||
|
(let ((re-or-cset (parse-seq (cdr sre)))) ; Depending on CSET?.
|
||||||
|
(if cset?
|
||||||
|
|
||||||
|
(if (re-char-set? re-or-cset) ; A char set or code
|
||||||
|
(uncase-char-set re-or-cset) ; producing a char set.
|
||||||
|
`(,(r 'uncase) ,re-or-cset))
|
||||||
|
|
||||||
|
(if (static-regexp? re-or-cset) ; A regexp or code
|
||||||
|
(uncase re-or-cset) ; producing a regexp.
|
||||||
|
`(,(r 'uncase)
|
||||||
|
,(regexp->scheme (simplify-regexp re-or-cset) r))))))
|
||||||
|
|
||||||
|
;; These just change the lexical case-sensitivity context.
|
||||||
|
((w/nocase) (parse-seq/context (cdr sre) #f))
|
||||||
|
((w/case) (parse-seq/context (cdr sre) #t))
|
||||||
|
|
||||||
|
;; ,<exp> and ,@<exp>
|
||||||
|
((unquote)
|
||||||
|
(let ((exp (cadr sre)))
|
||||||
|
(if cset?
|
||||||
|
`(,%coerce-dynamic-charset ,exp)
|
||||||
|
`(,%flush-submatches (,%coerce-dynamic-regexp ,exp)))))
|
||||||
|
((unquote-splicing)
|
||||||
|
(let ((exp (cadr sre)))
|
||||||
|
(if cset?
|
||||||
|
`(,%coerce-dynamic-charset ,exp)
|
||||||
|
`(,%coerce-dynamic-regexp ,exp))))
|
||||||
|
|
||||||
|
((~) (let* ((cs (assoc-cset-op char-set-union 'char-set-union
|
||||||
|
(map parse-char-class (cdr sre))
|
||||||
|
r))
|
||||||
|
(cs (if (char-set? cs)
|
||||||
|
(char-set-invert cs)
|
||||||
|
`(,(r 'char-set-invert) ,cs))))
|
||||||
|
(if cset? cs (make-re-char-set cs))))
|
||||||
|
|
||||||
|
((&) (let ((cs (assoc-cset-op char-set-intersection 'char-set-intersection
|
||||||
|
(map parse-char-class (cdr sre))
|
||||||
|
r)))
|
||||||
|
(if cset? cs (make-re-char-set cs))))
|
||||||
|
|
||||||
|
((-) (if (pair? (cdr sre))
|
||||||
|
(let* ((cs1 (parse-char-class (cadr sre)))
|
||||||
|
(cs2 (assoc-cset-op char-set-union 'char-set-union
|
||||||
|
(map parse-char-class (cddr sre))
|
||||||
|
r))
|
||||||
|
(cs (if (and (char-set? cs1) (char-set? cs2))
|
||||||
|
(char-set-difference cs1 cs2)
|
||||||
|
`(,(r 'char-set-difference)
|
||||||
|
,(if (char-set? cs1)
|
||||||
|
(char-set->scheme cs1 r)
|
||||||
|
cs1)
|
||||||
|
. ,(if (char-set? cs2)
|
||||||
|
(list (char-set->scheme cs2 r))
|
||||||
|
(cdr cs2))))))
|
||||||
|
(if cset? cs (make-re-char-set cs)))
|
||||||
|
(error "SRE set-difference operator (- ...) requires at least one argument")))
|
||||||
|
|
||||||
|
((/) (let ((cset (range-class->char-set (cdr sre) case-sensitive?)))
|
||||||
|
(if cset? cset (make-re-char-set cset))))
|
||||||
|
|
||||||
|
((posix-string)
|
||||||
|
(if (and (= 1 (length (cdr sre)))
|
||||||
|
(string? (cadr sre)))
|
||||||
|
(posix-string->regexp (cadr sre))
|
||||||
|
(error "Illegal (posix-string ...) SRE body." sre)))
|
||||||
|
|
||||||
|
(else (if (every string? sre) ; A set spec -- ("wxyz").
|
||||||
|
(let* ((cs (apply char-set-union
|
||||||
|
(map string->char-set sre)))
|
||||||
|
(cs (if case-sensitive? cs (uncase-char-set cs))))
|
||||||
|
(if cset? cs (make-re-char-set cs)))
|
||||||
|
|
||||||
|
(error "Illegal SRE" sre)))))
|
||||||
|
|
||||||
|
;; It must be a char-class name (ANY, ALPHABETIC, etc.)
|
||||||
|
(else (let ((cs (case sre
|
||||||
|
((any) char-set:full)
|
||||||
|
((nonl) nonl-chars)
|
||||||
|
((lower-case lower) char-set:lower-case)
|
||||||
|
((upper-case upper) char-set:upper-case)
|
||||||
|
((alphabetic alpha) char-set:alphabetic)
|
||||||
|
((numeric digit num) char-set:numeric)
|
||||||
|
((alphanumeric alnum alphanum) char-set:alphanumeric)
|
||||||
|
((punctuation punct) char-set:punctuation)
|
||||||
|
((graphic graph) char-set:graphic)
|
||||||
|
((blank) char-set:blank)
|
||||||
|
((whitespace space white) char-set:whitespace)
|
||||||
|
((printing print) char-set:printing)
|
||||||
|
((control cntrl) char-set:control)
|
||||||
|
((hex-digit xdigit hex) char-set:hex-digit)
|
||||||
|
((ascii) char-set:ascii)
|
||||||
|
(else (error "Illegal regular expression" sre)))))
|
||||||
|
(if cset? cs (make-re-char-set cs))))))))
|
||||||
|
|
||||||
|
|
||||||
|
;;; In a CSET? true context, S must be a 1-char string; convert to a char set
|
||||||
|
;;; according to CASE-SENSITIVE? setting.
|
||||||
|
;;; In a CSET? false context, convert S to a string re (CASE-SENSITIVE? true),
|
||||||
|
;;; or a sequence of char-sets (CASE-SENSITIVE? false).
|
||||||
|
|
||||||
|
(define (parse-string-re s case-sensitive? cset?)
|
||||||
|
(if (= 1 (string-length s))
|
||||||
|
(parse-char-re (string-ref s 0) case-sensitive? cset?)
|
||||||
|
(if cset?
|
||||||
|
(error "Non-singleton string not allowed in char-class context." s)
|
||||||
|
((if case-sensitive? make-re-string uncase-string) s))))
|
||||||
|
|
||||||
|
(define (parse-char-re c case-sensitive? cset?)
|
||||||
|
(if case-sensitive?
|
||||||
|
(if cset? (char-set c) (make-re-string (string c)))
|
||||||
|
(let ((cset (char-set (char-upcase c) (char-downcase c))))
|
||||||
|
(if cset? cset (make-re-char-set cset)))))
|
||||||
|
|
||||||
|
|
||||||
|
;;; "Apply" the associative char-set function OP to the char-sets ELTS.
|
||||||
|
;;; If any of the ELTS is Scheme code instead of a real char set, then
|
||||||
|
;;; we instead produce Scheme code for the op, using OP-NAME as the name
|
||||||
|
;;; of the function, and R for the macro renamer function.
|
||||||
|
|
||||||
|
(define (assoc-cset-op op op-name elts r)
|
||||||
|
(receive (csets code-chunks) (partition char-set? elts)
|
||||||
|
(if (pair? code-chunks)
|
||||||
|
(? ((pair? csets)
|
||||||
|
`(,(r op-name) ,(char-set->scheme (apply op csets) r)
|
||||||
|
. ,code-chunks))
|
||||||
|
((pair? (cdr code-chunks)) `(,(r op-name) . ,code-chunks))
|
||||||
|
(else (car code-chunks))) ; Just one.
|
||||||
|
(apply op csets))))
|
||||||
|
|
||||||
|
;;; Parse a (/ <range-spec> ...) char-class into a character set in
|
||||||
|
;;; case-sensitivity context CS?.
|
||||||
|
;;; Each <range-spec> can be a character or a string of characters.
|
||||||
|
|
||||||
|
(define (range-class->char-set range-specs cs?)
|
||||||
|
(let* ((specs (apply string-append
|
||||||
|
(map (lambda (spec) (if (char? spec) (string spec) spec))
|
||||||
|
range-specs)))
|
||||||
|
(len (string-length specs))
|
||||||
|
(cset (char-set-copy char-set:empty)))
|
||||||
|
(if (odd? len)
|
||||||
|
(error "Unmatched range specifier" range-specs)
|
||||||
|
(let lp ((i (- len 1)) (cset cset))
|
||||||
|
(if (< i 0)
|
||||||
|
(if cs? cset (uncase-char-set cset)) ; Case fold if necessary.
|
||||||
|
(lp (- i 2)
|
||||||
|
(char-set-union!
|
||||||
|
cset
|
||||||
|
(ascii-range->char-set (char->ascii (string-ref specs (- i 1)))
|
||||||
|
(+ 1 (char->ascii (string-ref specs i)))))))))))
|
||||||
|
|
||||||
|
;;; (regexp->scheme re r)
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; Translate a regexp value RE into raw Scheme code that will create it, with
|
||||||
|
;;; calls to the regexp ADT constructor functions. R is a renaming function
|
||||||
|
;;; provided by low-level macro expanders.
|
||||||
|
|
||||||
|
(define (regexp->scheme re r)
|
||||||
|
(let ((%re-bos (r 're-bos)) (%re-eos (r 're-eos))
|
||||||
|
(%re-bol (r 're-bol)) (%re-eol (r 're-eol))
|
||||||
|
(%re-bow (r 're-bow)) (%re-eow (r 're-eow))
|
||||||
|
(%list (r 'list)))
|
||||||
|
|
||||||
|
(let recur ((re re))
|
||||||
|
;; If (fetch-posix re) = #f, produce (OP . ARGS);
|
||||||
|
;; Otherwise, produce (OP/POSIX ,@ARGS '<posix-translation>).
|
||||||
|
(define (doit op op/posix args fetch-posix)
|
||||||
|
(? ((fetch-posix re) =>
|
||||||
|
(lambda (psx) `(,(r op/posix) ,@args
|
||||||
|
',(cre:string psx) ',(cre:tvec psx))))
|
||||||
|
|
||||||
|
(else `(,(r op) . ,args))))
|
||||||
|
|
||||||
|
(? ((re-string? re) (if (re-trivial? re) (r 're-trivial) ; Special hack
|
||||||
|
(doit 'make-re-string 'make-re-string/posix
|
||||||
|
`(,(re-string:chars re))
|
||||||
|
re-string:posix)))
|
||||||
|
|
||||||
|
((re-seq? re) (doit '%make-re-seq '%make-re-seq/posix
|
||||||
|
`((,%list . ,(map recur (re-seq:elts re)))
|
||||||
|
,(re-seq:tsm re))
|
||||||
|
re-seq:posix))
|
||||||
|
|
||||||
|
((re-choice? re) (doit '%make-re-choice '%make-re-choice/posix
|
||||||
|
`((,%list . ,(map recur (re-choice:elts re)))
|
||||||
|
,(re-choice:tsm re))
|
||||||
|
re-choice:posix))
|
||||||
|
|
||||||
|
((re-char-set? re) (if (re-any? re) (r 're-any) ; Special hack for ANY.
|
||||||
|
(doit 'make-re-char-set 'make-re-char-set/posix
|
||||||
|
`(,(char-set->scheme (re-char-set:cset re) r))
|
||||||
|
re-char-set:posix)))
|
||||||
|
|
||||||
|
((re-repeat? re) (doit '%make-re-repeat '%make-re-repeat/posix
|
||||||
|
`(,(re-repeat:from re)
|
||||||
|
,(re-repeat:to re)
|
||||||
|
,(recur (re-repeat:body re))
|
||||||
|
,(re-repeat:tsm re))
|
||||||
|
re-repeat:posix))
|
||||||
|
|
||||||
|
((re-dsm? re) (doit '%make-re-dsm '%make-re-dsm/posix
|
||||||
|
`(,(recur (re-dsm:body re))
|
||||||
|
,(re-dsm:pre-dsm re)
|
||||||
|
,(re-dsm:tsm re))
|
||||||
|
re-dsm:posix))
|
||||||
|
|
||||||
|
((re-submatch? re) (doit '%make-re-submatch '%make-re-submatch/posix
|
||||||
|
`(,(recur (re-submatch:body re))
|
||||||
|
,(re-submatch:pre-dsm re)
|
||||||
|
,(re-submatch:tsm re))
|
||||||
|
re-submatch:posix))
|
||||||
|
|
||||||
|
((re-bos? re) %re-bos)
|
||||||
|
((re-eos? re) %re-eos)
|
||||||
|
((re-bol? re) %re-bol)
|
||||||
|
((re-eol? re) %re-eol)
|
||||||
|
((re-bow? re) %re-bow)
|
||||||
|
((re-eow? re) %re-eow)
|
||||||
|
|
||||||
|
(else re)))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; Classify a character set.
|
||||||
|
;;; We pass in a char set CS and 15 parameters, one for each of the
|
||||||
|
;;; standard char sets. If we can classify CS as any of these char
|
||||||
|
;;; sets, we return the corresponding parameter's value, otw #f.
|
||||||
|
;;;
|
||||||
|
;;; This is gratuitously optimised by probing cset with a couple of
|
||||||
|
;;; witness chars (a,A,1,space), and doing an initial filter based
|
||||||
|
;;; on these witnesses.
|
||||||
|
|
||||||
|
(define (try-classify-char-set cs
|
||||||
|
full nonl lower upper alpha num alphanum
|
||||||
|
punct graph white print ctl hex blank ascii)
|
||||||
|
(let ((a (char-set-contains? cs #\a))
|
||||||
|
(biga (char-set-contains? cs #\A))
|
||||||
|
(one (char-set-contains? cs #\1))
|
||||||
|
(space (char-set-contains? cs #\space)))
|
||||||
|
|
||||||
|
(if a
|
||||||
|
(if biga
|
||||||
|
(if space
|
||||||
|
(and one (switch char-set= cs
|
||||||
|
((char-set:full) full)
|
||||||
|
((nonl-chars) nonl)
|
||||||
|
((char-set:printing) print)
|
||||||
|
((char-set:ascii) ascii)
|
||||||
|
(else #f)))
|
||||||
|
(if one
|
||||||
|
(switch char-set= cs
|
||||||
|
((char-set:alphanumeric) alphanum)
|
||||||
|
((char-set:graphic) graph)
|
||||||
|
((char-set:hex-digit) hex)
|
||||||
|
(else #f))
|
||||||
|
(and (char-set= cs char-set:alphabetic) alpha)))
|
||||||
|
(and (char-set= cs char-set:lower-case) lower)) ; a, not A
|
||||||
|
|
||||||
|
(if biga
|
||||||
|
(and (not space) (char-set= cs char-set:upper-case) upper)
|
||||||
|
(if one
|
||||||
|
(and (not space) (char-set= cs char-set:numeric) num)
|
||||||
|
(if space
|
||||||
|
(switch char-set= cs
|
||||||
|
((char-set:whitespace) white)
|
||||||
|
((char-set:blank) blank)
|
||||||
|
(else #f))
|
||||||
|
(switch char-set= cs
|
||||||
|
((char-set:punctuation) punct)
|
||||||
|
((char-set:control) ctl)
|
||||||
|
(else #f))))))))
|
||||||
|
|
||||||
|
|
||||||
|
(define (char-set->scheme cs r)
|
||||||
|
(let ((try (lambda (cs)
|
||||||
|
(try-classify-char-set cs
|
||||||
|
'char-set:full 'nonl-chars
|
||||||
|
'char-set:lower-case 'char-set:upper-case
|
||||||
|
'char-set:alphabetic 'char-set:numeric
|
||||||
|
'char-set:alphanumeric 'char-set:punctuation
|
||||||
|
'char-set:graphic 'char-set:whitespace
|
||||||
|
'char-set:printing 'char-set:control
|
||||||
|
'char-set:hex-digit 'char-set:blank
|
||||||
|
'char-set:ascii))))
|
||||||
|
(? ((not (char-set? cs)) cs) ; Dynamic -- *already* Scheme code.
|
||||||
|
((char-set-empty? cs) (r 'char-set:empty))
|
||||||
|
((try cs) => r)
|
||||||
|
((try (char-set-invert cs)) =>
|
||||||
|
(lambda (name) `(,(r 'char-set-invert) ,name)))
|
||||||
|
|
||||||
|
(else
|
||||||
|
(receive (loose+ ranges+) (char-set->in-pair cs)
|
||||||
|
(receive (loose- ranges-) (char-set->in-pair (char-set-invert cs))
|
||||||
|
(let ((makeit (r 'spec->char-set)))
|
||||||
|
(if (< (+ (length loose-) (* 12 (length ranges-)))
|
||||||
|
(+ (length loose+) (* 12 (length ranges+))))
|
||||||
|
`(,makeit #f ,(list->string loose-) ',ranges-)
|
||||||
|
`(,makeit #t ,(list->string loose+) ',ranges+)))))))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; This code needs work.
|
||||||
|
|
||||||
|
(define (char-set->sre cs r)
|
||||||
|
(if (char-set? cs)
|
||||||
|
(let ((try (lambda (cs)
|
||||||
|
(try-classify-char-set cs
|
||||||
|
'any 'nonl
|
||||||
|
'lower-case 'upper-case
|
||||||
|
'alphabetic 'numeric
|
||||||
|
'alphanumeric 'punctuation
|
||||||
|
'graphic 'whitespace
|
||||||
|
'printing 'control
|
||||||
|
'hex-digit 'blank
|
||||||
|
'ascii)))
|
||||||
|
(nchars (char-set-size cs)))
|
||||||
|
(? ((zero? nchars) `(,(r '|)))
|
||||||
|
((= 1 nchars) (apply string (char-set-members cs)))
|
||||||
|
((try cs) => r)
|
||||||
|
((try (char-set-invert cs)) =>
|
||||||
|
(lambda (name) `(,(r '~) ,name)))
|
||||||
|
(else (receive (cs rp comp?) (char-set->in-sexp-spec cs)
|
||||||
|
(let ((args (append (? ((string=? cs "") '())
|
||||||
|
((= 1 (string-length cs)) `(,cs))
|
||||||
|
(else `((,cs))))
|
||||||
|
(if (string=? rp "") '()
|
||||||
|
(list `(,(r '/) ,rp))))))
|
||||||
|
(if (and (= 1 (length args)) (not comp?))
|
||||||
|
(car args)
|
||||||
|
`(,(r (if comp? '~ '|)) . ,args)))))))
|
||||||
|
|
||||||
|
`(,(r 'unquote) ,cs))) ; dynamic -- ,<cset-exp>
|
||||||
|
|
||||||
|
|
||||||
|
;;; Unparse an re into a *list* of SREs (representing a sequence).
|
||||||
|
;;; This is for rendering the bodies of DSM, SUBMATCH, **, *, =, >=, and &'s,
|
||||||
|
;;; that is, forms whose body is an implicit sequence.
|
||||||
|
|
||||||
|
(define (regexp->sres/renamer re r)
|
||||||
|
(if (re-seq? re)
|
||||||
|
(let ((elts (re-seq:elts re)))
|
||||||
|
(if (pair? elts)
|
||||||
|
(map (lambda (re) (regexp->sre/renamer re r)) elts)
|
||||||
|
(let ((tsm (re-seq:tsm re))
|
||||||
|
(%dsm (r 'dsm)))
|
||||||
|
(if (zero? tsm) '() `((,%dsm ,tsm 0)))))) ; Empty sequence
|
||||||
|
(list (regexp->sre/renamer re r)))) ; Not a seq
|
||||||
|
|
||||||
|
|
||||||
|
(define (regexp->sre/renamer re r)
|
||||||
|
(let recur ((re re))
|
||||||
|
(? ((re-string? re) (re-string:chars re))
|
||||||
|
|
||||||
|
((re-seq? re) `(,(r ':) . ,(regexp->sres/renamer re r)))
|
||||||
|
|
||||||
|
((re-choice? re)
|
||||||
|
(let ((elts (re-choice:elts re))
|
||||||
|
(%| (r '|)))
|
||||||
|
(if (pair? elts)
|
||||||
|
`(,%| . ,(map recur elts))
|
||||||
|
(let ((tsm (re-choice:tsm re)))
|
||||||
|
(if (zero? tsm) `(,%|) `(,(r 'dsm) ,tsm 0 (,%|)))))))
|
||||||
|
|
||||||
|
((re-char-set? re) (char-set->sre (re-char-set:cset re) r))
|
||||||
|
|
||||||
|
((re-repeat? re)
|
||||||
|
(let ((from (re-repeat:from re))
|
||||||
|
(to (re-repeat:to re))
|
||||||
|
(bodies (regexp->sres/renamer (re-repeat:body re) r)))
|
||||||
|
(? ((and (eqv? from 0) (not to)) `(,(r '*) . ,bodies))
|
||||||
|
((and (eqv? from 0) (eqv? to 1)) `(,(r '?) . ,bodies))
|
||||||
|
((and (eqv? from 1) (not to)) `(,(r '+) . ,bodies))
|
||||||
|
((eqv? from to) `(,(r '=) ,to . bodies))
|
||||||
|
(to `(,(r '**) ,from ,to . ,bodies))
|
||||||
|
(else `(,(r '>=) ,from . ,bodies)))))
|
||||||
|
|
||||||
|
((re-dsm? re)
|
||||||
|
`(,(r 'dsm) ,(re-dsm:pre-dsm re) ,(re-dsm:post-dsm re)
|
||||||
|
. ,(regexp->sres/renamer (re-dsm:body re) r)))
|
||||||
|
|
||||||
|
((re-submatch? re)
|
||||||
|
`(,(r 'submatch) . ,(regexp->sres/renamer (re-submatch:body re) r)))
|
||||||
|
|
||||||
|
((re-bos? re) (r 'bos))
|
||||||
|
((re-eos? re) (r 'eos))
|
||||||
|
((re-bol? re) (r 'bol))
|
||||||
|
((re-eol? re) (r 'eol))
|
||||||
|
((re-bow? re) (r 'bow))
|
||||||
|
((re-eow? re) (r 'eow))
|
||||||
|
|
||||||
|
(else re)))) ; Presumably it's code.
|
||||||
|
|
||||||
|
(define (regexp->sre re) (regexp->sre/renamer re (lambda (x) x)))
|
||||||
|
|
||||||
|
;;; Character class unparsing
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; This is the code that takes char-sets and converts them into forms suitable
|
||||||
|
;;; for char-class SRE's or [...] Posix strings.
|
||||||
|
|
||||||
|
;;; Map a char-set to an (| ("...") (/"...")) or (~ ("...") (/"...")) SRE.
|
||||||
|
;;; We try it both ways, and return whichever is shortest.
|
||||||
|
;;; We return three values:
|
||||||
|
;;; - a string of chars that are members in the set;
|
||||||
|
;;; - a string of chars that, taken in pairs specifying ranges,
|
||||||
|
;;; give the rest of the members of the set.
|
||||||
|
;;; - A boolean COMP?, which says whether the set should be complemented
|
||||||
|
;;; (~ ...) or taken as-is (| ...).
|
||||||
|
;;;
|
||||||
|
;;; E.g., ["!?.", "AZaz09", #t]
|
||||||
|
|
||||||
|
(define (char-set->in-sexp-spec cset)
|
||||||
|
(let ((->sexp-pair (lambda (cset)
|
||||||
|
(receive (loose ranges) (char-set->in-pair cset)
|
||||||
|
(values (apply string loose)
|
||||||
|
(apply string
|
||||||
|
(fold-right (lambda (r lis)
|
||||||
|
`(,(car r) ,(cdr r) . ,lis))
|
||||||
|
'() ranges)))))))
|
||||||
|
(receive (cs+ rp+) (->sexp-pair cset)
|
||||||
|
(receive (cs- rp-) (->sexp-pair (char-set-invert cset))
|
||||||
|
(if (< (+ (string-length cs-) (string-length rp-))
|
||||||
|
(+ (string-length cs+) (string-length rp+)))
|
||||||
|
(values cs- rp- #t)
|
||||||
|
(values cs+ rp+ #f))))))
|
||||||
|
|
||||||
|
;;; Return 2 values characterizing the char set in a run-length encoding:
|
||||||
|
;;; - LOOSE List of singleton chars -- elts of the set.
|
||||||
|
;;; - RANGES List of (from . to) char ranges.
|
||||||
|
;;;
|
||||||
|
;;; E.g., [(#\! #\? #\.)
|
||||||
|
;;; ((#\A . #\Z) (#\a . #\z) (#\0 . #\9))]
|
||||||
|
|
||||||
|
(define (char-set->in-pair cset)
|
||||||
|
(let ((add-range (lambda (from to loose ranges)
|
||||||
|
(if from (case (- to from)
|
||||||
|
((0) (values (cons (ascii->char from) loose)
|
||||||
|
ranges))
|
||||||
|
((1) (values `(,(ascii->char from)
|
||||||
|
,(ascii->char to)
|
||||||
|
. ,loose)
|
||||||
|
ranges))
|
||||||
|
((2) (values `(,(ascii->char from)
|
||||||
|
,(ascii->char (+ from 1))
|
||||||
|
,(ascii->char to)
|
||||||
|
. ,loose)
|
||||||
|
ranges))
|
||||||
|
(else (values loose
|
||||||
|
`((,(ascii->char from) .
|
||||||
|
,(ascii->char to))
|
||||||
|
. ,ranges))))
|
||||||
|
(values loose ranges)))))
|
||||||
|
|
||||||
|
(let lp ((i 127) (from #f) (to #f) (loose '()) (ranges '()))
|
||||||
|
(if (< i 0)
|
||||||
|
(add-range from to loose ranges)
|
||||||
|
|
||||||
|
(let ((i-1 (- i 1)))
|
||||||
|
(if (char-set-contains? cset (ascii->char i))
|
||||||
|
(if from
|
||||||
|
(lp i-1 i to loose ranges) ; Continue the run.
|
||||||
|
(lp i-1 i i loose ranges)) ; Start a new run.
|
||||||
|
|
||||||
|
;; If there's a run going, finish it off.
|
||||||
|
(receive (loose ranges) (add-range from to loose ranges)
|
||||||
|
(lp i-1 #f #f loose ranges))))))))
|
|
@ -0,0 +1,618 @@
|
||||||
|
;;; Regexp-ADT -> Posix-string translator.
|
||||||
|
;;; Olin Shivers January 1997, May 1998.
|
||||||
|
|
||||||
|
;;; - If the regexp value contains nul character constants, or character sets
|
||||||
|
;;; that contain the nul character, they will show up in the Posix string
|
||||||
|
;;; we produce. Spencer's C regexp engine can handle regexp strings that
|
||||||
|
;;; contain nul bytes, but this might blow up other implementations -- that
|
||||||
|
;;; is, the nul byte might prematurely terminate the C string passed to the
|
||||||
|
;;; regexp engine.
|
||||||
|
;;;
|
||||||
|
;;; - The code is ASCII-specific in only one place: the expression for
|
||||||
|
;;; a regexp that matches nothing is the 6-char pattern "[^\000-\177]",
|
||||||
|
;;; which assumes a 7-bit character code. Note that the static simplifier
|
||||||
|
;;; can remove *all* occurences of this "empty regexp" except for the
|
||||||
|
;;; un-simplifiable case of a single, top-level empty regexp, e.g.
|
||||||
|
;;; (rx (in))
|
||||||
|
;;; We can handle this one special case specially, so we shouldn't *ever*
|
||||||
|
;;; have to produce this ASCII-specific pattern.
|
||||||
|
|
||||||
|
;;; Exports: regexp->posix-string
|
||||||
|
|
||||||
|
;;; Todo: A dumb, simple char-set renderer.
|
||||||
|
|
||||||
|
;;; These functions translate static regular expressions into Posix regexp
|
||||||
|
;;; strings. They generally return four values:
|
||||||
|
;;; - string (regexp)
|
||||||
|
;;;
|
||||||
|
;;; - syntax level: 0 parenthesized exp, 1 piece, 2 branch, 3 top
|
||||||
|
;;; ("piece", "branch" and "top" are Spencer's terms):
|
||||||
|
;;; + A parenthesized exp is syntactically equivalent to a piece.
|
||||||
|
;;; (But it's useful to know when an exp is parenthesized for
|
||||||
|
;;; eliminating redundant submatch-generated parens.)
|
||||||
|
;;; + A piece is something that would bind to a following *
|
||||||
|
;;; ("a" but not "aa").
|
||||||
|
;;; + A branch is a sequence of pieces -- something that would bind to a |
|
||||||
|
;;; ("ab*d" but not "ab*|d"). That is, a branch is not allowed to contain
|
||||||
|
;;; top-level |'s.
|
||||||
|
;;; + Top is for a sequence of branches -- "a|b*c|d".
|
||||||
|
;;;
|
||||||
|
;;; - paren count in the returned string.
|
||||||
|
;;;
|
||||||
|
;;; - Vector of parens numbers used for submatching. The first paren is
|
||||||
|
;;; numbered 1. #F means a dead submatch -- one we can tell statically
|
||||||
|
;;; will never match anything.
|
||||||
|
|
||||||
|
;;; Non-R4RS imports:
|
||||||
|
;;; ? = COND
|
||||||
|
;;; Multiple-value return: VALUES RECEIVE CALL-WITH-VALUES
|
||||||
|
;;; SORT-LIST
|
||||||
|
|
||||||
|
|
||||||
|
;;; Useful little utility -- pad vector V with
|
||||||
|
;;; PRE initial and POST following #f's.
|
||||||
|
|
||||||
|
(define (pad-vector pre post v)
|
||||||
|
(if (= pre post 0) v
|
||||||
|
(let* ((vlen (vector-length v))
|
||||||
|
(alen (+ pre post vlen))
|
||||||
|
(ans (make-vector alen #f)))
|
||||||
|
(do ((from (- vlen 1) (- from 1))
|
||||||
|
(to (+ pre vlen -1) (- to 1)))
|
||||||
|
((< from 0))
|
||||||
|
(vector-set! ans to (vector-ref v from)))
|
||||||
|
ans)))
|
||||||
|
|
||||||
|
(define (n-falses n) (make-vector n #f))
|
||||||
|
|
||||||
|
|
||||||
|
;;; There's no representation for regexps that never match anything (e.g.,
|
||||||
|
;;; (|)) in strict Posix notation. When we get one of these, we treat it
|
||||||
|
;;; specially, producing [#f #f #f #f].
|
||||||
|
;;;
|
||||||
|
;;; We can always detect these empty regexps, because they always simplify
|
||||||
|
;;; to one of these two values:
|
||||||
|
;;; - (make-re-char-set char-set:empty)
|
||||||
|
;;; - (dsm m n (make-re-char-set char-set:empty))
|
||||||
|
|
||||||
|
(define (simple-empty-re? re)
|
||||||
|
(or (and (re-char-set? re)
|
||||||
|
(char-set-empty? (re-char-set:cset re)))
|
||||||
|
(and (re-dsm? re)
|
||||||
|
(simple-empty-re? (re-dsm:body re)))))
|
||||||
|
|
||||||
|
|
||||||
|
;;; Top-level
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define (regexp->posix-string re)
|
||||||
|
;; We *must* simplify, to guarantee correct translation.
|
||||||
|
(let ((re (simplify-regexp re)))
|
||||||
|
(if (simple-empty-re? re) (values #f #f #f #f)
|
||||||
|
(translate-regexp re))))
|
||||||
|
|
||||||
|
|
||||||
|
(define (translate-regexp re)
|
||||||
|
(? ((re-string? re) (translate-string (re-string:chars re)))
|
||||||
|
|
||||||
|
((re-repeat? re) (translate-repeat re))
|
||||||
|
((re-choice? re) (translate-choice re))
|
||||||
|
((re-seq? re) (translate-seq re))
|
||||||
|
((re-char-set? re) (translate-char-set (re-char-set:cset re)))
|
||||||
|
|
||||||
|
((re-submatch? re) (translate-submatch re))
|
||||||
|
|
||||||
|
((re-bos? re) (values "^" 1 0 '#()))
|
||||||
|
((re-eos? re) (values "$" 1 0 '#()))
|
||||||
|
|
||||||
|
((re-bol? re) (error "Beginning-of-line regexp not supported in this implementation."))
|
||||||
|
((re-eol? re) (error "End-of-line regexp not supported in this implementation."))
|
||||||
|
|
||||||
|
((re-bow? re) (values "[[:<:]]" 1 0 '#())) ; These two are
|
||||||
|
((re-eow? re) (values "[[:>:]]" 1 0 '#())) ; Spencer-specific.
|
||||||
|
|
||||||
|
((re-dsm? re) (let ((pre-dsm (re-dsm:pre-dsm re))
|
||||||
|
(body (re-dsm:body re)))
|
||||||
|
(translate-dsm body pre-dsm
|
||||||
|
(- (re-dsm:tsm re)
|
||||||
|
(+ pre-dsm (re-tsm body))))))
|
||||||
|
|
||||||
|
(else (error "Illegal regular expression" re))))
|
||||||
|
|
||||||
|
|
||||||
|
;;; Translate reloc-elt ELT = (N . RE) from a sequence or choice
|
||||||
|
;;; into a Posix string.
|
||||||
|
;;; - Relocate the submatch indices by PREV-PCOUNT.
|
||||||
|
;;; (That is, assume rendering preceding elts used PREV-PCOUNT parens.)
|
||||||
|
;;; - Assume preceding elements allocated PREV-SMCOUNT submatches
|
||||||
|
;;; (we may have to pad our returned submatches string with some
|
||||||
|
;;; initial #F's to account for dead submatches PREV-SMCOUNT through N.)
|
||||||
|
;;; - If SUB-LEV3? is true, the result string is guaranteed to be < level 3.
|
||||||
|
;;; This is used by the & and | translators.
|
||||||
|
;;; - Returns the usual 4 values plus the final submatch count including
|
||||||
|
;;; this regexp.
|
||||||
|
|
||||||
|
(define (translate-elt elt prev-pcount prev-smcount sub-lev3?)
|
||||||
|
(let ((offset (car elt))
|
||||||
|
(re (cdr elt)))
|
||||||
|
|
||||||
|
(receive (s level pcount submatches) (translate-regexp re)
|
||||||
|
|
||||||
|
;; Relocate submatch indices by OFFSET and force level <3, if needed:
|
||||||
|
(receive (s level pcount submatches)
|
||||||
|
(if (and sub-lev3? (= level 3))
|
||||||
|
(values (string-append "(" s ")")
|
||||||
|
0
|
||||||
|
(+ pcount 1)
|
||||||
|
(mapv (lambda (sm) (and sm (+ prev-pcount 1 sm)))
|
||||||
|
submatches))
|
||||||
|
(values s level pcount
|
||||||
|
(mapv (lambda (sm) (and sm (+ prev-pcount sm)))
|
||||||
|
submatches)))
|
||||||
|
|
||||||
|
;; Tack onto submatches as many initial #F's as needed to bump
|
||||||
|
;; the previous submatches count from PREV-SMCOUNT to OFFSET.
|
||||||
|
(values s level pcount
|
||||||
|
(pad-vector (- offset prev-smcount) 0 submatches)
|
||||||
|
(+ offset (re-tsm re)))))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; Force the string to be level < 3 by parenthesizing it if necessary.
|
||||||
|
|
||||||
|
(define (paren-if-necessary s lev pcount submatches)
|
||||||
|
(if (< lev 3)
|
||||||
|
(values s lev pcount submatches)
|
||||||
|
(values (string-append "(" s ")")
|
||||||
|
0
|
||||||
|
(+ pcount 1)
|
||||||
|
(mapv (lambda (sm) (and sm (+ 1 sm)))
|
||||||
|
submatches))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; (: re1 ... ren)
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define (translate-seq re)
|
||||||
|
(let ((elts (re-seq:elts re))
|
||||||
|
(tsm (re-seq:tsm re)))
|
||||||
|
(let recur ((elts elts) (prev-pcount 0) (prev-smcount 0))
|
||||||
|
;; Render a sequence tail ELTS, assuming the previous elements translated
|
||||||
|
;; to a string with PREV-PCOUNT parens, and allocated PREV-SMCOUNT
|
||||||
|
;; submatches.
|
||||||
|
(if (pair? elts)
|
||||||
|
(let* ((elt (car elts))
|
||||||
|
(elts (cdr elts)))
|
||||||
|
|
||||||
|
(receive (s1 level1 pcount1 submatches1)
|
||||||
|
(translate-regexp elt)
|
||||||
|
|
||||||
|
(receive (s1 level1 pcount1 submatches1)
|
||||||
|
(paren-if-necessary s1 level1 pcount1 submatches1)
|
||||||
|
|
||||||
|
(receive (s level pcount submatches)
|
||||||
|
(recur elts
|
||||||
|
(+ pcount1 prev-pcount)
|
||||||
|
(+ prev-smcount (re-tsm elt)))
|
||||||
|
|
||||||
|
(values (string-append s1 s)
|
||||||
|
2
|
||||||
|
(+ pcount1 pcount)
|
||||||
|
(vector-append (mapv (lambda (sm) (+ sm prev-smcount))
|
||||||
|
submatches1)
|
||||||
|
submatches))))))
|
||||||
|
|
||||||
|
(values "" 2 0 '#()))))) ; Empty seq
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; (| re1 ... ren)
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define (translate-choice re)
|
||||||
|
(let ((elts (re-choice:elts re))
|
||||||
|
(tsm (re-choice:tsm re)))
|
||||||
|
(if (pair? elts)
|
||||||
|
(let recur ((elts elts) (prev-pcount 0) (prev-smcount 0))
|
||||||
|
;; ELTS is a non-empty choice tail. Render it, assuming the
|
||||||
|
;; previous elements translated to a string with PREV-PCOUNT parens,
|
||||||
|
;; and allocated PREV-SMCOUNT submatches.
|
||||||
|
(let ((elt (car elts)) (tail (cdr elts)))
|
||||||
|
(receive (s1 level1 pcount1 submatches1) (translate-regexp elt)
|
||||||
|
(let ((submatches1 (mapv (lambda (sm) (and sm (+ sm prev-pcount)))
|
||||||
|
submatches1)))
|
||||||
|
(if (pair? tail)
|
||||||
|
(receive (s level pcount submatches)
|
||||||
|
(recur tail
|
||||||
|
(+ pcount1 prev-pcount)
|
||||||
|
(+ prev-smcount (re-tsm elt)))
|
||||||
|
(values (string-append s1 "|" s) 3
|
||||||
|
(+ pcount1 pcount)
|
||||||
|
(vector-append submatches1 submatches)))
|
||||||
|
|
||||||
|
(values s1 level1 pcount1 submatches1))))))
|
||||||
|
|
||||||
|
(values "[^\000-\377]" 1 0 (n-falses tsm))))) ; Empty choice.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; Repeated cases: * + ? and {n,m} ranges.
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define (translate-repeat re)
|
||||||
|
(let ((from (re-repeat:from re))
|
||||||
|
(to (re-repeat:to re))
|
||||||
|
(body (re-repeat:body re))
|
||||||
|
(tsm (re-repeat:tsm re)))
|
||||||
|
|
||||||
|
(? ((and to (> from to)) ; Unsatisfiable
|
||||||
|
(values "[^\000-\377]" 1 0 (n-falses tsm)))
|
||||||
|
|
||||||
|
((and to (= from to 1)) (translate-seq body)) ; RE{1,1} => RE
|
||||||
|
|
||||||
|
((and to (= to 0)) ; RE{0,0} => ""
|
||||||
|
(values "" 2 0 (n-falses tsm)))
|
||||||
|
|
||||||
|
(else ; General case
|
||||||
|
(receive (s level pcount submatches) (translate-regexp body)
|
||||||
|
(receive (s level pcount submatches) ; Coerce S to level <2.
|
||||||
|
(if (> level 1)
|
||||||
|
(values (string-append "(" s ")")
|
||||||
|
0
|
||||||
|
(+ pcount 1)
|
||||||
|
(mapv (lambda (i) (and i (+ i 1))) submatches))
|
||||||
|
(values s level pcount submatches))
|
||||||
|
|
||||||
|
(values (if to
|
||||||
|
(? ((and (= from 0) (= to 1)) (string-append s "?"))
|
||||||
|
((= from to)
|
||||||
|
(string-append s "{" (number->string to) "}"))
|
||||||
|
(else
|
||||||
|
(string-append s "{" (number->string from)
|
||||||
|
"," (number->string to) "}")))
|
||||||
|
(? ((= from 0) (string-append s "*"))
|
||||||
|
((= from 1) (string-append s "+"))
|
||||||
|
(else (string-append s "{" (number->string from) ",}"))))
|
||||||
|
1 pcount submatches)))))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; Submatch
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define (translate-submatch re)
|
||||||
|
(let ((body (re-submatch:body re))
|
||||||
|
(pre-dsm (re-submatch:pre-dsm re)))
|
||||||
|
|
||||||
|
;; Translate the body, along with any leading or trailing dead submatches.
|
||||||
|
(receive (s level pcount submatches)
|
||||||
|
(translate-dsm body
|
||||||
|
pre-dsm
|
||||||
|
(- (re-submatch:tsm re)
|
||||||
|
(+ 1 pre-dsm (re-tsm body))))
|
||||||
|
|
||||||
|
;; If the whole expression isn't already wrapped in a paren, wrap it.
|
||||||
|
;; This outer paren becomes the new submatch -- add to submatches list.
|
||||||
|
(if (= level 0)
|
||||||
|
(values s 0 pcount (vector-append '#(1) submatches))
|
||||||
|
(values (string-append "(" s ")")
|
||||||
|
0
|
||||||
|
(+ pcount 1)
|
||||||
|
(mapv! (lambda (i) (and i (+ i 1))) ; Excuse me.
|
||||||
|
(vector-append '#(0) submatches)))))))
|
||||||
|
|
||||||
|
;;; Translating DSM
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; Translate the body, and paste enough #F's before and after the submatches
|
||||||
|
;;; list to account for extra dead submatches.
|
||||||
|
|
||||||
|
(define (translate-dsm body pre-dsm post-dsm)
|
||||||
|
(receive (s level pcount submatches) (translate-regexp body)
|
||||||
|
(values s level pcount (pad-vector pre-dsm post-dsm submatches))))
|
||||||
|
|
||||||
|
;;; Constant regexps
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; Convert a string into a regexp pattern that matches that string exactly --
|
||||||
|
;;; quote the special chars with backslashes.
|
||||||
|
|
||||||
|
(define translate-string
|
||||||
|
(let ((specials (string->char-set "[.*?()|\\$^+")))
|
||||||
|
(lambda (s)
|
||||||
|
(let ((len (string-length s)))
|
||||||
|
(if (zero? len)
|
||||||
|
(values "()" 0 1 '#()) ; Special case ""
|
||||||
|
|
||||||
|
(let* ((len2 (string-fold (lambda (c len) ; Length of answer str
|
||||||
|
(+ len (if (char-set-contains? specials c) 2 1)))
|
||||||
|
0 s))
|
||||||
|
(s2 (make-string len2))) ; Answer string
|
||||||
|
|
||||||
|
;; Copy the chars over to S2.
|
||||||
|
(string-fold (lambda (c i)
|
||||||
|
;; Write char C at index I, return the next index.
|
||||||
|
(let ((i (cond ((char-set-contains? specials c)
|
||||||
|
(string-set! s2 i #\\)
|
||||||
|
(+ i 1))
|
||||||
|
(else i))))
|
||||||
|
(string-set! s2 i c)
|
||||||
|
(+ i 1)))
|
||||||
|
0 s)
|
||||||
|
(values s2 (if (= len 1) 1 2)
|
||||||
|
0 '#())))))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; Translating char-sets to [...] strings
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; This is the nastiest code in the system. We make an effort to return
|
||||||
|
;;; succinct encodings of the char-sets, in the event these encodings are
|
||||||
|
;;; being shown to humans.
|
||||||
|
;;; - A singleton set is rendered as that char.
|
||||||
|
;;; - A full set is rendered as "."
|
||||||
|
;;; - An empty set is rendered as [^\000-\177].
|
||||||
|
;;; - Otherwise, render it both as a [...] and as a [^...] spec, and
|
||||||
|
;;; take whichever is shortest.
|
||||||
|
|
||||||
|
;;; Take a char set, and return the standard
|
||||||
|
;;; [regexp-string, level, pcount, submatches]
|
||||||
|
;;; quadruple.
|
||||||
|
;;;
|
||||||
|
|
||||||
|
(define (translate-char-set cset)
|
||||||
|
(if (char-set-full? cset) (values "." 1 0 '#()) ; Full set
|
||||||
|
|
||||||
|
(let ((nchars (char-set-size cset))
|
||||||
|
(->bracket-string (lambda (cset in?)
|
||||||
|
(receive (loose ranges) (char-set->in-pair cset)
|
||||||
|
(hack-bracket-spec loose ranges in?)))))
|
||||||
|
|
||||||
|
(? ((= 0 nchars) (values "[^\000-\177]" 1 0 '#())) ; Empty set
|
||||||
|
|
||||||
|
((= 1 nchars) ; Singleton set
|
||||||
|
(translate-string (string (car (char-set-members cset)))))
|
||||||
|
|
||||||
|
;; General case. Try both [...] and [^...].
|
||||||
|
(else (let ((s- (->bracket-string cset #t))
|
||||||
|
(s+ (->bracket-string (char-set-invert cset) #f)))
|
||||||
|
(values (if (< (string-length s-) (string-length s+))
|
||||||
|
s- s+)
|
||||||
|
1 0 '#())))))))
|
||||||
|
|
||||||
|
|
||||||
|
;;; Commentary
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;;; Hacking special chars in character-class strings:
|
||||||
|
;;; ] - ^ ]...^-
|
||||||
|
;;; ] - ]...-
|
||||||
|
;;; ] ^ ]...^
|
||||||
|
;;; ] ]...
|
||||||
|
;;; - ^ ...^- (or doubleton screw-case)
|
||||||
|
;;; - ...-
|
||||||
|
;;; ^ ...^ (or singleton screw-case)
|
||||||
|
;;;
|
||||||
|
;;; Two screw cases:
|
||||||
|
;;; "^-" must be converted to "-^" for IN.
|
||||||
|
;;; "^" must be converted to non-class "^" for IN.
|
||||||
|
|
||||||
|
;;; Rendering a general char-set into a correct Posix [...] bracket expression
|
||||||
|
;;; is a complete mess.
|
||||||
|
;;;
|
||||||
|
;;; The rules on bracket expressions:
|
||||||
|
;;; - ] terminates the exp unless it is the first char
|
||||||
|
;;; (after an optional leading ^).
|
||||||
|
;;; - .*[\ are not special in bracket expressions.
|
||||||
|
;;; - However, [. [= and [: *are* special, so you can't follow an
|
||||||
|
;;; open bracket by one of .=: -- argh. See below.
|
||||||
|
;;; - ^ isn't special unless it's the first char.
|
||||||
|
;;; - - is special unless it's first (after an optional ^), last,
|
||||||
|
;;; or as the ending char in a range (e.g., a--).
|
||||||
|
|
||||||
|
;;; This means:
|
||||||
|
;;; - You must ensure that ] doesn't begin or terminate a range.
|
||||||
|
;;; - You must ensure that .=: don't follow [
|
||||||
|
;;; + This can happen in the loose char list;
|
||||||
|
;;; + This can happen in the range list -- consider the pair of
|
||||||
|
;;; ranges "x-[.-%" Handle this by prohibiting [ as a range-terminator.
|
||||||
|
;;; + It can happen at the loose/range boundary: %[:-?
|
||||||
|
|
||||||
|
;;; First, run-length encode the set into loose and range-pairs.
|
||||||
|
;;; If the set is a singleton set, then punt the whole [...] effort,
|
||||||
|
;;; and do it as a simple char.
|
||||||
|
|
||||||
|
;;; Repeat until stable:
|
||||||
|
;;; - Sort the ranges in this order:
|
||||||
|
;;; 1. other ranges;
|
||||||
|
;;; 2. ranges that begin with ^ (not priority)
|
||||||
|
;;; 3. ranges that begin with .=: (priority)
|
||||||
|
;;; 4. ranges that end with [ (priority)
|
||||||
|
;;; This eliminates [. [= [: problems in the ranges, and
|
||||||
|
;;; minimises the chances of the problem at the loose/range boundary.
|
||||||
|
;;; and problems with initial ^ chars.
|
||||||
|
;;; - Sort the loose chars so that ] is first, then -, then .=:, then [,
|
||||||
|
;;; then others, then ^. This eliminates [. [= [: problems in the loose
|
||||||
|
;;; chars, and minimises the chances of the problem at the loose/range
|
||||||
|
;;; boundary.
|
||||||
|
;;; - Shrink ranges by moving an opening or closing range char into the
|
||||||
|
;;; loose-char set:
|
||||||
|
;;; + If ] opens or closes a range, shrink it out.
|
||||||
|
;;; + If any range opens with -, shrink it out.
|
||||||
|
;;; + If the first range opens with .=:, and the last loose char is [,
|
||||||
|
;;; shrink it out.
|
||||||
|
;;; + If there are no loose chars, the first range begins with ^, and
|
||||||
|
;;; we're doing an IN range, shrink out the ^.
|
||||||
|
;;; + Shrinking a range down to <3 chars means move it's elts into the
|
||||||
|
;;; loose char set.
|
||||||
|
;;; - If both [ and - are in the loose char set,
|
||||||
|
;;; pull - out as special end-hypen.
|
||||||
|
|
||||||
|
;;; Finally, we have to hack things so that ^ doesn't begin an IN sequence.
|
||||||
|
;;; - If it's a NOT-IN sequence, no worries.
|
||||||
|
;;; - If ^ is the opening loose char, then it's the only loose char.
|
||||||
|
;;; If there are ranges, move it to the end of the string.
|
||||||
|
;;; If there are no ranges, then just punt the char-class and convert
|
||||||
|
;;; it to a singleton ^. In fact, do this up-front, for any singleton
|
||||||
|
;;; set.
|
||||||
|
;;;
|
||||||
|
;;; If the special end-hyphen flag is set, add - to the end of the string.
|
||||||
|
|
||||||
|
;;; This general approach -- starting out with maximal ranges, and then
|
||||||
|
;;; shrinking them to avoid other syntax violations -- has the advantage
|
||||||
|
;;; of not relying on the details of the ASCII encodings.
|
||||||
|
|
||||||
|
;;; Ordering ranges:
|
||||||
|
;;; 1. other ranges (ordered by start char)
|
||||||
|
;;; 2. ranges that begin with ^ (not priority)
|
||||||
|
;;; 3. ranges that begin with .=:
|
||||||
|
;;; 4. ranges that end with [ (priority over #2 & #3)
|
||||||
|
|
||||||
|
(define (range< r1 r2)
|
||||||
|
(let ((r1-start (car r1)) (r1-end (cdr r1))
|
||||||
|
(r2-start (car r2)) (r2-end (cdr r2)))
|
||||||
|
(or (char=? r2-end #\[) ; Range ending with [ comes last.
|
||||||
|
(and (not (char=? r1-end #\[))
|
||||||
|
|
||||||
|
;; Range begin with one of .=: comes next-to-last
|
||||||
|
(or (char=? r2-start #\.) (char=? r2-start #\=) (char=? r2-start #\:)
|
||||||
|
(and (not (char=? r1-start #\.))
|
||||||
|
(not (char=? r1-start #\=))
|
||||||
|
(not (char=? r1-start #\:))
|
||||||
|
|
||||||
|
;; Range beginning with ^ comes before that.
|
||||||
|
(or (char=? r1-start #\^)
|
||||||
|
(and (not (char=? r2-start #\^))
|
||||||
|
|
||||||
|
;; Other ranges are ordered by start char.
|
||||||
|
(< (char->ascii r1-start)
|
||||||
|
(char->ascii r2-start))))))))))
|
||||||
|
|
||||||
|
;;; Order loose chars:
|
||||||
|
;;; ] is first,
|
||||||
|
;;; - is next,
|
||||||
|
;;; .=: are next,
|
||||||
|
;;; [ is next,
|
||||||
|
;;; then others (ordered by ascii val)
|
||||||
|
;;; ^ is last.
|
||||||
|
|
||||||
|
|
||||||
|
(define (loose<= c1 c2)
|
||||||
|
(or (char=? c1 #\]) ; ] is first,
|
||||||
|
(and (not (char=? c2 #\]))
|
||||||
|
|
||||||
|
(or (char=? c1 #\-) ; - is next,
|
||||||
|
(and (not (char=? c2 #\-))
|
||||||
|
|
||||||
|
;; .=: are next,
|
||||||
|
(or (char=? c1 #\.) (char=? c1 #\=) (char=? c1 #\:)
|
||||||
|
(and (not (char=? c2 #\.))
|
||||||
|
(not (char=? c2 #\=))
|
||||||
|
(not (char=? c2 #\:))
|
||||||
|
|
||||||
|
(or (char=? c1 #\[) ; [ is next,
|
||||||
|
(and (not (char=? c2 #\[))
|
||||||
|
|
||||||
|
(or (char=? c2 #\^) ; ^ is last,
|
||||||
|
(and (not (char=? c1 #\^))
|
||||||
|
|
||||||
|
;; other chars by ASCII.
|
||||||
|
(<= (char->ascii c1)
|
||||||
|
(char->ascii c2)))))))))))))
|
||||||
|
|
||||||
|
;;; Returns (1) a list of 0-3 loose chars, (2) a list of 0 or 1 ranges.
|
||||||
|
|
||||||
|
(define (shrink-range-start r)
|
||||||
|
(let ((start (char->ascii (car r)))
|
||||||
|
(end (char->ascii (cdr r))))
|
||||||
|
(shrink-range-finish-up start (+ start 1) end)))
|
||||||
|
|
||||||
|
(define (shrink-range-end r)
|
||||||
|
(let ((start (char->ascii (car r)))
|
||||||
|
(end (char->ascii (cdr r))))
|
||||||
|
(shrink-range-finish-up end start (- end 1))))
|
||||||
|
|
||||||
|
(define (shrink-range-finish-up c start end)
|
||||||
|
(? ((> start end) (values (list (ascii->char c)) '())) ; Empty range
|
||||||
|
|
||||||
|
((= start end) ; Collapse singleton range.
|
||||||
|
(values (list (ascii->char c) (ascii->char start))
|
||||||
|
'()))
|
||||||
|
|
||||||
|
((= (+ start 1) end) ; Collapse doubleton range.
|
||||||
|
(values (list (ascii->char c) (ascii->char start) (ascii->char end))
|
||||||
|
'()))
|
||||||
|
|
||||||
|
(else (values (list (ascii->char c))
|
||||||
|
(list (cons (ascii->char start) (ascii->char end)))))))
|
||||||
|
|
||||||
|
|
||||||
|
;;; We assume the bracket-spec is not a singleton, not empty, and not complete.
|
||||||
|
;;; (These cases get rendered as the letter, [^\000-\177], and ".",
|
||||||
|
;;; respectively.) We assume the loose chars and the ranges are all disjoint.
|
||||||
|
|
||||||
|
(define (hack-bracket-spec loose ranges in?)
|
||||||
|
(let lp ((loose0 loose) (ranges0 ranges) (end-hyphen? #f))
|
||||||
|
;; Repeat until stable:
|
||||||
|
(let ((loose (sort-list loose0 loose<=)) ; Sort loose chars and ranges.
|
||||||
|
(ranges (sort-list ranges0 range<)))
|
||||||
|
|
||||||
|
;; If ] opens or closes a range, shrink it out.
|
||||||
|
;; If - opens a range, shrink it out.
|
||||||
|
(receive (loose ranges)
|
||||||
|
(let recur ((ranges ranges))
|
||||||
|
(if (pair? ranges)
|
||||||
|
(let* ((range (car ranges))
|
||||||
|
(start (car range))
|
||||||
|
(end (cdr range))
|
||||||
|
(ranges (cdr ranges)))
|
||||||
|
(receive (new-loose new-ranges) (recur ranges)
|
||||||
|
(receive (new-loose0 new-ranges0)
|
||||||
|
(? ((char=? #\] start)
|
||||||
|
(shrink-range-start range))
|
||||||
|
|
||||||
|
((char=? #\] end)
|
||||||
|
(shrink-range-end range))
|
||||||
|
|
||||||
|
((char=? #\- start)
|
||||||
|
(shrink-range-start range))
|
||||||
|
|
||||||
|
(else (values '() (list range))))
|
||||||
|
(values (append new-loose0 new-loose)
|
||||||
|
(append new-ranges0 new-ranges)))))
|
||||||
|
(values loose '())))
|
||||||
|
|
||||||
|
(? ((or (not (equal? loose0 loose)) ; Loop if anything changed.
|
||||||
|
(not (equal? ranges0 ranges)))
|
||||||
|
(lp loose ranges end-hyphen?))
|
||||||
|
|
||||||
|
;; If the first range opens with .=:, and the last loose char is [,
|
||||||
|
;; shrink it out & loop.
|
||||||
|
((and (pair? ranges)
|
||||||
|
(memv (caar ranges) '(#\. #\= #\:))
|
||||||
|
(pair? loose)
|
||||||
|
(char=? #\[ (car (reverse loose))))
|
||||||
|
(receive (new-loose new-ranges)
|
||||||
|
(shrink-range-start (car ranges))
|
||||||
|
(lp (append new-loose loose) (append new-ranges (cdr ranges)) end-hyphen?)))
|
||||||
|
|
||||||
|
;; If there are no loose chars, the first range begins with ^, and
|
||||||
|
;; we're doing an IN range, shrink out the ^.
|
||||||
|
((and in? (null? loose) (pair? ranges) (char=? #\^ (caar ranges)))
|
||||||
|
(receive (new-loose new-ranges) (shrink-range-start (car ranges))
|
||||||
|
(lp (append new-loose loose) (append new-ranges ranges) end-hyphen?)))
|
||||||
|
|
||||||
|
;; If both [ and - are in the loose char set,
|
||||||
|
;; pull - out as special end-hypen.
|
||||||
|
((and (pair? loose)
|
||||||
|
(pair? (cdr loose))
|
||||||
|
(char=? (car loose) #\[)
|
||||||
|
(char=? (car loose) #\-))
|
||||||
|
(lp (cons (car loose) (cddr loose)) ranges #t))
|
||||||
|
|
||||||
|
;; No change! Build the answer...
|
||||||
|
(else (string-append (if in? "[" "[^")
|
||||||
|
(list->string loose)
|
||||||
|
(apply string-append
|
||||||
|
(map (lambda (r) (string (car r) #\- (cdr r)))
|
||||||
|
ranges))
|
||||||
|
"]")))))))
|
|
@ -0,0 +1,114 @@
|
||||||
|
;;; Regexp "fold" combinators -*- scheme -*-
|
||||||
|
;;; Copyright (c) 1998 by Olin Shivers.
|
||||||
|
|
||||||
|
;;; REGEXP-FOLD re kons knil s [finish start] -> value
|
||||||
|
;;; REGEXP-FOLD-RIGHT re kons knil s [finish start] -> value
|
||||||
|
;;; REGEXP-FOR-EACH re proc s [start] -> unspecific
|
||||||
|
|
||||||
|
;;; Non-R4RS imports: let-optionals :optional error ?
|
||||||
|
|
||||||
|
;;; regexp-fold re kons knil s [finish start] -> value
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; The following definition is a bit unwieldy, but the intuition is
|
||||||
|
;;; simple: this procedure uses the regexp RE to divide up string S into
|
||||||
|
;;; non-matching/matching chunks, and then "folds" the procedure KONS
|
||||||
|
;;; across this sequence of chunks.
|
||||||
|
;;;
|
||||||
|
;;; Search from START (defaulting to 0) for a match to RE; call
|
||||||
|
;;; this match M. Let I be the index of the end of the match
|
||||||
|
;;; (that is, (match:end M 0)). Loop as follows:
|
||||||
|
;;; (regexp-fold re kons (kons START M knil) s finish I)
|
||||||
|
;;; If there is no match, return instead
|
||||||
|
;;; (finish START knil)
|
||||||
|
;;; FINISH defaults to (lambda (i knil) knil)
|
||||||
|
;;;
|
||||||
|
;;; In other words, we divide up S into a sequence of non-matching/matching
|
||||||
|
;;; chunks:
|
||||||
|
;;; NM1 M1 NM1 M2 ... NMk Mk NMlast
|
||||||
|
;;; where NM1 is the initial part of S that isn't matched by the RE, M1 is the
|
||||||
|
;;; first match, NM2 is the following part of S that isn't matched, M2 is the
|
||||||
|
;;; second match, and so forth -- NMlast is the final non-matching chunk of
|
||||||
|
;;; S. We apply KONS from left to right to build up a result, passing it one
|
||||||
|
;;; non-matching/matching chunk each time: on an application (KONS i m KNIL),
|
||||||
|
;;; the non-matching chunk goes from I to (match:begin m 0), and the following
|
||||||
|
;;; matching chunk goes from (match:begin m 0) to (match:end m 0). The last
|
||||||
|
;;; non-matching chunk NMlast is processed by FINISH. So the computation we
|
||||||
|
;;; perform is
|
||||||
|
;;; (final q (kons Jk MTCHk ... (kons J2 MTCH2 (kons J1 MTCH1 knil))...))
|
||||||
|
;;; where Ji is the index of the start of NMi, MTCHi is a match value
|
||||||
|
;;; describing Mi, and Q is the index of the beginning of NMlast.
|
||||||
|
|
||||||
|
(define (regexp-fold re kons knil s . maybe-finish+start)
|
||||||
|
(let-optionals maybe-finish+start ((finish (lambda (i x) x))
|
||||||
|
(start 0))
|
||||||
|
(if (> start (string-length s))
|
||||||
|
(error "Illegal START parameter"
|
||||||
|
regexp-fold re kons knil s finish start))
|
||||||
|
(let lp ((i start) (val knil))
|
||||||
|
(? ((regexp-search re s i) =>
|
||||||
|
(lambda (m)
|
||||||
|
(let ((next-i (match:end m 0)))
|
||||||
|
(if (= next-i (match:start m 0))
|
||||||
|
(error "An empty-string regexp match has put regexp-fold into an infinite loop."
|
||||||
|
re s start next-i)
|
||||||
|
(lp next-i (kons i m val))))))
|
||||||
|
(else (finish i val))))))
|
||||||
|
|
||||||
|
;;; regexp-fold-right re kons knil s [finish start] -> value
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; This procedure repeatedly matches regexp RE across string S.
|
||||||
|
;;; This divides S up into a sequence of matching/non-matching chunks:
|
||||||
|
;;; NM0 M1 NM1 M2 NM2 ... Mk NMk
|
||||||
|
;;; where NM0 is the initial part of S that isn't matched by the RE,
|
||||||
|
;;; M1 is the first match, NM1 is the following part of S that isn't
|
||||||
|
;;; matched, M2 is the second match, and so forth. We apply KONS from
|
||||||
|
;;; right to left to build up a result
|
||||||
|
;;; (final q (kons MTCH1 J1 (kons MTCH2 J2 ...(kons MTCHk JK knil)...)))
|
||||||
|
;;; where MTCHi is a match value describing Mi, Ji is the index of the end of
|
||||||
|
;;; NMi (or, equivalently, the beginning of Mi+1), and Q is the index of the
|
||||||
|
;;; beginning of M1. In other words, KONS is passed a match, an index
|
||||||
|
;;; describing the following non-matching text, and the value produced by
|
||||||
|
;;; folding the following text. The FINAL function "polishes off" the fold
|
||||||
|
;;; operation by handling the initial chunk of non-matching text (NM0, above).
|
||||||
|
;;; FINISH defaults to (lambda (i knil) knil)
|
||||||
|
|
||||||
|
(define (regexp-fold-right re kons knil s . maybe-finish+start)
|
||||||
|
(let-optionals maybe-finish+start ((finish (lambda (i x) x))
|
||||||
|
(start 0))
|
||||||
|
(if (> start (string-length s))
|
||||||
|
(error "Illegal START parameter" regexp-fold-right re kons knil s
|
||||||
|
finish start))
|
||||||
|
|
||||||
|
(? ((regexp-search re s start) =>
|
||||||
|
(lambda (m)
|
||||||
|
(finish (match:start m 0)
|
||||||
|
(let recur ((last-m m))
|
||||||
|
(? ((regexp-search re s (match:end last-m 0)) =>
|
||||||
|
(lambda (m)
|
||||||
|
(let ((i (match:start m 0)))
|
||||||
|
(if (= i (match:end m 0))
|
||||||
|
(error "An empty-string regexp match has put regexp-fold-right into an infinite loop."
|
||||||
|
re s start i)
|
||||||
|
(kons last-m i (recur m))))))
|
||||||
|
(else (kons last-m (string-length s) knil)))))))
|
||||||
|
(else (finish (string-length s) knil)))))
|
||||||
|
|
||||||
|
;;; regexp-for-each re proc s [start] -> unspecific
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; Repeatedly match regexp RE against string S.
|
||||||
|
;;; Apply PROC to each match that is produced.
|
||||||
|
;;; Matches do not overlap.
|
||||||
|
|
||||||
|
(define (regexp-for-each re proc s . maybe-start)
|
||||||
|
(let ((start (:optional maybe-start 0)))
|
||||||
|
(if (> start (string-length s))
|
||||||
|
(apply error "Illegal START parameter" regexp-for-each re proc s start)
|
||||||
|
(let lp ((i start))
|
||||||
|
(? ((regexp-search re s i) =>
|
||||||
|
(lambda (m)
|
||||||
|
(let ((next-i (match:end m 0)))
|
||||||
|
(if (= (match:start m 0) next-i)
|
||||||
|
(error "An empty-string regexp match has put regexp-for-each into an infinite loop."
|
||||||
|
re proc s start next-i))
|
||||||
|
(proc m)
|
||||||
|
(lp next-i)))))))))
|
|
@ -0,0 +1,62 @@
|
||||||
|
;;; Regular expression matching for scsh
|
||||||
|
;;; Copyright (c) 1998 by Olin Shivers.
|
||||||
|
|
||||||
|
|
||||||
|
;;; Translates the re to a Posix string, and returns a CRE record,
|
||||||
|
;;; but doesn't actually compile the Posix string into a C regex_t struct.
|
||||||
|
;;; Uses the :POSIX field to cache the CRE record.
|
||||||
|
|
||||||
|
(define (compile-regexp re)
|
||||||
|
(let* ((compile (lambda () (receive (s lev pcount tvec)
|
||||||
|
(regexp->posix-string re)
|
||||||
|
(new-cre s tvec))))
|
||||||
|
|
||||||
|
(check-cache (lambda (fetch set)
|
||||||
|
(or (fetch re) ; Already cached.
|
||||||
|
(let ((cre (compile))) ; Compile it,
|
||||||
|
(set re cre) ; cache it,
|
||||||
|
cre))))) ; and return it.
|
||||||
|
|
||||||
|
(? ((re-seq? re)
|
||||||
|
(check-cache re-seq:posix set-re-seq:posix))
|
||||||
|
((re-choice? re)
|
||||||
|
(check-cache re-choice:posix set-re-choice:posix))
|
||||||
|
((re-repeat? re)
|
||||||
|
(check-cache re-repeat:posix set-re-repeat:posix))
|
||||||
|
((re-char-set? re)
|
||||||
|
(check-cache re-char-set:posix set-re-char-set:posix))
|
||||||
|
((re-string? re)
|
||||||
|
(check-cache re-string:posix set-re-string:posix))
|
||||||
|
((re-submatch? re)
|
||||||
|
(check-cache re-submatch:posix set-re-submatch:posix))
|
||||||
|
((re-dsm? re)
|
||||||
|
(check-cache re-dsm:posix set-re-dsm:posix))
|
||||||
|
|
||||||
|
((re-bos? re) (or bos-cre (set! bos-cre (compile))))
|
||||||
|
((re-eos? re) (or eos-cre (set! eos-cre (compile))))
|
||||||
|
|
||||||
|
((re-bol? re) (error "BOL regexp not supported in this implementation."))
|
||||||
|
((re-eol? re) (error "EOL regexp not supported in this implementation."))
|
||||||
|
|
||||||
|
((re-bow? re) (or bow-cre (set! bow-cre (compile))))
|
||||||
|
((re-eow? re) (or eow-cre (set! eow-cre (compile))))
|
||||||
|
|
||||||
|
(else (error "compile-regexp -- not a regexp" re)))))
|
||||||
|
|
||||||
|
(define bos-cre #f)
|
||||||
|
(define eos-cre #f)
|
||||||
|
(define bow-cre #f)
|
||||||
|
(define eow-cre #f)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
(define (regexp-search re str . maybe-start)
|
||||||
|
(let* ((tsm (re-tsm re))
|
||||||
|
(svec (make-vector (+ 1 tsm) #f))
|
||||||
|
(evec (make-vector (+ 1 tsm) #f))
|
||||||
|
(cre (compile-regexp re)))
|
||||||
|
(cre-search cre svec evec str (:optional maybe-start 0))))
|
||||||
|
|
||||||
|
|
||||||
|
(define (regexp-search? re str . maybe-start)
|
||||||
|
(cre-search? (compile-regexp re) str (:optional maybe-start 0)))
|
|
@ -0,0 +1,61 @@
|
||||||
|
/* This is an Scheme48/C interface file,
|
||||||
|
** automatically generated by cig.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h> /* For malloc. */
|
||||||
|
#include "libcig.h"
|
||||||
|
|
||||||
|
/* Make sure foreign-function stubs interface to the C funs correctly: */
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include "../regexp/regex.h"
|
||||||
|
#include "re1.h"
|
||||||
|
|
||||||
|
scheme_value df_compile_re(long nargs, scheme_value *args)
|
||||||
|
{
|
||||||
|
extern int compile_re(scheme_value , int , regex_t* *);
|
||||||
|
scheme_value ret1;
|
||||||
|
int r1;
|
||||||
|
regex_t* r2;
|
||||||
|
|
||||||
|
cig_check_nargs(3, nargs, "compile_re");
|
||||||
|
r1 = compile_re(args[2], EXTRACT_BOOLEAN(args[1]), &r2);
|
||||||
|
ret1 = ENTER_FIXNUM(r1);
|
||||||
|
AlienVal(VECTOR_REF(*args,0)) = (long) r2;
|
||||||
|
return ret1;
|
||||||
|
}
|
||||||
|
|
||||||
|
scheme_value df_re_search(long nargs, scheme_value *args)
|
||||||
|
{
|
||||||
|
extern scheme_value re_search(const regex_t *, scheme_value , int , scheme_value , int , scheme_value , scheme_value );
|
||||||
|
scheme_value ret1;
|
||||||
|
scheme_value r1;
|
||||||
|
|
||||||
|
cig_check_nargs(7, nargs, "re_search");
|
||||||
|
r1 = re_search((const regex_t *)AlienVal(args[6]), args[5], EXTRACT_FIXNUM(args[4]), args[3], EXTRACT_FIXNUM(args[2]), args[1], args[0]);
|
||||||
|
ret1 = r1;
|
||||||
|
return ret1;
|
||||||
|
}
|
||||||
|
|
||||||
|
scheme_value df_re_errint2str(long nargs, scheme_value *args)
|
||||||
|
{
|
||||||
|
extern const char *re_errint2str(int , const regex_t *);
|
||||||
|
scheme_value ret1;
|
||||||
|
const char *r1;
|
||||||
|
|
||||||
|
cig_check_nargs(3, nargs, "re_errint2str");
|
||||||
|
r1 = re_errint2str(EXTRACT_FIXNUM(args[2]), (const regex_t *)AlienVal(args[1]));
|
||||||
|
ret1 = VECTOR_REF(*args,0);
|
||||||
|
{AlienVal(CAR(ret1)) = (long) r1; CDR(ret1) = strlen_or_false(r1);}
|
||||||
|
return ret1;
|
||||||
|
}
|
||||||
|
|
||||||
|
scheme_value df_free_re(long nargs, scheme_value *args)
|
||||||
|
{
|
||||||
|
extern void free_re(regex_t* );
|
||||||
|
|
||||||
|
cig_check_nargs(1, nargs, "free_re");
|
||||||
|
free_re((regex_t* )AlienVal(args[0]));
|
||||||
|
return SCHFALSE;
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,152 @@
|
||||||
|
;;; Regular expression matching for scsh
|
||||||
|
;;; Copyright (c) 1994 by Olin Shivers.
|
||||||
|
|
||||||
|
(foreign-source
|
||||||
|
"/* Make sure foreign-function stubs interface to the C funs correctly: */"
|
||||||
|
"#include <sys/types.h>"
|
||||||
|
"#include \"../regexp/regex.h\""
|
||||||
|
"#include \"re1.h\""
|
||||||
|
"" ""
|
||||||
|
)
|
||||||
|
|
||||||
|
;;; Match data for regexp matches.
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define-record regexp-match
|
||||||
|
string ; The string against which we matched
|
||||||
|
start ; vector of starting indices
|
||||||
|
end) ; vector of ending indices
|
||||||
|
|
||||||
|
(define (match:start match . maybe-index)
|
||||||
|
(vector-ref (regexp-match:start match)
|
||||||
|
(:optional maybe-index 0)))
|
||||||
|
|
||||||
|
(define (match:end match . maybe-index)
|
||||||
|
(vector-ref (regexp-match:end match)
|
||||||
|
(:optional maybe-index 0)))
|
||||||
|
|
||||||
|
(define (match:substring match . maybe-index)
|
||||||
|
(let* ((i (:optional maybe-index 0))
|
||||||
|
(start (vector-ref (regexp-match:start match) i)))
|
||||||
|
(and start (substring (regexp-match:string match)
|
||||||
|
start
|
||||||
|
(vector-ref (regexp-match:end match) i)))))
|
||||||
|
|
||||||
|
;;; Compiling regexps
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; There's no legal Posix string expressing the empty match (e.g., (|))
|
||||||
|
;;; that will never match anything. So when we have one of these, we set
|
||||||
|
;;; the STRING field to #f. The matchers will spot this case and handle it
|
||||||
|
;;; specially.
|
||||||
|
|
||||||
|
;;; We compile the string two ways, on demand -- one for cre-search, and
|
||||||
|
;;; one for cre-search?.
|
||||||
|
|
||||||
|
(define-record cre ; A compiled regular expression
|
||||||
|
string ; The Posix string form of the regexp or #F.
|
||||||
|
max-paren ; Max paren in STRING needed for submatches.
|
||||||
|
(bytes #f) ; Pointer to the compiled form, in the C heap, or #F.
|
||||||
|
(bytes/nm #f) ; Same as BYTES, but compiled with no-submatch.
|
||||||
|
tvec ; Translation vector for the submatches
|
||||||
|
((disclose self) (list "cre" (cre:string self))))
|
||||||
|
|
||||||
|
(define (new-cre str tvec) (make-cre str (max-live-posix-submatch tvec) tvec))
|
||||||
|
|
||||||
|
(define (max-live-posix-submatch tvec)
|
||||||
|
(vfold (lambda (sm mlpsm) (if sm (max mlpsm sm) mlpsm)) 0 tvec))
|
||||||
|
|
||||||
|
(define (compile-posix-re->c-struct re-string sm?)
|
||||||
|
(receive (errcode c-struct) (%compile-re re-string sm?)
|
||||||
|
(if (zero? errcode) c-struct
|
||||||
|
(error errcode (%regerror-msg errcode c-struct)
|
||||||
|
compile-posix-re->c-struct re-string sm?))))
|
||||||
|
|
||||||
|
(define-foreign %compile-re (compile_re (string-desc pattern) (bool submatches?))
|
||||||
|
integer ; 0 or error code
|
||||||
|
(C regex_t*))
|
||||||
|
|
||||||
|
|
||||||
|
;;; Searching with compiled regexps
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; cre-search returns match info; cre-search? is just a predicate.
|
||||||
|
|
||||||
|
(define (cre-search cre start-vec end-vec str start)
|
||||||
|
(let ((re-str (cre:string cre))) ;; RE-STR = #F => empty match.
|
||||||
|
(and re-str
|
||||||
|
(let* ((C-bytes (or (cre:bytes cre)
|
||||||
|
(let ((C-bytes (compile-posix-re->c-struct re-str #t)))
|
||||||
|
(set-cre:bytes cre C-bytes)
|
||||||
|
(register-re-c-struct cre C-bytes)
|
||||||
|
C-bytes)))
|
||||||
|
(retcode (%cre-search C-bytes str start
|
||||||
|
(cre:tvec cre)
|
||||||
|
(cre:max-paren cre)
|
||||||
|
start-vec end-vec)))
|
||||||
|
(if (integer? retcode)
|
||||||
|
(error retcode (%regerror-msg retcode C-bytes)
|
||||||
|
cre-search cre start-vec end-vec str start)
|
||||||
|
(and retcode (make-regexp-match str start-vec end-vec)))))))
|
||||||
|
|
||||||
|
(define (cre-search? cre str start)
|
||||||
|
(let ((re-str (cre:string cre))) ;; RE-STR = #F => empty match.
|
||||||
|
(and re-str
|
||||||
|
(let* ((C-bytes (or (cre:bytes/nm cre)
|
||||||
|
(let ((C-bytes (compile-posix-re->c-struct re-str #f)))
|
||||||
|
(set-cre:bytes/nm cre C-bytes)
|
||||||
|
(register-re-c-struct cre C-bytes)
|
||||||
|
C-bytes)))
|
||||||
|
(retcode (%cre-search C-bytes str start '#() -1 '#() '#())))
|
||||||
|
(if (integer? retcode)
|
||||||
|
(error retcode (%regerror-msg retcode C-bytes)
|
||||||
|
cre-search? cre str start)
|
||||||
|
retcode)))))
|
||||||
|
|
||||||
|
(define-foreign %cre-search
|
||||||
|
(re_search ((C "const regex_t *~a") compiled-regexp)
|
||||||
|
(string-desc str)
|
||||||
|
(integer start)
|
||||||
|
(vector-desc tvec) (integer max-psm)
|
||||||
|
(vector-desc svec) (vector-desc evec))
|
||||||
|
desc) ; 0 success, #f no-match, or non-zero int error code.
|
||||||
|
|
||||||
|
|
||||||
|
;;; Generate an error msg from an error code.
|
||||||
|
|
||||||
|
(define-foreign %regerror-msg (re_errint2str (integer errcode)
|
||||||
|
((C "const regex_t *~a") re))
|
||||||
|
string)
|
||||||
|
|
||||||
|
|
||||||
|
;;; Reclaiming compiled regexp storage
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; Avert your eyes from the unsightly crock.
|
||||||
|
;;;
|
||||||
|
;;; S48 0.36 doesn't have finalizers, so we don't have a way to free
|
||||||
|
;;; the C regexp_t structure when its CRE record is gc'd. So our current
|
||||||
|
;;; lame approximation is to keep track of all the CRE's with a list of
|
||||||
|
;;; (cre-weak-pointer . regex_t*)
|
||||||
|
;;; pairs. From time to time, we should walk the list. If we deref the
|
||||||
|
;;; weak pointer and discover the CRE's been GC'd, we free the regex_t
|
||||||
|
;;; struct.
|
||||||
|
;;;
|
||||||
|
;;; Note this code is completely thread unsafe.
|
||||||
|
|
||||||
|
;;; Free the space used by a compiled regexp.
|
||||||
|
(define-foreign %free-re (free_re ((C regex_t*) re)) ignore)
|
||||||
|
|
||||||
|
(define *master-cre-list* '())
|
||||||
|
|
||||||
|
;;; Whenever we make a new CRE, use this proc to add it to the master list.
|
||||||
|
(define (register-re-c-struct cre c-bytes)
|
||||||
|
(set! *master-cre-list* (cons (cons (make-weak-pointer cre) c-bytes)
|
||||||
|
*master-cre-list*)))
|
||||||
|
|
||||||
|
(define (clean-up-cres)
|
||||||
|
(set! *master-cre-list*
|
||||||
|
(fold (lambda (elt lis)
|
||||||
|
(if (weak-pointer-ref (car elt)) ; Still alive
|
||||||
|
(cons elt lis)
|
||||||
|
(begin (%free-re (cdr elt))
|
||||||
|
lis)))
|
||||||
|
'()
|
||||||
|
*master-cre-list*)))
|
|
@ -0,0 +1,142 @@
|
||||||
|
;;; Substitution ops with regexps
|
||||||
|
;;; Copyright (c) 1998 by Olin Shivers.
|
||||||
|
|
||||||
|
;;; These function have to be in a separate package because they use
|
||||||
|
;;; the scsh I/O function WRITE-STRING. The rest of the regexp system
|
||||||
|
;;; has no dependencies on scsh system code, and is defined independently
|
||||||
|
;;; of scsh -- which scsh, in turn, relies upon: pieces of scsh-level-0
|
||||||
|
;;; use the regexp basics. So we have to split this code out to avoid
|
||||||
|
;;; a circular dependency in the modules: scsh-level-0 needs the regexp
|
||||||
|
;;; package which needs WRITE-STRING, which comes from the regexp package.
|
||||||
|
|
||||||
|
(define (regexp-substitute port match . items)
|
||||||
|
(let* ((str (regexp-match:string match))
|
||||||
|
(sv (regexp-match:start match))
|
||||||
|
(ev (regexp-match:end match))
|
||||||
|
(range (lambda (item) ; Return start & end of
|
||||||
|
(cond ((integer? item) ; ITEM's range in STR.
|
||||||
|
(values (vector-ref sv item)
|
||||||
|
(vector-ref ev item)))
|
||||||
|
((eq? 'pre item) (values 0 (vector-ref sv 0)))
|
||||||
|
((eq? 'post item) (values (vector-ref ev 0)
|
||||||
|
(string-length str)))
|
||||||
|
(else (error "Illegal substitution item."
|
||||||
|
item
|
||||||
|
regexp-substitute))))))
|
||||||
|
(if port
|
||||||
|
|
||||||
|
;; Output port case.
|
||||||
|
(for-each (lambda (item)
|
||||||
|
(if (string? item) (write-string item port)
|
||||||
|
(receive (si ei) (range item)
|
||||||
|
(write-string str port si ei))))
|
||||||
|
items)
|
||||||
|
|
||||||
|
;; Here's the string case. Make two passes -- one to
|
||||||
|
;; compute the length of the target string, one to fill it in.
|
||||||
|
(let* ((len (fold (lambda (item i)
|
||||||
|
(+ i (if (string? item) (string-length item)
|
||||||
|
(receive (si ei) (range item) (- ei si)))))
|
||||||
|
0 items))
|
||||||
|
(ans (make-string len)))
|
||||||
|
|
||||||
|
(fold (lambda (item index)
|
||||||
|
(cond ((string? item)
|
||||||
|
(string-copy! ans index item)
|
||||||
|
(+ index (string-length item)))
|
||||||
|
(else (receive (si ei) (range item)
|
||||||
|
(string-copy! ans index str si ei)
|
||||||
|
(+ index (- ei si))))))
|
||||||
|
0 items)
|
||||||
|
ans))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
(define (regexp-substitute/global port re str . items)
|
||||||
|
(let ((str-len (string-length str))
|
||||||
|
(range (lambda (start sv ev item) ; Return start & end of
|
||||||
|
(cond ((integer? item) ; ITEM's range in STR.
|
||||||
|
(values (vector-ref sv item)
|
||||||
|
(vector-ref ev item)))
|
||||||
|
((eq? 'pre item) (values start (vector-ref sv 0)))
|
||||||
|
(else (error "Illegal substitution item."
|
||||||
|
item
|
||||||
|
regexp-substitute/global)))))
|
||||||
|
(num-posts (fold (lambda (item count)
|
||||||
|
(+ count (if (eq? item 'post) 1 0)))
|
||||||
|
0 items)))
|
||||||
|
|
||||||
|
(if (and port (< num-posts 2))
|
||||||
|
|
||||||
|
;; Output port case, with zero or one POST items.
|
||||||
|
(let recur ((start 0))
|
||||||
|
(if (<= start str-len)
|
||||||
|
(let ((match (regexp-search re str start)))
|
||||||
|
(if match
|
||||||
|
(let* ((sv (regexp-match:start match))
|
||||||
|
(ev (regexp-match:end match))
|
||||||
|
(s (vector-ref sv 0))
|
||||||
|
(e (vector-ref ev 0))
|
||||||
|
(empty? (= s e)))
|
||||||
|
(for-each (lambda (item)
|
||||||
|
(cond ((string? item) (write-string item port))
|
||||||
|
|
||||||
|
((procedure? item) (write-string (item match) port))
|
||||||
|
|
||||||
|
((eq? 'post0 item)
|
||||||
|
(if (and empty? (< s str-len))
|
||||||
|
(write-char (string-ref str s) port)))
|
||||||
|
|
||||||
|
((eq? 'post item)
|
||||||
|
(recur (if empty? (+ 1 e) e)))
|
||||||
|
|
||||||
|
(else (receive (si ei)
|
||||||
|
(range start sv ev item)
|
||||||
|
(write-string str port si ei)))))
|
||||||
|
items))
|
||||||
|
|
||||||
|
(write-string str port start))))) ; No match.
|
||||||
|
|
||||||
|
;; Either we're making a string, or >1 POST.
|
||||||
|
(let* ((pieces (let recur ((start 0))
|
||||||
|
(if (> start str-len) '()
|
||||||
|
(let ((match (regexp-search re str start))
|
||||||
|
(cached-post #f))
|
||||||
|
(if match
|
||||||
|
(let* ((sv (regexp-match:start match))
|
||||||
|
(ev (regexp-match:end match))
|
||||||
|
(s (vector-ref sv 0))
|
||||||
|
(e (vector-ref ev 0))
|
||||||
|
(empty? (= s e)))
|
||||||
|
(fold (lambda (item pieces)
|
||||||
|
(cond ((string? item)
|
||||||
|
(cons item pieces))
|
||||||
|
|
||||||
|
((procedure? item)
|
||||||
|
(cons (item match) pieces))
|
||||||
|
|
||||||
|
((eq? 'post0 item)
|
||||||
|
(if (and empty? (< s str-len))
|
||||||
|
(cons (string (string-ref str s))
|
||||||
|
pieces)
|
||||||
|
pieces))
|
||||||
|
|
||||||
|
((eq? 'post item)
|
||||||
|
(if (not cached-post)
|
||||||
|
(set! cached-post
|
||||||
|
(recur (if empty? (+ e 1) e))))
|
||||||
|
(append cached-post pieces))
|
||||||
|
|
||||||
|
(else (receive (si ei)
|
||||||
|
(range start sv ev item)
|
||||||
|
(cons (substring str si ei)
|
||||||
|
pieces)))))
|
||||||
|
'() items))
|
||||||
|
|
||||||
|
;; No match. Return str[start,end].
|
||||||
|
(list (if (zero? start) str
|
||||||
|
(substring str start (string-length str)))))))))
|
||||||
|
|
||||||
|
(pieces (reverse pieces)))
|
||||||
|
(if port (for-each (lambda (p) (write-string p port)) pieces)
|
||||||
|
(apply string-append pieces))))))
|
|
@ -0,0 +1,115 @@
|
||||||
|
;;; SRE syntax support for regular expressions
|
||||||
|
;;; Olin Shivers, June 1998.
|
||||||
|
|
||||||
|
;;; Export SRE-FORM?, EXPAND-RX
|
||||||
|
|
||||||
|
;;; Is the form an SRE expression?
|
||||||
|
;;; We only shallowly check the initial keyword of a compound form.
|
||||||
|
|
||||||
|
(define (sre-form? exp r same?) ; An SRE is
|
||||||
|
(let ((kw? (lambda (x kw) (same? x (r kw)))))
|
||||||
|
(or (string? exp) ; "foo"
|
||||||
|
(and (pair? exp)
|
||||||
|
(let ((head (car exp)))
|
||||||
|
(or (every string? exp) ; ("aeiou")
|
||||||
|
(kw? head '*) ; (* re ...)
|
||||||
|
(kw? head '+) ; (+ re ...)
|
||||||
|
(kw? head '?) ; (? re ...)
|
||||||
|
(kw? head '=) ; (= n re ...)
|
||||||
|
(kw? head '>=) ; (>= n re ...)
|
||||||
|
(kw? head '**) ; (** m n re ...)
|
||||||
|
|
||||||
|
(kw? head '|) ; (| re ...)
|
||||||
|
(kw? head 'or) ; (| re ...)
|
||||||
|
(kw? head ':) ; (: re ...)
|
||||||
|
(kw? head 'seq) ; (: re ...)
|
||||||
|
|
||||||
|
(kw? head '-) ; (- re ...)
|
||||||
|
(kw? head '&) ; (& re ...)
|
||||||
|
(kw? head '~) ; (~ re ...)
|
||||||
|
|
||||||
|
(kw? head 'submatch) ; (submatch re ...)
|
||||||
|
(kw? head 'dsm) ; (dsm pre post re ...)
|
||||||
|
|
||||||
|
(kw? head 'uncase) ; (uncase re ...)
|
||||||
|
(kw? head 'w/case) ; (w/case re ...)
|
||||||
|
(kw? head 'w/nocase) ; (w/nocase re ...)
|
||||||
|
|
||||||
|
(kw? head 'unquote) ; ,exp
|
||||||
|
(kw? head 'unquote-splicing) ; ,@exp
|
||||||
|
|
||||||
|
(kw? head 'posix-string) ; (posix-string string)
|
||||||
|
|
||||||
|
(kw? head 'word+) ; (word+ re ...)
|
||||||
|
(kw? head 'word)))) ; (word re ...)
|
||||||
|
|
||||||
|
(kw? exp 'any) ; any
|
||||||
|
(kw? exp 'nonl) ; nonl
|
||||||
|
(kw? exp 'word) ; word
|
||||||
|
(kw? exp 'bos) (kw? exp 'eos) ; bos / eos
|
||||||
|
(kw? exp 'bol) (kw? exp 'eol) ; bol / eol
|
||||||
|
(kw? exp 'bow) (kw? exp 'eow) ; bow / eow
|
||||||
|
|
||||||
|
(kw? exp 'lower-case) (kw? exp 'lower); The char class names
|
||||||
|
(kw? exp 'upper-case) (kw? exp 'upper)
|
||||||
|
(kw? exp 'alphabetic) (kw? exp 'alpha)
|
||||||
|
(kw? exp 'numeric) (kw? exp 'num) (kw? exp 'digit)
|
||||||
|
(kw? exp 'alphanumeric) (kw? exp 'alphanum) (kw? exp 'alnum)
|
||||||
|
(kw? exp 'blank)
|
||||||
|
(kw? exp 'control) (kw? exp 'cntrl)
|
||||||
|
(kw? exp 'printing) (kw? exp 'print)
|
||||||
|
(kw? exp 'punctuation) (kw? exp 'punct)
|
||||||
|
(kw? exp 'hex-digit) (kw? exp 'hex) (kw? exp 'xdigit)
|
||||||
|
(kw? exp 'graphic) (kw? exp 'graph)
|
||||||
|
(kw? exp 'whitespace) (kw? exp 'white) (kw? exp 'space)
|
||||||
|
(kw? exp 'ascii))))
|
||||||
|
|
||||||
|
|
||||||
|
;;; (if-sre-form form conseq-form alt-form)
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; If FORM is an SRE, expand into CONSEQ-FORM, otherwise ALT-FORM.
|
||||||
|
;;; This is useful for expanding a subform of a macro that can
|
||||||
|
;;; be either a regexp or something else, e.g.
|
||||||
|
;;; (if-sre-form test ; If TEST is a regexp,
|
||||||
|
;;; (regexp-search? (rx test) line) ; match it against the line,
|
||||||
|
;;; (test line)) ; otw it's a predicate.
|
||||||
|
|
||||||
|
;;; The macro is actually defined directly in the module file.
|
||||||
|
;;; (define-syntax if-sre-form
|
||||||
|
;;; (lambda (exp r c)
|
||||||
|
;;; (if (sre-form? (cadr exp) r c)
|
||||||
|
;;; (caddr exp)
|
||||||
|
;;; (cadddr exp))))
|
||||||
|
|
||||||
|
|
||||||
|
;;; (RX re ...)
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; The basic SRE form.
|
||||||
|
|
||||||
|
(define (expand-rx exp r c)
|
||||||
|
(let ((re (simplify-regexp (parse-sres (cdr exp) r c))))
|
||||||
|
|
||||||
|
;; If it's static, pre-compute the Posix string & tvec now,
|
||||||
|
;; so the re->scheme unparser will find it and toss it into
|
||||||
|
;; the constructor. We do this only for the top-level regexp.
|
||||||
|
(if (static-regexp? re) (compile-regexp re))
|
||||||
|
|
||||||
|
(regexp->scheme re r)))
|
||||||
|
|
||||||
|
|
||||||
|
;(define-syntax rx (syntax-rules () ((rx stuff ...) (really-rx stuff ...))))
|
||||||
|
;(define-syntax really-rx
|
||||||
|
; (syntax-rules () ((really-rx stuff ...) (rx/cs stuff ...))))
|
||||||
|
;
|
||||||
|
;(define-syntax rx/cs (lambda (exp r c) (expand-rx exp #t r c)))
|
||||||
|
;(define-syntax rx/ci (lambda (exp r c) (expand-rx exp #f r c)))
|
||||||
|
;
|
||||||
|
;(define-syntax case-sensitive
|
||||||
|
; (lambda (exp r c)
|
||||||
|
; (let ((%ls (r 'let-syntax))
|
||||||
|
; (%really-rx (r 'really-rx))
|
||||||
|
; (%sr (r 'syntax-rules))
|
||||||
|
; (%rx/cs (r 'rx/cs)))
|
||||||
|
; `(,ls ((,%really-rx (,sr () ((,%really-rx stuff ...) (,%rx/cs stuff ...)))))
|
||||||
|
; . ,(cdr exp)))))
|
||||||
|
|
|
@ -0,0 +1,592 @@
|
||||||
|
;;; The regexp data type
|
||||||
|
;;; Olin Shivers, January 1997, May 1998.
|
||||||
|
|
||||||
|
;;; A DSM around a choice gets absorbed into the choice's first elt.
|
||||||
|
;;; But this prevents it from being moved out into a containing
|
||||||
|
;;; choice or seq elt, or outer DSM. Fix.
|
||||||
|
|
||||||
|
;;; A regexp is a: dsm, submatch, seq, choice, repeat,
|
||||||
|
;;; char-set, string, bos, eos
|
||||||
|
|
||||||
|
;;; Deleted sub-match regexp
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; This stands for a regexp containing TSM submatches, of which
|
||||||
|
;;; PRE-DSM come first as dead submatches, then the regexp BODY with its
|
||||||
|
;;; submatches, then POST-DSM as dead submatches.
|
||||||
|
|
||||||
|
(define-record-type re-dsm :re-dsm
|
||||||
|
(%%make-re-dsm body pre-dsm tsm posix)
|
||||||
|
re-dsm?
|
||||||
|
(body re-dsm:body) ; A Regexp
|
||||||
|
(pre-dsm re-dsm:pre-dsm) ; Integer -- initial dead submatches
|
||||||
|
(tsm re-dsm:tsm) ; Total submatch count
|
||||||
|
(posix re-dsm:posix set-re-dsm:posix)) ; Posix bits
|
||||||
|
|
||||||
|
(define (%make-re-dsm body pre-dsm tsm) (%%make-re-dsm body pre-dsm tsm #f))
|
||||||
|
|
||||||
|
;;; This is only used in code that the (RX ...) macro produces
|
||||||
|
;;; for static regexps.
|
||||||
|
(define (%make-re-dsm/posix body pre-dsm tsm posix-str tvec)
|
||||||
|
(%%make-re-dsm body pre-dsm tsm (new-cre posix-str tvec)))
|
||||||
|
|
||||||
|
(define (make-re-dsm body pre-dsm post-dsm)
|
||||||
|
(%make-re-dsm body pre-dsm (+ post-dsm pre-dsm (re-tsm body))))
|
||||||
|
|
||||||
|
;;; "Virtual field" for the RE-DSM record -- how many dead submatches
|
||||||
|
;;; come after the body:
|
||||||
|
|
||||||
|
(define (re-dsm:post-dsm re) ; Number of post-body DSM's =
|
||||||
|
(- (re-dsm:tsm re) ; total submatches
|
||||||
|
(+ (re-dsm:pre-dsm re) ; minus pre-body dead submatches
|
||||||
|
(re-tsm (re-dsm:body re))))) ; minus body's submatches.
|
||||||
|
|
||||||
|
;;; Slightly smart DSM constructor:
|
||||||
|
;;; - Absorb this DSM into an inner dsm, or submatch.
|
||||||
|
;;; - Punt unnecessary DSM's.
|
||||||
|
|
||||||
|
(define (re-dsm body pre-dsm post-dsm)
|
||||||
|
(let ((tsm (+ pre-dsm (re-tsm body) post-dsm)))
|
||||||
|
(receive (body1 pre-dsm1) (open-dsm body)
|
||||||
|
(let ((pre-dsm (+ pre-dsm pre-dsm1)))
|
||||||
|
|
||||||
|
(? ((= tsm (re-tsm body1)) body1) ; Trivial DSM
|
||||||
|
|
||||||
|
((re-submatch? body1) ; Absorb into submatch.
|
||||||
|
(%make-re-submatch (re-submatch:body body1)
|
||||||
|
(+ pre-dsm (re-submatch:pre-dsm body1))
|
||||||
|
tsm))
|
||||||
|
|
||||||
|
(else (%make-re-dsm body1 pre-dsm tsm))))))) ; Non-trivial DSM
|
||||||
|
|
||||||
|
;;; Take a regexp RE and return an equivalent (re', pre-dsm) pair of values.
|
||||||
|
;;; Recurses into DSM records. It is the case that
|
||||||
|
;;; (<= (+ pre-dsm (re-tsm re')) (re-tsm re))
|
||||||
|
;;; The post-dsm value is (- (re-tsm re) (re-tsm re') pre-dsm).
|
||||||
|
|
||||||
|
(define (open-dsm re)
|
||||||
|
(let lp ((re re) (pre-dsm 0))
|
||||||
|
(if (re-dsm? re)
|
||||||
|
(lp (re-dsm:body re) (+ pre-dsm (re-dsm:pre-dsm re)))
|
||||||
|
(values re pre-dsm))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; Sequence: (& re ...)
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define-record-type re-seq :re-seq
|
||||||
|
(%%make-re-seq elts tsm posix)
|
||||||
|
re-seq?
|
||||||
|
(elts re-seq:elts) ; Regexp list
|
||||||
|
(tsm re-seq:tsm) ; Total submatch count
|
||||||
|
(posix re-seq:posix set-re-seq:posix)) ; Posix record
|
||||||
|
|
||||||
|
(define (%make-re-seq elts tsm) (%%make-re-seq elts tsm #f))
|
||||||
|
|
||||||
|
;;; This is only used in code that (RE ...) macro produces for static regexps.
|
||||||
|
(define (%make-re-seq/posix elts tsm posix-str tvec)
|
||||||
|
(%%make-re-seq elts tsm (new-cre posix-str tvec)))
|
||||||
|
|
||||||
|
(define (make-re-seq res)
|
||||||
|
(%make-re-seq res
|
||||||
|
(fold (lambda (re sm-count) (+ (re-tsm re) sm-count))
|
||||||
|
0 res)))
|
||||||
|
|
||||||
|
;;; Slightly smart sequence constructor:
|
||||||
|
;;; - Flattens nested sequences
|
||||||
|
;;; - Drops trivial "" elements
|
||||||
|
;;; - Empty sequence => ""
|
||||||
|
;;; - Singleton sequence is reduced to its one element.
|
||||||
|
;;; - We don't descend into DSM's; too much work for this routine.
|
||||||
|
|
||||||
|
(define (re-seq res)
|
||||||
|
(let ((res (let recur ((res res)) ; Flatten nested seqs & drop ""'s.
|
||||||
|
(if (pair? res)
|
||||||
|
(let* ((re (car res))
|
||||||
|
(tail (recur (cdr res))))
|
||||||
|
(? ((re-seq? re) ; Flatten nested seqs
|
||||||
|
(append (recur (re-seq:elts re)) tail))
|
||||||
|
((re-trivial? re) tail) ; Drop trivial elts
|
||||||
|
(else (cons re tail))))
|
||||||
|
'()))))
|
||||||
|
|
||||||
|
(if (pair? res)
|
||||||
|
(if (pair? (cdr res))
|
||||||
|
(make-re-seq res) ; General case
|
||||||
|
(car res)) ; Singleton sequence
|
||||||
|
re-trivial))) ; Empty seq -- ""
|
||||||
|
|
||||||
|
|
||||||
|
;;; Choice: (| re ...)
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define-record-type re-choice :re-choice
|
||||||
|
(%%make-re-choice elts tsm posix)
|
||||||
|
re-choice?
|
||||||
|
(elts re-choice:elts) ; List of rel-items
|
||||||
|
(tsm re-choice:tsm) ; Total submatch count
|
||||||
|
(posix re-choice:posix set-re-choice:posix)) ; Posix string
|
||||||
|
|
||||||
|
(define (%make-re-choice elts tsm) (%%make-re-choice elts tsm #f))
|
||||||
|
|
||||||
|
;;; This is only used in code that (RE ...) macro produces for static regexps.
|
||||||
|
(define (%make-re-choice/posix elts tsm posix-str tvec)
|
||||||
|
(%%make-re-choice elts tsm (new-cre posix-str tvec)))
|
||||||
|
|
||||||
|
(define (make-re-choice res)
|
||||||
|
(%make-re-choice res
|
||||||
|
(fold (lambda (re sm-count) (+ (re-tsm re) sm-count))
|
||||||
|
0 res)))
|
||||||
|
|
||||||
|
;;; Slightly smart choice constructor:
|
||||||
|
;;; - Flattens nested choices
|
||||||
|
;;; - Drops empty (impossible) elements
|
||||||
|
;;; - Empty choice => empty-match
|
||||||
|
;;; - Singleton choice is reduced to its one element.
|
||||||
|
;;; - We don't descend into DSM's; too much work for this routine.
|
||||||
|
;;;
|
||||||
|
;;; This routine guarantees to preserve char-classness -- if it is applied
|
||||||
|
;;; to a list of char-class regexps (char-set and singleton-string re's),
|
||||||
|
;;; it will return a char-class regexp.
|
||||||
|
|
||||||
|
(define (re-choice res)
|
||||||
|
(let ((res (let recur ((res res)) ; Flatten nested choices
|
||||||
|
(if (pair? res) ; & drop empty re's.
|
||||||
|
(let* ((re (car res))
|
||||||
|
(tail (recur (cdr res))))
|
||||||
|
(? ((re-choice? re) ; Flatten nested choices
|
||||||
|
(append (recur (re-choice:elts re)) tail))
|
||||||
|
((re-empty? re) tail) ; Drop empty re's.
|
||||||
|
(else (cons re tail))))
|
||||||
|
'()))))
|
||||||
|
;; If all elts are char-class re's, fold them together.
|
||||||
|
(if (every static-char-class? res)
|
||||||
|
(let ((cset (apply char-set-union
|
||||||
|
(map (lambda (elt)
|
||||||
|
(if (re-char-set? elt)
|
||||||
|
(re-char-set:cset elt)
|
||||||
|
(string->char-set (re-string:chars elt))))
|
||||||
|
res))))
|
||||||
|
(if (= 1 (char-set-size cset))
|
||||||
|
(make-re-string (apply string (char-set-members cset)))
|
||||||
|
(make-re-char-set cset)))
|
||||||
|
|
||||||
|
(if (pair? res)
|
||||||
|
(if (pair? (cdr res))
|
||||||
|
(make-re-choice res) ; General case
|
||||||
|
(car res)) ; Singleton sequence
|
||||||
|
re-empty)))) ; Empty choice = ("")
|
||||||
|
|
||||||
|
;;; Repetition (*,?,+,=,>=,**)
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; The repeat record's body contains all of the repeat record's submatches --
|
||||||
|
;;; there is no pre-dsm field allowing for initial & trailing dead submatches.
|
||||||
|
;;; This is not a limit on expressiveness because repeat commutes with dsm --
|
||||||
|
;;; we can always move submatches that come before and after body to an outer
|
||||||
|
;;; DSM. Hence
|
||||||
|
;;; (= (re-repeat:tsm re) (re-tsm (re-repeat:body re)))
|
||||||
|
|
||||||
|
(define-record-type re-repeat :re-repeat
|
||||||
|
(%%make-re-repeat from to body tsm posix)
|
||||||
|
re-repeat?
|
||||||
|
(from re-repeat:from) ; Integer (Macro expander abuses.)
|
||||||
|
(to re-repeat:to) ; Integer or #f for infinite (Macro expander abuses.)
|
||||||
|
(body re-repeat:body) ; Regexp
|
||||||
|
(tsm re-repeat:tsm) ; Total submatch count
|
||||||
|
(posix re-repeat:posix set-re-repeat:posix)) ; Posix record
|
||||||
|
|
||||||
|
(define (%make-re-repeat from to body tsm)
|
||||||
|
(%%make-re-repeat from to body tsm #f))
|
||||||
|
|
||||||
|
;;; This is only used in code that (RE ...) macro produces for static regexps.
|
||||||
|
(define (%make-re-repeat/posix from to body tsm posix-str tvec)
|
||||||
|
(%%make-re-repeat from to body tsm (new-cre posix-str tvec)))
|
||||||
|
|
||||||
|
(define (make-re-repeat from to body)
|
||||||
|
(%make-re-repeat (check-arg (lambda (from)
|
||||||
|
(or (not (integer? from)) ; Dynamic
|
||||||
|
(>= from 0)))
|
||||||
|
from
|
||||||
|
make-re-repeat)
|
||||||
|
(check-arg (lambda (to)
|
||||||
|
(or (not (integer? to)) ; #f or dynamic
|
||||||
|
(and (integer? to) (>= to 0))))
|
||||||
|
to
|
||||||
|
make-re-repeat)
|
||||||
|
body
|
||||||
|
(re-tsm body)))
|
||||||
|
|
||||||
|
;;; Slightly smart repeat constructor
|
||||||
|
;;; - Flattens nested repeats.
|
||||||
|
;;; - re{1,1}, re{0,0}, and re{m,n} where m>n reduced.
|
||||||
|
;;; - If re is empty-match: from=0 => "", from>0 => empty-match.
|
||||||
|
;;; - If re is eos, bos, or "", and to <= from, reduce to simply re.
|
||||||
|
;;; - Commutes into DSM records.
|
||||||
|
|
||||||
|
(define (re-repeat from to body)
|
||||||
|
(receive (re pre-dsm) (reduce-repeat from to body 0)
|
||||||
|
(re-dsm re pre-dsm (- (re-tsm body) (+ pre-dsm (re-tsm re))))))
|
||||||
|
|
||||||
|
;;; This guy does all the work (and is also called by the repeat simplifier)
|
||||||
|
|
||||||
|
(define (reduce-repeat from to body pre-dsm)
|
||||||
|
(receive (from to body1 pre-dsm)
|
||||||
|
;; Collapse nested repeats and dsm's:
|
||||||
|
(let iter ((from from) (to to) (body body) (dsm0 pre-dsm))
|
||||||
|
(receive (body body-dsm0) (open-dsm body)
|
||||||
|
(let ((dsm0 (+ dsm0 body-dsm0)))
|
||||||
|
(if (and (integer? from) ; Stop if FROM or TO
|
||||||
|
(or (not to) (integer? to)) ; are code.
|
||||||
|
(re-repeat? body))
|
||||||
|
(let ((bfrom (re-repeat:from body))
|
||||||
|
(bto (re-repeat:to body))
|
||||||
|
(bbody (re-repeat:body body)))
|
||||||
|
(if (or (not (integer? bfrom)) ; Stop if bfrom or
|
||||||
|
(and bto (not (integer? bto)))) ; bto are code.
|
||||||
|
(values from to body dsm0)
|
||||||
|
(iter (* from bfrom)
|
||||||
|
(and to bto (* to bto))
|
||||||
|
bbody
|
||||||
|
dsm0)))
|
||||||
|
(values from to body dsm0)))))
|
||||||
|
|
||||||
|
(? ((and (eqv? from 1) (eqv? to 1)) ; re{1,1} => re
|
||||||
|
(values body1 pre-dsm))
|
||||||
|
|
||||||
|
((and (eqv? from 0) (eqv? to 0)) ; re{0,0} => ""
|
||||||
|
(values re-trivial (+ (re-tsm body1) pre-dsm)))
|
||||||
|
|
||||||
|
;; re{m,n} => re-empty when m>n:
|
||||||
|
((and (integer? from) (integer? to) (> from to))
|
||||||
|
(values re-empty (+ (re-tsm body1) pre-dsm)))
|
||||||
|
|
||||||
|
;; Reduce the body = re-empty case.
|
||||||
|
((and (re-empty? body1) (integer? from)) ; (+ (in)) => (in)
|
||||||
|
(values (if (> from 0) re-empty re-trivial) ; (* (in)) => ""
|
||||||
|
pre-dsm))
|
||||||
|
|
||||||
|
;; If BODY1 is eos, bos, or "", and m<=n, reduce to simply BODY1.
|
||||||
|
((and (integer? from)
|
||||||
|
(or (and (integer? to) (<= from to)) (not to))
|
||||||
|
(or (re-eos? body1)
|
||||||
|
(re-bos? body1)
|
||||||
|
(and (re-string? body1)
|
||||||
|
(string=? "" (re-string:chars body1)))))
|
||||||
|
(values body1 pre-dsm))
|
||||||
|
|
||||||
|
(else (values (make-re-repeat from to body1) ; general case
|
||||||
|
pre-dsm)))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; Submatch
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; A submatch record introduces a new submatch. This is followed by
|
||||||
|
;;; PRE-DSM dead submatches (caused by simplifying the body), then the
|
||||||
|
;;; BODY, then perhaps more dead submatches, all for a total of TSM
|
||||||
|
;;; submatches.
|
||||||
|
|
||||||
|
(define-record-type re-submatch :re-submatch
|
||||||
|
(%%make-re-submatch body pre-dsm tsm posix)
|
||||||
|
re-submatch?
|
||||||
|
(body re-submatch:body) ; Regexp
|
||||||
|
(pre-dsm re-submatch:pre-dsm) ; Deleted submatches preceding the body
|
||||||
|
(tsm re-submatch:tsm) ; Total submatch count for the record
|
||||||
|
(posix re-submatch:posix set-re-submatch:posix)) ; Posix string
|
||||||
|
|
||||||
|
(define (%make-re-submatch body pre-dsm tsm)
|
||||||
|
(%%make-re-submatch body pre-dsm tsm #f))
|
||||||
|
|
||||||
|
;;; This is only used in code that (RE ...) macro produces for static regexps.
|
||||||
|
(define (%make-re-submatch/posix body pre-dsm tsm posix-str tvec)
|
||||||
|
(%%make-re-submatch body pre-dsm tsm (new-cre posix-str tvec)))
|
||||||
|
|
||||||
|
|
||||||
|
;;; "Virtual field" for the RE-SUBMATCH record -- how many dead submatches
|
||||||
|
;;; come after the body:
|
||||||
|
|
||||||
|
(define (re-submatch:post-dsm re) ; Number of post-body DSM's =
|
||||||
|
(- (re-submatch:tsm re) ; total submatches
|
||||||
|
(+ 1 ; minus *this* submatch
|
||||||
|
(re-submatch:pre-dsm re) ; minus pre-body dead submatches
|
||||||
|
(re-tsm (re-submatch:body re))))); minus body's submatches.
|
||||||
|
|
||||||
|
(define (make-re-submatch body . maybe-pre+post-dsm)
|
||||||
|
(let-optionals maybe-pre+post-dsm ((pre-dsm 0) (post-dsm 0))
|
||||||
|
(%make-re-submatch body pre-dsm (+ pre-dsm 1 (re-tsm body) post-dsm))))
|
||||||
|
|
||||||
|
;;; Slightly smart submatch constructor
|
||||||
|
;;; - DSM's unpacked
|
||||||
|
;;; - If BODY is the re-empty, we'll never match, so just produce a DSM.
|
||||||
|
|
||||||
|
(define (re-submatch body . maybe-pre+post-dsm)
|
||||||
|
(let-optionals maybe-pre+post-dsm ((pre-dsm 0) (post-dsm 0))
|
||||||
|
(let ((tsm (+ 1 pre-dsm (re-tsm body) post-dsm)))
|
||||||
|
(receive (body1 pre-dsm1) (open-dsm body)
|
||||||
|
(if (re-empty? body1)
|
||||||
|
(re-dsm re-empty tsm 0)
|
||||||
|
(%make-re-submatch body1 (+ pre-dsm pre-dsm1) tsm))))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; Other regexps : string, char-set, bos & eos
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; Also, re-empty and re-trivial.
|
||||||
|
|
||||||
|
(define-record re-string
|
||||||
|
chars ; String
|
||||||
|
(posix #f) ; Posix record
|
||||||
|
((disclose self) (list "re-string" (re-string:chars self))))
|
||||||
|
|
||||||
|
(define re-string make-re-string) ; For consistency w/other re makers.
|
||||||
|
|
||||||
|
;;; This is only used in code that (RE ...) macro produces for static regexps.
|
||||||
|
(define (make-re-string/posix chars posix-str tvec)
|
||||||
|
(let ((re (make-re-string chars)))
|
||||||
|
(set-re-string:posix re (new-cre posix-str tvec))
|
||||||
|
re))
|
||||||
|
|
||||||
|
;;; Matches the empty string anywhere.
|
||||||
|
(define re-trivial (make-re-string/posix "" "" '#()))
|
||||||
|
|
||||||
|
(define (re-trivial? re)
|
||||||
|
(and (re-string? re) (zero? (string-length (re-string:chars re)))))
|
||||||
|
|
||||||
|
(define-record re-char-set
|
||||||
|
cset ; A character set (Macro expander abuses.)
|
||||||
|
(posix #f)) ; Posix record
|
||||||
|
|
||||||
|
(define re-char-set make-re-char-set) ; For consistency w/other re makers.
|
||||||
|
|
||||||
|
;;; This is only used in code that (RE ...) macro produces for static regexps.
|
||||||
|
(define (make-re-char-set/posix cs posix-str tvec)
|
||||||
|
(let ((re (make-re-char-set cs)))
|
||||||
|
(set-re-char-set:posix re (new-cre posix-str tvec))
|
||||||
|
re))
|
||||||
|
|
||||||
|
;;; Never matches
|
||||||
|
;;; NEED TO OPTIMIZE - PRE-SET POSIX FIELD.
|
||||||
|
(define re-empty (make-re-char-set char-set:empty))
|
||||||
|
|
||||||
|
(define (re-empty? re)
|
||||||
|
(and (re-char-set? re)
|
||||||
|
(let ((cs (re-char-set:cset re)))
|
||||||
|
(and (char-set? cs) ; Might be code...
|
||||||
|
(char-set-empty? cs)))))
|
||||||
|
|
||||||
|
(define-record re-bos) (define re-bos (make-re-bos))
|
||||||
|
(define-record re-eos) (define re-eos (make-re-eos))
|
||||||
|
|
||||||
|
(define-record re-bol) (define re-bol (make-re-bol))
|
||||||
|
(define-record re-eol) (define re-eol (make-re-eol))
|
||||||
|
|
||||||
|
(define-record re-bow) (define re-bow (make-re-bow))
|
||||||
|
(define-record re-eow) (define re-eow (make-re-eow))
|
||||||
|
|
||||||
|
|
||||||
|
(define re-any (make-re-char-set/posix char-set:full "." '#()))
|
||||||
|
|
||||||
|
(define (re-any? re)
|
||||||
|
(and (re-char-set? re)
|
||||||
|
(let ((cs (re-char-set:cset re)))
|
||||||
|
(and (char-set? cs) ; Might be code...
|
||||||
|
(char-set-full? cs)))))
|
||||||
|
|
||||||
|
(define re-nonl (make-re-char-set/posix (char-set #\newline) "[^\n]" '#()))
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define (regexp? x)
|
||||||
|
(or (re-seq? x) (re-choice? x) (re-repeat? x)
|
||||||
|
(re-char-set? x) (re-string? x)
|
||||||
|
(re-bos? x) (re-eos? x)
|
||||||
|
(re-bol? x) (re-eol? x)
|
||||||
|
(re-bow? x) (re-eow? x)
|
||||||
|
(re-submatch? x) (re-dsm? x)))
|
||||||
|
|
||||||
|
|
||||||
|
;;; Return the total number of submatches bound in RE.
|
||||||
|
|
||||||
|
(define (re-tsm re)
|
||||||
|
(? ((re-seq? re) (re-seq:tsm re))
|
||||||
|
((re-choice? re) (re-choice:tsm re))
|
||||||
|
((re-repeat? re) (re-repeat:tsm re))
|
||||||
|
((re-dsm? re) (re-dsm:tsm re))
|
||||||
|
((re-submatch? re) (re-submatch:tsm re))
|
||||||
|
(else 0)))
|
||||||
|
|
||||||
|
|
||||||
|
(define re-word
|
||||||
|
(let ((wcs (char-set-union char-set:alphanumeric ; Word chars
|
||||||
|
(char-set #\_))))
|
||||||
|
(make-re-seq (list re-bow
|
||||||
|
(make-re-repeat 1 #f (make-re-char-set wcs))
|
||||||
|
re-eow))))
|
||||||
|
|
||||||
|
|
||||||
|
;;; (flush-submatches re)
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; Return regular expression RE with all submatch-binding elements
|
||||||
|
;;; stripped out -- (= 0 (re-tsm (flush-submatches re))).
|
||||||
|
|
||||||
|
(define (flush-submatches re)
|
||||||
|
(? ((zero? (re-tsm re)) re) ; RE has no submatches.
|
||||||
|
|
||||||
|
((re-seq? re) (re-seq (map flush-submatches (re-seq:elts re))))
|
||||||
|
((re-choice? re) (re-choice (map flush-submatches (re-choice:elts re))))
|
||||||
|
|
||||||
|
((re-repeat? re) (re-repeat (re-repeat:from re)
|
||||||
|
(re-repeat:to re)
|
||||||
|
(flush-submatches (re-repeat:body re))))
|
||||||
|
|
||||||
|
((re-submatch? re) (flush-submatches (re-submatch:body re)))
|
||||||
|
((re-dsm? re) (flush-submatches (re-dsm:body re)))
|
||||||
|
|
||||||
|
(else re)))
|
||||||
|
|
||||||
|
|
||||||
|
;;; Map F over ELTS. (F x) returns two values -- the "real" return value,
|
||||||
|
;;; and a "changed?" flag. If CHANGED? is false, then the "real" return value
|
||||||
|
;;; should be identical to the original argument X. MAP/CHANGED constructs
|
||||||
|
;;; the mapped list sharing as long an unchanged tail as possible with the
|
||||||
|
;;; list ELTS; if F changes no argument, MAP/CHANGED returns exactly the list
|
||||||
|
;;; ELTS. MAP/CHANGED returns two values: the mapped list, and a changed?
|
||||||
|
;;; flag for the entire list.
|
||||||
|
|
||||||
|
(define (map/changed f elts)
|
||||||
|
(let recur ((elts elts))
|
||||||
|
(if (pair? elts)
|
||||||
|
(let ((elt (car elts)))
|
||||||
|
(receive (new-elts elts-changed?) (recur (cdr elts))
|
||||||
|
(receive (new-elt elt-changed?) (f elt)
|
||||||
|
(if (or elts-changed? elt-changed?)
|
||||||
|
(values (cons new-elt new-elts) #t)
|
||||||
|
(values elts #f)))))
|
||||||
|
(values '() #f))))
|
||||||
|
|
||||||
|
|
||||||
|
(define (uncase re)
|
||||||
|
(receive (new-re changed?)
|
||||||
|
(let recur ((re re))
|
||||||
|
(? ((re-seq? re)
|
||||||
|
(let ((elts (re-seq:elts re)))
|
||||||
|
(receive (new-elts elts-changed?)
|
||||||
|
(map/changed recur elts)
|
||||||
|
(if elts-changed?
|
||||||
|
(values (%make-re-seq new-elts (re-seq:tsm re)) #t)
|
||||||
|
(values re #f)))))
|
||||||
|
|
||||||
|
((re-choice? re)
|
||||||
|
(let ((elts (re-choice:elts re)))
|
||||||
|
(receive (new-elts elts-changed?)
|
||||||
|
(map/changed recur elts)
|
||||||
|
(if elts-changed?
|
||||||
|
(values (re-choice new-elts) #t)
|
||||||
|
(values re #f)))))
|
||||||
|
|
||||||
|
((re-char-set? re)
|
||||||
|
(let* ((cs (re-char-set:cset re))
|
||||||
|
(new-cs (uncase-char-set cs))) ; Better not be code.
|
||||||
|
(if (char-set= cs new-cs)
|
||||||
|
(values re #f)
|
||||||
|
(values (make-re-char-set new-cs) #t))))
|
||||||
|
|
||||||
|
((re-repeat? re)
|
||||||
|
(receive (new-body body-changed?) (recur (re-repeat:body re))
|
||||||
|
(if body-changed?
|
||||||
|
(values (re-repeat (re-repeat:from re)
|
||||||
|
(re-repeat:to re)
|
||||||
|
new-body)
|
||||||
|
#t)
|
||||||
|
(values re #f))))
|
||||||
|
|
||||||
|
((re-submatch? re)
|
||||||
|
(receive (new-body body-changed?) (recur (re-submatch? re))
|
||||||
|
(if body-changed?
|
||||||
|
(values (%make-re-submatch new-body
|
||||||
|
(re-submatch:pre-dsm re)
|
||||||
|
(re-submatch:tsm re))
|
||||||
|
#t)
|
||||||
|
(values re #f))))
|
||||||
|
|
||||||
|
((re-string? re)
|
||||||
|
(let ((cf-re (uncase-string (re-string:chars re))))
|
||||||
|
(if (re-string? cf-re)
|
||||||
|
(values re #f)
|
||||||
|
(values cf-re #t))))
|
||||||
|
|
||||||
|
(else (values re #f))))
|
||||||
|
new-re))
|
||||||
|
|
||||||
|
|
||||||
|
;;; (uncase-char-set cs)
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; Return a char-set cs' such that cs' contains every char c in cs in both
|
||||||
|
;;; its upcase and downcase form.
|
||||||
|
|
||||||
|
(define (uncase-char-set cs)
|
||||||
|
(char-set-fold (lambda (c new-cset)
|
||||||
|
(char-set-adjoin! new-cset
|
||||||
|
(char-downcase c)
|
||||||
|
(char-upcase c)))
|
||||||
|
(char-set-copy char-set:empty)
|
||||||
|
cs))
|
||||||
|
|
||||||
|
|
||||||
|
;;; I actually make an effort to keep this a re-string
|
||||||
|
;;; if possible (if the string contains no case-sensitive
|
||||||
|
;;; characters). Returns a regexp matching the string in
|
||||||
|
;;; a case-insensitive fashion.
|
||||||
|
|
||||||
|
(define (uncase-string s)
|
||||||
|
;; SEQ is a list of chars and doubleton char-sets.
|
||||||
|
(let* ((seq (string-fold-right (lambda (c lis)
|
||||||
|
(cons (? ((char-lower-case? c) (char-set c (char-upcase c)))
|
||||||
|
((char-upper-case? c) (char-set c (char-downcase c)))
|
||||||
|
(else c))
|
||||||
|
lis))
|
||||||
|
'() s))
|
||||||
|
|
||||||
|
;; Coalesce adjacent chars together into a string.
|
||||||
|
(fixup (lambda (chars seq)
|
||||||
|
(if (pair? chars)
|
||||||
|
(cons (make-re-string (list->string (reverse chars)))
|
||||||
|
seq)
|
||||||
|
seq)))
|
||||||
|
|
||||||
|
(new-seq (let recur ((seq seq) (chars '()))
|
||||||
|
(if (pair? seq)
|
||||||
|
(let ((elt (car seq))
|
||||||
|
(seq (cdr seq)))
|
||||||
|
(if (char? elt)
|
||||||
|
(recur seq (cons elt chars))
|
||||||
|
(fixup chars (cons (make-re-char-set elt)
|
||||||
|
(recur seq '())))))
|
||||||
|
(fixup chars '())))))
|
||||||
|
|
||||||
|
(if (= 1 (length new-seq)) (car new-seq)
|
||||||
|
(make-re-seq new-seq))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
(define char-set-full?
|
||||||
|
(let ((allchars-nchars (char-set-size char-set:full)))
|
||||||
|
(lambda (cs) (= allchars-nchars (char-set-size cs)))))
|
||||||
|
|
||||||
|
(define (char-set-empty? cs) (zero? (char-set-size cs)))
|
||||||
|
|
||||||
|
|
||||||
|
;;; A "char-class" re is either a char-set re or a string re whose string
|
||||||
|
;;; has only one character.
|
||||||
|
|
||||||
|
(define (re-char-class? re)
|
||||||
|
(or (re-char-set? re)
|
||||||
|
(and (re-string? re)
|
||||||
|
(= 1 (string-length (re-string:chars re))))))
|
||||||
|
|
||||||
|
(define (static-char-class? re)
|
||||||
|
(or (and (re-char-set? re)
|
||||||
|
(char-set? (re-char-set:cset re))) ; This might be code.
|
||||||
|
(and (re-string? re) ; But never this, so no check.
|
||||||
|
(= 1 (string-length (re-string:chars re))))))
|
|
@ -0,0 +1,173 @@
|
||||||
|
/* Scheme48 interface to Henry Spencer's Posix regular expression package.
|
||||||
|
** Copyright (c) 1993, 1994, 1998 by Olin Shivers.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Todo:
|
||||||
|
** not_eol not_bol support on searchers
|
||||||
|
** error code -> err msg
|
||||||
|
** regex freeing
|
||||||
|
** regexp-string -> regex_t caching
|
||||||
|
** make filter_stringvec return an error code.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include "../regexp/regex.h"
|
||||||
|
#include "../cstuff.h"
|
||||||
|
|
||||||
|
/* Make sure our exports match up w/the implementation: */
|
||||||
|
#include "re1.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Compile regexp into a malloc'd struct.
|
||||||
|
** The flag sm_p is true if we want to compile for submatches.
|
||||||
|
** On success, store pointer to struct into cr and return 0.
|
||||||
|
** On failure, free the struct, store NULL into cr,
|
||||||
|
** and return a non-zero error code.
|
||||||
|
*/
|
||||||
|
|
||||||
|
int compile_re(scheme_value re_str, int sm_p, regex_t **cr)
|
||||||
|
{
|
||||||
|
char *s = &STRING_REF(re_str, 0);
|
||||||
|
int len = STRING_LENGTH(re_str);
|
||||||
|
int err;
|
||||||
|
regex_t *re = Alloc(regex_t);
|
||||||
|
|
||||||
|
if( !re ) return -1;
|
||||||
|
|
||||||
|
re->re_endp = s + len;
|
||||||
|
err = regcomp(re, s, REG_EXTENDED | REG_PEND
|
||||||
|
| (sm_p ? 0 : REG_NOSUB));
|
||||||
|
if( err ) {Free(re); *cr=0;}
|
||||||
|
else *cr=re;
|
||||||
|
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Do a regex search of RE through string STR, beginning at STR[START].
|
||||||
|
** - STR is passed as a Scheme value as it is allowed to contain nul bytes.
|
||||||
|
**
|
||||||
|
** - trans_vec contains the translation from the user's "virtual" submatches to
|
||||||
|
** the actual submatches the engine will report:
|
||||||
|
** - trans_vec[i] = #F means user submatch #i is a dead submatch.
|
||||||
|
** - trans_vec[i] = j means user submatch #i corresponds to paren #j in re.
|
||||||
|
**
|
||||||
|
** Indexing fence-posts are a little complicated due to the fact that you
|
||||||
|
** get an extra match elt back from the matcher -- match 0 is not a
|
||||||
|
** paren-based *sub*match, but rather the match info for the whole thing.
|
||||||
|
**
|
||||||
|
** So, here is how it works:
|
||||||
|
** length(start_vec) = length(end_vec) = length(trans_vec) + 1
|
||||||
|
** because trans_vec doesn't have a translation for submatch 0, which
|
||||||
|
** is SRE submatch #0 => Posix submatch #0. For SRE submatch #i (1, 2, ...),
|
||||||
|
** we want the submatch associated with Posix paren # trans_vec[i-1].
|
||||||
|
**
|
||||||
|
** - MAX_PSM is the maximum paren in which we have submatch interest -- the
|
||||||
|
** max element in TRANS_VEC. Any parens after paren #MAX_PSM are just for
|
||||||
|
** grouping, not for marking submatches. We only have to allocate MAX_PSM+1
|
||||||
|
** elements in the submatch vector we pass into the search engine. If
|
||||||
|
** MAX_PSM = -1, then we don't even want the whole-match match bounds, which
|
||||||
|
** is really good -- the search engine can really fly in this case.
|
||||||
|
**
|
||||||
|
** If we match, map re's submatches over to the exported start_vec and
|
||||||
|
** end_vec match vectors using trans_vec.
|
||||||
|
**
|
||||||
|
** Return 0 on success; #f if no match; non-zero integer error code otherwise.
|
||||||
|
*/
|
||||||
|
|
||||||
|
scheme_value re_search(const regex_t *re, scheme_value str, int start,
|
||||||
|
scheme_value trans_vec, int max_psm,
|
||||||
|
scheme_value start_vec, scheme_value end_vec)
|
||||||
|
{
|
||||||
|
char *s = &STRING_REF(str,0); /* Passed as a scheme_value because */
|
||||||
|
int len = STRING_LENGTH(str); /* it might contain nul bytes. */
|
||||||
|
|
||||||
|
int vlen = VECTOR_LENGTH(start_vec);
|
||||||
|
int retval;
|
||||||
|
|
||||||
|
regmatch_t static_pmatch[10], *pm;
|
||||||
|
|
||||||
|
/* If max_psm+1 > 10, we can't use static_pmatch. */
|
||||||
|
if( max_psm < 10 ) pm = static_pmatch;
|
||||||
|
else {
|
||||||
|
pm = Malloc(regmatch_t, max_psm+1);/* Add 1 for the whole-match info. */
|
||||||
|
if( !pm ) return ENTER_FIXNUM(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
pm[0].rm_so = start;
|
||||||
|
pm[0].rm_eo = len;
|
||||||
|
|
||||||
|
retval = regexec(re, s, max_psm+1, pm, REG_STARTEND); /* Do it. */
|
||||||
|
|
||||||
|
/* We matched and have match-bound info, so translate it over. */
|
||||||
|
if( !retval && max_psm >= 0 ) {
|
||||||
|
int i;
|
||||||
|
|
||||||
|
VECTOR_REF(start_vec,0) = ENTER_FIXNUM(pm[0].rm_so); /* whole-match */
|
||||||
|
VECTOR_REF(end_vec,0) = ENTER_FIXNUM(pm[0].rm_eo);
|
||||||
|
|
||||||
|
for( i=vlen-1; --i >= 0; ) { /* submatches */
|
||||||
|
scheme_value j_scm = VECTOR_REF(trans_vec,i);
|
||||||
|
if( j_scm != SCHFALSE ) {
|
||||||
|
int j = EXTRACT_FIXNUM(j_scm);
|
||||||
|
int k = pm[j].rm_so,
|
||||||
|
l = pm[j].rm_eo;
|
||||||
|
VECTOR_REF(start_vec,i+1) = (k != -1) ? ENTER_FIXNUM(k) : SCHFALSE;
|
||||||
|
VECTOR_REF(end_vec, i+1) = (l != -1) ? ENTER_FIXNUM(l) : SCHFALSE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if( max_psm >= 10 ) Free(pm);
|
||||||
|
|
||||||
|
if( retval==REG_NOMATCH ) return SCHFALSE;
|
||||||
|
if( ! retval ) return SCHTRUE;
|
||||||
|
return ENTER_FIXNUM(retval);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/* Filter a vector of strings by regexp RE_STR.
|
||||||
|
** Stringvec is a NULL-terminated vector of strings;
|
||||||
|
** filter it in-place, copying the survivors back to compact them.
|
||||||
|
** Put the number of survivors in nummatch.
|
||||||
|
*/
|
||||||
|
|
||||||
|
int filter_stringvec(scheme_value re_str, char const **stringvec)
|
||||||
|
{
|
||||||
|
int re_len = STRING_LENGTH(re_str);/* Passed as a scheme_value because */
|
||||||
|
char *re_chars = &STRING_REF(re_str,0);/* it might contain nul bytes. */
|
||||||
|
regex_t re;
|
||||||
|
|
||||||
|
char const **p, **q;
|
||||||
|
|
||||||
|
/* REG_NOSUB -- We just want to know if it matches or not. */
|
||||||
|
re.re_endp = re_chars + re_len;
|
||||||
|
if( regcomp(&re, re_chars, REG_EXTENDED | REG_PEND | REG_NOSUB) ) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(p=q=stringvec; *p; p++) {
|
||||||
|
char const *s = *p;
|
||||||
|
if( ! regexec(&re, s, 0, 0, 0) ) *q++ = s;
|
||||||
|
}
|
||||||
|
|
||||||
|
regfree(&re);
|
||||||
|
return q-stringvec;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
const char *re_errint2str(int errcode, const regex_t *re)
|
||||||
|
{
|
||||||
|
int size = regerror(errcode, re, 0, 0);
|
||||||
|
char *s = Malloc(char,size);
|
||||||
|
if(s) regerror(errcode, re, s, size);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void free_re(regex_t *re)
|
||||||
|
{
|
||||||
|
regfree(re);
|
||||||
|
Free(re);
|
||||||
|
}
|
|
@ -0,0 +1,15 @@
|
||||||
|
/* Exports from re1.c */
|
||||||
|
|
||||||
|
int compile_re(scheme_value sre, int sm_p, regex_t **cr);
|
||||||
|
|
||||||
|
scheme_value re_search(const regex_t *re, scheme_value str, int start,
|
||||||
|
scheme_value trans_vec, int max_psm,
|
||||||
|
scheme_value start_vec, scheme_value end_vec);
|
||||||
|
|
||||||
|
/* Filter a vector of strings by a regexp. */
|
||||||
|
int filter_stringvec(scheme_value re_str, char const **stringvec);
|
||||||
|
|
||||||
|
/* Error code -> error msg */
|
||||||
|
const char *re_errint2str(int errcode, const regex_t *re);
|
||||||
|
|
||||||
|
void free_re(regex_t *re); /* Free the malloc'd regexp. */
|
|
@ -0,0 +1,5 @@
|
||||||
|
;;; We keep the regression tests here.
|
||||||
|
;;; If I had any.
|
||||||
|
|
||||||
|
(define sre+posix
|
||||||
|
'())
|
|
@ -0,0 +1,40 @@
|
||||||
|
;;; Procedures that appear in code produced by (RX ...).
|
||||||
|
|
||||||
|
;;; In sexp syntax, a ,<exp> or ,@<exp> form may evaluate to a string, char,
|
||||||
|
;;; char-set, or regexp value. Coerce one of these to a regexp value.
|
||||||
|
|
||||||
|
(define (coerce-dynamic-regexp x)
|
||||||
|
(? ((string? x) (make-re-string x))
|
||||||
|
((char? x) (make-re-string (string x)))
|
||||||
|
((char-set? x) (make-re-char-set x))
|
||||||
|
((regexp? x) x)
|
||||||
|
(else (error "Cannot coerce value to regular expression." x))))
|
||||||
|
|
||||||
|
;;; In a char-set context (e.g., as an operand of the SRE - operator),
|
||||||
|
;;; a ,<exp> or form must be coercable to a char-set.
|
||||||
|
|
||||||
|
(define (coerce-dynamic-charset x)
|
||||||
|
(? ((string? x)
|
||||||
|
(if (= 1 (string-length x)) (string->char-set x)
|
||||||
|
(error "Multi-char string not allowed as ,<exp> or ,@<exp> SRE in char-class context."
|
||||||
|
x)))
|
||||||
|
((char? x) (char-set x))
|
||||||
|
((char-set? x) x)
|
||||||
|
((re-char-set? x) (re-char-set:cset x))
|
||||||
|
(else (error "Cannot coerce value to character set" x))))
|
||||||
|
|
||||||
|
|
||||||
|
(define (spec->char-set in? loose ranges)
|
||||||
|
(let ((doit (lambda (loose ranges)
|
||||||
|
(fold (lambda (r cset)
|
||||||
|
(let ((from (char->ascii (car r)))
|
||||||
|
(to (char->ascii (cdr r))))
|
||||||
|
(do ((i from (+ i 1))
|
||||||
|
(cs cset (char-set-adjoin! cs (ascii->char i))))
|
||||||
|
((> i to) cs))))
|
||||||
|
(string->char-set loose)
|
||||||
|
ranges))))
|
||||||
|
(if in?
|
||||||
|
(doit loose ranges)
|
||||||
|
(char-set-invert! (doit loose ranges)))))
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
Notes on Tom Lord's rx regexp package.
|
||||||
|
|
||||||
|
API info nodes should give C prototypes. re_compile_pattern doesn't.
|
||||||
|
|
||||||
|
Many thread-reentrancy problems:
|
||||||
|
- syntax var is a shared global. Needs to be passed as arg to
|
||||||
|
pattern compiler -- which could easily be done by simply having
|
||||||
|
the client pre-set the pat->syntax field.
|
||||||
|
|
||||||
|
- regs field should not be a part of the pattern -- you can't
|
||||||
|
use the pattern in multiple concurrent matches.
|
||||||
|
|
||||||
|
Similarly for pat->no_sub, pat->not_bol, pat->not-eol. These fields are
|
||||||
|
properly part of of the client's request, not the pattern.
|
|
@ -0,0 +1,41 @@
|
||||||
|
(define-syntax color (syntax-rules () ((color) 'green)))
|
||||||
|
|
||||||
|
(define-syntax with-blue
|
||||||
|
(lambda (exp r c)
|
||||||
|
`(,(r 'let-syntax) ((color (,(r 'syntax-rules) ()
|
||||||
|
((color) 'blue))))
|
||||||
|
. ,(cdr exp))))
|
||||||
|
|
||||||
|
(with-blue (color))
|
||||||
|
|
||||||
|
;;; This has a problem -- WITH-BLUE is not hygenic:
|
||||||
|
(let ((color (lambda () 'foo)))
|
||||||
|
(with-blue (color)))
|
||||||
|
|
||||||
|
=> blue
|
||||||
|
|
||||||
|
;;; Let's fix this by adding a layer of indirection --
|
||||||
|
;;; 1. (color) ==> (hidden-color)
|
||||||
|
;;; 2. with-blue frobs the definition of *our* hidden-color
|
||||||
|
|
||||||
|
(define-syntax hidden-color (lambda (exp r c) `(,(r 'quote) green)))
|
||||||
|
(define-syntax color (lambda (exp r c) `(,(r 'hidden-color))))
|
||||||
|
|
||||||
|
(define-syntax with-blue
|
||||||
|
(lambda (exp r c)
|
||||||
|
`(,(r 'let-syntax)
|
||||||
|
((,(r 'hidden-color) (,(r 'syntax-rules) ()
|
||||||
|
((,(r 'hidden-color)) (,(r 'quote) blue)))))
|
||||||
|
. ,(cdr exp))))
|
||||||
|
|
||||||
|
;;; Without all the renaming, the above is
|
||||||
|
;;; (let-syntax ((hidden-color (syntax-rules () ((hidden-color 'blue)))))
|
||||||
|
;;; body ...)
|
||||||
|
;;; where *all* symbols on the first line are renamed, *including*
|
||||||
|
;;; hidden-color, so we should be redefining the same hidden-color
|
||||||
|
;;; to which (color) expands.
|
||||||
|
|
||||||
|
;;; It doesn't work:
|
||||||
|
(with-blue (color))
|
||||||
|
=> green
|
||||||
|
|
|
@ -0,0 +1,43 @@
|
||||||
|
The simplifier produces regexps with some simple invariants:
|
||||||
|
|
||||||
|
- DSM's are only top-level, never appearing in the body of a DSM,
|
||||||
|
repeat, sequence, choice, or submatch.
|
||||||
|
|
||||||
|
- A repeat's body is not a repeat, trivial match, or empty match.
|
||||||
|
|
||||||
|
- A choice's body contains more than one element; no element is
|
||||||
|
- a choice,
|
||||||
|
- a DSM, or
|
||||||
|
- an empty-match.
|
||||||
|
|
||||||
|
- A choice contains 0 or 1 char-set, bos, and eos elements.
|
||||||
|
|
||||||
|
- A sequence's body contains more than one element; no element is
|
||||||
|
- a sequence,
|
||||||
|
- a DSM,
|
||||||
|
- a trivial match, or
|
||||||
|
- an empty-match
|
||||||
|
|
||||||
|
- There are no empty matches in the regexp unless the entire regexp
|
||||||
|
is either an empty match, or a dsm whose body is an empty match.
|
||||||
|
(This is good, because there is no way to write an empty match
|
||||||
|
in Posix notation in a char-set independent way -- you have to
|
||||||
|
use the six-char "[^\000-\177]" for ASCII.)
|
||||||
|
|
||||||
|
To see these invariants:
|
||||||
|
|
||||||
|
- We can always bubble up empty matches:
|
||||||
|
- If a sequence has one, the whole sequence is reduced to an empty match.
|
||||||
|
- They can be deleted from a choice; if the choice reduces to 0 elements,
|
||||||
|
the choice can be reduced to an empty match.
|
||||||
|
- A repeat of an empty match is either an empty match or a trivial match,
|
||||||
|
depending upon whether FROM is >0 or 0, respectively.
|
||||||
|
- DSM of an empty match: the DSM itself can be bubbled upwards (see below).
|
||||||
|
|
||||||
|
- We can always bubble up DSM regexps:
|
||||||
|
- If an elt of a choice or sequence is a DSM, it can be "absorbed"
|
||||||
|
into the element's relocation offset.
|
||||||
|
- Repeat commutes with DSM.
|
||||||
|
- A DSM body can be "absorbed" into a submatch record by increasing the
|
||||||
|
submatch's DSM0 count.
|
||||||
|
- Nested DSM's can be collapsed together.
|
|
@ -0,0 +1,403 @@
|
||||||
|
;;; Olin Shivers, June 1998
|
||||||
|
;;; Copyright (c) 1998 by the Scheme Underground.
|
||||||
|
|
||||||
|
;;; One export: (simplify-regexp re) -> re
|
||||||
|
|
||||||
|
;;; Regexp simplifier
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; (| (in c1 ...) (in c2 ...) re ...) => (| (in c1 ... c2 ...) re ...)
|
||||||
|
;;; (| (not-in c1 ...) (not-in c2 ...)) => (| (not-in [intersect (c1 ...)
|
||||||
|
;;; (c2 ...)])
|
||||||
|
;;; A run of BOS's or a run of EOS's in a sequence may be elided.
|
||||||
|
;;; Nested exponents can be collapsed (*, +, ?) -- multiply the "from's"
|
||||||
|
;;; together; multiply the "to's" together.
|
||||||
|
;;; Exponent range [1,1] simplifies, as does [0,0].
|
||||||
|
;;; Uniquify branches
|
||||||
|
;;; Adjacent literals in a sequence can be collapsed
|
||||||
|
;;; A singleton-char char class can be collapsed into a constant
|
||||||
|
;;; Nested choices can be collapsed
|
||||||
|
;;; Nested sequences can be collapsed
|
||||||
|
;;; An empty sequence (:) can be turned into an empty-string match "".
|
||||||
|
;;; Singleton choices and sequences can be reduced to their body.
|
||||||
|
;;;
|
||||||
|
;;; The simplifier is carefully written so that it won't blow up
|
||||||
|
;;; when applied to a dynamic regexp -- that is,
|
||||||
|
;;; - a chunk of Scheme code that produces a regexp instead of
|
||||||
|
;;; an actual regexp value;
|
||||||
|
;;; - a repeat regexp whose FROM or TO fields are chunks of Scheme code
|
||||||
|
;;; rather than integers;
|
||||||
|
;;; - a char-set regexp whose CSET field is a chunk of Scheme code rather
|
||||||
|
;;; than an actual char-set value.
|
||||||
|
;;; This is useful because the RX macro can build such a regexp as part
|
||||||
|
;;; of its expansion process.
|
||||||
|
|
||||||
|
(define (simplify-regexp re)
|
||||||
|
(receive (simp-re pre-dsm) (simp-re re)
|
||||||
|
(re-dsm simp-re pre-dsm (- (re-tsm re) (+ (re-tsm simp-re) pre-dsm)))))
|
||||||
|
|
||||||
|
(define (simp-re re)
|
||||||
|
(? ((re-string? re) (values re 0))
|
||||||
|
((re-seq? re) (simp-seq re))
|
||||||
|
((re-choice? re) (simp-choice re))
|
||||||
|
|
||||||
|
;; Singleton char-sets reduce to the character.
|
||||||
|
;; Bear in mind the cset field might be Scheme code instead
|
||||||
|
;; of an actual char set if the regexp is dynamic.
|
||||||
|
((re-char-set? re)
|
||||||
|
(values (let ((cs (re-char-set:cset re)))
|
||||||
|
(if (and (char-set? cs)
|
||||||
|
(= 1 (char-set-size cs)))
|
||||||
|
(make-re-string (string (car (char-set-members cs))))
|
||||||
|
re))
|
||||||
|
0))
|
||||||
|
|
||||||
|
((re-repeat? re) (simp-repeat re))
|
||||||
|
|
||||||
|
((re-submatch? re) (simp-submatch re))
|
||||||
|
((re-dsm? re) (simp-dsm re))
|
||||||
|
|
||||||
|
(else (values re 0))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; If the body of a submatch is the empty re, reduce it to the empty re.
|
||||||
|
|
||||||
|
(define (simp-submatch re)
|
||||||
|
(let ((tsm (re-submatch:tsm re))
|
||||||
|
(pre-dsm (re-submatch:pre-dsm re)))
|
||||||
|
(receive (body1 pre-dsm1) (simp-re (re-submatch:body re))
|
||||||
|
(if (re-empty? body1)
|
||||||
|
(values re-empty tsm)
|
||||||
|
(values (%make-re-submatch body1 (+ pre-dsm pre-dsm1) tsm)
|
||||||
|
0)))))
|
||||||
|
|
||||||
|
;;; - Flatten nested DSM's.
|
||||||
|
;;; - Return pre-dsm field and body field as the two return values.
|
||||||
|
|
||||||
|
(define (simp-dsm re)
|
||||||
|
(receive (body pre-dsm1) (simp-re (re-dsm:body re))
|
||||||
|
(values body (+ (re-dsm:pre-dsm re) pre-dsm1))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; Simplifying sequences
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; - Collapse nested sequences and DSM's.
|
||||||
|
;;; - Merge adjacent strings, identical adjacent anchors (bos, eos, etc.).
|
||||||
|
;;; - Bubble DSM's forwards past elts that don't contain live submatches.
|
||||||
|
;;; (Going past live submatches would switch the submatch indexes around,
|
||||||
|
;;; which would be an error). This helps to coalesce DSMs and if we bring
|
||||||
|
;;; them all the way to the front, we can pop them off and make them a
|
||||||
|
;;; pre-dsm for the entire seq record.
|
||||||
|
;;; - If an elt is the re-empty, reduce the whole re to the empty re.
|
||||||
|
;;; - Reduce singleton and empty seq.
|
||||||
|
|
||||||
|
(define (simp-seq re)
|
||||||
|
(let ((tsm (re-seq:tsm re))
|
||||||
|
(elts (map simplify-regexp (re-seq:elts re))))
|
||||||
|
(if (pair? elts)
|
||||||
|
|
||||||
|
(call-with-current-continuation
|
||||||
|
(lambda (abort)
|
||||||
|
(receive (pre-dsm head tail) (simp-seq1 elts abort tsm)
|
||||||
|
(values (if (pair? tail)
|
||||||
|
(%make-re-seq (cons head tail) (- tsm pre-dsm))
|
||||||
|
head) ; Singleton seq
|
||||||
|
pre-dsm))))
|
||||||
|
|
||||||
|
(values re-trivial 0)))) ; Empty seq
|
||||||
|
|
||||||
|
|
||||||
|
;;; Simplify the non-empty sequence ELTS.
|
||||||
|
;;; - Return the result split out into three values:
|
||||||
|
;;; [head-elt-pre-dsm, head-elt, tail].
|
||||||
|
;;; - If any elt is the empty (impossible) re, abort by calling
|
||||||
|
;;; (abort elt tsm). TSM is otherwise unused.
|
||||||
|
|
||||||
|
(define (simp-seq1 elts abort tsm)
|
||||||
|
(let recur ((elt (car elts)) (elts (cdr elts)))
|
||||||
|
(receive (elt pre-dsm) (open-dsm elt)
|
||||||
|
(? ((re-seq? elt) ; Flatten nested seqs.
|
||||||
|
(let ((sub-elts (re-seq:elts elt)))
|
||||||
|
(recur (re-dsm (car sub-elts) pre-dsm 0)
|
||||||
|
(append (cdr sub-elts) elts))))
|
||||||
|
|
||||||
|
((re-empty? elt) (abort elt tsm)) ; Bomb out on the empty
|
||||||
|
; (impossible) re.
|
||||||
|
((pair? elts)
|
||||||
|
(receive (next-pre-dsm next tail) ; Simplify the tail,
|
||||||
|
(recur (car elts) (cdr elts)) ; then think about
|
||||||
|
; the head:
|
||||||
|
;; This guy is called when we couldn't find any other
|
||||||
|
;; simplification. If ELT contains live submatches, then
|
||||||
|
;; there really is nothing to be done at this step -- just
|
||||||
|
;; assemble the pieces together and return them. If ELT
|
||||||
|
;; *doesn't* contain any live submatches, do the same, but
|
||||||
|
;; bubble its following next-pre-dsm submatches forwards.
|
||||||
|
(define (no-simp)
|
||||||
|
(if (has-live-submatches? elt)
|
||||||
|
(values pre-dsm elt (cons (re-dsm next next-pre-dsm 0) tail))
|
||||||
|
(values (+ pre-dsm next-pre-dsm) elt (cons next tail))))
|
||||||
|
|
||||||
|
;; Coalesces two adjacent bol's, two adjacent eol's, etc.
|
||||||
|
(define (coalesce-anchor anchor?)
|
||||||
|
(if (and (anchor? elt) (anchor? next))
|
||||||
|
(values (+ pre-dsm next-pre-dsm) elt tail)
|
||||||
|
(no-simp)))
|
||||||
|
|
||||||
|
(? ((re-trivial? elt) ; Drop trivial re's.
|
||||||
|
(values (+ pre-dsm next-pre-dsm) next tail))
|
||||||
|
|
||||||
|
;; Coalesce adjacent strings
|
||||||
|
((re-string? elt)
|
||||||
|
(if (re-string? next)
|
||||||
|
(values (+ pre-dsm next-pre-dsm)
|
||||||
|
(make-re-string (string-append (re-string:chars elt)
|
||||||
|
(re-string:chars next)))
|
||||||
|
tail)
|
||||||
|
(no-simp)))
|
||||||
|
|
||||||
|
;; Coalesce adjacent bol/eol/bos/eos/bow/eow's.
|
||||||
|
((re-bol? elt) (coalesce-anchor re-bol?))
|
||||||
|
((re-eol? elt) (coalesce-anchor re-eol?))
|
||||||
|
((re-bos? elt) (coalesce-anchor re-bos?))
|
||||||
|
((re-eos? elt) (coalesce-anchor re-eos?))
|
||||||
|
((re-bow? elt) (coalesce-anchor re-bow?))
|
||||||
|
((re-eow? elt) (coalesce-anchor re-eow?))
|
||||||
|
(else (no-simp)))))
|
||||||
|
|
||||||
|
(else (values pre-dsm elt '()))))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; Simplifying choices
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
;;; - Collapse nested choices and DSM's.
|
||||||
|
;;; - Delete re-empty's.
|
||||||
|
;;; - Merge sets; merge identical anchors (bos, eos, etc.).
|
||||||
|
;;; But you can't merge across an element that contains a live submatch,
|
||||||
|
;;; see below.
|
||||||
|
;;; - A singleton string "c" is included into the char-set merge as a
|
||||||
|
;;; singleton set.
|
||||||
|
;;; - Bubble DSM's forwards past elts that don't contain live submatches.
|
||||||
|
;;; (Going past live submatches would switch the submatch indexes around,
|
||||||
|
;;; which would be an error). This helps to coalesce DSMs and if we bring
|
||||||
|
;;; them all the way to the front, we can pop them off and make them a
|
||||||
|
;;; pre-dsm for the entire seq record.
|
||||||
|
;;; - Reduce singleton and empty choice.
|
||||||
|
;;;
|
||||||
|
;;; You have to be careful simplifying choices -- you can't merge two sets
|
||||||
|
;;; that appear on different sides of an element containing a live submatch.
|
||||||
|
;;; The problem is that the assignment of submatches breaks ties left-to-right.
|
||||||
|
;;; So these aren't the same:
|
||||||
|
;;; (| (submatch "x") any) (| any (submatch "x"))
|
||||||
|
;;; The first assigns the submatch, the second doesn't -- the ANY gets credit.
|
||||||
|
;;; We want to collapse multiple char-sets, bos's, and eos's, but we have
|
||||||
|
;;; to deal with this issue. So
|
||||||
|
;;; - When we coalesce anchors, we retain the *leftmost* one.
|
||||||
|
;;; - We coalesce sets that appear between live-submatch boundaries.
|
||||||
|
;;; When we do this, we subtract from the set any char that was in
|
||||||
|
;;; an earlier coalesced char-set. If this gets us down to the empty set,
|
||||||
|
;;; we drop it. If it gets us down to a singleton set, we convert it into
|
||||||
|
;;; a singleton string.
|
||||||
|
;;; Whew. I had to think about this one.
|
||||||
|
|
||||||
|
(define (simp-choice re)
|
||||||
|
(let ((tsm (re-choice:tsm re)))
|
||||||
|
|
||||||
|
(receive (pre-dsm cset bos? eos? bol? eol? bow? eow? tail)
|
||||||
|
(simp-choice1 (map simplify-regexp (re-choice:elts re)))
|
||||||
|
|
||||||
|
(let ((tail (assemble-boundary-tail char-set:empty cset
|
||||||
|
bos? eos? bol? eol? bow? eow?
|
||||||
|
#f #f #f #f #f #f
|
||||||
|
tail)))
|
||||||
|
(values (if (pair? tail)
|
||||||
|
(if (pair? (cdr tail))
|
||||||
|
(%make-re-choice tail (- tsm pre-dsm))
|
||||||
|
(car tail)) ; Singleton choice
|
||||||
|
re-empty) ; Empty choice
|
||||||
|
pre-dsm)))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; Given the return values from simp-choice1, this tacks all
|
||||||
|
;;; the various pieces (CSET, BOS?, EOS?, etc.) onto the front of
|
||||||
|
;;; TAIL. However, elements are not added onto TAIL that are already
|
||||||
|
;;; described by PREV-CSET, PREV-BOS?, etc. -- they will be added onto
|
||||||
|
;;; some earlier bit of the final result.
|
||||||
|
|
||||||
|
(define (assemble-boundary-tail prev-cset cset
|
||||||
|
bos? eos? bol? eol? bow? eow?
|
||||||
|
prev-bos? prev-eos?
|
||||||
|
prev-bol? prev-eol?
|
||||||
|
prev-bow? prev-eow?
|
||||||
|
tail)
|
||||||
|
(let* ((cset (char-set-difference cset prev-cset))
|
||||||
|
(numchars (char-set-size cset))
|
||||||
|
(tail (if (and eos? (not prev-eos?)) (cons re-eos tail) tail))
|
||||||
|
(tail (if (and eol? (not prev-eol?)) (cons re-eol tail) tail))
|
||||||
|
(tail (if (and eow? (not prev-eow?)) (cons re-eow tail) tail))
|
||||||
|
(tail (if (and bow? (not prev-bow?)) (cons re-bow tail) tail))
|
||||||
|
(tail (if (and bol? (not prev-bol?)) (cons re-bol tail) tail))
|
||||||
|
(tail (if (and bos? (not prev-bos?)) (cons re-bos tail) tail))
|
||||||
|
(tail (? ((zero? numchars) tail) ; Drop empty char set.
|
||||||
|
((= 1 numchars) ; {c} => "c"
|
||||||
|
(cons (make-re-string (string (car (char-set-members cset))))
|
||||||
|
tail))
|
||||||
|
(else (cons (make-re-char-set cset) tail)))))
|
||||||
|
tail))
|
||||||
|
|
||||||
|
|
||||||
|
;;; Simplify the non-empty list of choices ELTS.
|
||||||
|
;;; Return the result split out into the values
|
||||||
|
;;; [pre-dsm, cset, bos?, eos?, bol?, eol?, bow?, eow?, tail]
|
||||||
|
|
||||||
|
(define (simp-choice1 elts)
|
||||||
|
(let recur ((elts elts)
|
||||||
|
|
||||||
|
(prev-cset char-set:empty) ; Chars we've already seen.
|
||||||
|
|
||||||
|
(prev-bos? #f) (prev-eos? #f) ; These flags say if we've
|
||||||
|
(prev-bol? #f) (prev-eol? #f) ; already seen one of these
|
||||||
|
(prev-bow? #f) (prev-eow? #f)) ; anchors.
|
||||||
|
|
||||||
|
|
||||||
|
(if (pair? elts)
|
||||||
|
(let ((elt (car elts))
|
||||||
|
(elts (cdr elts)))
|
||||||
|
(receive (elt pre-dsm) (open-dsm elt)
|
||||||
|
(if (re-choice? elt)
|
||||||
|
|
||||||
|
;; Flatten nested choices.
|
||||||
|
(let ((sub-elts (re-seq:elts elt)))
|
||||||
|
(receive (tail-pre-dsm cset bos? eos? bol? eol? bow? eow? tail)
|
||||||
|
(recur (append sub-elts elts)
|
||||||
|
prev-cset
|
||||||
|
prev-bos? prev-eos?
|
||||||
|
prev-bol? prev-eol?
|
||||||
|
prev-bow? prev-eow?)
|
||||||
|
(values (+ pre-dsm tail-pre-dsm)
|
||||||
|
cset bos? eos? bol? eol? bow? eow? tail)))
|
||||||
|
|
||||||
|
;; Simplify the tail, then think about the head.
|
||||||
|
(receive (tail-pre-dsm cset bos? eos? bol? eol? bow? eow? tail)
|
||||||
|
(recur elts
|
||||||
|
(? ((and (re-string? elt)
|
||||||
|
(= 1 (string-length (re-string:chars elt))))
|
||||||
|
(char-set-union prev-cset
|
||||||
|
(string->char-set (re-string:chars elt))))
|
||||||
|
|
||||||
|
;; The cset might be a Scheme exp.
|
||||||
|
((and (re-char-set? elt)
|
||||||
|
(char-set? (re-char-set:cset elt)))
|
||||||
|
(char-set-union prev-cset
|
||||||
|
(re-char-set:cset elt)))
|
||||||
|
|
||||||
|
(else prev-cset))
|
||||||
|
(or prev-bos? (re-bos? elt))
|
||||||
|
(or prev-eos? (re-eos? elt))
|
||||||
|
(or prev-bol? (re-bol? elt))
|
||||||
|
(or prev-eol? (re-eol? elt))
|
||||||
|
(or prev-bow? (re-bow? elt))
|
||||||
|
(or prev-eow? (re-eow? elt)))
|
||||||
|
|
||||||
|
;; This guy is called when we couldn't find any other
|
||||||
|
;; simplification. If ELT contains live submatches, then we
|
||||||
|
;; are at a merge boundary, and have to take all the
|
||||||
|
;; TAIL-PRE-DSM, CSET, BOS?, EOS?, ... stuff we've collected
|
||||||
|
;; and tack them onto TAIL as elements, then put ELT on
|
||||||
|
;; front. Otherwise, we can commute TAIL-PRE-DSM, CSET,
|
||||||
|
;; BOS?, etc. with ELT, since it contains no live
|
||||||
|
;; submatches, so just tack ELT onto TAIL.
|
||||||
|
|
||||||
|
(define (no-simp)
|
||||||
|
(if (has-live-submatches? elt)
|
||||||
|
(let ((tail (assemble-boundary-tail prev-cset cset
|
||||||
|
bos? eos?
|
||||||
|
bol? eol?
|
||||||
|
bow? eow?
|
||||||
|
prev-bos? prev-eos?
|
||||||
|
prev-bol? prev-eol?
|
||||||
|
prev-bow? prev-eow?
|
||||||
|
tail)))
|
||||||
|
(values pre-dsm char-set:empty #f #f #f #f #f #f
|
||||||
|
(if (pair? tail)
|
||||||
|
;; Tack tail-pre-dsm onto
|
||||||
|
;; TAIL's first elt.
|
||||||
|
(cons elt
|
||||||
|
(cons (re-dsm (car tail)
|
||||||
|
tail-pre-dsm 0)
|
||||||
|
(cdr tail)))
|
||||||
|
|
||||||
|
;; Squirrel case: TAIL is empty, so use
|
||||||
|
;; TAIL-PRE-DSM as ELT's post-dsm.
|
||||||
|
(list (re-dsm elt 0 tail-pre-dsm)))))
|
||||||
|
|
||||||
|
;; ELT has no live submatches, so we can commute all
|
||||||
|
;; the recursion state forwards past it.
|
||||||
|
(values (+ pre-dsm tail-pre-dsm)
|
||||||
|
cset bos? eos? bol? eol? bow? eow?
|
||||||
|
(cons elt tail))))
|
||||||
|
|
||||||
|
(? ((and (re-char-set? elt)
|
||||||
|
(char-set? (re-char-set:cset elt))) ; Might be Scheme code
|
||||||
|
(values (+ pre-dsm tail-pre-dsm)
|
||||||
|
(char-set-union cset (re-char-set:cset elt))
|
||||||
|
bos? eos? bol? eol? bow? eow? tail))
|
||||||
|
|
||||||
|
;; Treat a singleton string "c" as a singleton set {c}.
|
||||||
|
((and (re-string? elt) (= 1 (string-length (re-string:chars elt))))
|
||||||
|
(values (+ pre-dsm tail-pre-dsm)
|
||||||
|
(char-set-union cset (string->char-set (re-string:chars elt)))
|
||||||
|
bos? eos? bol? eol? bow? eow? tail))
|
||||||
|
|
||||||
|
;; Coalesce bol/eol/bos/eos/bow/eow's.
|
||||||
|
((re-bos? elt) (values (+ pre-dsm tail-pre-dsm) cset
|
||||||
|
#t eos? bol? eol? bow? eow? tail))
|
||||||
|
((re-eos? elt) (values (+ pre-dsm tail-pre-dsm) cset
|
||||||
|
bos? #t bol? eol? bow? eow? tail))
|
||||||
|
((re-bol? elt) (values (+ pre-dsm tail-pre-dsm) cset
|
||||||
|
bos? eos? #t eol? bow? eow? tail))
|
||||||
|
((re-eol? elt) (values (+ pre-dsm tail-pre-dsm) cset
|
||||||
|
bos? eos? bol? #t bow? eow? tail))
|
||||||
|
((re-bow? elt) (values (+ pre-dsm tail-pre-dsm) cset
|
||||||
|
bos? eos? bol? eol? #t eow? tail))
|
||||||
|
((re-eow? elt) (values (+ pre-dsm tail-pre-dsm) cset
|
||||||
|
bos? eos? bol? eol? bow? #t tail))
|
||||||
|
|
||||||
|
(else (no-simp)))))))
|
||||||
|
|
||||||
|
(values 0 char-set:empty #f #f #f #f #f #f '()))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
(define (simp-repeat re)
|
||||||
|
(let ((from (re-repeat:from re))
|
||||||
|
(to (re-repeat:to re))
|
||||||
|
(body (re-repeat:body re)))
|
||||||
|
(receive (simp-body pre-dsm) (simp-re body) ; Simplify body.
|
||||||
|
;; The fancy reductions are all handled by REDUCE-REPEAT.
|
||||||
|
(reduce-repeat from to simp-body pre-dsm))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; Does RE contain a live submatch?
|
||||||
|
;;; If RE is dynamic, we can't tell, so we err conservatively,
|
||||||
|
;;; which means we say "yes."
|
||||||
|
|
||||||
|
(define (has-live-submatches? re)
|
||||||
|
(or (re-submatch? re)
|
||||||
|
(? ((re-seq? re) (every has-live-submatches? (re-seq:elts re)))
|
||||||
|
((re-choice? re) (every has-live-submatches? (re-choice:elts re)))
|
||||||
|
((re-repeat? re) (has-live-submatches? (re-repeat:body re)))
|
||||||
|
((re-dsm? re) (has-live-submatches? (re-dsm:body re)))
|
||||||
|
|
||||||
|
;; If it's not one of these things, then this isn't a regexp -- it's
|
||||||
|
;; a chunk of Scheme code producing a regexp, and we conservatively
|
||||||
|
;; return #T -- the expression *might* produce a regexp containing
|
||||||
|
;; a live submatch:
|
||||||
|
(else (not (or (re-char-set? re) (re-string? re)
|
||||||
|
(re-bos? re) (re-eos? re)
|
||||||
|
(re-bol? re) (re-eol? re)
|
||||||
|
(re-bow? re) (re-eow? re)))))))
|
|
@ -0,0 +1,171 @@
|
||||||
|
;;; Parse Spencer-style regexps into the regexp ADT.
|
||||||
|
;;; Olin Shivers, July 1998.
|
||||||
|
|
||||||
|
;;; One export: (posix-string->regexp s)
|
||||||
|
|
||||||
|
;;; Need better error checking on {m,n} brace parsing.
|
||||||
|
|
||||||
|
(define (parse-posix-regexp-string s)
|
||||||
|
(receive (re i) (parse-posix-exp s 0)
|
||||||
|
(if (= i (string-length s)) re
|
||||||
|
(error "Illegal Posix regexp -- terminated early" s i))))
|
||||||
|
|
||||||
|
(define posix-string->regexp parse-posix-regexp-string)
|
||||||
|
|
||||||
|
;;; An complete expression is a sequence of |-separated branches.
|
||||||
|
|
||||||
|
(define (parse-posix-exp s i)
|
||||||
|
(let ((len (string-length s)))
|
||||||
|
(if (< i len)
|
||||||
|
(let lp ((i i) (branches '()))
|
||||||
|
(receive (branch i) (parse-posix-branch s i)
|
||||||
|
(let ((branches (cons branch branches)))
|
||||||
|
(if (and (< i len)
|
||||||
|
(char=? #\| (string-ref s i)))
|
||||||
|
(lp (+ i 1) branches)
|
||||||
|
(values (re-choice (reverse branches)) i)))))
|
||||||
|
(values re-trivial i))))
|
||||||
|
|
||||||
|
|
||||||
|
;;; A branch is a sequence of pieces -- stuff that goes in-between |'s.
|
||||||
|
|
||||||
|
(define (parse-posix-branch s i)
|
||||||
|
(let ((len (string-length s)))
|
||||||
|
(let lp ((i i) (pieces '()))
|
||||||
|
(if (< i len)
|
||||||
|
(receive (piece i) (parse-posix-piece s i)
|
||||||
|
(let ((pieces (cons piece pieces)))
|
||||||
|
(if (< i len)
|
||||||
|
(case (string-ref s i)
|
||||||
|
((#\) #\|) (values (re-seq (reverse pieces)) i))
|
||||||
|
(else (lp i pieces)))
|
||||||
|
(values (re-seq (reverse pieces)) i))))
|
||||||
|
|
||||||
|
(values (re-seq (reverse pieces)) i)))))
|
||||||
|
|
||||||
|
|
||||||
|
;;; A piece is an atom possibly followed by a * ? + or {...} multiplier.
|
||||||
|
;;; I.e. an element of a branch sequence.
|
||||||
|
|
||||||
|
(define (parse-posix-piece s i)
|
||||||
|
(let ((len (string-length s)))
|
||||||
|
(receive (atom i) (parse-posix-atom s i)
|
||||||
|
(if (< i len)
|
||||||
|
(case (string-ref s i)
|
||||||
|
((#\* #\+ #\?)
|
||||||
|
(receive (from to) (case (string-ref s i)
|
||||||
|
((#\*) (values 0 #f))
|
||||||
|
((#\+) (values 1 #f))
|
||||||
|
((#\?) (values 0 1)))
|
||||||
|
(values (re-repeat from to atom) (+ i 1))))
|
||||||
|
|
||||||
|
((#\{) (receive (from to i) (parse-posix-braces s (+ i 1))
|
||||||
|
(values (re-repeat from to atom) i)))
|
||||||
|
|
||||||
|
(else (values atom i)))
|
||||||
|
|
||||||
|
(values atom i)))))
|
||||||
|
|
||||||
|
|
||||||
|
;;; An atom is something that would bind to a following * operator --
|
||||||
|
;;; a letter, [...] charset, ^, $, or (...).
|
||||||
|
|
||||||
|
(define (parse-posix-atom s i)
|
||||||
|
(let ((len (string-length s)))
|
||||||
|
(if (< i (string-length s))
|
||||||
|
(let ((c (string-ref s i)))
|
||||||
|
(case c
|
||||||
|
((#\^) (values re-bos (+ i 1)))
|
||||||
|
((#\$) (values re-eos (+ i 1)))
|
||||||
|
((#\.) (values re-any (+ i 1)))
|
||||||
|
|
||||||
|
((#\[) (parse-posix-bracket s (+ i 1)))
|
||||||
|
|
||||||
|
((#\() (receive (re i) (parse-posix-exp s (+ i 1))
|
||||||
|
(if (and (< i len) (char=? #\) (string-ref s i)))
|
||||||
|
(values (re-submatch re) (+ i 1))
|
||||||
|
(error "Regexp subexpression has no terminating close parenthesis" s i))))
|
||||||
|
|
||||||
|
((#\\) (let ((i (+ i 1)))
|
||||||
|
(if (< i len)
|
||||||
|
(values (make-re-string (string (string-ref s i)))
|
||||||
|
(+ i 1))
|
||||||
|
(error "Regexps may not terminate with a backslash" s))))
|
||||||
|
|
||||||
|
((#\) #\| #\* #\+ #\? #\{) (values re-trivial i))
|
||||||
|
|
||||||
|
(else (values (make-re-string (string c)) (+ i 1)))))
|
||||||
|
|
||||||
|
(values re-trivial i))))
|
||||||
|
|
||||||
|
|
||||||
|
;;; Parse a [...] or [^...] bracket expression into a regexp.
|
||||||
|
;;; I is the index of the char following the left bracket.
|
||||||
|
|
||||||
|
(define db-cset (char-set #\. #\= #\:)) ; Not allowed after a #\[.
|
||||||
|
|
||||||
|
(define (parse-posix-bracket s i)
|
||||||
|
(let ((len (string-length s)))
|
||||||
|
(if (>= i len) (error "Missing close right bracket in regexp" s i)
|
||||||
|
|
||||||
|
(receive (negate? i0) (let ((c (string-ref s i)))
|
||||||
|
(if (char=? c #\^)
|
||||||
|
(values #t (+ i 1))
|
||||||
|
(values #f i)))
|
||||||
|
(let lp ((i i0) (cset (char-set-copy char-set:empty)))
|
||||||
|
(if (>= i len) (error "Missing close right bracket in regexp" s i)
|
||||||
|
|
||||||
|
(let ((c (string-ref s i))
|
||||||
|
(i1 (+ i 1)))
|
||||||
|
(case c
|
||||||
|
((#\[)
|
||||||
|
;; We don't handle [..] [==] [::] frobs.
|
||||||
|
(if (and (< i1 len)
|
||||||
|
(char-set-contains? db-cset (string-ref s i1)))
|
||||||
|
(error "double-bracket regexps not supported." s i)
|
||||||
|
(lp i1 (char-set-adjoin! cset #\[))))
|
||||||
|
|
||||||
|
((#\]) (if (= i i0)
|
||||||
|
(lp i1 (char-set-adjoin! cset #\]))
|
||||||
|
(let ((cset (if negate?
|
||||||
|
(char-set-invert! cset)
|
||||||
|
cset)))
|
||||||
|
(values (make-re-char-set cset) i1))))
|
||||||
|
|
||||||
|
((#\-) (if (or (= i i0) ; first char or last char
|
||||||
|
(and (< i1 len)
|
||||||
|
(char=? #\] (string-ref s i1))))
|
||||||
|
(lp i1 (char-set-adjoin! cset #\-))
|
||||||
|
(error "Illegal - in [...] regexp" s i)))
|
||||||
|
|
||||||
|
;; Regular letter -- either alone, or startpoint of a range.
|
||||||
|
(else (if (and (< (+ i1 1) len)
|
||||||
|
(char=? #\- (string-ref s i1)))
|
||||||
|
|
||||||
|
;; Range
|
||||||
|
(let* ((i-tochar (+ i1 1))
|
||||||
|
(to (char->ascii (string-ref s i-tochar))))
|
||||||
|
(do ((j (char->ascii c) (+ j 1))
|
||||||
|
(cset cset (char-set-adjoin! cset (ascii->char j))))
|
||||||
|
((> j to) (lp (+ i-tochar 1) cset))))
|
||||||
|
|
||||||
|
;; Just a letter
|
||||||
|
(lp i1 (char-set-adjoin! cset c))))))))))))
|
||||||
|
|
||||||
|
|
||||||
|
;;; Parse out a [from,to] repetition pair from a {m,n} {m} or {m,} expression.
|
||||||
|
;;; I is the index of the char following the left brace.
|
||||||
|
|
||||||
|
(define (parse-posix-braces s i)
|
||||||
|
(let ((comma (string-index s #\,) i)
|
||||||
|
(rb (string-index s #\} i)))
|
||||||
|
(if rb
|
||||||
|
(if (and comma (< comma rb))
|
||||||
|
(values (string->number (substring s i comma))
|
||||||
|
(and (not (= (+ comma 1) rb))
|
||||||
|
(string->number (substring s (+ comma 1) rb)))
|
||||||
|
(+ rb 1))
|
||||||
|
(let ((m (string->number (substring s i rb))))
|
||||||
|
(values m m (+ rb 1))))
|
||||||
|
(error "Missing close brace in regexp" s i))))
|
||||||
|
|
|
@ -0,0 +1,50 @@
|
||||||
|
;;; Test routines
|
||||||
|
;;; ,open re-posix-parsers sre-parser-package re-simp-package pp
|
||||||
|
|
||||||
|
(define (test-string)
|
||||||
|
(let lp ()
|
||||||
|
(write-char #\newline)
|
||||||
|
(let ((re-s (read-line)))
|
||||||
|
(if (not (eof-object? re-s))
|
||||||
|
(let ((re (posix-string->regexp re-s)))
|
||||||
|
(print-re re)
|
||||||
|
(lp))))))
|
||||||
|
|
||||||
|
(define (test-sre)
|
||||||
|
(let lp ()
|
||||||
|
(write-char #\newline)
|
||||||
|
(let ((sre (read)))
|
||||||
|
(if (not (eof-object? sre))
|
||||||
|
(let ((re (sre->regexp sre)))
|
||||||
|
(print-re re)
|
||||||
|
(lp))))))
|
||||||
|
|
||||||
|
|
||||||
|
(define (print-re re)
|
||||||
|
(let ((simp-re (simplify-regexp re)))
|
||||||
|
(cond ((static-regexp? re)
|
||||||
|
(receive (s lev pcount tvec) (regexp->posix-string re)
|
||||||
|
(format #t "plain: ~a\n lev=~a pcount=~a tvec=~a\n"
|
||||||
|
s lev pcount tvec))
|
||||||
|
(receive (s lev pcount tvec) (regexp->posix-string simp-re)
|
||||||
|
(format #t "simp: ~a\n lev=~a pcount=~a tvec=~a\n"
|
||||||
|
s lev pcount tvec))))
|
||||||
|
(p (regexp->sre re))
|
||||||
|
(p (regexp->sre simp-re))))
|
||||||
|
|
||||||
|
(define (test-match)
|
||||||
|
(let lp ()
|
||||||
|
(write-string "sre: ")
|
||||||
|
(let ((sre (read)))
|
||||||
|
(if (not (eof-object? sre))
|
||||||
|
(let ((re (sre->regexp sre)))
|
||||||
|
(let lp2 ()
|
||||||
|
(let ((line (read-line)))
|
||||||
|
(cond ((not (eof-object? line))
|
||||||
|
(cond ((regexp-search re line) =>
|
||||||
|
(lambda (m)
|
||||||
|
(format #t "Hit at [~a,~a).\n"
|
||||||
|
(match:start m)
|
||||||
|
(match:end m)))))
|
||||||
|
(lp2))
|
||||||
|
(else (lp))))))))))
|
|
@ -0,0 +1,88 @@
|
||||||
|
- scsh integration
|
||||||
|
Affected: fr nawk filemtch glob rdelim re scsh-interfaces scsh-package
|
||||||
|
|
||||||
|
- Naming conventions. "re" vs. "regexp", should I have "smart" versions
|
||||||
|
of make-re-string, etc.
|
||||||
|
|
||||||
|
- Remove all "reduce" forms from scsh, replace with foldl, foldr forms.
|
||||||
|
- Check FPS, network code
|
||||||
|
|
||||||
|
- The match fun should allow you to state the beginning of string is not a
|
||||||
|
real bos & likewise for eos. Similarly for bol & eol.
|
||||||
|
execution flag:
|
||||||
|
-- REG_NOTBOL -- beginning of string doesn't count as ^ match.
|
||||||
|
-- REG_NOTEOL -- end of string doesn't count as $ match.
|
||||||
|
|
||||||
|
- Hack awk, expect, chat, dir-match for new regexp system
|
||||||
|
Current:
|
||||||
|
(awk (test body ...)
|
||||||
|
(:range test1 test2 body ...)
|
||||||
|
(else body ...)
|
||||||
|
(test => proc)
|
||||||
|
(test ==> vars body ...))
|
||||||
|
|
||||||
|
test ::=
|
||||||
|
integer
|
||||||
|
expression
|
||||||
|
string
|
||||||
|
|
||||||
|
|
||||||
|
New:
|
||||||
|
(else body ...)
|
||||||
|
(:range test1 test2 body ...)
|
||||||
|
(after body ...)
|
||||||
|
(test => proc)
|
||||||
|
(test ==> vars body ...)
|
||||||
|
(test body ...)
|
||||||
|
|
||||||
|
test ::= integer | sre | (WHEN exp) | exp
|
||||||
|
|
||||||
|
-------------------------------------------------------------------------------
|
||||||
|
Must disallow, due to Posix' RE_CONTEXT_INVALID_OPS
|
||||||
|
...^*...
|
||||||
|
*... ...(*... ...|*...
|
||||||
|
|... ...| ...|$... ...||... ...(|...
|
||||||
|
|
||||||
|
That is:
|
||||||
|
1. Do simplification below to remove repeats of zero-length matches.
|
||||||
|
2. An empty elt of a choice renders as ().
|
||||||
|
3. ...|$... Hack it: If first char of a rendered choice elt is $, prefix
|
||||||
|
with ().
|
||||||
|
|
||||||
|
Simplify ^{0,n} -> ""
|
||||||
|
^{m,n} -> ^ (0<m<=n)
|
||||||
|
^{m,n} -> (in) (m>n)
|
||||||
|
Similarly for bos/eos bol/eol bow/eow ""
|
||||||
|
|
||||||
|
Spencer says:
|
||||||
|
A repetition operator (?, *, +, or bounds) cannot follow
|
||||||
|
another repetition operator. A repetition operator cannot
|
||||||
|
begin an expression or subexpression or follow `^' or `|'.
|
||||||
|
|
||||||
|
`|' cannot appear first or last in a (sub)expression or
|
||||||
|
after another `|', i.e. an operand of `|' cannot be an
|
||||||
|
empty subexpression. An empty parenthesized subexpres-
|
||||||
|
sion, `()', is legal and matches an empty (sub)string. An
|
||||||
|
empty string is not a legal RE.
|
||||||
|
|
||||||
|
|
||||||
|
Fix the printer and reader so control chars are printed as
|
||||||
|
\ddd; do syntax for control-char input
|
||||||
|
|
||||||
|
-------------------------------------------------------------------------------
|
||||||
|
Less important:
|
||||||
|
- Support for searching vs. matching
|
||||||
|
- Case-scope hacking (needs s48 0.51 CODE-QUOTE)
|
||||||
|
- simp caching
|
||||||
|
- Better char-set->sre renderer
|
||||||
|
First, bound the cset with tightest possible superset,
|
||||||
|
then look for negations.
|
||||||
|
|
||||||
|
Possible interesting extensions:
|
||||||
|
- An ADT->DFA compiler
|
||||||
|
- A DFA->Scheme-code compiler
|
||||||
|
- An ADT interpreter
|
||||||
|
- A pattern notation for matching against s-expressions.
|
||||||
|
This would be handy for specifying the grammar of Scheme macros,
|
||||||
|
for example.
|
||||||
|
- Only allocate svec and evec if we match?
|
Loading…
Reference in New Issue