Ported Dorai's pregexp to Ikarus, adding many somewhat reasonable

performance hacks.
This commit is contained in:
Abdulaziz Ghuloum 2007-12-19 14:58:40 -05:00
parent 22ff670e81
commit ce3a16bc6a
2 changed files with 1229 additions and 0 deletions

437
other-libs/pregexp-test.ss Executable file
View File

@ -0,0 +1,437 @@
#!/usr/bin/env scheme-script
;;; ported to Ikarus by Abdulaziz Ghuloum on Dec 19 2007.
;last change by dorai on 2005-04-24
(import (ikarus) (pregexp))
(define *failed* 0)
(define-syntax test
(syntax-rules ()
[(_) #t]
[(_ q a rest ...)
(begin
(test-each q a)
(test rest ...))]))
(define-syntax test-each
(syntax-rules ()
[(_ expr expected-answer)
(begin
(display "Trying ")
(write 'expr)
(newline)
(display " --> ")
(let ((__actual-answer expr))
(write __actual-answer)
(display " ... ")
(if (equal? __actual-answer 'expected-answer)
(display "OK")
(begin
(set! *failed* (+ *failed* 1))
(display "FAILED!!!")
(newline)
(display " ;;; expected ")
(write 'expected-answer))))
(newline))]))
(define bottomline
(lambda ()
(newline)
(if (= *failed* 0)
(display "All tests succeeded! :-) :-) :-)")
(begin (display *failed*)
(display " test")
(if (> *failed* 1) (display "s"))
(display " failed! :-( :-( :-(")))
(newline)))
;keeping the document honest
(test
(pregexp "c.r")
(:sub (:or (:seq #\c :any #\r)))
(pregexp-match-positions "brain" "bird")
#f
(pregexp-match-positions "needle" "hay needle stack")
((4 . 10))
(pregexp-match-positions "needle"
"his hay needle stack -- my hay needle stack -- her hay needle stack"
24 43)
((31 . 37))
(pregexp-match "brain" "bird")
#f
(pregexp-match "needle" "hay needle stack")
("needle")
(pregexp-split ":" "/bin:/usr/bin:/usr/bin/X11:/usr/local/bin")
("/bin" "/usr/bin" "/usr/bin/X11" "/usr/local/bin")
(pregexp-split " " "pea soup")
("pea" "soup")
(pregexp-split "" "smithereens")
("s" "m" "i" "t" "h" "e" "r" "e" "e" "n" "s")
(pregexp-split " +" "split pea soup")
("split" "pea" "soup")
(pregexp-split " *" "split pea soup")
("s" "p" "l" "i" "t" "p" "e" "a" "s" "o" "u" "p")
(pregexp-replace "te" "liberte" "ty")
"liberty"
(pregexp-replace* "te" "liberte egalite fraternite" "ty")
"liberty egality fratyrnity"
(pregexp-match-positions "^contact" "first contact")
#f
(pregexp-match-positions "laugh$" "laugh laugh laugh laugh")
((18 . 23))
(pregexp-match-positions "yack\\b" "yackety yack")
((8 . 12))
(pregexp-match-positions "an\\B" "an analysis")
((3 . 5))
(pregexp-match "p.t" "pet")
("pet")
(pregexp-match "\\d\\d" "0 dear, 1 have to read catch 22 before 9")
("22")
(pregexp-match "[[:alpha:]_]" "--x--")
("x")
(pregexp-match "[[:alpha:]_]" "--_--")
("_")
(pregexp-match "[[:alpha:]_]" "--:--")
#f
(pregexp-match "[:alpha:]" "--a--")
("a")
(pregexp-match "[:alpha:]" "--_--")
#f
(pregexp-match-positions "c[ad]*r" "cadaddadddr")
((0 . 11))
(pregexp-match-positions "c[ad]*r" "cr")
((0 . 2))
(pregexp-match-positions "c[ad]+r" "cadaddadddr")
((0 . 11))
(pregexp-match-positions "c[ad]+r" "cr")
#f
(pregexp-match-positions "c[ad]?r" "cadaddadddr")
#f
(pregexp-match-positions "c[ad]?r" "cr")
((0 . 2))
(pregexp-match-positions "c[ad]?r" "car")
((0 . 3))
(pregexp-match "[aeiou]{3}" "vacuous")
("uou")
(pregexp-match "[aeiou]{3}" "evolve")
#f
(pregexp-match "[aeiou]{2,3}" "evolve")
#f
(pregexp-match "[aeiou]{2,3}" "zeugma")
("eu")
(pregexp-match "<.*>" "<tag1> <tag2> <tag3>")
("<tag1> <tag2> <tag3>")
(pregexp-match "<.*?>" "<tag1> <tag2> <tag3>")
("<tag1>")
(pregexp-match "([a-z]+) ([0-9]+), ([0-9]+)" "jan 1, 1970")
("jan 1, 1970" "jan" "1" "1970")
(pregexp-match "(poo )*" "poo poo platter")
("poo poo " "poo ")
(pregexp-match "([a-z ]+;)*" "lather; rinse; repeat;")
("lather; rinse; repeat;" " repeat;")
)
(define date-re
;match `month year' or `month day, year'.
;subpattern matches day, if present
(pregexp "([a-z]+) +([0-9]+,)? *([0-9]+)"))
(test
(pregexp-match date-re "jan 1, 1970")
("jan 1, 1970" "jan" "1," "1970")
(pregexp-match date-re "jan 1970")
("jan 1970" "jan" #f "1970")
(pregexp-replace "_(.+?)_"
"the _nina_, the _pinta_, and the _santa maria_"
"*\\1*")
"the *nina*, the _pinta_, and the _santa maria_"
(pregexp-replace* "_(.+?)_"
"the _nina_, the _pinta_, and the _santa maria_"
"*\\1*")
"the *nina*, the *pinta*, and the *santa maria*"
(pregexp-replace "(\\S+) (\\S+) (\\S+)"
"eat to live"
"\\3 \\2 \\1")
"live to eat"
(pregexp-match "([a-z]+) and \\1"
"billions and billions")
("billions and billions" "billions")
(pregexp-match "([a-z]+) and \\1"
"billions and millions")
#f
(pregexp-replace* "(\\S+) \\1"
"now is the the time for all good men to to come to the aid of of the party"
"\\1")
"now is the time for all good men to come to the aid of the party"
(pregexp-replace* "(\\d+)\\1"
"123340983242432420980980234"
"{\\1,\\1}")
"12{3,3}40983{24,24}3242{098,098}0234"
(pregexp-match "^(?:[a-z]*/)*([a-z]+)$" "/usr/local/bin/mzscheme")
("/usr/local/bin/mzscheme" "mzscheme")
(pregexp-match "(?i:hearth)" "HeartH")
("HeartH")
(pregexp-match "(?x: a lot)" "alot")
("alot")
(pregexp-match "(?x: a \\ lot)" "a lot")
("a lot")
(pregexp-match "(?x:
a \\ man \\; \\ ; ignore
a \\ plan \\; \\ ; me
a \\ canal ; completely
)"
"a man; a plan; a canal")
("a man; a plan; a canal")
(pregexp-match "(?ix:
a \\ man \\; \\ ; ignore
a \\ plan \\; \\ ; me
a \\ canal ; completely
)"
"A Man; a Plan; a Canal")
("A Man; a Plan; a Canal")
(pregexp-match "(?i:the (?-i:TeX)book)"
"The TeXbook")
("The TeXbook")
(pregexp-match "f(ee|i|o|um)" "a small, final fee")
("fi" "i")
(pregexp-replace* "([yi])s(e[sdr]?|ing|ation)"
"it is energising to analyse an organisation pulsing with noisy organisms"
"\\1z\\2")
"it is energizing to analyze an organization pulsing with noisy organisms"
(pregexp-match "f(?:ee|i|o|um)" "fun for all")
("fo")
(pregexp-match "call|call-with-current-continuation"
"call-with-current-continuation")
("call")
(pregexp-match "call-with-current-continuation|call"
"call-with-current-continuation")
("call-with-current-continuation")
(pregexp-match "(?:call|call-with-current-continuation) constrained"
"call-with-current-continuation constrained")
("call-with-current-continuation constrained")
(pregexp-match "(?>a+)." "aaaa")
#f
(pregexp-match-positions "grey(?=hound)"
"i left my grey socks at the greyhound")
((28 . 32))
(pregexp-match-positions "grey(?!hound)"
"the gray greyhound ate the grey socks")
((27 . 31))
(pregexp-match-positions "(?<=grey)hound"
"the hound in the picture is not a greyhound")
((38 . 43))
(pregexp-match-positions "(?<!grey)hound"
"the greyhound in the picture is not a hound")
((38 . 43))
)
(define n0-255
"(?x:
\\d ; 0 through 9
| \\d\\d ; 00 through 99
| [01]\\d\\d ;000 through 199
| 2[0-4]\\d ;200 through 249
| 25[0-5] ;250 through 255
)")
(define ip-re1
(string-append
"^" ;nothing before
n0-255 ;the first n0-255,
"(?x:" ;then the subpattern of
"\\." ;a dot followed by
n0-255 ;an n0-255,
")" ;which is
"{3}" ;repeated exactly 3 times
"$" ;with nothing following
))
(test
(pregexp-match ip-re1
"1.2.3.4")
("1.2.3.4")
(pregexp-match ip-re1
"55.155.255.265")
#f
(pregexp-match ip-re1
"0.00.000.00")
("0.00.000.00")
)
(define ip-re
(string-append
"(?=[1-9])" ;ensure there's a non-0 digit
ip-re1))
(test
(pregexp-match ip-re
"1.2.3.4")
("1.2.3.4")
(pregexp-match ip-re
"0.0.0.0")
#f
)
(set! ip-re
(string-append
"(?![0.]*$)" ;not just zeros and dots
;dot is not metachar inside []
ip-re1))
(test
(pregexp-match ip-re
"1.2.3.4")
("1.2.3.4")
(pregexp-match ip-re
"0.0.0.0")
#f
;misc
(pregexp-match "a[^a]*b" "glauber")
("aub")
(pregexp-match "a([^a]*)b" "glauber")
("aub" "u")
(pregexp-match "a([^a]*)b" "ababababab")
("ab" "")
(pregexp-match "(?x: s e * k )" "seeeeek")
("seeeeek")
(pregexp-match "(?x: t ;matches t
h ; matches h
e ;;; matches e
\\ ; ; ; matches space
\\; ; matches ;
)"
"the ;")
("the ;")
(pregexp-replace* "^(.*)$" "foobar" "\\1abc")
"foobarabc"
(pregexp-replace* "^(.*)$" "foobar" "abc\\1")
"abcfoobar"
(pregexp-replace* "(.*)$" "foobar" "abc\\1")
"abcfoobar"
)
(test
;PLT bug 6095 from Neil W. Van Dyke
(pregexp "[a-z-]")
(:sub (:or (:seq (:one-of-chars (:char-range #\a #\z) #\-))))
;
(pregexp "[-a-z]")
(:sub (:or (:seq (:one-of-chars #\- (:char-range #\a #\z)))))
;PLT bug 6442 from David T. Pierson
(pregexp-match-positions "(a(b))?c" "abc")
((0 . 3) (0 . 2) (1 . 2))
;
(pregexp-match-positions "(a(b))?c" "c")
((0 . 1) #f #f)
;PLT bug 7233 from Edi Weitz
(length (pregexp-match "(a)|(b)" "b"))
3
;PLT bug 7232 from Neil Van Dyke
(pregexp "[-a]")
(:sub (:or (:seq (:one-of-chars #\- #\a))))
;
(pregexp "[a-]")
(:sub (:or (:seq (:one-of-chars #\a #\-))))
)
(bottomline)

792
other-libs/pregexp.ss Normal file
View File

@ -0,0 +1,792 @@
;pregexp.scm
;Portable regular expressions for Scheme
;Dorai Sitaram
;http://www.ccs.neu.edu/~dorai
;dorai AT ccs DOT neu DOT edu
;Oct 2, 1999
;;; ported to ikarus by Abdulaziz Ghuloum on Dec 19, 2007.
;;; also added special cases for when the first argument to
;;; any of these procedures is a known string at macro
;;; expansion time.
(library (pregexp)
(export pregexp pregexp-match-positions pregexp-match
pregexp-split pregexp-replace pregexp-replace*)
(import (ikarus))
(define *pregexp-version* 20071219) ;last change
(define *pregexp-comment-char* #\;)
(define *pregexp-nul-char-int* #\x0)
(define *pregexp-return-char* #\return)
(define *pregexp-tab-char* #\tab)
(define *pregexp-space-sensitive?* #t)
(define pregexp-reverse!
;the useful reverse! isn't R5RS
(lambda (s)
(let loop ((s s) (r '()))
(if (null? s) r
(let ((d (cdr s)))
(set-cdr! s r)
(loop d s))))))
(define pregexp-error
;R5RS won't give me a portable error procedure.
;modify this as needed
(case-lambda
[(who) (error who "an error occurred")]
[(who msg . args) (apply error who (format "~a" msg) args)]))
(define pregexp-read-pattern
(lambda (s i n)
(if (>= i n)
(list
(list ':or (list ':seq)) i)
(let loop ((branches '()) (i i))
(if (or (>= i n)
(char=? (string-ref s i) #\)))
(list (cons ':or (pregexp-reverse! branches)) i)
(let ((vv (pregexp-read-branch
s
(if (char=? (string-ref s i) #\|) (+ i 1) i) n)))
(loop (cons (car vv) branches) (cadr vv))))))))
(define pregexp-read-branch
(lambda (s i n)
(let loop ((pieces '()) (i i))
(cond ((>= i n)
(list (cons ':seq (pregexp-reverse! pieces)) i))
((let ((c (string-ref s i)))
(or (char=? c #\|)
(char=? c #\))))
(list (cons ':seq (pregexp-reverse! pieces)) i))
(else (let ((vv (pregexp-read-piece s i n)))
(loop (cons (car vv) pieces) (cadr vv))))))))
(define pregexp-read-piece
(lambda (s i n)
(let ((c (string-ref s i)))
(case c
((#\^) (list ':bos (+ i 1)))
((#\$) (list ':eos (+ i 1)))
((#\.) (pregexp-wrap-quantifier-if-any
(list ':any (+ i 1)) s n))
((#\[) (let ((i+1 (+ i 1)))
(pregexp-wrap-quantifier-if-any
(case (and (< i+1 n) (string-ref s i+1))
((#\^)
(let ((vv (pregexp-read-char-list s (+ i 2) n)))
(list (list ':neg-char (car vv)) (cadr vv))))
(else (pregexp-read-char-list s i+1 n)))
s n)))
((#\()
(pregexp-wrap-quantifier-if-any
(pregexp-read-subpattern s (+ i 1) n) s n))
((#\\ )
(pregexp-wrap-quantifier-if-any
(cond ((pregexp-read-escaped-number s i n) =>
(lambda (num-i)
(list (list ':backref (car num-i)) (cadr num-i))))
((pregexp-read-escaped-char s i n) =>
(lambda (char-i)
(list (car char-i) (cadr char-i))))
(else (pregexp-error 'pregexp-read-piece 'backslash)))
s n))
(else
(if (or *pregexp-space-sensitive?*
(and (not (char-whitespace? c))
(not (char=? c *pregexp-comment-char*))))
(pregexp-wrap-quantifier-if-any
(list c (+ i 1)) s n)
(let loop ((i i) (in-comment? #f))
(if (>= i n) (list ':empty i)
(let ((c (string-ref s i)))
(cond (in-comment?
(loop (+ i 1)
(not (char=? c #\newline))))
((char-whitespace? c)
(loop (+ i 1) #f))
((char=? c *pregexp-comment-char*)
(loop (+ i 1) #t))
(else (list ':empty i))))))))))))
(define pregexp-read-escaped-number
(lambda (s i n)
; s[i] = \
(and (< (+ i 1) n) ;must have at least something following \
(let ((c (string-ref s (+ i 1))))
(and (char-numeric? c)
(let loop ((i (+ i 2)) (r (list c)))
(if (>= i n)
(list (string->number
(list->string (pregexp-reverse! r))) i)
(let ((c (string-ref s i)))
(if (char-numeric? c)
(loop (+ i 1) (cons c r))
(list (string->number
(list->string (pregexp-reverse! r)))
i))))))))))
(define pregexp-read-escaped-char
(lambda (s i n)
; s[i] = \
(and (< (+ i 1) n)
(let ((c (string-ref s (+ i 1))))
(case c
((#\b) (list ':wbdry (+ i 2)))
((#\B) (list ':not-wbdry (+ i 2)))
((#\d) (list ':digit (+ i 2)))
((#\D) (list '(:neg-char :digit) (+ i 2)))
((#\n) (list #\newline (+ i 2)))
((#\r) (list *pregexp-return-char* (+ i 2)))
((#\s) (list ':space (+ i 2)))
((#\S) (list '(:neg-char :space) (+ i 2)))
((#\t) (list *pregexp-tab-char* (+ i 2)))
((#\w) (list ':word (+ i 2)))
((#\W) (list '(:neg-char :word) (+ i 2)))
(else (list c (+ i 2))))))))
(define pregexp-read-posix-char-class
(lambda (s i n)
; lbrack, colon already read
(let ((neg? #f))
(let loop ((i i) (r (list #\:)))
(if (>= i n)
(pregexp-error 'pregexp-read-posix-char-class)
(let ((c (string-ref s i)))
(cond ((char=? c #\^)
(set! neg? #t)
(loop (+ i 1) r))
((char-alphabetic? c)
(loop (+ i 1) (cons c r)))
((char=? c #\:)
(if (or (>= (+ i 1) n)
(not (char=? (string-ref s (+ i 1)) #\])))
(pregexp-error 'pregexp-read-posix-char-class)
(let ((posix-class
(string->symbol
(list->string (pregexp-reverse! r)))))
(list (if neg? (list ':neg-char posix-class)
posix-class)
(+ i 2)))))
(else
(pregexp-error 'pregexp-read-posix-char-class)))))))))
(define pregexp-read-cluster-type
(lambda (s i n)
; s[i-1] = left-paren
(let ((c (string-ref s i)))
(case c
((#\?)
(let ((i (+ i 1)))
(case (string-ref s i)
((#\:) (list '() (+ i 1)))
((#\=) (list '(:lookahead) (+ i 1)))
((#\!) (list '(:neg-lookahead) (+ i 1)))
((#\>) (list '(:no-backtrack) (+ i 1)))
((#\<)
(list (case (string-ref s (+ i 1))
((#\=) '(:lookbehind))
((#\!) '(:neg-lookbehind))
(else (pregexp-error 'pregexp-read-cluster-type)))
(+ i 2)))
(else (let loop ((i i) (r '()) (inv? #f))
(let ((c (string-ref s i)))
(case c
((#\-) (loop (+ i 1) r #t))
((#\i) (loop (+ i 1)
(cons (if inv? ':case-sensitive
':case-insensitive) r) #f))
((#\x)
(set! *pregexp-space-sensitive?* inv?)
(loop (+ i 1) r #f))
((#\:) (list r (+ i 1)))
(else (pregexp-error
'pregexp-read-cluster-type)))))))))
(else (list '(:sub) i))))))
(define pregexp-read-subpattern
(lambda (s i n)
(let* ((remember-space-sensitive? *pregexp-space-sensitive?*)
(ctyp-i (pregexp-read-cluster-type s i n))
(ctyp (car ctyp-i))
(i (cadr ctyp-i))
(vv (pregexp-read-pattern s i n)))
(set! *pregexp-space-sensitive?* remember-space-sensitive?)
(let ((vv-re (car vv))
(vv-i (cadr vv)))
(if (and (< vv-i n)
(char=? (string-ref s vv-i)
#\)))
(list
(let loop ((ctyp ctyp) (re vv-re))
(if (null? ctyp) re
(loop (cdr ctyp)
(list (car ctyp) re))))
(+ vv-i 1))
(pregexp-error 'pregexp-read-subpattern))))))
(define pregexp-wrap-quantifier-if-any
(lambda (vv s n)
(let ((re (car vv)))
(let loop ((i (cadr vv)))
(if (>= i n) vv
(let ((c (string-ref s i)))
(if (and (char-whitespace? c) (not *pregexp-space-sensitive?*))
(loop (+ i 1))
(case c
((#\* #\+ #\? #\{)
(let* ((new-re (list ':between 'minimal?
'at-least 'at-most re))
(new-vv (list new-re 'next-i)))
(case c
((#\*) (set-car! (cddr new-re) 0)
(set-car! (cdddr new-re) #f))
((#\+) (set-car! (cddr new-re) 1)
(set-car! (cdddr new-re) #f))
((#\?) (set-car! (cddr new-re) 0)
(set-car! (cdddr new-re) 1))
((#\{) (let ((pq (pregexp-read-nums s (+ i 1) n)))
(if (not pq)
(pregexp-error
'pregexp-wrap-quantifier-if-any
'left-brace-must-be-followed-by-number))
(set-car! (cddr new-re) (car pq))
(set-car! (cdddr new-re) (cadr pq))
(set! i (caddr pq)))))
(let loop ((i (+ i 1)))
(if (>= i n)
(begin (set-car! (cdr new-re) #f)
(set-car! (cdr new-vv) i))
(let ((c (string-ref s i)))
(cond ((and (char-whitespace? c)
(not *pregexp-space-sensitive?*))
(loop (+ i 1)))
((char=? c #\?)
(set-car! (cdr new-re) #t)
(set-car! (cdr new-vv) (+ i 1)))
(else (set-car! (cdr new-re) #f)
(set-car! (cdr new-vv) i))))))
new-vv))
(else vv)))))))))
;
(define pregexp-read-nums
(lambda (s i n)
; s[i-1] = {
; returns (p q k) where s[k] = }
(let loop ((p '()) (q '()) (k i) (reading 1))
(if (>= k n) (pregexp-error 'pregexp-read-nums))
(let ((c (string-ref s k)))
(cond ((char-numeric? c)
(if (= reading 1)
(loop (cons c p) q (+ k 1) 1)
(loop p (cons c q) (+ k 1) 2)))
((and (char-whitespace? c) (not *pregexp-space-sensitive?*))
(loop p q (+ k 1) reading))
((and (char=? c #\,) (= reading 1))
(loop p q (+ k 1) 2))
((char=? c #\})
(let ((p (string->number (list->string (pregexp-reverse! p))))
(q (string->number (list->string (pregexp-reverse! q)))))
(cond ((and (not p) (= reading 1)) (list 0 #f k))
((= reading 1) (list p p k))
(else (list p q k)))))
(else #f))))))
(define pregexp-invert-char-list
(lambda (vv)
(set-car! (car vv) ':none-of-chars)
vv))
;
(define pregexp-read-char-list
(lambda (s i n)
(let loop ((r '()) (i i))
(if (>= i n)
(pregexp-error 'pregexp-read-char-list
'character-class-ended-too-soon)
(let ((c (string-ref s i)))
(case c
((#\]) (if (null? r)
(loop (cons c r) (+ i 1))
(list (cons ':one-of-chars (pregexp-reverse! r))
(+ i 1))))
((#\\ )
(let ((char-i (pregexp-read-escaped-char s i n)))
(if char-i (loop (cons (car char-i) r) (cadr char-i))
(pregexp-error 'pregexp-read-char-list 'backslash))))
((#\-) (if (or (null? r)
(let ((i+1 (+ i 1)))
(and (< i+1 n)
(char=? (string-ref s i+1) #\]))))
(loop (cons c r) (+ i 1))
(let ((c-prev (car r)))
(if (char? c-prev)
(loop (cons (list ':char-range c-prev
(string-ref s (+ i 1))) (cdr r))
(+ i 2))
(loop (cons c r) (+ i 1))))))
((#\[) (if (char=? (string-ref s (+ i 1)) #\:)
(let ((posix-char-class-i
(pregexp-read-posix-char-class s (+ i 2) n)))
(loop (cons (car posix-char-class-i) r)
(cadr posix-char-class-i)))
(loop (cons c r) (+ i 1))))
(else (loop (cons c r) (+ i 1)))))))))
;
(define pregexp-string-match
(lambda (s1 s i n sk fk)
(let ((n1 (string-length s1)))
(if (> n1 n) (fk)
(let loop ((j 0) (k i))
(cond ((>= j n1) (sk k))
((>= k n) (fk))
((char=? (string-ref s1 j) (string-ref s k))
(loop (+ j 1) (+ k 1)))
(else (fk))))))))
(define pregexp-char-word?
(lambda (c)
;too restrictive for Scheme but this
;is what \w is in most regexp notations
(or (char-alphabetic? c)
(char-numeric? c)
(char=? c #\_))))
(define pregexp-at-word-boundary?
(lambda (s i n)
(or (= i 0) (>= i n)
(let ((c/i (string-ref s i))
(c/i-1 (string-ref s (- i 1))))
(let ((c/i/w? (pregexp-check-if-in-char-class?
c/i ':word))
(c/i-1/w? (pregexp-check-if-in-char-class?
c/i-1 ':word)))
(or (and c/i/w? (not c/i-1/w?))
(and (not c/i/w?) c/i-1/w?)))))))
(define pregexp-check-if-in-char-class?
(lambda (c char-class)
(case char-class
((:any) (not (char=? c #\newline)))
;
((:alnum) (or (char-alphabetic? c) (char-numeric? c)))
((:alpha) (char-alphabetic? c))
((:ascii) (< (char->integer c) 128))
((:blank) (or (char=? c #\space) (char=? c *pregexp-tab-char*)))
((:cntrl) (< (char->integer c) 32))
((:digit) (char-numeric? c))
((:graph) (and (>= (char->integer c) 32)
(not (char-whitespace? c))))
((:lower) (char-lower-case? c))
((:print) (>= (char->integer c) 32))
((:punct) (and (>= (char->integer c) 32)
(not (char-whitespace? c))
(not (char-alphabetic? c))
(not (char-numeric? c))))
((:space) (char-whitespace? c))
((:upper) (char-upper-case? c))
((:word) (or (char-alphabetic? c)
(char-numeric? c)
(char=? c #\_)))
((:xdigit) (or (char-numeric? c)
(char-ci=? c #\a) (char-ci=? c #\b)
(char-ci=? c #\c) (char-ci=? c #\d)
(char-ci=? c #\e) (char-ci=? c #\f)))
(else (pregexp-error 'pregexp-check-if-in-char-class?)))))
(define pregexp-list-ref
(lambda (s i)
;like list-ref but returns #f if index is
;out of bounds
(let loop ((s s) (k 0))
(cond ((null? s) #f)
((= k i) (car s))
(else (loop (cdr s) (+ k 1)))))))
;re is a compiled regexp. It's a list that can't be
;nil. pregexp-match-positions-aux returns a 2-elt list whose
;car is the string-index following the matched
;portion and whose cadr contains the submatches.
;The proc returns false if there's no match.
;Am spelling loop- as loup- because these shouldn't
;be translated into CL loops by scm2cl (although
;they are tail-recursive in Scheme)
(define pregexp-make-backref-list
(lambda (re)
(let sub ((re re))
(if (pair? re)
(let ((car-re (car re))
(sub-cdr-re (sub (cdr re))))
(if (eqv? car-re ':sub)
(cons (cons re #f) sub-cdr-re)
(append (sub car-re) sub-cdr-re)))
'()))))
(define pregexp-match-positions-aux
(lambda (re s sn start n i)
(let ((identity (lambda (x) x))
(backrefs (pregexp-make-backref-list re))
(case-sensitive? #t))
(let sub ((re re) (i i) (sk identity) (fk (lambda () #f)))
;(printf "sub ~s ~s\n" i re)
(cond ((eqv? re ':bos)
;(if (= i 0) (sk i) (fk))
(if (= i start) (sk i) (fk))
)
((eqv? re ':eos)
;(if (>= i sn) (sk i) (fk))
(if (>= i n) (sk i) (fk))
)
((eqv? re ':empty)
(sk i))
((eqv? re ':wbdry)
(if (pregexp-at-word-boundary? s i n)
(sk i)
(fk)))
((eqv? re ':not-wbdry)
(if (pregexp-at-word-boundary? s i n)
(fk)
(sk i)))
((and (char? re) (< i n))
;(printf "bingo\n")
(if ((if case-sensitive? char=? char-ci=?)
(string-ref s i) re)
(sk (+ i 1)) (fk)))
((and (not (pair? re)) (< i n))
(if (pregexp-check-if-in-char-class?
(string-ref s i) re)
(sk (+ i 1)) (fk)))
((and (pair? re) (eqv? (car re) ':char-range) (< i n))
(let ((c (string-ref s i)))
(if (let ((c< (if case-sensitive? char<=? char-ci<=?)))
(and (c< (cadr re) c)
(c< c (caddr re))))
(sk (+ i 1)) (fk))))
((pair? re)
(case (car re)
((:char-range)
(if (>= i n) (fk)
(pregexp-error 'pregexp-match-positions-aux)))
((:one-of-chars)
(if (>= i n) (fk)
(let loup-one-of-chars ((chars (cdr re)))
(if (null? chars) (fk)
(sub (car chars) i sk
(lambda ()
(loup-one-of-chars (cdr chars))))))))
((:neg-char)
(if (>= i n) (fk)
(sub (cadr re) i
(lambda (i1) (fk))
(lambda () (sk (+ i 1))))))
((:seq)
(let loup-seq ((res (cdr re)) (i i))
(if (null? res) (sk i )
(sub (car res) i
(lambda (i1 )
(loup-seq (cdr res) i1 ))
fk))))
((:or)
(let loup-or ((res (cdr re)))
(if (null? res) (fk)
(sub (car res) i
(lambda (i1 )
(or (sk i1 )
(loup-or (cdr res))))
(lambda () (loup-or (cdr res)))))))
((:backref)
(let* ((c (pregexp-list-ref backrefs (cadr re)))
(backref
(cond (c => cdr)
(else
(pregexp-error 'pregexp-match-positions-aux
'non-existent-backref re)
#f))))
(if backref
(pregexp-string-match
(substring s (car backref) (cdr backref))
s i n (lambda (i) (sk i)) fk)
(sk i))))
((:sub)
(sub (cadr re) i
(lambda (i1)
(set-cdr! (assv re backrefs) (cons i i1))
(sk i1)) fk))
((:lookahead)
(let ((found-it?
(sub (cadr re) i
identity (lambda () #f))))
(if found-it? (sk i) (fk))))
((:neg-lookahead)
(let ((found-it?
(sub (cadr re) i
identity (lambda () #f))))
(if found-it? (fk) (sk i))))
((:lookbehind)
(let ((n-actual n) (sn-actual sn))
(set! n i) (set! sn i)
(let ((found-it?
(sub (list ':seq '(:between #f 0 #f :any)
(cadr re) ':eos) 0
identity (lambda () #f))))
(set! n n-actual) (set! sn sn-actual)
(if found-it? (sk i) (fk)))))
((:neg-lookbehind)
(let ((n-actual n) (sn-actual sn))
(set! n i) (set! sn i)
(let ((found-it?
(sub (list ':seq '(:between #f 0 #f :any)
(cadr re) ':eos) 0
identity (lambda () #f))))
(set! n n-actual) (set! sn sn-actual)
(if found-it? (fk) (sk i)))))
((:no-backtrack)
(let ((found-it? (sub (cadr re) i
identity (lambda () #f))))
(if found-it?
(sk found-it?)
(fk))))
((:case-sensitive :case-insensitive)
(let ((old case-sensitive?))
(set! case-sensitive?
(eqv? (car re) ':case-sensitive))
(sub (cadr re) i
(lambda (i1)
(set! case-sensitive? old)
(sk i1))
(lambda ()
(set! case-sensitive? old)
(fk)))))
((:between)
(let* ((maximal? (not (cadr re)))
(p (caddr re))
(q (cadddr re))
(could-loop-infinitely? (and maximal? (not q)))
(re (car (cddddr re))))
(let loup-p ((k 0) (i i) )
(if (< k p)
(sub re i
(lambda (i1 )
(if (and could-loop-infinitely?
(= i1 i))
(pregexp-error
'pregexp-match-positions-aux
'greedy-quantifier-operand-could-be-empty))
(loup-p (+ k 1) i1 ))
fk)
(let ((q (and q (- q p))))
(let loup-q ((k 0) (i i))
(let ((fk (lambda ()
(sk i ))))
(if (and q (>= k q)) (fk)
(if maximal?
(sub re i
(lambda (i1)
(if (and could-loop-infinitely?
(= i1 i))
(pregexp-error
'pregexp-match-positions-aux
'greedy-quantifier-operand-could-be-empty))
(or (loup-q (+ k 1) i1)
(fk)))
fk)
(or (fk)
(sub re i
(lambda (i1)
(loup-q (+ k 1) i1))
fk)))))))))))
(else (pregexp-error 'pregexp-match-positions-aux))))
((>= i n) (fk))
(else (pregexp-error 'pregexp-match-positions-aux))))
;(printf "done\n")
(let ((backrefs (map cdr backrefs)))
(and (car backrefs) backrefs)))))
(define pregexp-replace-aux
(lambda (str ins n backrefs)
(let loop ((i 0) (r ""))
(if (>= i n) r
(let ((c (string-ref ins i)))
(if (char=? c #\\ )
(let* ((br-i (pregexp-read-escaped-number ins i n))
(br (if br-i (car br-i)
(if (char=? (string-ref ins (+ i 1)) #\&) 0
#f)))
(i (if br-i (cadr br-i)
(if br (+ i 2)
(+ i 1)))))
(if (not br)
(let ((c2 (string-ref ins i)))
(loop (+ i 1)
(if (char=? c2 #\$) r
(string-append r (string c2)))))
(loop i
(let ((backref (pregexp-list-ref backrefs br)))
(if backref
(string-append r
(substring str (car backref) (cdr backref)))
r)))))
(loop (+ i 1) (string-append r (string c)))))))))
(define pregexp-proc
(let ([pregexp
(lambda (s)
(set! *pregexp-space-sensitive?* #t) ;in case it got corrupted
(list ':sub (car (pregexp-read-pattern s 0 (string-length s)))))])
pregexp))
(define-syntax pregexp
(lambda (x) ;;; aziz's touch
(syntax-case x ()
[(_ str)
(string? (syntax->datum #'str))
#'(let-syntax ([foo
(lambda (t)
(list #'quote (pregexp-proc str)))])
foo)]
[(_ args ...) #'(pregexp-proc args ...)]
[id (identifier? #'id) #'pregexp-proc])))
(define-syntax define-regexp-proc
(lambda (stx)
(syntax-case stx ()
[(_ name expr)
(with-syntax ([(name^) (generate-temporaries (list #'name))])
#'(begin
(define name^ expr)
(define-syntax name
(lambda (x)
(syntax-case x ()
[(_ str args (... ...)) (string? #'str)
#'(name^ (pregexp str) args (... ...))]
[(_ args (... ...))
#'(name^ args (... ...))]
[id (identifier? #'id) #'name^])))))])))
(define-regexp-proc pregexp-match-positions
(lambda (pat str . opt-args)
(cond ((string? pat) (set! pat (pregexp pat)))
((pair? pat) #t)
(else (pregexp-error 'pregexp-match-positions
'pattern-must-be-compiled-or-string-regexp
pat)))
(let* ((str-len (string-length str))
(start (if (null? opt-args) 0
(let ((start (car opt-args)))
(set! opt-args (cdr opt-args))
start)))
(end (if (null? opt-args) str-len
(car opt-args))))
(let loop ((i start))
(and (<= i end)
(or (pregexp-match-positions-aux
pat str str-len start end i)
(loop (+ i 1))))))))
(define-regexp-proc pregexp-match
(lambda (pat str . opt-args)
(let ((ix-prs (apply pregexp-match-positions pat str opt-args)))
(and ix-prs
(map
(lambda (ix-pr)
(and ix-pr
(substring str (car ix-pr) (cdr ix-pr))))
ix-prs)))))
(define-regexp-proc pregexp-split
(lambda (pat str)
;split str into substrings, using pat as delimiter
(let ((n (string-length str))
;;; aziz
(pat (if (string? pat) (pregexp pat) pat)))
(let loop ((i 0) (r '()) (picked-up-one-undelimited-char? #f))
(cond ((>= i n) (pregexp-reverse! r))
((pregexp-match-positions pat str i n)
=>
(lambda (y)
(let ((jk (car y)))
(let ((j (car jk)) (k (cdr jk)))
;(printf "j = ~a; k = ~a; i = ~a~n" j k i)
(cond ((= j k)
;(printf "producing ~s~n" (substring str i (+ j 1)))
(loop (+ k 1)
(cons (substring str i (+ j 1)) r) #t))
((and (= j i) picked-up-one-undelimited-char?)
(loop k r #f))
(else
;(printf "producing ~s~n" (substring str i j))
(loop k (cons (substring str i j) r) #f)))))))
(else (loop n (cons (substring str i n) r) #f)))))))
(define-regexp-proc pregexp-replace
(lambda (pat str ins)
(let* ((n (string-length str))
(pp (pregexp-match-positions pat str 0 n)))
(if (not pp) str
(let ((ins-len (string-length ins))
(m-i (caar pp))
(m-n (cdar pp)))
(string-append
(substring str 0 m-i)
(pregexp-replace-aux str ins ins-len pp)
(substring str m-n n)))))))
(define-regexp-proc pregexp-replace*
(lambda (pat str ins)
;return str with every occurrence of pat
;replaced by ins
(let ((pat (if (string? pat) (pregexp pat) pat))
(n (string-length str))
(ins-len (string-length ins)))
(let loop ((i 0) (r ""))
;i = index in str to start replacing from
;r = already calculated prefix of answer
(if (>= i n) r
(let ((pp (pregexp-match-positions pat str i n)))
(if (not pp)
(if (= i 0)
;this implies pat didn't match str at
;all, so let's return original str
str
;else: all matches already found and
;replaced in r, so let's just
;append the rest of str
(string-append
r (substring str i n)))
(loop (cdar pp)
(string-append
r
(substring str i (caar pp))
(pregexp-replace-aux str ins ins-len pp))))))))))
(define pregexp-quote
(lambda (s)
(let loop ((i (- (string-length s) 1)) (r '()))
(if (< i 0) (list->string r)
(loop (- i 1)
(let ((c (string-ref s i)))
(if (memv c '(#\\ #\. #\? #\* #\+ #\| #\^ #\$
#\[ #\] #\{ #\} #\( #\)))
(cons #\\ (cons c r))
(cons c r))))))))
)