From d87538a015a50dd6a2dac2fd31c1a77ca9a02cf5 Mon Sep 17 00:00:00 2001 From: Abdulaziz Ghuloum Date: Wed, 19 Dec 2007 14:33:21 -0500 Subject: [PATCH] Ported Dorai's pregexp to Ikarus. --- other-libs/pregexp-test.ss | 437 +++++++++++++++++++++ other-libs/pregexp.ss | 770 +++++++++++++++++++++++++++++++++++++ 2 files changed, 1207 insertions(+) create mode 100755 other-libs/pregexp-test.ss create mode 100644 other-libs/pregexp.ss diff --git a/other-libs/pregexp-test.ss b/other-libs/pregexp-test.ss new file mode 100755 index 0000000..00dda8f --- /dev/null +++ b/other-libs/pregexp-test.ss @@ -0,0 +1,437 @@ +#!/usr/bin/env scheme-script +;;; ported to Ikarus by Abdulaziz Ghuloum on Dec 19 2007. + +;last change by dorai on 2005-04-24 + + +(import (ikarus) (pregexp)) + +(define *failed* 0) + +(define-syntax test + (syntax-rules () + [(_) #t] + [(_ q a rest ...) + (begin + (test-each q a) + (test rest ...))])) + +(define-syntax test-each + (syntax-rules () + [(_ expr expected-answer) + (begin + (display "Trying ") + (write 'expr) + (newline) + (display " --> ") + (let ((__actual-answer expr)) + (write __actual-answer) + (display " ... ") + (if (equal? __actual-answer 'expected-answer) + (display "OK") + (begin + (set! *failed* (+ *failed* 1)) + (display "FAILED!!!") + (newline) + (display " ;;; expected ") + (write 'expected-answer)))) + (newline))])) + +(define bottomline + (lambda () + (newline) + (if (= *failed* 0) + (display "All tests succeeded! :-) :-) :-)") + (begin (display *failed*) + (display " test") + (if (> *failed* 1) (display "s")) + (display " failed! :-( :-( :-("))) + (newline))) + +;keeping the document honest + +(test + + (pregexp "c.r") + (:sub (:or (:seq #\c :any #\r))) + + (pregexp-match-positions "brain" "bird") + #f + + (pregexp-match-positions "needle" "hay needle stack") + ((4 . 10)) + + (pregexp-match-positions "needle" + "his hay needle stack -- my hay needle stack -- her hay needle stack" + 24 43) + ((31 . 37)) + + (pregexp-match "brain" "bird") + #f + + (pregexp-match "needle" "hay needle stack") + ("needle") + + (pregexp-split ":" "/bin:/usr/bin:/usr/bin/X11:/usr/local/bin") + ("/bin" "/usr/bin" "/usr/bin/X11" "/usr/local/bin") + + (pregexp-split " " "pea soup") + ("pea" "soup") + + (pregexp-split "" "smithereens") + ("s" "m" "i" "t" "h" "e" "r" "e" "e" "n" "s") + + (pregexp-split " +" "split pea soup") + ("split" "pea" "soup") + + (pregexp-split " *" "split pea soup") + ("s" "p" "l" "i" "t" "p" "e" "a" "s" "o" "u" "p") + + (pregexp-replace "te" "liberte" "ty") + "liberty" + + (pregexp-replace* "te" "liberte egalite fraternite" "ty") + "liberty egality fratyrnity" + + (pregexp-match-positions "^contact" "first contact") + #f + + (pregexp-match-positions "laugh$" "laugh laugh laugh laugh") + ((18 . 23)) + + (pregexp-match-positions "yack\\b" "yackety yack") + ((8 . 12)) + + (pregexp-match-positions "an\\B" "an analysis") + ((3 . 5)) + + (pregexp-match "p.t" "pet") + ("pet") + + + (pregexp-match "\\d\\d" "0 dear, 1 have to read catch 22 before 9") + ("22") + + (pregexp-match "[[:alpha:]_]" "--x--") + ("x") + + (pregexp-match "[[:alpha:]_]" "--_--") + ("_") + + (pregexp-match "[[:alpha:]_]" "--:--") + #f + + (pregexp-match "[:alpha:]" "--a--") + ("a") + + (pregexp-match "[:alpha:]" "--_--") + #f + + (pregexp-match-positions "c[ad]*r" "cadaddadddr") + ((0 . 11)) + + (pregexp-match-positions "c[ad]*r" "cr") + ((0 . 2)) + + (pregexp-match-positions "c[ad]+r" "cadaddadddr") + ((0 . 11)) + + (pregexp-match-positions "c[ad]+r" "cr") + #f + + (pregexp-match-positions "c[ad]?r" "cadaddadddr") + #f + + (pregexp-match-positions "c[ad]?r" "cr") + ((0 . 2)) + + (pregexp-match-positions "c[ad]?r" "car") + ((0 . 3)) + + (pregexp-match "[aeiou]{3}" "vacuous") + ("uou") + + (pregexp-match "[aeiou]{3}" "evolve") + #f + + (pregexp-match "[aeiou]{2,3}" "evolve") + #f + + (pregexp-match "[aeiou]{2,3}" "zeugma") + ("eu") + + (pregexp-match "<.*>" " ") + (" ") + + + (pregexp-match "<.*?>" " ") + ("") + + (pregexp-match "([a-z]+) ([0-9]+), ([0-9]+)" "jan 1, 1970") + ("jan 1, 1970" "jan" "1" "1970") + + (pregexp-match "(poo )*" "poo poo platter") + ("poo poo " "poo ") + + (pregexp-match "([a-z ]+;)*" "lather; rinse; repeat;") + ("lather; rinse; repeat;" " repeat;") + + ) + +(define date-re + ;match `month year' or `month day, year'. + ;subpattern matches day, if present + (pregexp "([a-z]+) +([0-9]+,)? *([0-9]+)")) + +(test + + (pregexp-match date-re "jan 1, 1970") + ("jan 1, 1970" "jan" "1," "1970") + + (pregexp-match date-re "jan 1970") + ("jan 1970" "jan" #f "1970") + + (pregexp-replace "_(.+?)_" + "the _nina_, the _pinta_, and the _santa maria_" + "*\\1*") + "the *nina*, the _pinta_, and the _santa maria_" + + (pregexp-replace* "_(.+?)_" + "the _nina_, the _pinta_, and the _santa maria_" + "*\\1*") + "the *nina*, the *pinta*, and the *santa maria*" + + (pregexp-replace "(\\S+) (\\S+) (\\S+)" + "eat to live" + "\\3 \\2 \\1") + "live to eat" + + (pregexp-match "([a-z]+) and \\1" + "billions and billions") + ("billions and billions" "billions") + + (pregexp-match "([a-z]+) and \\1" + "billions and millions") + #f + + (pregexp-replace* "(\\S+) \\1" + "now is the the time for all good men to to come to the aid of of the party" + "\\1") + "now is the time for all good men to come to the aid of the party" + + (pregexp-replace* "(\\d+)\\1" + "123340983242432420980980234" + "{\\1,\\1}") + "12{3,3}40983{24,24}3242{098,098}0234" + + (pregexp-match "^(?:[a-z]*/)*([a-z]+)$" "/usr/local/bin/mzscheme") + ("/usr/local/bin/mzscheme" "mzscheme") + + (pregexp-match "(?i:hearth)" "HeartH") + ("HeartH") + + (pregexp-match "(?x: a lot)" "alot") + ("alot") + + (pregexp-match "(?x: a \\ lot)" "a lot") + ("a lot") + + (pregexp-match "(?x: + a \\ man \\; \\ ; ignore + a \\ plan \\; \\ ; me + a \\ canal ; completely + )" + "a man; a plan; a canal") + ("a man; a plan; a canal") + + (pregexp-match "(?ix: + a \\ man \\; \\ ; ignore + a \\ plan \\; \\ ; me + a \\ canal ; completely + )" + "A Man; a Plan; a Canal") + ("A Man; a Plan; a Canal") + + (pregexp-match "(?i:the (?-i:TeX)book)" + "The TeXbook") + ("The TeXbook") + + (pregexp-match "f(ee|i|o|um)" "a small, final fee") + ("fi" "i") + + (pregexp-replace* "([yi])s(e[sdr]?|ing|ation)" + "it is energising to analyse an organisation pulsing with noisy organisms" + "\\1z\\2") + "it is energizing to analyze an organization pulsing with noisy organisms" + + (pregexp-match "f(?:ee|i|o|um)" "fun for all") + ("fo") + + (pregexp-match "call|call-with-current-continuation" + "call-with-current-continuation") + ("call") + + (pregexp-match "call-with-current-continuation|call" + "call-with-current-continuation") + ("call-with-current-continuation") + + (pregexp-match "(?:call|call-with-current-continuation) constrained" + "call-with-current-continuation constrained") + ("call-with-current-continuation constrained") + + (pregexp-match "(?>a+)." "aaaa") + #f + + (pregexp-match-positions "grey(?=hound)" + "i left my grey socks at the greyhound") + ((28 . 32)) + + (pregexp-match-positions "grey(?!hound)" + "the gray greyhound ate the grey socks") + ((27 . 31)) + + (pregexp-match-positions "(?<=grey)hound" + "the hound in the picture is not a greyhound") + ((38 . 43)) + + (pregexp-match-positions "(?= i n) + (list + (list ':or (list ':seq)) i) + (let loop ((branches '()) (i i)) + (if (or (>= i n) + (char=? (string-ref s i) #\))) + (list (cons ':or (pregexp-reverse! branches)) i) + (let ((vv (pregexp-read-branch + s + (if (char=? (string-ref s i) #\|) (+ i 1) i) n))) + (loop (cons (car vv) branches) (cadr vv)))))))) + +(define pregexp-read-branch + (lambda (s i n) + (let loop ((pieces '()) (i i)) + (cond ((>= i n) + (list (cons ':seq (pregexp-reverse! pieces)) i)) + ((let ((c (string-ref s i))) + (or (char=? c #\|) + (char=? c #\)))) + (list (cons ':seq (pregexp-reverse! pieces)) i)) + (else (let ((vv (pregexp-read-piece s i n))) + (loop (cons (car vv) pieces) (cadr vv)))))))) + +(define pregexp-read-piece + (lambda (s i n) + (let ((c (string-ref s i))) + (case c + ((#\^) (list ':bos (+ i 1))) + ((#\$) (list ':eos (+ i 1))) + ((#\.) (pregexp-wrap-quantifier-if-any + (list ':any (+ i 1)) s n)) + ((#\[) (let ((i+1 (+ i 1))) + (pregexp-wrap-quantifier-if-any + (case (and (< i+1 n) (string-ref s i+1)) + ((#\^) + (let ((vv (pregexp-read-char-list s (+ i 2) n))) + (list (list ':neg-char (car vv)) (cadr vv)))) + (else (pregexp-read-char-list s i+1 n))) + s n))) + ((#\() + (pregexp-wrap-quantifier-if-any + (pregexp-read-subpattern s (+ i 1) n) s n)) + ((#\\ ) + (pregexp-wrap-quantifier-if-any + (cond ((pregexp-read-escaped-number s i n) => + (lambda (num-i) + (list (list ':backref (car num-i)) (cadr num-i)))) + ((pregexp-read-escaped-char s i n) => + (lambda (char-i) + (list (car char-i) (cadr char-i)))) + (else (pregexp-error 'pregexp-read-piece 'backslash))) + s n)) + (else + (if (or *pregexp-space-sensitive?* + (and (not (char-whitespace? c)) + (not (char=? c *pregexp-comment-char*)))) + (pregexp-wrap-quantifier-if-any + (list c (+ i 1)) s n) + (let loop ((i i) (in-comment? #f)) + (if (>= i n) (list ':empty i) + (let ((c (string-ref s i))) + (cond (in-comment? + (loop (+ i 1) + (not (char=? c #\newline)))) + ((char-whitespace? c) + (loop (+ i 1) #f)) + ((char=? c *pregexp-comment-char*) + (loop (+ i 1) #t)) + (else (list ':empty i)))))))))))) + +(define pregexp-read-escaped-number + (lambda (s i n) + ; s[i] = \ + (and (< (+ i 1) n) ;must have at least something following \ + (let ((c (string-ref s (+ i 1)))) + (and (char-numeric? c) + (let loop ((i (+ i 2)) (r (list c))) + (if (>= i n) + (list (string->number + (list->string (pregexp-reverse! r))) i) + (let ((c (string-ref s i))) + (if (char-numeric? c) + (loop (+ i 1) (cons c r)) + (list (string->number + (list->string (pregexp-reverse! r))) + i)))))))))) + +(define pregexp-read-escaped-char + (lambda (s i n) + ; s[i] = \ + (and (< (+ i 1) n) + (let ((c (string-ref s (+ i 1)))) + (case c + ((#\b) (list ':wbdry (+ i 2))) + ((#\B) (list ':not-wbdry (+ i 2))) + ((#\d) (list ':digit (+ i 2))) + ((#\D) (list '(:neg-char :digit) (+ i 2))) + ((#\n) (list #\newline (+ i 2))) + ((#\r) (list *pregexp-return-char* (+ i 2))) + ((#\s) (list ':space (+ i 2))) + ((#\S) (list '(:neg-char :space) (+ i 2))) + ((#\t) (list *pregexp-tab-char* (+ i 2))) + ((#\w) (list ':word (+ i 2))) + ((#\W) (list '(:neg-char :word) (+ i 2))) + (else (list c (+ i 2)))))))) + +(define pregexp-read-posix-char-class + (lambda (s i n) + ; lbrack, colon already read + (let ((neg? #f)) + (let loop ((i i) (r (list #\:))) + (if (>= i n) + (pregexp-error 'pregexp-read-posix-char-class) + (let ((c (string-ref s i))) + (cond ((char=? c #\^) + (set! neg? #t) + (loop (+ i 1) r)) + ((char-alphabetic? c) + (loop (+ i 1) (cons c r))) + ((char=? c #\:) + (if (or (>= (+ i 1) n) + (not (char=? (string-ref s (+ i 1)) #\]))) + (pregexp-error 'pregexp-read-posix-char-class) + (let ((posix-class + (string->symbol + (list->string (pregexp-reverse! r))))) + (list (if neg? (list ':neg-char posix-class) + posix-class) + (+ i 2))))) + (else + (pregexp-error 'pregexp-read-posix-char-class))))))))) + +(define pregexp-read-cluster-type + (lambda (s i n) + ; s[i-1] = left-paren + (let ((c (string-ref s i))) + (case c + ((#\?) + (let ((i (+ i 1))) + (case (string-ref s i) + ((#\:) (list '() (+ i 1))) + ((#\=) (list '(:lookahead) (+ i 1))) + ((#\!) (list '(:neg-lookahead) (+ i 1))) + ((#\>) (list '(:no-backtrack) (+ i 1))) + ((#\<) + (list (case (string-ref s (+ i 1)) + ((#\=) '(:lookbehind)) + ((#\!) '(:neg-lookbehind)) + (else (pregexp-error 'pregexp-read-cluster-type))) + (+ i 2))) + (else (let loop ((i i) (r '()) (inv? #f)) + (let ((c (string-ref s i))) + (case c + ((#\-) (loop (+ i 1) r #t)) + ((#\i) (loop (+ i 1) + (cons (if inv? ':case-sensitive + ':case-insensitive) r) #f)) + ((#\x) + (set! *pregexp-space-sensitive?* inv?) + (loop (+ i 1) r #f)) + ((#\:) (list r (+ i 1))) + (else (pregexp-error + 'pregexp-read-cluster-type))))))))) + (else (list '(:sub) i)))))) + +(define pregexp-read-subpattern + (lambda (s i n) + (let* ((remember-space-sensitive? *pregexp-space-sensitive?*) + (ctyp-i (pregexp-read-cluster-type s i n)) + (ctyp (car ctyp-i)) + (i (cadr ctyp-i)) + (vv (pregexp-read-pattern s i n))) + (set! *pregexp-space-sensitive?* remember-space-sensitive?) + (let ((vv-re (car vv)) + (vv-i (cadr vv))) + (if (and (< vv-i n) + (char=? (string-ref s vv-i) + #\))) + (list + (let loop ((ctyp ctyp) (re vv-re)) + (if (null? ctyp) re + (loop (cdr ctyp) + (list (car ctyp) re)))) + (+ vv-i 1)) + (pregexp-error 'pregexp-read-subpattern)))))) + +(define pregexp-wrap-quantifier-if-any + (lambda (vv s n) + (let ((re (car vv))) + (let loop ((i (cadr vv))) + (if (>= i n) vv + (let ((c (string-ref s i))) + (if (and (char-whitespace? c) (not *pregexp-space-sensitive?*)) + (loop (+ i 1)) + (case c + ((#\* #\+ #\? #\{) + (let* ((new-re (list ':between 'minimal? + 'at-least 'at-most re)) + (new-vv (list new-re 'next-i))) + (case c + ((#\*) (set-car! (cddr new-re) 0) + (set-car! (cdddr new-re) #f)) + ((#\+) (set-car! (cddr new-re) 1) + (set-car! (cdddr new-re) #f)) + ((#\?) (set-car! (cddr new-re) 0) + (set-car! (cdddr new-re) 1)) + ((#\{) (let ((pq (pregexp-read-nums s (+ i 1) n))) + (if (not pq) + (pregexp-error + 'pregexp-wrap-quantifier-if-any + 'left-brace-must-be-followed-by-number)) + (set-car! (cddr new-re) (car pq)) + (set-car! (cdddr new-re) (cadr pq)) + (set! i (caddr pq))))) + (let loop ((i (+ i 1))) + (if (>= i n) + (begin (set-car! (cdr new-re) #f) + (set-car! (cdr new-vv) i)) + (let ((c (string-ref s i))) + (cond ((and (char-whitespace? c) + (not *pregexp-space-sensitive?*)) + (loop (+ i 1))) + ((char=? c #\?) + (set-car! (cdr new-re) #t) + (set-car! (cdr new-vv) (+ i 1))) + (else (set-car! (cdr new-re) #f) + (set-car! (cdr new-vv) i)))))) + new-vv)) + (else vv))))))))) + +; + +(define pregexp-read-nums + (lambda (s i n) + ; s[i-1] = { + ; returns (p q k) where s[k] = } + (let loop ((p '()) (q '()) (k i) (reading 1)) + (if (>= k n) (pregexp-error 'pregexp-read-nums)) + (let ((c (string-ref s k))) + (cond ((char-numeric? c) + (if (= reading 1) + (loop (cons c p) q (+ k 1) 1) + (loop p (cons c q) (+ k 1) 2))) + ((and (char-whitespace? c) (not *pregexp-space-sensitive?*)) + (loop p q (+ k 1) reading)) + ((and (char=? c #\,) (= reading 1)) + (loop p q (+ k 1) 2)) + ((char=? c #\}) + (let ((p (string->number (list->string (pregexp-reverse! p)))) + (q (string->number (list->string (pregexp-reverse! q))))) + (cond ((and (not p) (= reading 1)) (list 0 #f k)) + ((= reading 1) (list p p k)) + (else (list p q k))))) + (else #f)))))) + +(define pregexp-invert-char-list + (lambda (vv) + (set-car! (car vv) ':none-of-chars) + vv)) + +; + +(define pregexp-read-char-list + (lambda (s i n) + (let loop ((r '()) (i i)) + (if (>= i n) + (pregexp-error 'pregexp-read-char-list + 'character-class-ended-too-soon) + (let ((c (string-ref s i))) + (case c + ((#\]) (if (null? r) + (loop (cons c r) (+ i 1)) + (list (cons ':one-of-chars (pregexp-reverse! r)) + (+ i 1)))) + ((#\\ ) + (let ((char-i (pregexp-read-escaped-char s i n))) + (if char-i (loop (cons (car char-i) r) (cadr char-i)) + (pregexp-error 'pregexp-read-char-list 'backslash)))) + ((#\-) (if (or (null? r) + (let ((i+1 (+ i 1))) + (and (< i+1 n) + (char=? (string-ref s i+1) #\])))) + (loop (cons c r) (+ i 1)) + (let ((c-prev (car r))) + (if (char? c-prev) + (loop (cons (list ':char-range c-prev + (string-ref s (+ i 1))) (cdr r)) + (+ i 2)) + (loop (cons c r) (+ i 1)))))) + ((#\[) (if (char=? (string-ref s (+ i 1)) #\:) + (let ((posix-char-class-i + (pregexp-read-posix-char-class s (+ i 2) n))) + (loop (cons (car posix-char-class-i) r) + (cadr posix-char-class-i))) + (loop (cons c r) (+ i 1)))) + (else (loop (cons c r) (+ i 1))))))))) + + +; + +(define pregexp-string-match + (lambda (s1 s i n sk fk) + (let ((n1 (string-length s1))) + (if (> n1 n) (fk) + (let loop ((j 0) (k i)) + (cond ((>= j n1) (sk k)) + ((>= k n) (fk)) + ((char=? (string-ref s1 j) (string-ref s k)) + (loop (+ j 1) (+ k 1))) + (else (fk)))))))) + +(define pregexp-char-word? + (lambda (c) + ;too restrictive for Scheme but this + ;is what \w is in most regexp notations + (or (char-alphabetic? c) + (char-numeric? c) + (char=? c #\_)))) + +(define pregexp-at-word-boundary? + (lambda (s i n) + (or (= i 0) (>= i n) + (let ((c/i (string-ref s i)) + (c/i-1 (string-ref s (- i 1)))) + (let ((c/i/w? (pregexp-check-if-in-char-class? + c/i ':word)) + (c/i-1/w? (pregexp-check-if-in-char-class? + c/i-1 ':word))) + (or (and c/i/w? (not c/i-1/w?)) + (and (not c/i/w?) c/i-1/w?))))))) + +(define pregexp-check-if-in-char-class? + (lambda (c char-class) + (case char-class + ((:any) (not (char=? c #\newline))) + ; + ((:alnum) (or (char-alphabetic? c) (char-numeric? c))) + ((:alpha) (char-alphabetic? c)) + ((:ascii) (< (char->integer c) 128)) + ((:blank) (or (char=? c #\space) (char=? c *pregexp-tab-char*))) + ((:cntrl) (< (char->integer c) 32)) + ((:digit) (char-numeric? c)) + ((:graph) (and (>= (char->integer c) 32) + (not (char-whitespace? c)))) + ((:lower) (char-lower-case? c)) + ((:print) (>= (char->integer c) 32)) + ((:punct) (and (>= (char->integer c) 32) + (not (char-whitespace? c)) + (not (char-alphabetic? c)) + (not (char-numeric? c)))) + ((:space) (char-whitespace? c)) + ((:upper) (char-upper-case? c)) + ((:word) (or (char-alphabetic? c) + (char-numeric? c) + (char=? c #\_))) + ((:xdigit) (or (char-numeric? c) + (char-ci=? c #\a) (char-ci=? c #\b) + (char-ci=? c #\c) (char-ci=? c #\d) + (char-ci=? c #\e) (char-ci=? c #\f))) + (else (pregexp-error 'pregexp-check-if-in-char-class?))))) + +(define pregexp-list-ref + (lambda (s i) + ;like list-ref but returns #f if index is + ;out of bounds + (let loop ((s s) (k 0)) + (cond ((null? s) #f) + ((= k i) (car s)) + (else (loop (cdr s) (+ k 1))))))) + +;re is a compiled regexp. It's a list that can't be +;nil. pregexp-match-positions-aux returns a 2-elt list whose +;car is the string-index following the matched +;portion and whose cadr contains the submatches. +;The proc returns false if there's no match. + +;Am spelling loop- as loup- because these shouldn't +;be translated into CL loops by scm2cl (although +;they are tail-recursive in Scheme) + +(define pregexp-make-backref-list + (lambda (re) + (let sub ((re re)) + (if (pair? re) + (let ((car-re (car re)) + (sub-cdr-re (sub (cdr re)))) + (if (eqv? car-re ':sub) + (cons (cons re #f) sub-cdr-re) + (append (sub car-re) sub-cdr-re))) + '())))) + +(define pregexp-match-positions-aux + (lambda (re s sn start n i) + (let ((identity (lambda (x) x)) + (backrefs (pregexp-make-backref-list re)) + (case-sensitive? #t)) + (let sub ((re re) (i i) (sk identity) (fk (lambda () #f))) + ;(printf "sub ~s ~s\n" i re) + (cond ((eqv? re ':bos) + ;(if (= i 0) (sk i) (fk)) + (if (= i start) (sk i) (fk)) + ) + ((eqv? re ':eos) + ;(if (>= i sn) (sk i) (fk)) + (if (>= i n) (sk i) (fk)) + ) + ((eqv? re ':empty) + (sk i)) + ((eqv? re ':wbdry) + (if (pregexp-at-word-boundary? s i n) + (sk i) + (fk))) + ((eqv? re ':not-wbdry) + (if (pregexp-at-word-boundary? s i n) + (fk) + (sk i))) + ((and (char? re) (< i n)) + ;(printf "bingo\n") + (if ((if case-sensitive? char=? char-ci=?) + (string-ref s i) re) + (sk (+ i 1)) (fk))) + ((and (not (pair? re)) (< i n)) + (if (pregexp-check-if-in-char-class? + (string-ref s i) re) + (sk (+ i 1)) (fk))) + ((and (pair? re) (eqv? (car re) ':char-range) (< i n)) + (let ((c (string-ref s i))) + (if (let ((c< (if case-sensitive? char<=? char-ci<=?))) + (and (c< (cadr re) c) + (c< c (caddr re)))) + (sk (+ i 1)) (fk)))) + ((pair? re) + (case (car re) + ((:char-range) + (if (>= i n) (fk) + (pregexp-error 'pregexp-match-positions-aux))) + ((:one-of-chars) + (if (>= i n) (fk) + (let loup-one-of-chars ((chars (cdr re))) + (if (null? chars) (fk) + (sub (car chars) i sk + (lambda () + (loup-one-of-chars (cdr chars)))))))) + ((:neg-char) + (if (>= i n) (fk) + (sub (cadr re) i + (lambda (i1) (fk)) + (lambda () (sk (+ i 1)))))) + ((:seq) + (let loup-seq ((res (cdr re)) (i i)) + (if (null? res) (sk i ) + (sub (car res) i + (lambda (i1 ) + (loup-seq (cdr res) i1 )) + fk)))) + ((:or) + (let loup-or ((res (cdr re))) + (if (null? res) (fk) + (sub (car res) i + (lambda (i1 ) + (or (sk i1 ) + (loup-or (cdr res)))) + (lambda () (loup-or (cdr res))))))) + ((:backref) + (let* ((c (pregexp-list-ref backrefs (cadr re))) + (backref + (cond (c => cdr) + (else + (pregexp-error 'pregexp-match-positions-aux + 'non-existent-backref re) + #f)))) + (if backref + (pregexp-string-match + (substring s (car backref) (cdr backref)) + s i n (lambda (i) (sk i)) fk) + (sk i)))) + ((:sub) + (sub (cadr re) i + (lambda (i1) + (set-cdr! (assv re backrefs) (cons i i1)) + (sk i1)) fk)) + ((:lookahead) + (let ((found-it? + (sub (cadr re) i + identity (lambda () #f)))) + (if found-it? (sk i) (fk)))) + ((:neg-lookahead) + (let ((found-it? + (sub (cadr re) i + identity (lambda () #f)))) + (if found-it? (fk) (sk i)))) + ((:lookbehind) + (let ((n-actual n) (sn-actual sn)) + (set! n i) (set! sn i) + (let ((found-it? + (sub (list ':seq '(:between #f 0 #f :any) + (cadr re) ':eos) 0 + identity (lambda () #f)))) + (set! n n-actual) (set! sn sn-actual) + (if found-it? (sk i) (fk))))) + ((:neg-lookbehind) + (let ((n-actual n) (sn-actual sn)) + (set! n i) (set! sn i) + (let ((found-it? + (sub (list ':seq '(:between #f 0 #f :any) + (cadr re) ':eos) 0 + identity (lambda () #f)))) + (set! n n-actual) (set! sn sn-actual) + (if found-it? (fk) (sk i))))) + ((:no-backtrack) + (let ((found-it? (sub (cadr re) i + identity (lambda () #f)))) + (if found-it? + (sk found-it?) + (fk)))) + ((:case-sensitive :case-insensitive) + (let ((old case-sensitive?)) + (set! case-sensitive? + (eqv? (car re) ':case-sensitive)) + (sub (cadr re) i + (lambda (i1) + (set! case-sensitive? old) + (sk i1)) + (lambda () + (set! case-sensitive? old) + (fk))))) + ((:between) + (let* ((maximal? (not (cadr re))) + (p (caddr re)) + (q (cadddr re)) + (could-loop-infinitely? (and maximal? (not q))) + (re (car (cddddr re)))) + (let loup-p ((k 0) (i i) ) + (if (< k p) + (sub re i + (lambda (i1 ) + (if (and could-loop-infinitely? + (= i1 i)) + (pregexp-error + 'pregexp-match-positions-aux + 'greedy-quantifier-operand-could-be-empty)) + (loup-p (+ k 1) i1 )) + fk) + (let ((q (and q (- q p)))) + (let loup-q ((k 0) (i i)) + (let ((fk (lambda () + (sk i )))) + (if (and q (>= k q)) (fk) + (if maximal? + (sub re i + (lambda (i1) + (if (and could-loop-infinitely? + (= i1 i)) + (pregexp-error + 'pregexp-match-positions-aux + 'greedy-quantifier-operand-could-be-empty)) + (or (loup-q (+ k 1) i1) + (fk))) + fk) + (or (fk) + (sub re i + (lambda (i1) + (loup-q (+ k 1) i1)) + fk))))))))))) + (else (pregexp-error 'pregexp-match-positions-aux)))) + ((>= i n) (fk)) + (else (pregexp-error 'pregexp-match-positions-aux)))) + ;(printf "done\n") + (let ((backrefs (map cdr backrefs))) + (and (car backrefs) backrefs))))) + +(define pregexp-replace-aux + (lambda (str ins n backrefs) + (let loop ((i 0) (r "")) + (if (>= i n) r + (let ((c (string-ref ins i))) + (if (char=? c #\\ ) + (let* ((br-i (pregexp-read-escaped-number ins i n)) + (br (if br-i (car br-i) + (if (char=? (string-ref ins (+ i 1)) #\&) 0 + #f))) + (i (if br-i (cadr br-i) + (if br (+ i 2) + (+ i 1))))) + (if (not br) + (let ((c2 (string-ref ins i))) + (loop (+ i 1) + (if (char=? c2 #\$) r + (string-append r (string c2))))) + (loop i + (let ((backref (pregexp-list-ref backrefs br))) + (if backref + (string-append r + (substring str (car backref) (cdr backref))) + r))))) + (loop (+ i 1) (string-append r (string c))))))))) + +(define pregexp-proc + (letrec ([pregexp + (lambda (s) + (set! *pregexp-space-sensitive?* #t) ;in case it got corrupted + (list ':sub (car (pregexp-read-pattern s 0 (string-length s)))))]) + pregexp)) + +(define-syntax pregexp + (lambda (x) ;;; aziz's touch + (syntax-case x () + [(_ str) + (string? (syntax->datum #'str)) + #'(let-syntax ([foo + (lambda (t) + (list #'quote (pregexp-proc str)))]) + foo)] + [(_ . rest) #'(pregexp-proc . rest)] + [id (identifier? #'id) #'pregexp-proc]))) + + +(define pregexp-match-positions + (lambda (pat str . opt-args) + (cond ((string? pat) (set! pat (pregexp pat))) + ((pair? pat) #t) + (else (pregexp-error 'pregexp-match-positions + 'pattern-must-be-compiled-or-string-regexp + pat))) + (let* ((str-len (string-length str)) + (start (if (null? opt-args) 0 + (let ((start (car opt-args))) + (set! opt-args (cdr opt-args)) + start))) + (end (if (null? opt-args) str-len + (car opt-args)))) + (let loop ((i start)) + (and (<= i end) + (or (pregexp-match-positions-aux + pat str str-len start end i) + (loop (+ i 1)))))))) + +(define pregexp-match + (lambda (pat str . opt-args) + (let ((ix-prs (apply pregexp-match-positions pat str opt-args))) + (and ix-prs + (map + (lambda (ix-pr) + (and ix-pr + (substring str (car ix-pr) (cdr ix-pr)))) + ix-prs))))) + +(define pregexp-split + (lambda (pat str) + ;split str into substrings, using pat as delimiter + (let ((n (string-length str))) + (let loop ((i 0) (r '()) (picked-up-one-undelimited-char? #f)) + (cond ((>= i n) (pregexp-reverse! r)) + ((pregexp-match-positions pat str i n) + => + (lambda (y) + (let ((jk (car y))) + (let ((j (car jk)) (k (cdr jk))) + ;(printf "j = ~a; k = ~a; i = ~a~n" j k i) + (cond ((= j k) + ;(printf "producing ~s~n" (substring str i (+ j 1))) + (loop (+ k 1) + (cons (substring str i (+ j 1)) r) #t)) + ((and (= j i) picked-up-one-undelimited-char?) + (loop k r #f)) + (else + ;(printf "producing ~s~n" (substring str i j)) + (loop k (cons (substring str i j) r) #f))))))) + (else (loop n (cons (substring str i n) r) #f))))))) + +(define pregexp-replace + (lambda (pat str ins) + (let* ((n (string-length str)) + (pp (pregexp-match-positions pat str 0 n))) + (if (not pp) str + (let ((ins-len (string-length ins)) + (m-i (caar pp)) + (m-n (cdar pp))) + (string-append + (substring str 0 m-i) + (pregexp-replace-aux str ins ins-len pp) + (substring str m-n n))))))) + +(define pregexp-replace* + (lambda (pat str ins) + ;return str with every occurrence of pat + ;replaced by ins + (let ((pat (if (string? pat) (pregexp pat) pat)) + (n (string-length str)) + (ins-len (string-length ins))) + (let loop ((i 0) (r "")) + ;i = index in str to start replacing from + ;r = already calculated prefix of answer + (if (>= i n) r + (let ((pp (pregexp-match-positions pat str i n))) + (if (not pp) + (if (= i 0) + ;this implies pat didn't match str at + ;all, so let's return original str + str + ;else: all matches already found and + ;replaced in r, so let's just + ;append the rest of str + (string-append + r (substring str i n))) + (loop (cdar pp) + (string-append + r + (substring str i (caar pp)) + (pregexp-replace-aux str ins ins-len pp)))))))))) + +(define pregexp-quote + (lambda (s) + (let loop ((i (- (string-length s) 1)) (r '())) + (if (< i 0) (list->string r) + (loop (- i 1) + (let ((c (string-ref s i))) + (if (memv c '(#\\ #\. #\? #\* #\+ #\| #\^ #\$ + #\[ #\] #\{ #\} #\( #\))) + (cons #\\ (cons c r)) + (cons c r)))))))) + +) +;(trace pregexp-read-pattern pregexp-read-char-list pregexp-read-piece) +;eof