scsh-0.5/scsh/rx/spencer.scm

172 lines
5.2 KiB
Scheme

;;; Parse Spencer-style regexps into the regexp ADT.
;;; Olin Shivers, July 1998.
;;; One export: (posix-string->regexp s)
;;; Need better error checking on {m,n} brace parsing.
(define (parse-posix-regexp-string s)
(receive (re i) (parse-posix-exp s 0)
(if (= i (string-length s)) re
(error "Illegal Posix regexp -- terminated early" s i))))
(define posix-string->regexp parse-posix-regexp-string)
;;; An complete expression is a sequence of |-separated branches.
(define (parse-posix-exp s i)
(let ((len (string-length s)))
(if (< i len)
(let lp ((i i) (branches '()))
(receive (branch i) (parse-posix-branch s i)
(let ((branches (cons branch branches)))
(if (and (< i len)
(char=? #\| (string-ref s i)))
(lp (+ i 1) branches)
(values (re-choice (reverse branches)) i)))))
(values trivial-re i))))
;;; A branch is a sequence of pieces -- stuff that goes in-between |'s.
(define (parse-posix-branch s i)
(let ((len (string-length s)))
(let lp ((i i) (pieces '()))
(if (< i len)
(receive (piece i) (parse-posix-piece s i)
(let ((pieces (cons piece pieces)))
(if (< i len)
(case (string-ref s i)
((#\) #\|) (values (re-seq (reverse pieces)) i))
(else (lp i pieces)))
(values (re-seq (reverse pieces)) i))))
(values (re-seq (reverse pieces)) i)))))
;;; A piece is an atom possibly followed by a * ? + or {...} multiplier.
;;; I.e. an element of a branch sequence.
(define (parse-posix-piece s i)
(let ((len (string-length s)))
(receive (atom i) (parse-posix-atom s i)
(if (< i len)
(case (string-ref s i)
((#\* #\+ #\?)
(receive (from to) (case (string-ref s i)
((#\*) (values 0 #f))
((#\+) (values 1 #f))
((#\?) (values 0 1)))
(values (re-repeat from to atom) (+ i 1))))
((#\{) (receive (from to i) (parse-posix-braces s (+ i 1))
(values (re-repeat from to atom) i)))
(else (values atom i)))
(values atom i)))))
;;; An atom is something that would bind to a following * operator --
;;; a letter, [...] charset, ^, $, or (...).
(define (parse-posix-atom s i)
(let ((len (string-length s)))
(if (< i (string-length s))
(let ((c (string-ref s i)))
(case c
((#\^) (values re-bos (+ i 1)))
((#\$) (values re-eos (+ i 1)))
((#\.) (values re-any (+ i 1)))
((#\[) (parse-posix-bracket s (+ i 1)))
((#\() (receive (re i) (parse-posix-exp s (+ i 1))
(if (and (< i len) (char=? #\) (string-ref s i)))
(values (re-submatch re) (+ i 1))
(error "Regexp subexpression has no terminating close parenthesis" s i))))
((#\\) (let ((i (+ i 1)))
(if (< i len)
(values (make-re-string (string (string-ref s i)))
(+ i 1))
(error "Regexps may not terminate with a backslash" s))))
((#\) #\| #\* #\+ #\? #\{) (values trivial-re i))
(else (values (make-re-string (string c)) (+ i 1)))))
(values trivial-re i))))
;;; Parse a [...] or [^...] bracket expression into a regexp.
;;; I is the index of the char following the left bracket.
(define db-cset (char-set #\. #\= #\:)) ; Not allowed after a #\[.
(define (parse-posix-bracket s i)
(let ((len (string-length s)))
(if (>= i len) (error "Missing close right bracket in regexp" s i)
(receive (negate? i0) (let ((c (string-ref s i)))
(if (char=? c #\^)
(values #t (+ i 1))
(values #f i)))
(let lp ((i i0) (cset (char-set-copy char-set:empty)))
(if (>= i len) (error "Missing close right bracket in regexp" s i)
(let ((c (string-ref s i))
(i1 (+ i 1)))
(case c
((#\[)
;; We don't handle [..] [==] [::] frobs.
(if (and (< i1 len)
(char-set-contains? db-cset (string-ref s i1)))
(error "double-bracket regexps not supported." s i)
(lp i1 (char-set-adjoin! cset #\[))))
((#\]) (if (= i i0)
(lp i1 (char-set-adjoin! cset #\]))
(let ((cset (if negate?
(char-set-invert! cset)
cset)))
(values (make-re-char-set cset) i1))))
((#\-) (if (or (= i i0) ; first char or last char
(and (< i1 len)
(char=? #\] (string-ref s i1))))
(lp i1 (char-set-adjoin! cset #\-))
(error "Illegal - in [...] regexp" s i)))
;; Regular letter -- either alone, or startpoint of a range.
(else (if (and (< (+ i1 1) len)
(char=? #\- (string-ref s i1)))
;; Range
(let* ((i-tochar (+ i1 1))
(to (char->ascii (string-ref s i-tochar))))
(do ((j (char->ascii c) (+ j 1))
(cset cset (char-set-adjoin! cset (ascii->char j))))
((> j to) (lp (+ i-tochar 1) cset))))
;; Just a letter
(lp i1 (char-set-adjoin! cset c))))))))))))
;;; Parse out a [from,to] repetition pair from a {m,n} {m} or {m,} expression.
;;; I is the index of the char following the left brace.
(define (parse-posix-braces s i)
(let ((comma (string-index s #\,) i)
(rb (string-index s #\} i)))
(if rb
(if (and comma (< comma rb))
(values (string->number (substring s i comma))
(and (not (= (+ comma 1) rb))
(string->number (substring s (+ comma 1) rb)))
(+ rb 1))
(let ((m (string->number (substring s i rb))))
(values m m (+ rb 1))))
(error "Missing close brace in regexp" s i))))