diff --git a/scsh/rx/alanltr b/scsh/rx/alanltr new file mode 100644 index 0000000..63c7a40 --- /dev/null +++ b/scsh/rx/alanltr @@ -0,0 +1,70 @@ +To: alan@lcs.mit.edu +Subject: scoping macros +Reply-to: shivers@ai.mit.edu +--text follows this line-- +So, I'd like to write a macro that introduces a lexical contour. +Here's my toy problem: + +(color) - macro form that produces the current lexical color. + +(with-green body ...) +(with-blue body ...) + Evaluate BODY forms in a lexical color contour that is green/blue, + respectively. + +The default, top-level color is green. + +This doesn't work: + +--- +(define-syntax color (syntax-rules () ((color) 'green))) + +(define-syntax with-blue + (syntax-rules (color) + ((with-blue body ...) + (let-syntax ((color (syntax-rules () ((color) 'blue)))) + body ...)))) + +(define-syntax with-green + (syntax-rules (color) + ((with-blue body ...) + (let-syntax ((color (syntax-rules () ((color) 'green)))) + body ...)))) +--- + +Everything comes out green. Removing COLOR from the syntax-rules keyword list +doesn't fix it. + +This *does* work: +--- +(define-syntax with-blue + (syntax-rules (color) + ((with-blue body ...) + (let-syntax ((color (syntax-rules (color) ((color) 'blue)))) + body ...)))) + +(with-blue (color)) +'green + +(define-syntax with-blue + (lambda (exp r c) + `(,(r 'let-syntax) ((color (,(r 'syntax-rules) () + ((color) 'blue)))) + . ,(cdr exp)))) + +> (with-blue (color)) +'blue + +> (list (color) (with-blue (color))) +'(green blue) + +> (define-syntax with-green + (lambda (exp r c) + `(,(r 'let-syntax) ((color (,(r 'syntax-rules) () + ((color) 'green)))) + . ,(cdr exp)))) + +> (cons (color) (with-blue (list (color) (with-green (color))))) +'(green blue green) +--- + diff --git a/scsh/rx/cclass b/scsh/rx/cclass new file mode 100644 index 0000000..89f6181 --- /dev/null +++ b/scsh/rx/cclass @@ -0,0 +1,60 @@ +scsh name Posix ctype Alternate +----------------------------------------- +lower-case lower +upper-case upper +alphabetic alpha +numeric digit num +alphanumeric alnum alphanum +punctuation punct +graphic graph +blank (Gnu extension) +whitespace space white ("space" is potentially confusing.) +printing print +control cntrl +hex-digit xdigit hex +ascii ascii (Gnu extension) + +SRE ::= ... ... + + ::= + | any | nonl | + | (in ...) + | (not-in ...) + | (and ...) + | (diff ...) + | (- ...) + | , + + ::= | + + + +(diff ...) = (and (not-in ...)) + +~!@#$%^&*-_+=|:<>?/ in + +(: ...) sequence + +(and ...) +(or ...) +(not ...) +(diff ...) + +(* ...) +(+ ...) +(- ...) + +(in ...) ; union +(not-in ...) ; complement-of-union +(and ...) ; intersection +(diff ...) ; diff + +(+ ...) ; union +(~ ...) ; complement-of-union +(& ...) ; intersection +(- ...) ; diff + +(in ...) ; union +(~ ...) ; complement-of-union +(& ...) ; intersection +(- ...) ; diff diff --git a/scsh/rx/cond-package.scm b/scsh/rx/cond-package.scm new file mode 100644 index 0000000..1907ace --- /dev/null +++ b/scsh/rx/cond-package.scm @@ -0,0 +1,160 @@ +(define-structure conditionals + (export (define-simple-syntax :syntax) + (when :syntax) + (unless :syntax) + (? :syntax) + (switchq :syntax) + (switch :syntax) + (prog0 :syntax) + (land* :syntax)) + (open scheme) + (begin + +;;; (define-simple-syntax (name subforms ...) expansion) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define-syntax define-simple-syntax + (syntax-rules () + ((define-simple-syntax (name subforms ...) expansion) + (define-syntax name (syntax-rules () ((name subforms ...) expansion)))))) + + +;;; ? = COND +;;; (WHEN test body ...) (SWITCHQ = key clause ...) +;;; (UNLESS test body ...) (SWITCH = key clause ...) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Handy conditional forms. ? is so short that it renders WHEN pretty +;;; much useless. + +(define-simple-syntax (when test body ...) + (if test (begin body ...))) + +(define-simple-syntax (unless test body ...) + (if (not test) (begin body ...))) + +;;; ? is synonym for COND. +(define-simple-syntax (? clause ...) (cond clause ...)) + + +;;; (PROG0 val-exp exp ...) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define-simple-syntax (prog0 val-exp exp ...) + (let ((v val-exp)) exp ... v)) + + +;;; (land* (clause ...) body ...) -*- Scheme -*- +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Evaluate each clause. If any clause returns false, land* stops and +;;; returns false. If all the clauses evaluate to a true value, return +;;; the value of the body. +;;; +;;; The difference between LAND* and AND is that LAND* binds names to +;;; the values of its clauses, which may be used by subsequent clauses. +;;; Clauses are of the form +;;; (var exp) ; binds VAR to the value of EXP. +;;; (exp) ; No binding. +;;; var ; Reference -- no binding. +;;; +;;; Example: +;;; (land* ((probe (assq key alist))) +;;; (cdr probe)) +;;; +;;; LAND* is due to Oleg Kiselyov (http://pobox.com/~oleg); I wrote this +;;; simple implementation as a high-level R5RS DEFINE-SYNTAX macro. +;;; Olin 98/9/29 + +(define-syntax land* + (syntax-rules () + ((land* () body ...) (begin body ...)) + + ((land* ((var exp) clause ...) body ...) + (let ((var exp)) (and var (land* (clause ...) body ...)))) + + ((land* ((#f exp) clause ...) body ...) + (and exp (land* (clause ...) body ...))) + + ((land* ((exp) clause ...) body ...) + (and exp (land* (clause ...) body ...))) + + ((land* (var clause ...) body ...) + (and var (land* (clause ...) body ...))))) + + + +;;; Like CASE, but you specify the key-comparison procedure. +;;; SWITCH evaluates its keys each time through the conditional. +;;; SWITCHQ keys are not evaluated -- are simply constants. +;;; (switchq string=? (vector-ref vec i) +;;; (("plus" "minus") ...) +;;; (("times" "div") ...) +;;; (else ...)) + +(define-simple-syntax (switchq compare key clause ...) + (let ((k key) ; Eval KEY and COMPARE + (c compare)) ; just once, then call %switch. + (%switchq c k clause ...))) ; C, K are vars, hence replicable. + +(define-syntax %switchq + (syntax-rules (else) + ((%switchq compare key ((key1 ...) body1 body2 ...) rest ...) + (if (or (compare key 'key1) ...) + (begin body1 body2 ...) + (%switchq compare key rest ...))) + + ((%switchq compare key ((key1 ...)) rest ...) ; Null body. + (if (not (or (compare key 'key1) ...)) + (%switchq compare key rest ...))) + + ((%switchq compare key (else body ...)) + (begin body ...)) + + ((%switchq compare key) '#f))) + + +(define-simple-syntax (switch compare key clause ...) + (let ((k key) ; Eval KEY and COMPARE + (c compare)) ; just once, then call %switch. + (%switch c k clause ...))) ; C, K are vars, hence replicable. + +(define-syntax %switch + (syntax-rules (else) + ((%switch compare key ((key1 ...) body1 body2 ...) rest ...) + (if (or (compare key key1) ...) + (begin body1 body2 ...) + (%switch compare key rest ...))) + + ((%switch compare key ((key1 ...)) rest ...) ; Null body. + (if (not (or (compare key key1) ...)) + (%switch compare key rest ...))) + + ((%switch compare key (else body ...)) + (begin body ...)) + + ((%switch compare key) '#f))) + +;;; I can't get this to work -- S48 complains "too many ...'s". +;(define-syntax switchq +; (syntax-rules (else) +; ((switchq compare key clause ...) +; (letrec-syntax ((%switchq (syntax-rules (else) +; ((%switchq compare key +; ((key1 ...) body1 body2 ...) rest ...) +; (if (or (compare key 'key1) ...) +; (begin body1 body2 ...) +; (%switchq compare key rest ...))) +; +; ; Null body. +; ((%switchq compare key ((key1 ...)) rest ...) +; (if (not (or (compare key 'key1) ...)) +; (%switchq compare key rest ...))) +; +; ((%switchq compare key (else body ...)) +; (begin body ...)) +; +; ((%switchq compare key) '#f)))) +; +; (let ((k key) ; Eval KEY and COMPARE +; (c compare)) ; just once, then call %switch. +; (%switchq c k clause ...)))))); C, K are vars, hence replicable. +)) diff --git a/scsh/rx/doc2.txt b/scsh/rx/doc2.txt new file mode 100644 index 0000000..51b33df --- /dev/null +++ b/scsh/rx/doc2.txt @@ -0,0 +1,1687 @@ +Scsh regular expressions +Olin Shivers +July 1998 +Last Update: 99/7/12 + +* Todo + Module naming and structuring. + Documentation. + +This document can be viewed in emacs outline mode, with *'s introducing +section headings -- just say M-x outline-mode in emacs. + +* Table of contents +------------------- +Preamble: 100% and 80% solutions +Overview +Summary SRE syntax +Examples +A short tutorial +Discussion and design notes +Regexp functions +The scsh regexp ADT +Syntax-hacking tools +Acknowledgements + +* Preamble: 100% and 80% solutions +---------------------------------- +There's a problem with tool design in the free software and academic +community. The tool designers are usually people who are building tools for +some larger goal. For example, let's take the case of someone who wants to do +web hacking in Scheme. His Scheme system doesn't have a sockets interface, so +he sits down and hacks one up for his particular Scheme implementation. Now, +socket API's are not what this programmer is interested in; he wants to get on +with things and hack the exciting stuff -- his real interest is Web services. +So he does a quick 80% job, which is adequate to get him up and running, and +then he's on to his orignal goal. + +Unfortunately, his quickly-built socket interface isn't general. It just +covers the bits this particular hacker needed for his applications. So the +next guy that comes along and needs a socket interface can't use this one. +Not only does it lack coverage, but the deep structure wasn't thought out well +enough to allow for quality extension. So *he* does his *own* 80% +implementation. Five hackers later, five different, incompatible, ungeneral +implementations had been built. No one can use each others code. + +The alternate way systems like this end up going over a cliff is that they +get patched over and over again, and what one ends up with is more 80% +bandaids and 20% structured code. When systems like DOS evolve organically +into systems like Win95, it's unsuprising and unavoidable that what one +ends up with is a horrible design. + +As an alternative to five hackers doing five 80% solutions of the same proble, +we would be much better off if each programmer picked a different task, and +really crushed it -- a 100% solution. Then each time a programmer solved a +problem, no one else would have to redo the effort. Of course, it's true that +100% solutions are significantly harder to design and build than 80% +solutions. But they have one tremendous labor-savings advantage: you don't +have to constantly reinvent the wheel. The up-front investment buys you +forward progress; you aren't trapped endlessly reinventing the same awkward +wheel. + +Examples: I've done this three times. The first time was when I needed an +emacs mode in graduate school for interacting with Scheme processes. I looked +around, and I found a snarled up mess of many, many 80% solutions, some for +Lisp, some for Scheme, some for shells, some for gdb, and so forth. These +modes had all started off at some point as the original emacs shell.el mode, +then had been hacked up, eventually drifting into divergence. The keybindings +had no commonality. Some modes recovered old commands with a "yank" type form, +on c-c y. Some modes recovered old commands with m-p and m-n. It was hugely +confusing and not very functional. + +The right thing to do was to carefully implement one, common base mode for +process interaction, and to carefully put in hooks for customising this base +mode into language-specific modes -- lisp, shell, Scheme, etc. So that's what +I did. I carefully went over the keybindings and functionality of all the +process modes I could find -- even going back to old Lisp Machine bindings for +Zwei -- and then I designed and implemented a base mode called comint. Now, +all process modes are implemented on top of comint, and no one, ever, has to +re-implement this code. Users only have to learn one set of bindings for +the common functions. Features put into the common code are available for free +to all the derived modes. Extensions are done, not by doing a completely new +design, but *in terms of* the original system -- it may not be perfect, but +it's good enough to allow people to move on and do other things. + +The second time was the design of the Scheme Unix API found in scsh. Most +Schemes have a couple of functions for changing directory, some minimal socket +hacking, and perhaps forking off a shell command with the system() C function. +But no one has done a complete job, and the functions are never compatible. +So I sat down to do a careful, 100% job -- I wanted to cover everything in +section 2 of the Unix man pages, in a manner that was harmonious with the deep +structures of the Scheme language. As a design task, it was a tremendous +amount of work, taking several years, and multiple revisions. But now it's +done. Scsh's socket code, for instance, *completely* implements the socket +API. My hope in doing all this was that other people could profit from my +investment. If you are building your own Scheme system, *you* don't have to +put in the time. You can just steal the design. Or the code. + +The regexp notation in this document represents a third attempt at this kind +of design. Looking back, I'm amazed at how much time I poured into the design, +not to mention the complete reference implementation. I sold myself on doing a +serious job with the philosophy of the 100% design -- the point is to save +other people the trouble. If the design is good enough, then instead of having +to do your own, you can steal mine and use the time saved... to do your own +100% design of something *else*, and fill in another gap. + +I am not saying that these three designs of mine represent the last word on +the issues -- "100%" is really a bit of a misnomer, since no design is ever +really 100%. I would prefer to think of them as sufficiently good that they at +least present low-water marks -- future systems, I'd hope, can at least build +upon these designs, hopefully *in terms of* these designs. You don't ever have +to do *worse* -- you can just steal the design. If you don't have a +significantly better idea, I'd encourage you to adopt the design for the +benefits of compatibility. If you *do* have an improvement, email me about it, +so we can fold it in to the core design and *everyone* can win -- and we can +also make your improvement part of the standard, so that people can use your +good idea and *still* be portable. + +But here's what I'd really like: instead of tweaking regexps, you go do your +own 100% design or two. Because I'd like to use them. If everyone does just +one, then that's all anyone has to do. + -Olin + + +* Overview +---------- +This document describes the regular-expression system used in scsh. +The system is composed of several pieces: + +- An s-expression notation for writing down general regular expressions. + In most systems, regexps are encoded as string literals. In scsh, + they are written using s-expressions. This notation has several + advantages over the traditional string-based notation; these advantages + are discussed in a following section. + +- An abstract data type (ADT) representation for regexp values. + Traditional regular-expression systems compute regular expressions + from run-time values using strings. This can be awkward. Scsh, instead, + provides a separate data type for regexps, with a set of basic constructor + and accessor functions; regular expressions can be dynamically computed + and manipulated using these functions. + +- Some tools that work on the regexp ADT: case-sensitve -> case-insensitive + regexp transform, a regexp simplifier, and so forth. + +- Parsers and unparsers that can convert between external representations + and the regexp ADT. The supported external representations are + + Posix strings + + S-expression notation + Being able to convert regexps to Posix strings allows implementations + to implement regexp matching using standard Posix C-based engines. + +- Macro support for the s-expression notation. + The RX macro provides a new special form that allows you to embed regexps in + the s-expression notation within a Scheme program. The macro parses the + regexp into the ADT, simplifies it, and translates it to a Posix strings, + which can be used by a traditional C-based regexp engine. + +- Matchers + Spencer's Posix regexp engine is linked in to the runtime; the + regexp code uses this engine to provide text matching. + +The regexp language supported is a complete superset of Posix functionality, +providing: + sequencing and choice (|) + repetition (*, +, ?, {m,n}) + character clases and wildcard ([...], .) + beginning/end of string anchors + beginning/end of line anchors + beginning/end of word anchors + case-sensitivity control + submatch-marking (...) + +------------------------------------------------------------------------------- +* Summary SRE syntax +-------------------- +Here is a summary of the syntax; the next section is a friendlier tutorial +introduction. + +SRE ::= + literal match -- Interpreted relative to + the current case-sensitivity lexical context + (default is case-sensitive). + + ( ....) Set of chars, e.g., ("abc" "XYZ") + Interpreted relative to the current + case-sensitivity lexical context. + + (* ...) 0 or more matches + (+ ...) 1 or more matches + (? ...) 0 or 1 matches + (= ...) matches + (>= ...) or more matches + (** ...) to matches + and are Scheme expressions producing non-negative integers. + may also be #f, meaning "infinity" + + (| ...) Choice (OR is R5RS symbol; | is unspecified) + (or ...) + + (: ...) Sequence (SEQ is Common Lisp symbol) + (seq ...) + + (submatch ...) Numbered submatch + (dsm
   ...)	Deleted submatches
+	
 and  are numerals.
+
+    (uncase  ...)			Case-folded match.
+
+    (w/case    ...)		Introduce a lexical case-sensitivity
+    (w/nocase  ...)		context.
+
+    ,@			Dynamically computed regexp.
+    ,			Same as ,@, but no submatch info.
+				     must produce a character, string,
+                                    char-set, or regexp.
+
+    bos eos			Beginning/end of string
+    bol eol			Beginning/end of line
+    
+    bow eow			Beginning/end of word
+    (word   ...)		(: bow  ... eow)
+    (word+  ...)	(word (+ (& (| alphanumeric "_")
+                                            (|  ...))))
+    word			(word+ any)
+
+    (posix-string )	Posix string -- for backwards compatibility.
+
+    			Singleton char set
+    		alphanumeric, whitespace, etc.
+				    These two forms are interpreted subject to
+				    the lexical case-sensitivity context.
+
+    (~   ...)		[^...] -- complement-of-union
+    (-   ...)		Difference
+    (&   ...)		Intersection
+
+    (/  ...)	Character range -- interpreted subject to
+				the lexical case-sensitivy context.
+
+ ::= any			.
+	         nonl	    		[^\n]
+
+		 (Posix character classes:)
+		 lower-case upper-case alphabetic numeric alphanumeric
+		 punctuation graphic whitespace printing control hex-digit
+
+		 blank ascii		(Gnu character classes)
+
+		 (Shorter equivalent nicknames:)
+		 lower upper alpha digit num alnum alphanum punct graph
+		 space white print cntrl xdigit hex ascii
+
+ ::=  | 
+		 The chars are taken in pairs to form inclusive ranges.
+
+
+The ~, -, &, and word+ operators may only be applied to SRE's that specify
+character sets. Here are the "type-checking" rules:
+
+ ::= (~  ...)		Set complement-of-union
+	     | (-  ...)		Set difference
+	     | (&  ...)		Intersection
+	     | (|  ...)		Set union
+	     | (/  ...)		Range
+
+	     | ()			Constant set
+	     | 				Singleton constant set
+	     | 				For 1-char string "c"
+
+	     | 			Constant set
+
+	     | ,				 evals to a char-set,
+	     | ,@				char, single-char string,
+    						or re-char-set regexp.
+
+	     | (uncase )		Case-folding
+	     | (w/case )		
+	     | (w/nocase )		
+
+
+-------------------------------------------------------------------------------
+* Examples
+----------
+
+(- alpha ("aeiouAEIOU"))		; Various forms of non-vowel letter
+(- alpha ("aeiou") ("AEIOU"))
+(w/nocase (- alpha ("aeiou")))
+(- (/"azAZ") ("aeiouAEIOU"))
+(w/nocase (- (/"az") ("aeiou")))
+
+;;; Upper-case letter, lower-case vowel, or digit
+(| upper ("aeiou") digit)
+(| (/"AZ09") ("aeiou"))
+
+;;; Not an SRE, but Scheme code containing some embedded SREs.
+(let* ((ws (rx (+ whitespace)))			; Seq of whitespace
+       (date (rx (: (| "Jan" "Feb" "Mar" ...)	; A month/day date.
+	            ,ws
+                    (| ("123456789")	    	; 1-9
+		       (: ("12") digit)    	; 10-29
+		       "30" "31")))))		; 30-31
+
+  ;; Now we can use DATE several times:
+  (rx ... ,date ... (* ... ,date ...)	    
+      ... .... ,date))
+
+;;; More Scheme code
+(define (csl re)		; A comma-separated list of RE's is
+  (rx (| ""			; either zero of them (empty string), or
+	 (: ,re 		; one RE, followed by
+            (* ", " ,re)))))	; Zero or more comma-space-RE matches.
+
+(csl (rx (| "John" "Paul" "George" "Ringo")))
+
+
+-------------------------------------------------------------------------------
+* A short tutorial
+------------------
+S-expression regexps are called "SRE"s. Keep in mind that they are *not*
+Scheme expressions; they are another, separate notation that is expressed
+using the underlying framework of s-expression list structure -- lists,
+symbols, etc. SRE's can be *embedded* inside of Scheme expressions using
+special forms that extend Scheme's syntax; there are places in the SRE 
+grammar where one may place a Scheme expression -- in these ways, SRE's and
+Scheme expressions can be intertwined. But this isn't fundamental; SRE's may
+be used in a completely Scheme-independent context. By simply restricting
+the notation to eliminate two special Scheme-embedding forms, they can be
+a completely independent notation.
+
+
+** Constant strings
+
+The simplest SRE is a string, denoting a constant regexp. For example, the SRE
+    "Spot"
+matches only the string <>. There is
+no interpretation of the characters in the string at all -- the SRE
+    ".*["
+matches the string <>.
+
+
+** Simple character sets
+
+To specify a set of characters, write a list whose single element is
+a string containing the set's elements. So the SRE
+    ("aeiou")
+only matches a vowel. One way to think of this, notationally, is that the
+set brackets are (" and ").
+
+
+** Wild card
+
+Another simple SRE is the symbol ANY, which matches any single character --
+including newline and ASCII nul.
+
+
+** Sequences
+
+We can form sequences of SRE's with the SRE (:  ...).
+So the SRE
+    (: "x" any "z")
+matches any three-character string starting with "x" and ending with "z".
+As we'll see shortly, many SRE's have bodies that are implicit sequences of
+SRE's, analogous to the manner in which the body of a Scheme LAMBDA or LET
+expression is an implicit BEGIN sequence. The regexp (seq  ...) is
+completely equivalent to (:  ...); it's included in order to have a 
+syntax that doesn't require : to be a legal symbol (e.g., for Common Lisp).
+
+
+** Choices
+
+The SRE (|  ...) is a regexp that matches anything any of the
+ regexps match. So the regular expression
+    (| "sasha" "Pete")
+matches either the string "sasha" or the string "Pete". The regexp
+    (| ("aeiou") ("0123456789"))
+is the same as
+    ("aeiou0123456789")
+The regexp (or  ...) is completely equivalent to (|  ...); 
+it's included in order to have a syntax that doesn't require | to be a 
+legal symbol.
+
+
+** Repetition
+
+There are several SRE forms that match multiple occurences of a regular
+expression. For example, the SRE (*  ...) matches zero or more
+occurences of the sequence (:  ...). Here is the complete list
+of SRE repetition forms:
+
+SRE					Means 		At least    no more than
+---					--------	--------    -----------
+(*  ...)				zero-or-more	0	    infinity
+(+  ...)				one-or-more	1	    infinity
+(?  ...)				zero-or-one	0	    1
+(=   ...)			exactly-n		    
+(>=   ...)			n-or-more		    infinity
+(**    ...)		n-to-m			    
+
+A  field is a Scheme expression that produces an integer.
+A  field is a Scheme expression that produces either an integer,
+or false, meaning infinity.
+
+While it is illegal for the  or  fields to be negative, it *is*
+allowed for  to be greater than  in a ** form -- this simply
+produces a regexp that will never match anything.
+
+As an example, we can describe the names of car/cdr access functions
+("car", "cdr", "cadr", "cdar", "caar" , "cddr", "caaadr", etc.) with
+either of the SREs
+    (: "c" (+ (| "a" "d")) "r")
+    (: "c" (+ ("ad")) "r")
+We can limit the a/d chains to 4 characters or less with the SRE
+    (: "c" (** 1 4 ("ad")) "r")
+
+Some boundary cases:
+    (** 5 2 "foo")	; Will never match
+    (** 0 0 "foo")	; Matches the empty string
+
+
+** Character classes
+
+There is a special set of SRE's that form "character classes" -- basically, 
+a regexp that matches one character from some specified set of characters.
+There are operators to take the intersection, union, complement, and
+difference of character classes to produce a new character class. (Except 
+for union, these capabilities are not provided for general regexps as they 
+are computationally intractable in the general case.)
+
+A single character is the simplest character class -- #\x is a character
+class that matches only the character "x".  A string that has only one
+letter is also a character class -- "x" is the same SRE as #\x.
+
+The character-set notation () we've seen is a primitive character
+class, as is the wildcard ANY. When arguments to the choice operator, |, are
+all character classes, then the choice form is itself a character-class. 
+So these SREs are all character-classes:
+    ("aeiou")
+    (| #\a #\e #\i #\o #\u)
+    (| ("aeiou") ("1234567890"))
+However, these SRE's are *not* character-classes:
+    "aeiou"
+    (| "foo" #\x)
+
+The (~  ...) char class matches one character not in the specified 
+classes:
+    (~ ("0248") ("1359"))
+matches any character that is not a digit. 
+
+More compactly, we can use the / operator to specify character sets by giving
+the endpoints of contiguous ranges, where the endpoints are specified by a
+sequence of strings and characters.  For example, any of these char classes
+    (/ #\A #\Z  #\a #\z  #\0 #\9)
+    (/ "AZ" #\a #\z "09")
+    (/ "AZ" #\a "z09")
+    (/"AZaz09")
+matches a letter or a digit. The range endpoints are taken in pairs to
+form inclusive ranges of characters. Note that the exact set of characters
+included in a range is dependent on the underlying implementation's 
+character type, so ranges may not be portable across different implementations.
+
+There is a wide selection of predefined, named character classes that may be
+used. One such SRE is the wildcard ANY. NONL is a character class matching
+anything but newline; it is equivalent to
+    (~ #\newline)
+and is useful as a wildcard in line-oriented matching.
+
+There are also predefined named char classes for the standard Posix and Gnu
+character classes:
+    scsh name		Posix/ctype	Alt name    Comment
+    -------------------------------------------------------
+    lower-case		lower
+    upper-case		upper
+    alphabetic		alpha
+    numeric		digit		num
+    alphanumeric	alnum		alphanum
+    punctuation		punct
+    graphic		graph
+    blank					    (Gnu extension)
+    whitespace		space		white	    ("space" is deprecated.)
+    printing		print
+    control		cntrl
+    hex-digit		xdigit		hex
+    ascii					    (Gnu extension)
+See the scsh character-set documentation or the Posix isalpha(3) man page
+for the exact definitions of these sets.
+
+You can use either the long scsh name or the shorter Posix and alternate names
+to refer to these char classes. The standard Posix name "space" is provided,
+but deprecated, since it is ambiguous. It means "whitespace," the set of
+whitespace characters, not the singleton set of the #\space character.
+If you want a short name for the set of whitespace characters, use the
+char-class name "white" instead.
+
+Char classes may be intersected with the operator (&  ...), and
+set-difference can be performed with (-  ...). These operators are
+particularly useful when you want to specify a set by negation *with respect
+to a limited universe*. For example, the set of all non-vowel letters is
+    (- alpha ("aeiou") ("AEIOU"))
+whereas writing a simple complement 
+    (~ ("aeiouAEIOU"))
+gives a char class that will match any non-vowel -- including punctuation,
+digits, white space, control characters, and ASCII nul.
+
+We can *compute* a char class by writing the SRE 
+    ,
+where  is a Scheme expression producing a value that can be
+coerced to a character set: a character set, character, one-character
+string, or char-class regexp value. This regexp matches any character
+from the set.
+
+The char-class SRE ,@ is entirely equivalent to ,
+when  produces a character set (but see below for the more
+general non-char-class context, where there *is* a distinction between
+, and ,@).
+
+Example: An SRE that matches a lower-case vowel, upper-case letter, 
+or digit is
+    (| ("aeiou") (/"AZ09"))
+or, equivalently
+    (| ("aeiou") upper-case numeric)
+
+Boundary cases: the empty-complement char class
+    (~)
+matches any character; it is equivalent to 
+    any
+The empty-union char class
+    (|)
+never matches at all. This is rarely useful for human-written regexps,
+but may be of occasional utility in machine-generated regexps, perhaps
+produced by macros.
+
+The rules for determining if an SRE is a simple, char-class SRE or a
+more complex SRE form a little "type system" for SRE's. See the summary
+section preceding this one for a complete listing of these rules.
+
+** Case sensitivity
+
+There are three forms that control case sensitivity:
+    (uncase    ...)
+    (w/case    ...)
+    (w/nocase  ...)
+
+UNCASE is a regexp operator producing a regexp that matches any
+case permutation of any string that matches (:  ...).
+For example, the regexp
+    (uncase "foo")
+matches the strings foo, foO, fOo, fOO, Foo, ...
+
+Expressions in SRE notation are interpreted in a lexical case-sensitivy
+context. The forms W/CASE and W/NOCASE are the scoping operators for this
+context, which controls how constant strings and char-class forms are
+interpreted in their bodies. So, for example, the regexp
+    (w/nocase "abc"
+              (* "FOO" (w/case "Bar"))
+	      ("aeiou"))
+defines a case-insensitive match for all of its elements except for the
+sub-element "Bar", which must match exactly capital-B, little-a, little-r.
+The default, the outermost, top-level context is case sensitive.
+
+The lexical case-sensitivity context affects the interpretation of
+    - constant strings, such as "foo"
+    - chars, such as #\x
+    - char sets, such as ("abc")
+    - ranges, such as (/"az")
+that appear within that context. It does not affect dynamically computed
+regexps -- ones that are introduced by , and ,@ forms. It does
+not affect named char-classes -- presumably, if you wrote LOWER, you didn't
+mean ALPHA.
+
+UNCASE is *not* the same as W/NOCASE. To point up one distinction,
+consider the two regexps
+    (uncase   (~ "a"))
+    (w/nocase (~ "a"))
+The regexp (~ "a") matches any character except "a" -- which means it *does*
+match "A". Now, (uncase ) matches any case-permutation of a string that
+ matches. (~ "a") matches "A", so (uncase (~ "a")) matches "A" and "a"
+-- and, for that matter, every other character. So (uncase (~ "a")) is
+equivalent to ANY.
+
+In contrast, (w/nocase (~ "a")) establishes a case-insensitive lexical
+context in which the "a" is interpreted, making the SRE equivalent to
+(~ ("aA")).
+
+
+** Dynamic regexps
+
+SRE notation allows you to compute parts of a regular expression
+at run time. The SRE
+    ,
+is a regexp whose body  is a Scheme expression producing a
+string, character, char-set, or regexp as its value. Strings and
+characters are converted into constant regexps; char-sets are converted
+into char-class regexps; and regexp values are substituted in place.
+So we can write regexps like this
+    (: "feeding the "
+       ,(if (> n 1) "geese" "goose"))
+This is how you can drop computed strings, such as someone's name,
+or the decimal numeral for a computed number, into a complex regexp.
+
+If we have a large, complex regular expression that is used multiple
+times in some other, containing regular expression, we can name it, using 
+the binding forms of the embedding language (e.g., Scheme), and refer to
+it by name in the containing expression. E.g.: The Scheme expression
+
+    (let* ((ws (rx (+ whitespace)))			; Seq of whitespace
+
+	   (date (rx (: (| "Jan" "Feb" "Mar" ...)	; A month/day date.
+		        ,ws
+                        (| ("123456789")	    	; 1-9
+			   (: ("12") digit)	    	; 10-29
+			   "30" "31")))))		; 30-31
+
+      ;; Now we can use DATE several times:
+      (rx ... ,date ... (* ... ,date ...)	    
+          ... .... ,date))
+        
+where the (RX  ...) macro is the Scheme special form that produces
+a Scheme regexp value given a body in SRE notation.
+
+As we saw in the char-class section, if a dynamic regexp is used
+in a char-class context (e.g., as an argument to a ~ operation),
+the expression must be coercable not merely to a general regexp,
+but to a character set -- so it must be either a singleton string,
+a character, a scsh char set, or a char-class regexp.
+
+We can also define and use functions on regexps in the host language. 
+For example,
+
+    (define (csl re)		; A comma-separated list of RE's is either
+      (rx (| ""			; zero of them (empty string), or
+             (: ,re		; RE followed by
+                (* ", " ,re))))); zero or more comma-space-RE matches.
+
+    (rx ... ,date ...
+        ,(csl (rx (| "John" "Paul" "George" "Ringo")))
+	...
+        ... ,(csl date) ...)
+
+I leave the extension of CSL to allow for an optional "and" between the last
+two matches as an exercise for the interested reader (e.g., to match
+"John, Paul, George and Ringo").
+
+Note, in passing, one of the nice features of SRE notation: SREs can
+be commented.
+
+When we embed a computed regexp inside another regular expression with
+the , form, we must specify how to account for the submatches that
+may be in the computed part. For example, suppose we have the regexp
+    (rx (submatch (* "foo"))
+        (submatch (? "bar"))
+        ,(f x)
+        (submatch "baz"))
+It's clear that the submatch for the (* "foo") part of the regexp is
+submatch #1, and the (? "bar") part is submatch #2. But what number
+submatch is the "baz" submatch? It's not clear. Suppose the Scheme
+expression (f x) produces a regular expression that itself has 3
+subforms. Are these counted (making the "baz" submatch #6), or not
+counted (making the "bar" submatch #3)?
+
+SRE notation provides for both possibilities. The SRE
+    ,
+does *not* contribute its submatches to its containing regexp; it
+has zero submatches. So one can reliably assign submatch indices to
+forms appearing after a , form in a regexp.
+
+On the other hand, the SRE
+    ,@
+"splices" its resulting regexp into place, *exposing* its submatches
+to the containing regexp. This is useful if the computed regexp is defined
+to produce a certain number of submatches -- if that is part of 's
+"contract."
+
+
+** String, line, and word units
+
+The regexps BOS and EOS match the empty string at the beginning and end of
+the string, respectively.
+
+The regexps BOL and EOL match the empty string at the beginning and end of
+a line, respectively. A line begins at the beginning of the string, and
+just after every newline character. A line ends at the end of the string,
+and just before every newline character. The char class NONL matches any
+character except newline, and is useful in conjunction with line-based
+pattern matching.
+
+The regexps BOW and EOW match the empty string at the beginning and end of
+a word, respectively. A word is a contiguous sequence of characters that
+are either alphanumeric or the underscore character. 
+
+The regexp (WORD  ...) surrounds the sequence (:  ...)
+with bow/eow delimiters. It is equivalent to
+    (: bow  ... eow)
+
+The regexp (WORD+  ...) matches a word whose body is one or
+more word characters also in one of the  classes. It is equivalent 
+to
+    (word (+ (& (| alphanumeric "_")
+                (|  ...))))
+For example, a word not containing x, y, or z is
+    (word+ (~ ("xyz")))
+
+The regexp WORD matches one word; it is equivalent to 
+    (word+ any)
+
+[Note: BOL and EOL are not supported by scsh's current regexp search engine,
+which is Spencer's Posix matcher. This is the only element of the notation
+that is not supported by the current scsh reference implementation.]
+
+
+** Miscellaneous elements
+
+*** Posix string notation
+
+The SRE (posix-string ), where  is a string literal (*not* 
+a general Scheme expression that evaluates to a string), allows one to use
+Posix string notation for a regexp. It's intended as backwards compatibility
+and is deprecated. Example:  (posix-string "[aeiou]+|x*|y{3,5}") matches a
+string of vowels, a possibly empty string of x's, or three to five y's.
+
+Parens are used ambiguously in Posix notation -- both for grouping
+and submatch marking. The (posix-string ) form makes the
+conservative assumption: all parens introduce submatches. 
+
+*** Deleted submatches
+
+DSM's are a subtle feature that are never required in expressions written
+by humans. They can be introduced by the simplifier when reducing
+regular expressions to simpler equivalents, and are included in the
+syntax to give it expressibility spanning the full regexp ADT. They
+may appear when unparsing simplified regular expressions that have
+been run through the simplifier; otherwise you are not likely to see them.
+Feel free to skip this section.
+
+The regexp simplifier can sometimes eliminate entire sub-expressions from a
+regexp. For example, the regexp
+    (: "foo" (** 0 0 "apple") "bar")
+can be simplified to
+    "foobar"
+since (** 0 0 "apple") will always match the empty string. The regexp
+    (| "foo"
+       (: "Richard" (|) "Nixon")
+       "bar")
+can be simplified to
+    (| "foo" "bar")
+The empty choice (|) can't match anything, so the whole
+    (: "Richard" (|) "Nixon")
+sequence can't match, and we can remove it from the choice.
+
+However, if deleting part of a regular expression removes a submatch
+form, any following submatch forms will have their numbering changed,
+which would be an error. For example, if we simplify
+    (: (** 0 0 (submatch "apple"))
+       (submatch "bar"))
+to
+    (submatch "bar")
+then the "bar" submatch changes from submatch 2 to submatch 1 -- so this
+is not a legal simplification.
+
+When the simplifier deletes a sub-regexp that contains submatches,
+it introduces a special regexp form to account for the missing,
+deleted submatches, thus keeping the submatch accounting correct.
+    (dsm 
   ...)
+is a regexp that matches the sequence (:  ...). 
 and
+ are integer constants. The DSM form introduces 
 deleted
+submatches before the body, and  deleted submatches after the
+body. If the body (:  ...) itself has body-sm submatches,
+then the total number of submatches for the DSM form is
+    (+ 
 body-sm )
+These extra, deleted submatches are never assigned string indices in any
+match values produced when matching the regexp against a string.
+
+As examples,
+    (| (: (submatch "Richard") (|) "Nixon")
+       (submatch "bar"))
+can be simplified to
+    (dsm 1 0 (submatch "bar"))
+
+The regexp
+    (: (** 0 0 (submatch "apple"))
+       (submatch "bar"))
+can be simplified to
+    (dsm 1 0 (submatch "bar"))
+
+
+** Embedding regexps within Scheme programs
+
+SRE's can be placed in a Scheme program using the (rx  ...) 
+Scheme form, which evaluates to a Scheme regexp value.
+
+
+** Static and dynamic regexps
+
+We separate SRE expressions into two classes: static and dynamic
+expressions. A *static* expression is one that has no run-time dependencies;
+it is a complete, self-contained description of a regular set. A *dynamic*
+expression is one that requires run-time computation to determine the
+particular regular set being described. There are two places where one can
+embed run-time computations in an SRE:
+    - The  or  repetition counts of **, =, and >= forms;
+    - , and ,@ forms.
+A static SRE is one that does not contain any , or ,@ forms, 
+and whose **, =, and >= forms all contain constant repetition counts.
+
+Scsh's RX macro is able, at macro-expansion time, to completely parse,
+simplify and translate any static SRE into the equivalent Posix string
+which is used to drive the underlying C-based matching engine; there is 
+no run-time overhead. Dynamic SRE's are partially simplified and then expanded
+into Scheme code that constructs the regexp at run-time.
+
+
+-------------------------------------------------------------------------------
+* Discussion and design notes
+-----------------------------
+Many considerations drove the design of the SRE notation. I took
+advantage of ideas found in the s-expression notations of Manuel 
+Serrano's Bigloo systems-programming Scheme implementation and Michael
+Sperber's Scheme 48 design. I also considered features found in many
+traditional regexp implementations, including the Posix standard, Henry 
+Spencer's package, Tom Lord's rx system, gnu regex, and Perl's system. 
+Features that didn't make it into the scsh system are not there for a reason.
+
+Lord's package provided the name for the basic SRE macro, which is
+agreeably brief.
+
+What follows is a loose collection of design notes.
+
+** No lazy repetition operators
+
+SRE notation does not provide lazy repeat forms, of the kind found in perl.
+Lazy repeat forms have problems. In principle, a regexp doesn't specify a
+matching algorithm, it specifies a *set of strings*. Lazy submatches violate
+this principle. Providing lazy submatches forces a particular matching
+algorithm: an NDFA-based backtracking search engine -- you can't use a
+fast DFA no-backtracking engine. I chose to restrict the notation to keep
+it algorithm-independent. (This isn't strictly true -- we sleaze in one area:
+submatches require an NDFA searcher, but the standard C-based DFA engines
+have clever hacks for dealing with this.)
+
+Also, lazy submatches can't be implemented with a Posix search engine,
+and I wanted to allow for this.
+
+Note that an alternative to perl's lazy repeat forms would be to have
+a flag in the match and search functions telling it to provide either the
+leftmost longest or the leftmost shortest match. This is a *global* property
+of the whole regexp, not of a particular part, and can be easily provided
+by either DFA or NDFA engines.
+
+I suspect this would handle many (most? all?) of the cases where perl hackers
+want lazy repetition operators. But it is more in the spirit of regexps,
+where one notates a *regular set* of strings with a regexp, then asks the
+matcher to find a match -- it's the matcher's business how it will choose
+from multiple matches to strings in the set.
+
+Finally, my suspicion is that the sort of things people do with lazy
+repetition operators (e.g., match delimited  ...  regions
+in html text) are abusing regexps, pushing them beyond their real
+capabilities, which is asking for trouble. Strings of balanced delimiters
+aren't regular; you should be using more powerful parsing tools if this
+is what you want to do. Don't break regexps to handle this case in a
+fragile way; design a different parsing tool. For an example of a more
+powerful parsing tool, see the elegant parser tool, READ/RP, in Serrano's
+Scheme system bigloo.
+
+
+** No named submatches
+
+One might want a feature wherein we could *name* our submatches, instead
+of referring to them by their numerical index in the regexp. Perhaps
+something like
+    (: ...
+       (named-match phone-num (= 3 digit) "-" (= 4 digit))
+       ...)
+which would somehow "bind" phone-num to the substring matching the
+seven-digit phone number. This is awkward. The problem is that binding
+introduces issues of scope -- what is the scope of an identifier "bound"
+in a regexp? Suppose the SRE is used in a Scheme form to produce a first-class
+regexp value, which can be passed around in and out of various scopes?
+Clearly, somehow using Scheme's variables isn't going to work. If one then
+turns to symbol-indexed tables, one is leaving the language-level for
+binding, and moving to run-time data values. This is inefficient and
+awkward. Furthermore, what are the precedence rules when the same identifier
+is bound multiple times in the same SRE?
+
+There's no shame in positional, indexed references. It's how parameters
+are passed in Scheme procedure calls. It's the natural mechanism that
+arises for regexps, so that's what is provided.
+
+One might consider a hairy named-submatch system where one would specify in
+a *single form* (1) an SRE with named submatches, (2) a string to match/search,
+and (3) a body of code to be conditionally evaluated in the scope of the bound
+names. This would use names in a way that was tightly integrated with Scheme,
+which is good. But you must now wrestle with some very tricky issues:
+
+- Are the names bound to substrings or pairs of start/stop indices into
+  the source text? Sometimes you want one, sometimes the other.
+
+- You have to give up on passing regexps around as first-class values.
+  SRE's are now scope-introducing, variable-binding syntax, like
+  LET or LAMBDA. So much for , and the power of dynamic regexps.
+
+- What to do about a name that appears multiple times in the regexp?
+
+I did not go down this path. Be one with the system; don't fight against it.
+(However, see the LET-MATCH, IF-MATCH, and MATCH-COND forms, which provide
+matching, positional name-binding, and control transfers in a consistent
+manner.)
+
+
+** No intersection or negation operators
+
+The SRE notation supports a general union operator, the choice form
+(|  ...). However, the rest of the set algebra -- negation, 
+intersection, and subtraction -- is restricted to character sets. Wouldn't 
+it be nice to extend these operators to apply to general regexps? After
+all, regular sets *are* closed under these operations, and they are useful, 
+so they should be in the toolkit.
+
+They aren't in for two reasons:
+1. Combinatoric explosion
+   Intersection forces you to convert your regexp to a DFA, which can entail
+   exponential growth in the state space. After the exponential explosion,
+   you work in the cross-product of the two DFA's state spaces, for another
+   multiplicative factor. It's hard to control. Negation presents similar
+   difficulties, in either time (DFA) or space (NDFA).
+
+2. Standard C engines don't provide it. (Because of reason #1.)
+
+Sure, it'd be great to have them anyway. What you probably want is
+a direct-in-Scheme DFA/NDFA toolkit. Then you can take your regexp,
+convert it to a DFA, do the intersection, and either interpret the
+result machine, or translate it to Scheme code for direct execution.
+The programmer would have to take responsibility for managing the
+potential for combinatorial explosion.
+
+It's a great idea. You do it. I was careful to design the notation to
+allow for it -- you don't even have to introduce new operators, just
+lift the char-class "typing" restriction on the existing ~, -, and &
+ops.
+
+Note that just by allowing general set operations on character classes, 
+we're still way out in front of traditional notation, which doesn't 
+provide this feature at all.
+
+
+** No name-binding forms
+
+SRE notation doesn't have a form for binding names to regexps; this
+is punted to the host language by way of the , mechanism. This
+is arguably a lack from the point of view of SRE's as a completely
+standalone, static notation, independent of their embedding language. 
+But enough is enough.
+
+
+** No SRE macros
+
+It's a shame that we can't provide a means of allowing programmers to define
+their own SRE macros, by which I do *not* mean Scheme expressions that contain
+SRE forms, but *new* classes of SRE form, beyond (: ...), (| ...) and friends.
+For example, the (WORD+  ...) form is not really primitive; it
+can be defined by way of expansion to
+    (: bow (& (| alphanum "_") (|  ...)) eow)
+A given task might profit from allowing the programmer to extend SRE's
+by way of rewriting forms into the base SRE notation. But Scheme does
+not support this -- it only provides macros for Scheme expressions.
+Too bad.
+
+
+** No back-references
+
+Completely non-regular -- there's a reason these were dropped from Posix'
+"extended" (= "modern") regexp notation.
+
+
+** No "syntax classes"
+
+"Syntax classes" are a gnu-emacs feature for describing certain character
+sets. SRE already has a powerful set of character-set operators, and the
+whole notion of "syntax class" is emacs specific. So they weren't included.
+
+
+** Range notation
+
+One might consider it more "Schemey" to have the char-class range notation
+specify the from/to pairs as two-element lists, e.g.
+    (/ (#\a #\z) (#\0 #\9))
+or maybe even
+    (/ (#\a . #\z) (#\0 . #\9))
+If we do things this way, the structure of the s-expression more closely
+mirrors the underlying structure of the form. Well, yes. But it's hard
+to read -- I claim that ripping out all the sharp signs, backslashes, dots
+and extra parens is much easier on your eyes:
+    (/"az09")
+and I am unable to see what the extra pairing overhead buys you over
+and above gratuitous notational bloat. Remember that this notation is
+designed primarily for *human* producers and consumers. The *machine*
+doesn't see the *notation*; it sees the nice, regular ADT.
+
+
+** Big notation
+
+SRE notation is baroque -- there are a lot of ways to write the same
+regexp. This is not accidental. The idea is to make a notation that is as
+expressive as possible for human-written documents. The ADT, in contrast, is
+simple and spare -- it is designed to be operated upon by programs.
+
+
+** Implementation complexity
+
+This implementation is much more complex than I'd like; there are three main
+reasons. The first reason is the strategy of parsing SRE's, not directly to
+Scheme code, but instead to regexp records, which can then be simplified and
+then unparsed to Scheme code (or other forms, such as SRE or Posix string).
+Centering the design on the ADT was the right thing to do, as it enables other
+unparsers to use the same "front-end" parser -- for example, if someone wanted
+to write a macro that expanded (static) SRE's into a LETREC directly
+implementing the DFA, it would be much easier to work from the ADT than
+directly from the SRE form. It also enables the macro to apply a lot of
+processing to the form at compile-time, such as the simplifier, giving us a
+sleazy but effective form of partial evaluation.  However... we pay a price in
+complexity to do things this way. Code that processes regexp records that
+might be used *at macro-expansion time* must be written to tolerate the
+presence of Scheme *code* in record fields that would ordinarily only contain
+Scheme *values* -- for example, the FROM and TO fields of the repeat record. I
+got a lot of code reuse by making these records do double-duty as both the
+regexp ADT *and* the compiler's syntax tree (AST) for expressions computing
+these values -- but I had to get the code that did double-duty *just right*,
+which meant being careful to add code/value checks on all accesses to these
+fields.
+
+(Note that *user* code manipulates run-time regexp values, and so will
+never see anything but values in these fields.)
+
+The second reason is the inherent difficulty in translating general character
+sets to [...] Posix character classes, which seem simple at first, but turn
+out to have very awkward special-case restrictions on the grammar -- I discuss
+this at length in a later section. The char-set rendering code is made more
+complex than it could have been because I made an effort to render them
+into concise, readable descriptions, using ranges and ^ negation where
+possible to minimise the notation.
+
+The final major influence on the code complexity is all the bookkeeping that
+is involved in submatches and DSM's. Tracking DSM's complicates just about all
+parts of the system: the data-structures, the simplifier, the parser, and so
+on. It's really amazing how this one feature comes to dominate all the
+processing. But it has to be done. Submatches are an indispensable part of the
+way we use regexps. Simplification -- the process that introduces DSM's -- is
+not an option: SRE syntax is more general than traditional syntax, and permits
+authors to write expressions that don't have representations in traditional
+syntax. Simplifying the regular expression rewrites these un-translatable
+cases into equivalent cases that *are* translatable. So we must simplify
+before translating to Posix strings, hence we are stuck with DSM's.
+
+I went to the trouble of doing a full-bore implementation to have a
+reference implementation for others to steal. So the complexity of the
+coding shouldn't throw anyone who wants to use this notation; it's all
+been implemented.
+
+
+** Should anchors be primitive regexps or operators?
+
+There are two ways to do anchors, such as BOL and EOL, in an
+s-expression syntax. One way is to have them be primitive regexps
+that match the empty string in a particular context (which is how it
+is done in SRE syntax). The alternate method, found in some other designs
+I have studied, is to have anchors be *operators* with bodies, e.g.
+    (at-bos "foo" (* " ") "bar")
+which would match a "foobar" string, anchoring the match
+to the beginning of the string. This works reasonably well for 
+beginning-of-element anchors, but with end-of-element anchors, it puts
+the operator on the wrong side -- the left side -- of the regexp body:
+    (at-eol "foo" (* " ") "bar")
+The end-of-line in this pattern occurs after the "bar", but the operator's
+way over on the other side of the regexp body. This gets especially ugly 
+when we want to delimit both sides of the body:
+    (at-eol (at-bos "foo" (* " ") "bar"))
+Too many parens; too much nesting; too hard to read. I went with the
+magic-empty-string primitive regexp model:
+    (: bos "foo" (* " ") "bar" eos)
+
+
+** Character class operators
+
+In an earlier version of this notation, I had a distinct subnotation
+for character classes, with a distinct non-| operator for char-set
+union. This provided a simple, syntactic way to separate the char-set
+set algebra from the other operations of the language, to ensure you
+didn't try to complement or intersect general regexps. The char-set
+operators were IN, -, &, ~ for union, difference, intersection, and 
+complement. Inside these operators, simple strings stood for character
+sets. So we'd write the any-vowel SRE (in "aeiou") instead of ("aeiou").
+
+Shifting over to a distinct SRE form for constant char sets -- ("aeiou") --
+allowed me throw out the whole syntactic division between char-sets and
+other regexps, replacing this division with a "type system" restricting
+non-union set operations to char-sets. This seems like a big improvement
+for two reasons:
+    - The syntax was simplified and made less context-dependent.
+    - Should we ever wish to extend the regexp system to allow
+      for set operations on general regexps, the notation doesn't
+      have to be changed or extended at all.
+
+
+** Operator names
+
+Always a painful task. Here are some random notes on my choices.
+
+Before choosing W/CASE and W/NOCASE, I considered
+    CASE-SENSITIVE and CASE-INSENSITIVE 
+and
+    CASE0 CASE1
+but the former is far too long, and the latter was insufficiently clear.
+One early reviewer (Rees) asked me where were CASE2, CASE3, et al. So I went
+with W/CASE and W/NOCASE.
+
+I considered CASE-FOLD before selecting UNCASE. UNCASE is shorter and
+seems no more or less clear.
+
+I considered &/+/~/- for the set algebra ops -- it's a nice, consistent, terse
+operator set. This would give us an R5RS-sanctioned operator for union, as an
+alternative to the slightly iffy |.  But if we use + for union, there is some
+pressure to use * for intersection, by way of analogy with mod-2
+addition/multiplication. But * is already assigned to a repetition operator.
+Furthermore, + is also already taken, by the 1..infinity repetition
+operator. Not only are these * and + assignments firmly ingrained in
+traditional syntax, I couldn't come up with a good, short alternative for the
+repetition operators, so I kept them + and *. | is also the and/or mate of &
+in C, so it's natural to pair them in an and/or intersection/union manner.
+
+I also considered and/or/not/diff for the char-set algebra ops. But
+the names are too long for such common primitives.
+
+
+** No collating elements
+
+Posix has this completely opaque feature of character sets called "collating
+elements." It's some mechanism whereby you can, in a locale-independent way,
+get a pair of characters to sort as one character, or have a German eszet
+(the one that looks like a beta) character sort like a pair of s's. This stuff,
+which I am unable to understand without bleeding from the nose and ears, is
+part of the full Posix regexp spec -- you can say things like [[=ch=]]
+and get a character class which will actually match two characters out of
+a target string, if your locale defines a collating element .
+
+My system doesn't support this at all. SRE char-class expressions are
+rendered to scsh character sets, which are then rendered into [...]
+sets containing the elements of those sets. The rendered [...] expressions
+never contain [..], [==], or [::] elements.
+
+As I discuss below, I'm willing to support a super-ASCII character type,
+such as Unicode or big-5 or latin-1. What I find objectionable is the
+idea of "collation elements" that can match more than one character in
+a row.
+
+I'm open to more suggestions on this front.
+
+
+** Character set dependencies
+
+If you wanted to do a Unicode version of my package, you'd have to redo the
+scsh character-set machinery, and also the char-set unparsers in the regexp
+backends.  If you wanted to do a Latin-1 version, you'd need to slightly tune
+scsh's primitive character sets (such as char-set:alphabetic).  The SRE->Posix
+string character-class rendering code is written to be as independent as
+possible of the character type, but it has some dependencies, and would need
+to be tuned.
+
+This is straightforward to do and makes sense in the global context in which
+we now program. However, I'm an American (= functionally monolingual), and so
+not as expert on the various internationalisation issues as a European or
+Asian would be -- so I'm punting it for now.
+
+
+** Problems with traditional regexps
+
+SRE notation was intended to fix a lot of things I didn't like about
+working with traditional notation.
+
+
+*** Traditional regexp notation doesn't scale over large regexps
+
+Traditional regexp notation has a lot of problems, and the bigger
+the regexp, the worse they get:
+- They can't be laid out to express their structure with indentation.
+- They can't be commented.
+- They don't have an abstraction mechanism -- parts can't be named
+  and used, functions can't be defined. (This can be hacked using
+  mechanisms like sprintf(), but it is awkward and error prone.)
+
+
+*** String constants
+
+There's no need to backquote special characters in SRE string constants.
+Just write them down in Scheme string-literal syntax.
+
+
+*** Traditional regexp notation doesn't scale to rich operator sets
+
+Traditional notation tries to use single characters as its operators:
+. * ( ) ^ $ and so forth. Unfortunately, the more chars you reserve as
+operators, the more backslash-quoting you have to do when you write down 
+constant strings. Eventually, you run out of special characters. Using a 
+single special character to prefix operators (as Gnu regexps do with 
+backslash) rapidly renders regexps unreadable -- especially when these
+backslashes have to be doubled to get them into the host language's string
+literals. When regexp packages such as Gnu, Perl, or Spencer start to expand 
+their operator repertoires, they are forced to adopt very unwieldy syntactic
+mechanisms. For example, Spencer's notation for beginning-of-word and 
+end-of-word boundaries are [[:<:]] and [[:>:]], a somewhat bizarre bit of
+syntactic jiggering.
+
+S-expressions, on the other hand, are a little more verbose for
+simple forms, but paying this cost up-front gets you into a general
+framework that is extremely extensible. It's easy to add many new
+operators to the SRE syntax -- as a result, SRE can be a very rich syntax.
+You choose:
+    SRE:     (w/nocase (word+ (~ ("aeiou"))))
+    Brand X: "[[:<:]]([b-df-hj-np-tv-zB-DF-HJ-NP-TV-Z])+[[:>]]"
+Note that not one of the three operators used in the SRE version is
+available in traditional notation. That tells the story right there.
+
+*** Traditional regexp [...] classes 
+
+There is a slew of special cases in the Posix grammar for [...] classes
+to shoehorn [...]'s special chars (carat, right bracket, and hyphen) into the
+notation as set elements. Examples:
+- Right bracket terminates a char class... unless it's the first
+  character following the left bracket... or the second character, 
+  following an initial ^. 
+- To put in a carat, place it at the *end* (well, perhaps *next* to the
+  end, if you also want to put in a hyphen) -- unless it's the only element,
+  then just punt the whole [...], and write it as a character. 
+- To put in a minus sign, really place it at the end, even after a carat.
+  Unless, that is, the whole char set is just the two characters carat
+  and hyphen, in which case you'd have [^-], which would mean something
+  else entirely -- so in this one case, flip the order of the two characters,
+  and put hyphen first: [-^].
+- Be sure never to accidentally place a left-bracket element next
+  to an equals sign, colon, or period, because [=, [:, and [. are
+  collating element open-delimiters in Posix regexps -- you can
+  say things like "[ABC[:lower:]123]" to get A, B, C, 1, 2, 3 and
+  all the lower-case letters. Better shuffle the class elements around
+  to avoid these juxtapositions.
+- There's no way, at all, to write the empty character class.
+  ("[]" is not syntactically legal; if you try "[^\000-\177]", you will
+  probably blow up your C regexp engine with the non-terminating nul
+  byte, and, in any event, you are being ASCII specific.)
+
+This is not even the whole set of special exceptions. You start putting
+special characters into your [...] char classes, and you walk into a
+mine field. Who designed this mess?
+
+As evidence of this complexity, the code that translates general Scheme
+character sets into this notation is the single largest and most complicated
+part of the ADT->Posix-string compiler. Better, however, that the unparser code
+puzzle out how to represent sets given all these ill-structured, error-prone
+rules than for you to have to waste time thinking about it yourself, as
+you must when you use traditional notation.
+
+Even with its baroque syntactic rule set, the [...] construct is pretty
+limited -- it lacks the compositional elements and general set operators
+that SRE char classes have. Writing a [...] form that will match any
+non-vowel letter is pretty painful because there is no set-difference
+operator:
+    [b-df-hj-np-tv-zB-DF-HJ-NP-TV-Z]
+You'd have to stare at this for a minute to figure out what it is. The
+corresponding SRE is much more transparent:
+    (- alpha ("aeiouAEIOU"))
+
+
+*** ASCII nul and newline
+
+Does . match a ASCII nul? Does it match a newline? Does [^x]? Unfortunately, 
+the same expressions have different meanings depending on the implementation
+and the flags passed to the pattern-compiler functions. In SRE notation,
+the behaviour of each element is unambiguously defined. No surprises; no
+misunderstandings.
+
+*** Newline
+
+Various regexp systems can never seem to agree on the treatment of newline. 
+Is it matched by . or [^x]? Do the anchors ^ and $ match beginning/end of
+line, or just beginning/end of string? Gnu regexps do it one way; Posix,
+another. Posix provides a compile-time flag that shifts the meaning of 
+all these constructs from string-oriented to line-oriented -- but multiplexing
+the notation in this global way means you can't do a bit of each in the
+same regexp.
+
+SRE's don't have this problem, partly because they aren't restricted to
+just using punctuation characters such as . [ ] ^ $ for operators. It was
+straightforward to add both bos/eos/any *and* bol/eol/nonl. Say what you
+mean; mean what you say.
+
+(Unfortunately, I don't have an underlying C engine that *implements*
+both eos/bos and bol/eol matching. But I've got a driver, and that's a
+start.)
+
+
+*** Submatch and grouping
+
+Parens are overloaded in Posix syntax as both grouping operators and
+as submatch markers. Some implementations extend the traditional
+syntax with awkward non-submatch-introducing grouping parentheses,
+but they are non-standard extensions and syntactically awkward.
+
+S-expressions, on the other hand, have no precedence issues, so
+grouping is distinct from the submatch operator. Removing these
+spurious submatches can have huge performance benefits, since submatch
+assignment rules out non-backtracking DFA search (roughly speaking).
+
+
+*** Grammar ambiguity
+
+The traditional syntax has a lot of squirrelly cases in the grammar
+to trip up the unwary. Some examples:
+- Is "x+*" the regexp (* (+ "x")) or is it (: (+ "x") (* ""))?
+- Is "" an empty choice (SRE (|), which never matches anything), 
+  or an empty sequence (SRE (:), which always matches the empty string)?
+
+SRE syntax doesn't have any of these problems.
+
+
+*** Grammar limitations
+
+As we've seen, there's no way to write an empty match in Posix notation,
+i.e. something that will never match.  [] is not an empty class, due to
+bizarre special-casing of right-bracket in the special context of immediately
+following a left bracket or left-bracket/carat.
+
+This may not seem important, but the generality can be handy when the
+regexp expressions are being computed, not hand-written (e.g., by a macro,
+or when unparsing an ADT value).
+
+Notations should provide coverage all the way out to the boundary cases.
+When they fail to do this, someone, sooner or later, runs into trouble.
+
+*** No exported ADT
+
+It's hard to compute regexps with the string representation -- for example,
+if you want to drop a constant string into a regexp, you have to write
+a routine to quote the special chars. Code that wants to manipulate regexps
+in terms of their structure is much harder to write in terms of the string
+form than the structured ADT values. As one of Perlis' aphorisms states,
+
+    The string is a stark data structure and everywhere it is passed 
+    there is much duplication of process. It is a perfect vehicle for 
+    hiding information.
+
+Working in terms of the grammar keeps you one step removed from the level at
+which you want to be operating, and the grammar doesn't even permit you write
+down straighforward boundary cases, such as the empty choice, the empty
+sequence, or the empty char class.
+
+
+*** Comments
+
+You can comment SRE notation. The traditional notation doesn't permit this.
+This is a big problem for large, complex regexps. (To be fair, Perl's
+non-standard syntax *does* permit comments to be interleaved with pieces 
+of a regexp.)
+
+
+*** Case sensitivity
+
+Traditional notation has no support for case-sensitivity. While the Posix
+pattern-compiler allows for a case-insensitive flag to be globally applied
+to a whole pattern, this is distinct from the notation itself -- not the
+right thing -- and does not provide for locality of scope.
+
+
+-------------------------------------------------------------------------------
+* Regexp functions
+------------------
+
+** Obsolete, deprecated procedures
+==================================
+
+These two procedures are survivors from the previous, now-obsolete scsh regexp
+interface. Old code must open the re-old-funs package to access them. They
+should not be used in new code.
+
+(string-match posix-re-string string [start]) -> match or false
+(make-regexp posix-re-string) -> re
+    Old functions for backwards compatibility. Will go away at some point in
+    the future.
+
+** Standard procedures and syntax
+=================================
+
+(rx sre ...)		Regexp macro
+    This allows you to describe a regexp value with SRE notation.
+
+(regexp? x) -> bool
+
+(regexp-search  re str [start flags]) -> false or match-data
+(regexp-search? re str [start flags]) -> boolean
+    FLAGS is bitwise-or of regexp/bos-not-bol and regexp/eos-not-eol.
+    regexp/bos-not-bol means the beginning of the string isn't a line-begin.
+    regexp/eos-not-eol is analogous. [They're currently ignored because
+    BOL and EOL aren't supported.]
+
+    Use REGEXP-SEARCH? when you don't need submatch information, as
+    it has the potential to be *significantly* faster on submatch-containing
+    regexps.
+
+    There is no longer a separate regexp "compilation" function; regexp
+    records are compiled for the C engine on demand, and the resulting
+    C structures are cached in the regexp structure after the first use.
+
+(match:start m [i]) -> int or false
+(match:end   m [i]) -> int or false
+(match:substring m [i]) -> string or false
+
+(regexp-substitute port-or-false match-data . items)
+    An item is a string (copied verbatim), integer (match index),
+    'pre (chars before the match), or 'post (chars after the match).
+    #f for port means return a string.
+
+    See the scsh manual for more details.
+
+(regexp-substitute/global port-or-false re str . items)
+    Same as above, except 'post item means recurse on post-match substring.
+    If RE doesn't match STR, returns STR.
+
+(regexp-fold re kons knil s [finish start]) -> value
+    The following definition is a bit unwieldy, but the intuition is
+    simple: this procedure uses the regexp RE to divide up string S into
+    non-matching/matching chunks, and then "folds" the procedure KONS
+    across this sequence of chunks. It is useful when you wish to operate
+    on a string in sub-units defined by some regular expression, as are
+    the related REGEXP-FOLD-RIGHT and REGEXP-FOR-EACH procedures.
+
+    Search from START (defaulting to 0) for a match to RE; call
+    this match M. Let I be the index of the end of the match
+    (that is, (match:end M 0)). Loop as follows:
+      (regexp-fold re kons (kons START M knil) s finish I)
+    If there is no match, return instead 
+      (finish START knil)
+    FINISH defaults to (lambda (i knil) knil)
+
+    In other words, we divide up S into a sequence of non-matching/matching
+    chunks:
+       NM1 M1 NM1 M2 ... NMk Mk NMlast
+    where NM1 is the initial part of S that isn't matched by the RE, M1 is the
+    first match, NM2 is the following part of S that isn't matched, M2 is the
+    second match, and so forth -- NMlast is the final non-matching chunk of
+    S. We apply KONS from left to right to build up a result, passing it one
+    non-matching/matching chunk each time: on an application (KONS i m KNIL),
+    the non-matching chunk goes from I to (match:begin m 0), and the following
+    matching chunk goes from (match:begin m 0) to (match:end m 0). The last
+    non-matching chunk NMlast is processed by FINISH. So the computation we
+    perform is
+      (final q (kons Jk MTCHk ... (kons J2 MTCH2 (kons J1 MTCH1 knil))...))
+    where Ji is the index of the start of NMi, MTCHi is a match value
+    describing Mi, and Q is the index of the beginning of NMlast.
+
+    Hint: The LET-MATCH macro is frequently useful for operating on the 
+      match value M passed to the KONS function.
+
+(regexp-fold-right re kons knil s [finish start]) -> value
+    This procedure repeatedly matches regexp RE across string S.
+    This divides S up into a sequence of matching/non-matching chunks:
+       NM0 M1 NM1 M2 NM2 ... Mk NMk
+    where NM0 is the initial part of S that isn't matched by the RE,
+    M1 is the first match, NM1 is the following part of S that isn't
+    matched, M2 is the second match, and so forth. We apply KONS from
+    right to left to build up a result
+      (final q (kons MTCH1 J1 (kons MTCH2 J2 ...(kons MTCHk JK knil)...)))
+    where MTCHi is a match value describing Mi, Ji is the index of the end of
+    NMi (or, equivalently, the beginning of Mi+1), and Q is the index of the
+    beginning of M1. In other words, KONS is passed a match, an index
+    describing the following non-matching text, and the value produced by
+    folding the following text. The FINAL function "polishes off" the fold
+    operation by handling the initial chunk of non-matching text (NM0, above).
+    FINISH defaults to (lambda (i knil) knil)
+
+    Example: To pick out all the matches to RE in S, say
+        (regexp-fold-right re
+                           (lambda (m i lis) (cons (match:substring m 0) lis))
+                           '() s)
+
+    Hint: The LET-MATCH macro is frequently useful for operating on the 
+      match value M passed to the KONS function.
+
+
+(regexp-for-each re proc s [start]) -> unspecific
+    Repeatedly match regexp RE against string S. 
+    Apply PROC to each match that is produced.
+    Matches do not overlap.
+
+    Hint: The LET-MATCH macro is frequently useful for operating on the 
+      match value M passed to the KONS function.
+
+
+(let-match match-exp mvars body ...)			Syntax
+(if-match  match-exp mvars on-match no-match)		Syntax
+    MVARS is a list of vars that is bound to the match and submatches
+    of the string; #F is allowed as a don't-care element. For example,
+	(let-match (regexp-search date s)
+                   (whole-date month day year)
+          ...body...)
+    matches the regexp against string s, then evaluates the body of the
+    let-match in a scope where M is bound to the matched string, and
+    SM2 is bound to the string matched by the second submatch.
+
+    IF-MATCH is similar, but if the match expression is false, 
+    then the no-match expression is evaluated; this would be an
+    error in LET-MATCH.
+
+(match-cond (   ...)	; As in if-match
+            (test   ...)			; As in cond
+            (test  => )			; As in cond
+            (else  ...))				; As in cond
+
+
+(flush-submatches re) -> re	; Returned value has no submatches
+(uncase re)           -> re	; Case-fold regexp
+(simplify-regexp  re) -> re	; Simplify the regexp
+(uncase-char-set cset) -> re
+(uncase-string str) -> re
+
+
+(sre->regexp sre) -> re		; S-expression parser
+(regexp->sre re)  -> sre	; S-expression unparser
+
+(posix-string->regexp string) -> re	; Posix regexp parser
+(regexp->posix-string re) -> string	; Posix regexp unparser
+    - The string parser doesn't handle the exotica of character class
+      names such as [[:alnum:]]; I wrote in in three hours.
+    - The unparser produces Spencer-specific strings for bow/eow
+      elements; otherwise, it's Posix all the way.
+
+    You can use these tools to map between scsh regexps and Posix
+    regexp strings, which can be useful if you want to do conversion
+    between SRE's and Posix form. For example, you can write a particularly
+    complex regexp in SRE form, or compute it using the ADT constructors,
+    then convert to Posix form, print it out, cut and paste it into a
+    C or emacs lisp program. Or you can import an old regexp from some other
+    program, parse it into an ADT value, render it to an SRE, print it out, 
+    then cut and paste it into a scsh program.
+
+-------------------------------------------------------------------------------
+* The scsh regexp ADT 
+---------------------
+The following functions may be used to construct and examine scsh's
+regexp abstract data type. They are in the following Scheme 48 packages:
+    re-adt-lib
+    re-lib
+    scsh
+
+** Sequences
+(re-seq? x) -> boolean				; Type predicate
+(make-re-seq . re-list) -> re			; Basic constructor
+(re-seq      . re-list) -> re			; Smart constructor
+(re-seq:elts re) -> re-list			; Accessors
+(re-seq:tsm  re) -> integer			; .
+
+** Choices
+(re-choice? x) -> boolean			; Type predicate
+(make-re-choice re-list) -> re			; Basic constructor
+(re-choice      . re-list) -> re		; Smart constructor
+(re-choice:elts . re) -> re-list		; Accessors
+(re-choice:tsm re) -> integer			; .
+
+** Repetition
+(re-repeat? x) -> boolean			; Type predicate
+(make-re-repeat  from to body)			; Basic constructor
+(re-repeat       from to body)			; Smart constructor
+(re-repeat:body re) -> re			; Accessors
+(re-repeat:from re) -> integer			; .
+(re-repeat:to   re) -> integer			; .
+(re-repeat:tsm  re) -> integer			; .
+
+** Submatches
+(re-submatch? x) -> boolean			; Type predicate
+(make-re-submatch body [pre-dsm post-dsm])	; Basic constructor
+(re-submatch      body [pre-dsm post-dsm])	; Smart constructor
+(re-submatch:body     re) -> re			; Accessors
+(re-submatch:pre-dsm  re) -> integer		; .
+(re-submatch:post-dsm re) -> integer		; .
+(re-submatch:tsm      re) -> integer		; .
+
+** String constants
+(re-string? x) -> boolean			; Type predicate
+(make-re-string chars) -> re			; Basic constructor
+(re-string chars) -> re				; Basic constructor
+(re-string:chars re) -> string			; Accessor
+
+** Char sets
+(re-char-set? x) -> boolean			; Type predicate
+(make-re-char-set cset) -> re			; Basic constructor
+(re-char-set cset) -> re			; Basic constructor
+(re-char-set:cset re) -> char-set		; Accessor
+
+** DSM
+(re-dsm? x) -> boolean				; Type predicate
+(make-re-dsm  body pre-dsm post-dsm) -> re	; Basic constructor
+(re-dsm       body pre-dsm post-dsm) -> re	; Smart constructor
+(re-dsm:body re) -> re				; Accessor
+(re-dsm:pre-dsm  re) -> integer			; .
+(re-dsm:post-dsm re) -> integer			; .
+(re-dsm:tsm      re) -> integer			; .
+
+** Primitive regexps
+re-bos re-eos re-bol re-eol re-bow re-eow	; Primitive regexps
+re-bos? re-eos?					; Type predicates
+re-bol? re-eol?
+re-bow? re-eow?
+
+trivial-re trivial-re?				; ""
+empty-re   empty-re?				; (|)
+re-any	   re-any?				; any
+re-nonl						; (~ #\newline)
+re-word						; word
+    These are non-primitive predefined regexps of general utility.
+
+(regexp? x) -> boolean
+(re-tsm re) -> integer
+
+(clean-up-cres)
+
+-------------------------------------------------------------------------------
+* Syntax-hacking tools
+----------------------
+The Scheme 48 package rx-syntax-tools exports several tools for macro
+hackers that want to use SREs in their macros. In the functions defined
+below, COMPARE and RENAME parameters are as passed to Clinger-Rees
+explicit-renaming low-level macros.
+
+(if-sre-form form conseq-form alt-form)					Syntax
+    If FORM is a legal SRE, this is equivalent to the expression
+    CONSEQ-FORM, otherwise it expands to ALT-FORM.
+    
+    This is useful for high-level macro authors who want to write a macro
+    where one field in the macro can be an SRE or possibly something
+    else. E.g., we might have a conditional form wherein if the
+    test part of one arm is an SRE, it expands to a regexp match
+    on some implied value, otherwise the form is evaluated as a boolean 
+    Scheme expression. Here's an example of how a high-level macro could
+    drop an expression bound to ?TEST-EXP into an IF-SRE-FORM:
+
+       (if-sre-form ?test-exp			; If ?TEST-EXP is a regexp,
+         (regexp-search? (rx ?test-exp) line)	; match it against the line,
+         ?test-exp)				; otw it's a boolean exp.
+
+(sre-form? form rename compare) -> boolean
+    This procedure is for low-level macros doing things equivalent to
+    IF-SRE-FORM. It returns true if the form is a legal SRE.
+
+Note that neither of these tests does a deep recursion over the form
+in the case where the form is a list. They simply check the car of the
+form for one of the legal SRE keywords.
+
+(parse-sre  sre-form  compare rename) -> re
+(parse-sres sre-forms compare rename) -> re
+    Parse SRE-FORM into an ADT. Note that if the SRE is dynamic -- 
+    contains , or ,@ forms, or has repeat operators whose
+    from/to counts are not constants -- then the returned ADT will have
+    *Scheme expressions* in the corresponding slots of the regexp records
+    instead of the corresponding integer, char-set, or regexp. In other
+    words, we use the ADT as its own AST. It's called a "hack."
+
+    PARSE-SRES parses a list of SRE forms that comprise an implicit sequence.
+
+(regexp->scheme re rename) -> Scheme-expression
+    Returns a Scheme expression that will construct the regexp RE
+    using ADT constructors such as make-re-sequence, make-re-repeat,
+    and so forth.
+
+    If the regexp is static, it will be simplified and pre-translated
+    to a Posix string as well, which will be part of the constructed
+    regexp value.
+
+(static-regexp? re) -> boolean
+    Is the regexp a static one?
+
+-------------------------------------------------------------------------------
+* Acknowledgements
+------------------
+If you want to know precise details on obscure features of Posix regexps,
+and their associated algorithms, you have to ask Tom Lord or Henry Spencer.
+I did.
+
+Alan Bawden was the one who proposed making | a "polymorphic" char-class/regexp
+union operator, with an inference pass to disambiguate, thus leaving the SRE
+syntax open to full intersection/union/difference/complement extension. I
+found this to be an amazingly clever idea. Alan also explained some esoteric
+points concerning low-level macros to me.
diff --git a/scsh/rx/let-match.scm b/scsh/rx/let-match.scm
new file mode 100644
index 0000000..6f6efb7
--- /dev/null
+++ b/scsh/rx/let-match.scm
@@ -0,0 +1,121 @@
+;;; These are some macros to support using regexp matching.
+
+(define-structure let-match-package
+  (export (let-match  :syntax)
+	  (if-match   :syntax)
+	  (match-cond :syntax))
+  (for-syntax (open scheme
+		    signals))	; For ERROR
+
+  (open scsh scheme)
+  (access signals) ; for ERROR
+
+  (begin
+
+;;; (let-match m mvars body ...)
+;;; Bind the vars in MVARS to the match & submatch strings of match data M,
+;;; and eval the body forms. #F is allowed in the MVARS list, as a don't-care 
+;;; parameter.
+;;;
+;;; (if-match m mvars conseq alt)
+;;; The same as LET-MATCH -- eval the CONSEQ form in the scope of the
+;;; bound MVARS. However, if the match data M evaluates to false, instead
+;;; of blowing up, we execute the ALT form instead.
+
+(define-syntax let-match
+  (lambda (exp r c)
+    (if (< (length exp) 3)
+	(error "No match-vars list in LET-MATCH" exp))
+    (let ((m (cadr exp))		; The match expression
+	  (mvars (caddr exp))		; The match vars
+	  (body (cdddr exp))		; The expression's body forms
+
+	  (%begin           (r 'begin))
+	  (%match:substring (r 'match:substring))
+	  (%let*            (r 'let*)))
+
+      (cond ((null? mvars) `(,%begin ,@body))
+
+	    ((pair? mvars)
+	     (let* ((msv (or (car mvars) (r 'match-val))) ; "match-struct var"
+		    (sm-bindings (let recur ((i 0) (vars (cdr mvars)))
+				   (if (pair? vars)
+				       (let ((var (car vars))
+					     (bindings (recur (+ i 1) (cdr vars))))
+					 (if var
+					     (cons `(,var (,%match:substring ,msv ,i))
+						   bindings)
+					     bindings))
+				       '()))))
+	       `(,%let* ((,msv ,m) ,@sm-bindings) ,@body)))
+
+
+	    (else (error "Illegal match-vars list in LET-MATCH" mvars exp))))))
+
+(define-syntax if-match
+  (syntax-rules ()
+    ((if-match match-exp mvars on-match no-match)
+     (cond (match-exp => (lambda (m) (let-match m mvars on-match)))
+	   (else no-match)))))
+
+;;; (MATCH-COND (   ...)
+;;;             (TEST   ...)
+;;;             (TEST  => )
+;;;             (ELSE  ...))
+;;;
+;;; The first clause is as-in IF-MATCH; the next three clauses are as-in COND.
+;;;
+;;; It would be slicker if we could *add* extra clauses to the syntax
+;;; of COND, but Scheme macros aren't extensible this way.
+ 
+;;; Two defs. The other expander produces prettier output -- one COND
+;;; rather than a mess of nested IF's.
+;(define-syntax match-cond
+;  (syntax-rules (else test =>)
+;    ((match-cond (else body ...) clause2 ...) (begin body ...))
+;
+;    ((match-cond) (cond))
+;
+;    ((match-cond (test exp => proc) clause2 ...)
+;     (let ((v exp)) (if v (proc v) (match-cond clause2 ...))))
+;
+;    ((match-cond (test exp body ...) clause2 ...)
+;     (if exp (begin body ...) (match-cond clause2 ...)))
+;
+;    ((match-cond (test exp) clause2 ...)
+;     (or exp (match-cond clause2 ...)))
+;
+;    ((match-cond (match-exp mvars body ...) clause2 ...)
+;     (if-match match-exp mvars (begin body ...)
+;	       (match-cond clause2 ...)))))
+
+(define-syntax match-cond
+  (syntax-rules ()
+    ((match-cond clause ...) (match-cond-aux () clause ...))))
+
+(define-syntax match-cond-aux
+  (syntax-rules (test else)
+
+   ;; No more clauses.
+   ((match-cond-aux (cond-clause ...))
+    (cond cond-clause ...))
+
+   ;; (TEST . )
+   ((match-cond-aux (cond-clause ...)
+		    (test . another-cond-clause) clause2 ...)
+    (match-cond-aux (cond-clause ... another-cond-clause)
+		    clause2 ...))
+   
+   ;; (ELSE  ...)
+   ((match-cond-aux (cond-clause ...)
+		    (else body ...) clause2 ...)
+    (match-cond-aux (cond-clause ... (else body ...))))
+
+   ;; (   ...)
+   ((match-cond-aux (cond-clause ...)
+		    (match-exp mvars body ...) clause2 ...)
+    (match-cond-aux (cond-clause ... (match-exp => (lambda (m)
+						     (let-match m mvars
+						       body ...))))
+		    clause2 ...))))
+))
diff --git a/scsh/rx/loadem.scm b/scsh/rx/loadem.scm
new file mode 100644
index 0000000..e33dc74
--- /dev/null
+++ b/scsh/rx/loadem.scm
@@ -0,0 +1,7 @@
+;;; ,exec ,load loadem.scm
+
+(config '(load "packages2.scm"))
+(config '(load "cond-package.scm"))
+;(map load-package '(rx-lib re-basics re-low-exports re-high-tools
+;			   sre-parser-package re-posix-parsers sre-syntax-tools
+;			   rx-syntax))
diff --git a/scsh/rx/modules.scm b/scsh/rx/modules.scm
new file mode 100644
index 0000000..fdb2735
--- /dev/null
+++ b/scsh/rx/modules.scm
@@ -0,0 +1,26 @@
+(define-structure re-package (export)
+  (open scsh
+	formats
+	define-record-types	; re
+	defrec-package		; re
+	scsh-utilities		;
+	define-foreign-syntax	; re-low
+	weak			; re-low
+	let-opt			; re
+	sort			; posixstr
+	receiving		; all of them
+	scheme)
+
+  (files "/usr/home/shivers/src/scm/conditionals.scm"
+	 re
+	 re-low
+	 simp
+	 re-high
+	 parse
+	 posixstr
+	 spencer
+	 ;re-syntax
+	 )
+
+  (optimize auto-integrate)
+  )
diff --git a/scsh/rx/oldfuns.scm b/scsh/rx/oldfuns.scm
new file mode 100644
index 0000000..d1904dc
--- /dev/null
+++ b/scsh/rx/oldfuns.scm
@@ -0,0 +1,21 @@
+;;; These functions were dropped from the regexp API when I shifted scsh's
+;;; regexps over to SREs. They are retained for backwards compatibility.
+;;; 	-Olin 8/98
+
+(define (string-match re str . maybe-start)
+  (apply regexp-search (->regexp re) str maybe-start))
+
+(define make-regexp posix-string->regexp)
+
+(define regexp-exec regexp-search)
+
+(define (->regexp str-or-re)
+  (cond ((string? str-or-re) (posix-string->regexp str-or-re))
+	((regexp? str-or-re) str-or-re)
+	(else (error ->regexp
+		     "Value must be either a Posix regexp string or a regexp value"
+		     str-or-re))))
+
+(define (regexp-quote str)
+  (receive (s lev pcount tvec) (regexp->posix-string (re-string str))
+    s))
diff --git a/scsh/rx/packages-old.scm b/scsh/rx/packages-old.scm
new file mode 100644
index 0000000..46e6d86
--- /dev/null
+++ b/scsh/rx/packages-old.scm
@@ -0,0 +1,235 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(define-interface re-low-exports-interface		; User-level stuff
+  (export regexp-match?
+	  match:start match:end match:substring
+	  clean-up-cres))
+
+;;; TOO MUCH STUFF HERE
+(define-interface re-low-internals-interface		; For scsh internals
+  (export make-regexp-match
+	  regexp-match:string set-regexp-match:string
+	  regexp-match:start  set-regexp-match:start
+	  regexp-match:end    set-regexp-match:end
+
+	  cre? new-cre
+	  cre:string          set-cre:string
+	  cre:bytes           set-cre:bytes
+	  cre:bytes           set-cre:bytes
+	  cre:tvec set-cre:tvec
+
+	  cre-search cre-search?))
+  
+(define-structures ((re-low-exports re-low-exports-interface)
+		    (re-low-internals re-low-internals-interface))
+  (open scsh
+	scsh-utilities
+	defrec-package
+	let-opt
+	define-foreign-syntax
+	weak		
+	receiving
+	scheme)
+  (files re-low)
+  (optimize auto-integrate))
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Do these guys really need to open the scsh package?
+
+(define-interface basic-re-interface
+  (export re-dsm? make-re-dsm
+	  re-dsm:body
+	  re-dsm:pre-dsm 
+	  re-dsm:tsm 
+	  re-dsm:posix set-re-dsm:posix
+	  re-dsm:post-dsm
+	  re-dsm open-dsm
+
+	  re-seq? %%make-re-seq %make-re-seq make-re-seq re-seq
+	  re-seq:elts
+	  re-seq:tsm
+	  re-seq:posix set-re-seq:posix
+
+	  re-choice? %%make-re-choice %make-re-choice make-re-choice re-choice
+	  re-choice:elts
+	  re-choice:tsm
+	  re-choice:posix set-re-choice:posix
+
+	  re-repeat? %%make-re-repeat %make-re-repeat make-re-repeat re-repeat
+	  re-repeat:from
+	  re-repeat:to
+	  re-repeat:body
+	  re-repeat:tsm
+	  re-repeat:posix set-re-repeat:posix
+
+	  re-submatch?
+	  %%make-re-submatch %make-re-submatch make-re-submatch re-submatch
+	  re-submatch:body
+	  re-submatch:pre-dsm
+	  re-submatch:tsm
+	  re-submatch:posix set-re-submatch:posix
+	  re-submatch:post-dsm
+
+	  re-string? make-re-string re-string
+	  re-string:chars set-re-string:chars
+	  re-string:posix set-re-string:posix
+
+	  re-trivial re-trivial?
+
+	  re-char-set? make-re-char-set re-char-set
+	  re-char-set:cset set-re-char-set:cset 
+	  re-char-set:posix set-re-char-set:posix
+
+	  ;; Constructors for the Scheme unparser
+	  make-re-string/posix
+	  %make-re-seq/posix
+	  %make-re-choice/posix
+	  make-re-char-set/posix
+	  %make-re-repeat/posix
+	  %make-re-dsm/posix
+	  %make-re-submatch/posix
+
+	  re-empty re-empty?
+	  re-bos re-bos?	  re-eos re-eos?
+	  re-bol re-bol?	  re-eol re-eol?
+	  re-bow re-bow?	  re-eow re-eow?
+
+	  re-any re-any?
+
+	  re-nonl
+	  re-word
+
+	  re?
+	  re-tsm
+
+	  flush-submatches	; Can be in code produced by RX expander.
+	  uncase		; Can be in code produced by RX expander.
+	  uncase-char-set	; Can be in code produced by RX expander.
+	  uncase-string
+
+	  char-set-empty?
+	  char-set-full?))
+
+;;; Stuff that could appear in code produced by (rx ...)
+(define-interface rx-lib-interface
+  (export coerce-dynamic-regexp
+	  coerce-dynamic-charset
+	  spec->char-set
+	  flush-submatches
+	  uncase
+	  uncase-char-set
+	  uncase-string))
+
+(define-structure rx-lib rx-lib-interface
+  (open scsh conditionals re-basics scheme)
+  (files rx-lib)
+  (optimize auto-integrate))
+
+(define-structures ((re-basics basic-re-interface)
+		    (re-simp-package (export simplify-regexp)))
+  (open scsh
+	re-low-internals	; new-cre
+	conditionals
+	scsh-utilities
+	define-record-types
+	defrec-package
+	let-opt
+	receiving
+	scheme)
+  (files re simp)
+  (optimize auto-integrate))
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define-interface re-high-interface
+  (export regexp-search regexp-search?
+	  regexp-substitute regexp-substitute/global))
+
+(define-interface re-high-tools-interface (export compile-regexp))
+
+(define-structures ((re-high-exports re-high-interface)
+		    (re-high-tools   re-high-tools-interface))
+  (open scsh
+	scsh-utilities
+	conditionals
+	
+	;; compile-regexp needs:
+	re-low-internals	; new-cre
+	re-simp-package		; simplify-regexp
+	re-posix-parsers	; regexp->posix-string
+
+	re-basics		; re-tsm
+
+	let-opt
+	receiving
+	scheme)
+  (files re-high)
+  (optimize auto-integrate))
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
+
+(define-interface sre-parser-interface
+  (export sre->regexp regexp->sre
+	  parse-sre parse-sres regexp->scheme
+	  char-set->in-pair
+	  static-regexp?))
+
+(define-interface posix-re-interface
+  (export regexp->posix-string		; posixstr.scm
+	  posix-string->regexp		; spencer
+	  ))
+
+;;; The Posix-string stuff needs char-set->in-pair from parse.scm
+;;; The SRE parser needs the Posix string parser for POSIX-STRING SRE's.
+
+(define-structures ((sre-parser-package sre-parser-interface)
+		    (re-posix-parsers posix-re-interface))
+  (open scsh
+	conditionals
+	re-low-internals	; cre:string cre:tvec
+	re-basics
+	re-simp-package
+	sort			; Posix renderer
+	scsh-utilities
+	receiving
+	scheme)
+  (files parse			; sre-parser-package
+	 posixstr spencer)	; re-posix-parsers
+  (optimize auto-integrate))
+
+
+;;; re-syntax provides 2 structures:
+;;;     re-syntax (exports (rx :syntax))
+;;;     re-syntax-tools (exports (if-sre-form :syntax) sre-form?)
+
+(define-interface sre-syntax-tools-interface
+  (export expand-rx sre-form?))
+
+(define-structure sre-syntax-tools sre-syntax-tools-interface
+  (open scheme
+	receiving
+	conditionals
+	re-posix-parsers	; regexp->posix-string
+	sre-parser-package	; static-regexp? parse-sres 
+	re-high-tools		; compile-regexp
+	re-basics		; For the posix-cacher and code-producer
+	re-simp-package)
+  (files re-syntax)
+  (optimize auto-integrate))
+
+(define-structure rx-syntax (export (rx :syntax)
+				    (if-sre-form :syntax))
+  
+  (open re-basics
+	rx-lib
+	scheme)
+  (for-syntax (open sre-syntax-tools scheme))
+  (begin (define-syntax rx expand-rx)
+	 (define-syntax if-sre-form
+	   (lambda (exp r c)
+	     (if (sre-form? (cadr exp) r c)
+		 (caddr exp)
+		 (cadddr exp)))))
+  (optimize auto-integrate))
diff --git a/scsh/rx/packages-old2.scm b/scsh/rx/packages-old2.scm
new file mode 100644
index 0000000..22a2906
--- /dev/null
+++ b/scsh/rx/packages-old2.scm
@@ -0,0 +1,185 @@
+;;; Module definitions for the scsh regexp system.
+;;;     -Olin  8/98
+
+(define-interface basic-re-interface
+  (export re-dsm? make-re-dsm
+	  re-dsm:body
+	  re-dsm:pre-dsm 
+	  re-dsm:tsm 
+	  re-dsm:posix set-re-dsm:posix
+	  re-dsm:post-dsm
+	  re-dsm open-dsm
+
+	  re-seq? %%make-re-seq %make-re-seq make-re-seq re-seq
+	  re-seq:elts
+	  re-seq:tsm
+	  re-seq:posix set-re-seq:posix
+
+	  re-choice? %%make-re-choice %make-re-choice make-re-choice re-choice
+	  re-choice:elts
+	  re-choice:tsm
+	  re-choice:posix set-re-choice:posix
+
+	  re-repeat? %%make-re-repeat %make-re-repeat make-re-repeat re-repeat
+	  re-repeat:from
+	  re-repeat:to
+	  re-repeat:body
+	  re-repeat:tsm
+	  re-repeat:posix set-re-repeat:posix
+
+	  re-submatch?
+	  %%make-re-submatch %make-re-submatch make-re-submatch re-submatch
+	  re-submatch:body
+	  re-submatch:pre-dsm
+	  re-submatch:tsm
+	  re-submatch:posix set-re-submatch:posix
+	  re-submatch:post-dsm
+
+	  re-string? make-re-string re-string
+	  re-string:chars set-re-string:chars
+	  re-string:posix set-re-string:posix
+
+	  re-trivial re-trivial?
+
+	  re-char-set? make-re-char-set re-char-set
+	  re-char-set:cset set-re-char-set:cset 
+	  re-char-set:posix set-re-char-set:posix
+
+	  re-empty re-empty?
+	  re-bos re-bos?	  re-eos re-eos?
+	  re-bol re-bol?	  re-eol re-eol?
+	  re-bow re-bow?	  re-eow re-eow?
+
+	  re-any re-any?
+
+	  re-nonl
+	  re-word
+
+	  regexp?
+	  re-tsm
+
+	  flush-submatches	; Can be in code produced by RX expander.
+	  uncase		; Can be in code produced by RX expander.
+	  uncase-char-set	; Can be in code produced by RX expander.
+	  uncase-string
+	  ))
+
+
+;;; These guys were made obsolete by the new SRE package and exist for
+;;; backwards compatibility only.
+(define-interface re-old-funs-interface
+  (export string-match make-regexp regexp-exec ->regexp regexp-quote))
+
+
+(define-interface re-internals-interface
+  (export make-re-string/posix		; Constructors for the Scheme unparser
+	  %make-re-seq/posix
+	  %make-re-choice/posix
+	  make-re-char-set/posix
+	  %make-re-repeat/posix
+	  %make-re-dsm/posix
+	  %make-re-submatch/posix))
+
+
+(define-interface posix-re-interface
+  (export regexp->posix-string		; posixstr.scm
+	  posix-string->regexp		; spencer
+	  ))
+
+(define-interface re-exports-interface
+  (compound-interface posix-re-interface
+		      basic-re-interface
+		      (export regexp-match?
+			      match:start match:end match:substring
+			      clean-up-cres
+			      regexp-search regexp-search?
+			      regexp-substitute regexp-substitute/global
+			      sre->regexp regexp->sre
+			      )))
+
+
+(define-structures ((re-exports re-exports-interface)
+		    (re-internals re-internals-interface)
+		    (sre-syntax-tools (export expand-rx sre-form?))
+		    )
+  (open scsh-utilities
+	defrec-package
+	define-foreign-syntax
+	weak		
+	;re-posix-parsers	; regexp->posix-string
+	let-opt
+	sort			; Posix renderer
+	conditionals
+	define-record-types
+	defrec-package
+	receiving
+	scsh
+	scheme)
+  (files re-low re simp re-high
+	 parse posixstr spencer re-syntax)
+  (optimize auto-integrate)
+  )
+
+;;; Stuff that could appear in code produced by (rx ...)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define-interface rx-lib-interface
+  (compound-interface (export coerce-dynamic-regexp
+			      coerce-dynamic-charset
+			      spec->char-set
+			      flush-submatches
+			      uncase
+			      uncase-char-set
+			      uncase-string)
+		      re-internals-interface))
+
+(define-structure rx-lib rx-lib-interface
+  (open re-internals
+	conditionals re-exports scsh scheme)
+  (files rx-lib)
+  (optimize auto-integrate))
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define-structure rx-syntax (export (rx :syntax)
+				    (if-sre-form :syntax))
+  (open re-exports
+	rx-lib
+	scheme)
+  (for-syntax (open sre-syntax-tools scheme))
+  (begin (define-syntax rx expand-rx)
+	 (define-syntax if-sre-form
+	   (lambda (exp r c)
+	     (if (sre-form? (cadr exp) r c)
+		 (caddr exp)
+		 (cadddr exp)))))
+  (optimize auto-integrate))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define-structure re-old-funs re-old-funs-interface
+  (open re-exports scsh scheme)
+  (files oldfuns))
+
+
+
+;;; File	Exports
+;;; ----	-------
+;;; parse	sre->regexp regexp->sre  
+;;;             parse-sre parse-sres regexp->scheme
+;;;             char-set->in-pair
+;;; posixstr	regexp->posix-string
+;;; re-high	compile-regexp regexp-search regexp-search? 
+;;;             regexp-substitute regexp-substitute/global
+;;; re-low	match:start match:end match:substring
+;;;             CRE record, new-cre, compile-posix-re->c-struct
+;;;             cre-search cre-search? clean-up-cres
+;;; re-syntax	sre-form? if-sre-form expand-rx
+;;; re.scm	The ADT. flush-submatches uncase uncase-char-set
+;;;             char-set-full? char-set-empty?
+;;;             re-char-class? static-char-class?
+;;; rx-lib	coerce-dynamic-regexp coerce-dynamic-charset spec->char-set
+;;; simp	simplify-regexp
+;;; spencer	posix-string->regexp
diff --git a/scsh/rx/packages.scm b/scsh/rx/packages.scm
new file mode 100644
index 0000000..06a10c3
--- /dev/null
+++ b/scsh/rx/packages.scm
@@ -0,0 +1,309 @@
+;;; Module definitions for the scsh regexp system.
+;;; This is a sleazy modularisation -- we just load everything into
+;;; scsh-level-0, and export from there.
+;;;     -Olin  8/98
+
+(define-interface basic-re-interface
+  (export (re-dsm? (proc (:value) :boolean))
+	  (make-re-dsm (proc (:value :exact-integer :exact-integer) :value))
+	  (re-dsm:body (proc (:value) :value))
+	  (re-dsm:pre-dsm (proc (:value) :exact-integer))
+	  (re-dsm:tsm (proc (:value) :exact-integer))
+	  (re-dsm:posix (proc (:value) :value))
+	  (set-re-dsm:posix (proc (:value :value) :unspecific))
+	  ((re-dsm:post-dsm re-dsm) (proc (:value) :exact-integer))
+	  (open-dsm (proc (:value) (some-values :value :exact-integer)))
+
+	  (re-seq? (proc (:value) :boolean))
+	  (%%make-re-seq (proc (:value :exact-integer :value) :value))
+	  (%make-re-seq (proc (:value :exact-integer) :value))
+	  ((re-seq make-re-seq) (proc (:value) :value))
+	  (re-seq:elts (proc (:value) :value))
+	  (re-seq:tsm (proc (:value) :exact-integer))
+	  (re-seq:posix (proc (:value) :value))
+	  (set-re-seq:posix (proc (:value :value) :unspecific))
+
+	  (re-choice? (proc (:value) :boolean))
+	  (%%make-re-choice (proc (:value :exact-integer :value) :value))
+	  (%make-re-choice (proc (:value :exact-integer) :value))
+	  ((make-re-choice re-choice) (proc (:value) :value))
+	  (re-choice:elts (proc (:value) :value))
+	  (re-choice:tsm (proc (:value) :exact-integer))
+	  (re-choice:posix (proc (:value) :value))
+	  (set-re-choice:posix (proc (:value :value) :unspecific))
+
+	  (re-repeat? (proc (:value) :boolean))
+	  (%%make-re-repeat (proc (:exact-integer :value :value
+				   :exact-integer :value)
+				  :value))
+	  (%make-re-repeat (proc (:exact-integer :value :value :exact-integer )
+				 :value))
+	  ((re-repeat make-re-repeat)
+	   (proc (:exact-integer :value :value) :value))
+	  ((re-repeat:from re-repeat:tsm)
+	   (proc (:value) :exact-integer))
+	  (re-repeat:to (proc (:value) :value))
+	  ((re-repeat:body re-repeat:posix)
+	   (proc (:value) :value))
+	  (set-re-repeat:posix (proc (:value :value) :unspecific))
+
+	  (re-submatch? (proc (:value) :boolean))
+	  (%%make-re-submatch (proc (:value :exact-integer :exact-integer :value)
+				    :value))
+	  (%make-re-submatch (proc (:value :exact-integer :exact-integer) :value))
+	  ((make-re-submatch re-submatch)
+	   (proc (:value &opt :exact-integer :exact-integer) :value))
+ 
+	  (re-submatch:body (proc (:value) :value))
+	  ((re-submatch:pre-dsm re-submatch:tsm re-submatch:post-dsm)
+	   (proc (:value) :exact-integer))
+	  (re-submatch:posix (proc (:value) :value))
+	  (set-re-submatch:posix (proc (:value :value) :unspecific))
+
+	  (re-string? (proc (:value) :boolean))
+	  ((make-re-string re-string) (proc (:string) :value))
+	  (re-string:chars (proc (:value) :string))
+	  (set-re-string:chars (proc (:value :string) :unspecific))
+	  (re-string:posix (proc (:value) :value))
+	  (set-re-string:posix (proc (:value :value) :unspecific))
+
+	  re-trivial
+	  (re-trivial? (proc (:value) :boolean))
+
+	  (re-char-set? (proc (:value) :boolean))
+	  ((make-re-char-set re-char-set) (proc (:value) :value))
+	  (re-char-set:cset (proc (:value) :value))
+	  (set-re-char-set:cset (proc (:value :value) :unspecific))
+	  (re-char-set:posix (proc (:value) :value))
+	  (set-re-char-set:posix (proc (:value :value) :unspecific))
+
+	  re-empty
+	  (re-empty? (proc (:value) :boolean))
+	  re-bos	  re-eos
+	  re-bol 	  re-eol
+	  re-bow 	  re-eow
+
+	  ((re-bos? re-eos? re-bol? re-eol? re-bow? re-eow? re-any?)
+	   (proc (:value) :boolean))
+
+	  re-any
+	  re-nonl
+	  re-word
+
+	  (regexp? (proc (:value) :boolean))
+	  (re-tsm (proc (:value) :exact-integer))
+
+	  ;; These guys can be in code produced by RX expander.
+	  (flush-submatches (proc (:value) :value))
+	  (uncase (proc (:value) :value))
+	  (uncase-char-set (proc (:value) :value))
+	  (uncase-string (proc (:string) :value))
+	  ))
+
+
+;;; These guys were made obsolete by the new SRE package and exist for
+;;; backwards compatibility only.
+(define-interface re-old-funs-interface
+  (export
+   (string-match (proc (:value :string &opt :exact-integer) :value))
+   (make-regexp  (proc (:string) :value))
+   (regexp-exec  (proc (:value :string &opt :exact-integer) :value))
+   (->regexp     (proc (:value) :value))
+   (regexp-quote (proc (:string) :value))))
+
+
+(define-interface re-internals-interface
+  ;; These are constructors for the Scheme unparser
+  (export
+   (make-re-string/posix (proc (:string :string :vector) :value))
+   ((%make-re-seq/posix %make-re-choice/posix)
+    (proc (:value :exact-integer :string :vector) :value))
+   (make-re-char-set/posix (proc (:value :string :vector) :value))
+   (%make-re-repeat/posix (proc (:exact-integer :value :value :exact-integer :string :vector)
+				:value))
+   (%make-re-dsm/posix (proc (:value :exact-integer :exact-integer :string :vector)
+			     :value))
+   (%make-re-submatch/posix (proc (:value :exact-integer :exact-integer :string :vector) :value))))
+
+
+(define re-match-internals-interface
+  (export (regexp-match:string (proc (:value) :string))
+	  (regexp-match:start  (proc (:value) :vector))
+	  (regexp-match:end    (proc (:value) :vector))))
+
+
+(define-interface posix-re-interface
+  (export (regexp->posix-string (proc (:value) :string))	; posixstr.scm
+	  (posix-string->regexp (proc (:string) :value))	; spencer
+	  ))
+
+(define-interface re-subst-interface
+  (export
+   (regexp-substitute (proc (:value :value &rest :value) :value))
+   (regexp-substitute/global (proc (:value :value :string &rest :value) :value))))
+
+(define-interface re-folders-interface
+  (export
+   (regexp-fold (proc (:value (proc (:exact-integer :value :value) :value)
+			      :value
+			      :string
+			      &opt (proc (:exact-integer :value) :value)
+			      :exact-integer)
+		      :value))
+   (regexp-fold (proc (:value (proc (:value :exact-integer :value) :value)
+			      :value
+			      :string
+			      &opt (proc (:exact-integer :value) :value)
+			      :exact-integer)
+		      :value))
+   (regexp-for-each (proc (:value (proc (:value) :unspecific)
+				  :string &opt :exact-integer)
+			  :unspecific))))
+
+(define-interface re-level-0-interface
+  (compound-interface posix-re-interface
+		      basic-re-interface
+		      (export (regexp-match? (proc (:value) :boolean))
+			      (match:start (proc (:value :exact-integer) :value))
+			      (match:end   (proc (:value :exact-integer) :value))
+			      (match:substring (proc (:value :exact-integer) :value))
+			      (clean-up-cres (proc () :unspecific))
+			      (regexp-search (proc (:value :string &opt :exact-integer)
+						   :value))
+			      (regexp-search? (proc (:value :string &opt :exact-integer)
+						   :boolean))
+			      (sre->regexp (proc (:value) :value))
+			      (regexp->sre (proc (:value) :value))
+			      )))
+
+
+(define-structures ((re-level-0 re-level-0-interface)
+		    (re-match-internals re-match-internals-interface)
+		    (re-internals re-internals-interface)
+		    (sre-syntax-tools (export expand-rx static-regexp?
+					      sre-form?
+					      parse-sre parse-sres
+					      sre->regexp regexp->sre  
+					      regexp->scheme)))
+  (open scsh-utilities
+	defrec-package
+	define-foreign-syntax
+	weak		
+	;re-posix-parsers	; regexp->posix-string
+	let-opt
+	sort			; Posix renderer
+	conditionals
+	define-record-types
+	defrec-package
+	receiving
+	char-set-package
+	error-package
+	ascii
+	string-lib		; string-fold
+	scheme)
+  (files re-low re simp re-high
+	 parse posixstr spencer re-syntax)
+  (optimize auto-integrate)
+  )
+
+
+;;; Stuff that could appear in code produced by (rx ...)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define-interface rx-lib-interface
+  (compound-interface (export coerce-dynamic-regexp
+			      coerce-dynamic-charset
+			      spec->char-set
+			      flush-submatches
+			      uncase
+			      uncase-char-set
+			      uncase-string)
+		      re-internals-interface))
+
+(define-structure rx-lib rx-lib-interface
+  (open re-internals
+	conditionals
+	re-level-0
+	char-set-package
+	scsh-utilities	; fold
+	error-package
+	ascii
+	scheme)
+  (files rx-lib)
+  (optimize auto-integrate))
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define-interface rx-syntax-interface (export (rx :syntax)
+					      (if-sre-form :syntax)))
+
+(define-structure rx-syntax rx-syntax-interface
+  (open re-level-0
+	char-set-package
+	rx-lib
+	scheme)
+  (for-syntax (open sre-syntax-tools scheme))
+  (begin (define-syntax rx expand-rx)
+	 (define-syntax if-sre-form
+	   (lambda (exp r c)
+	     (if (sre-form? (cadr exp) r c)
+		 (caddr exp)
+		 (cadddr exp)))))
+  (optimize auto-integrate))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define-structure re-old-funs re-old-funs-interface
+  (open re-level-0 error-package receiving scheme)
+  (files oldfuns)
+  (optimize auto-integrate))
+
+
+
+(define-structure re-subst re-subst-interface
+  (open re-level-0
+	re-match-internals
+	scsh-utilities	; fold & some string utilities that need to be moved.
+	scsh-level-0	; write-string
+	string-lib	; string-copy!
+	scheme)
+  (files re-subst)
+  (optimize auto-integrate))
+
+
+(define-structure re-folders re-folders-interface
+  (open re-level-0 let-opt conditionals error-package scheme)
+  (files re-fold)
+  (optimize auto-integrate))
+
+
+(define-interface re-exports-interface
+  (compound-interface re-level-0-interface
+		      rx-syntax-interface
+		      re-subst-interface
+		      re-folders-interface))
+
+(define-structure re-exports re-exports-interface
+  (open rx-syntax re-level-0 re-subst re-folders)
+  (optimize auto-integrate))
+
+
+;;; File	Exports
+;;; ----	-------
+;;; parse	sre->regexp regexp->sre  
+;;;             parse-sre parse-sres regexp->scheme
+;;;             char-set->in-pair static-regexp?
+;;; posixstr	regexp->posix-string
+;;; re-high	compile-regexp regexp-search regexp-search? 
+;;; re-subst	regexp-substitute regexp-substitute/global
+;;; re-low	match:start match:end match:substring
+;;;             CRE record, new-cre, compile-posix-re->c-struct
+;;;             cre-search cre-search? clean-up-cres
+;;; re-syntax	sre-form? if-sre-form expand-rx
+;;; re.scm	The ADT. flush-submatches uncase uncase-char-set
+;;;             char-set-full? char-set-empty?
+;;;             re-char-class? static-char-class?
+;;; rx-lib	coerce-dynamic-regexp coerce-dynamic-charset spec->char-set
+;;; simp	simplify-regexp
+;;; spencer	posix-string->regexp
diff --git a/scsh/rx/parse.scm b/scsh/rx/parse.scm
new file mode 100644
index 0000000..fed54e0
--- /dev/null
+++ b/scsh/rx/parse.scm
@@ -0,0 +1,667 @@
+;;; Regexp support for Scheme
+;;;     Olin Shivers, January 1997, May 1998.
+
+;;; Todo:
+;;; - Better unparsers for (word ...) and (word+ ...).
+;;; - Unparse char-sets into set-diff SREs -- find a char set that's a
+;;;   tight bound, then get the difference.  This would really pretty up
+;;;   things like (- alpha "aeiou")
+
+;;; Exports:
+;;; (sre->regexp sre)		SRE->ADT parser
+;;; (regexp->sre re)		ADT->SRE unparser
+;;;
+;;; Procedures that parse sexp regexps and translate ADTs for low-level macros:
+;;; (parse-sre  sre  rename compare)
+;;; (parse-sres sres rename compare)
+;;; (regexp->scheme re rename)
+;;;
+;;; (char-set->in-pair cset)	Char-set unparsing utility
+
+;;; Character-set dependencies:
+;;; The only stuff in here dependent on the implementation's character type
+;;; is the char-set parsing and unparsing, which deal with ranges of 
+;;; characters. We assume an 8-bit ASCII superset.
+
+;;; Imports:
+;;; ? for COND, and SWITCHQ conditional form.
+;;; every
+
+;;; This code is much hairier than it would otherwise be because of the
+;;; the presence of , forms, which put a static/dynamic duality over
+;;; a lot of the processing -- we have to be prepared to handle either
+;;; re's or Scheme epressions that produce re's; char-sets or Scheme 
+;;; expressions that produce char-sets. It's a pain.
+;;;
+;;; See comments in re.scm ADT code about building regexp trees that have
+;;; code in the record fields instead of values.
+;;;
+;;; The macro expander works by parsing the regexp form into an re record,
+;;; and simplifying it. If the record is completely static, it is then 
+;;; translated, at macro-expand time, into a Posix regex string. If the
+;;; regexp needs runtime values -- e.g, the computed from and to fields in 
+;;;     (** "ha, " (- min 1) (+ max 1))
+;;; -- the expander instead produces Scheme ADT constructors to build
+;;; the regexp at run-time.
+
+
+;;; Parser
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Is a parsed regexp completely determined statically, or does it
+;;; have dynamic components (e.g., a ,@ or a computed char-set)
+;;; in the form of embedded code in some of the regexp's fields?
+
+(define (static-regexp? re)
+  (? ((re-seq?    re)   (every static-regexp? (re-seq:elts    re)))
+     ((re-choice? re)   (every static-regexp? (re-choice:elts re)))
+
+     ((re-char-set? re) (char-set? (re-char-set:cset re)))    ; Might be code.
+
+     ((re-repeat? re)		; FROM & TO fields might be code.
+      (let ((to (re-repeat:to re)))
+	(and (integer? (re-repeat:from re))
+	     (or (not to) (integer? to))
+	     (static-regexp? (re-repeat:body re)))))
+
+     ((re-dsm? re)      (static-regexp? (re-dsm:body re)))
+     ((re-submatch? re) (static-regexp? (re-submatch:body re)))
+
+     (else (or (re-bos? re) (re-eos? re)	; Otw, if it's not 
+	       (re-bol? re) (re-eol? re)	; one of these,
+	       (re-bow? re) (re-eow? re)	; then it's Scheme code.
+	       (re-string? re))))) 
+               
+
+;;; Two useful standard char sets
+(define nonl-chars (char-set-invert (char-set #\newline)))
+(define word-chars (char-set-union (char-set #\_) char-set:alphanumeric))
+
+;;; Little utility that should be moved to scsh's utilities.scm
+(define (partition pred lis)
+  (let recur ((in '()) (out '()) (lis lis))
+    (if (pair? lis)
+	(let ((head (car lis))
+	      (tail (cdr lis)))
+	  (if (pred head)
+	      (recur (cons head in) out             tail)
+	      (recur in             (cons head out) tail)))
+	(values in out))))
+
+
+(define (sre->regexp sre)
+  (parse-sre sre (lambda (x) x) equal?))
+
+
+;;; Parse a sexp regexp into a regexp value, which may be "dynamic" --
+;;; i.e., some slots may be filled with the Scheme code that will produce
+;;; their true vaues.
+;;;
+;;; R & C are rename and compare functions for low-level macro expanders.
+
+;;; These two guys are little front-ends for the main routine.
+
+(define (parse-sre sre r c) (parse-sre/context sre #t #f r c))
+
+(define (parse-sres sres r c)
+  (re-seq (map (lambda (sre) (parse-sre sre r c)) sres)))
+
+
+;;; (parse-sre/context sre case-sensitive? cset? r c)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; This is the main entry point. Parse SRE, given the lexical case-sensitivity
+;;; flag CASE-SENSITIVE?. If CSET? is true, SRE *must* be parseable as a
+;;; char-class SRE, and this function returns a character set, *not* a
+;;; regexp value. If CSET? is false, SRE can be any SRE, and this function
+;;; returns a regexp value. R and C are low-level macro rename and compare
+;;; functions.
+
+(define (parse-sre/context sre case-sensitive? cset? r c)
+  (let ((%bos (r 'bos))		(%eos (r 'eos))
+	(%bol (r 'bol))		(%eol (r 'eol))
+	(%bow (r 'bow))		(%eow (r 'eow))
+
+	(%word (r 'word))
+
+	(%flush-submatches       (r 'flush-submatches))
+	(%coerce-dynamic-charset (r 'coerce-dynamic-charset))
+	(%coerce-dynamic-regexp  (r 'coerce-dynamic-regexp)))
+
+    (let recur ((sre             sre)
+		(case-sensitive? case-sensitive?)
+		(cset?           cset?))
+
+      ;; Parse the sequence of regexp expressions SEQ with a lexical
+      ;; case-sensitivity context of CS?.
+      (define (parse-seq/context seq cs?)
+	(if cset?
+	    (if (= 1 (length seq))
+		(recur (car sre) cs? #t)
+		(error "Non-singleton sequence not allowed in char-class context."
+		       seq))
+	    (re-seq (map (lambda (sre) (recur sre cs? cset?))
+			 seq))))
+	  
+      (define (parse-seq seq) (parse-seq/context seq case-sensitive?))
+      (define (parse-char-class sre) (recur sre case-sensitive? #t))
+
+      (define (non-cset)	; Blow up if cset? is true.
+	(if cset? (error "Illegal SRE in char-class context." sre)))
+
+      (? ((char? sre)   (parse-char-re   sre case-sensitive? cset?))
+	 ((string? sre) (parse-string-re sre case-sensitive? cset?))
+
+	 ((c sre %bos) (non-cset) re-bos)
+	 ((c sre %eos) (non-cset) re-eos)
+
+	 ((c sre %bol) (non-cset) re-bol)
+	 ((c sre %eol) (non-cset) re-eol)
+
+	 ((c sre %bow)  (non-cset) re-bow)
+	 ((c sre %eow)  (non-cset) re-eow)
+	 ((c sre %word) (non-cset) re-word)
+
+	 ((pair? sre)
+	  (case (car sre)
+	    ((*)  (non-cset) (re-repeat 0 #f (parse-seq (cdr sre))))
+	    ((+)  (non-cset) (re-repeat 1 #f (parse-seq (cdr sre))))
+	    ((?)  (non-cset) (re-repeat 0  1 (parse-seq (cdr sre))))
+	    ((=)  (non-cset) (let ((n (cadr sre)))
+			       (re-repeat n n (parse-seq (cddr sre)))))
+	    ((>=) (non-cset) (re-repeat (cadr sre) #f (parse-seq (cddr sre))))
+	    ((**) (non-cset) (re-repeat (cadr sre) (caddr sre)
+					(parse-seq (cdddr sre))))
+
+	    ;; Choice is special wrt cset? because it's "polymorphic".
+	    ;; Note that RE-CHOICE guarantees to construct a char-set
+	    ;; or single-char string regexp if all of its args are char 
+	    ;; classes.
+	    ((| or) (let ((elts (map (lambda (sre)
+				       (recur sre case-sensitive? cset?))
+				     (cdr sre))))
+		      (if cset?
+			  (assoc-cset-op char-set-union 'char-set-union elts r)
+			  (re-choice elts))))
+
+	    ((: seq) (non-cset) (parse-seq (cdr sre)))
+
+	    ((word)  (non-cset) (parse-seq `(,%bow ,@(cdr sre) ,%eow)))
+	    ((word+)
+	     (recur `(,(r 'word) (,(r '+) (,(r '&) (,(r '|) ,(r 'alphanum) "_")
+						   (,(r '|) . ,(cdr sre)))))
+		    case-sensitive?
+		    cset?))
+	    
+	    ((submatch) (non-cset) (re-submatch (parse-seq (cdr sre))))
+	    ((dsm)      (non-cset) (re-dsm (parse-seq (cdddr sre))
+					   (cadr sre)
+					   (caddr sre)))
+
+	    ;; We could be more aggressive and push the uncase op down into
+	    ;; partially-static regexps, but enough is enough.
+	    ((uncase)
+	     (let ((re-or-cset (parse-seq (cdr sre))))	; Depending on CSET?.
+	       (if cset?
+
+		   (if (re-char-set? re-or-cset)	; A char set or code
+		       (uncase-char-set re-or-cset)	; producing a char set.
+		       `(,(r 'uncase) ,re-or-cset))
+
+		   (if (static-regexp? re-or-cset)	; A regexp or code
+		       (uncase re-or-cset)		; producing a regexp.
+		       `(,(r 'uncase)
+			 ,(regexp->scheme (simplify-regexp re-or-cset) r))))))
+
+	    ;; These just change the lexical case-sensitivity context.
+	    ((w/nocase) (parse-seq/context (cdr sre) #f))
+	    ((w/case)   (parse-seq/context (cdr sre) #t))
+
+	    ;; , and ,@
+	    ((unquote)
+	     (let ((exp (cadr sre)))
+	       (if cset?
+		   `(,%coerce-dynamic-charset ,exp)
+		   `(,%flush-submatches (,%coerce-dynamic-regexp ,exp)))))
+	    ((unquote-splicing)
+	     (let ((exp (cadr sre)))
+	       (if cset?
+		   `(,%coerce-dynamic-charset ,exp)
+		   `(,%coerce-dynamic-regexp ,exp))))
+
+	    ((~) (let* ((cs (assoc-cset-op char-set-union 'char-set-union
+					   (map parse-char-class (cdr sre))
+					   r))
+			(cs (if (char-set? cs)
+				(char-set-invert cs)
+				`(,(r 'char-set-invert) ,cs))))
+		   (if cset? cs (make-re-char-set cs))))
+
+	    ((&) (let ((cs (assoc-cset-op char-set-intersection 'char-set-intersection
+					  (map parse-char-class (cdr sre))
+					  r)))
+		   (if cset? cs (make-re-char-set cs))))
+
+	    ((-) (if (pair? (cdr sre))
+		     (let* ((cs1 (parse-char-class (cadr sre)))
+			    (cs2 (assoc-cset-op char-set-union 'char-set-union
+						(map parse-char-class (cddr sre))
+						r))
+			    (cs (if (and (char-set? cs1) (char-set? cs2))
+				    (char-set-difference cs1 cs2)
+				    `(,(r 'char-set-difference)
+				      ,(if (char-set? cs1)
+					   (char-set->scheme cs1 r)
+					   cs1)
+				      . ,(if (char-set? cs2)
+					     (list (char-set->scheme cs2 r))
+					     (cdr cs2))))))
+		       (if cset? cs (make-re-char-set cs)))
+		     (error "SRE set-difference operator (- ...) requires at least one argument")))
+
+	    ((/) (let ((cset (range-class->char-set (cdr sre) case-sensitive?)))
+		   (if cset? cset (make-re-char-set cset))))
+
+	    ((posix-string)
+	     (if (and (= 1 (length (cdr sre)))
+		      (string? (cadr sre)))
+		 (posix-string->regexp (cadr sre))
+		 (error "Illegal (posix-string ...) SRE body." sre)))
+
+	    (else (if (every string? sre)	; A set spec -- ("wxyz").
+		      (let* ((cs (apply char-set-union
+					(map string->char-set sre)))
+			     (cs (if case-sensitive? cs (uncase-char-set cs))))
+			(if cset? cs (make-re-char-set cs)))
+
+		      (error "Illegal SRE" sre)))))
+
+	 ;; It must be a char-class name (ANY, ALPHABETIC, etc.)
+	 (else (let ((cs (case sre
+			   ((any)			char-set:full)
+			   ((nonl)			nonl-chars)
+			   ((lower-case lower)		char-set:lower-case)
+			   ((upper-case upper)		char-set:upper-case)
+			   ((alphabetic alpha)		char-set:alphabetic)
+			   ((numeric digit num)	char-set:numeric)
+			   ((alphanumeric alnum alphanum) char-set:alphanumeric)
+			   ((punctuation punct)	char-set:punctuation)
+			   ((graphic graph)		char-set:graphic)
+			   ((blank)			char-set:blank)
+			   ((whitespace space white)	char-set:whitespace)
+			   ((printing print)		char-set:printing)
+			   ((control cntrl)		char-set:control)
+			   ((hex-digit xdigit hex)	char-set:hex-digit)
+			   ((ascii)			char-set:ascii)
+			   (else (error "Illegal regular expression" sre)))))
+		 (if cset? cs (make-re-char-set cs))))))))
+
+
+;;; In a CSET? true context, S must be a 1-char string; convert to a char set
+;;;     according to CASE-SENSITIVE? setting.
+;;; In a CSET? false context, convert S to a string re (CASE-SENSITIVE? true),
+;;;     or a sequence of char-sets (CASE-SENSITIVE? false).
+
+(define (parse-string-re s case-sensitive? cset?)
+  (if (= 1 (string-length s))
+      (parse-char-re (string-ref s 0) case-sensitive? cset?)
+      (if cset?
+	  (error "Non-singleton string not allowed in char-class context." s)
+	  ((if case-sensitive? make-re-string uncase-string) s))))
+
+(define (parse-char-re c case-sensitive? cset?)
+  (if case-sensitive?
+      (if cset? (char-set c) (make-re-string (string c)))
+      (let ((cset (char-set (char-upcase c) (char-downcase c))))
+	(if cset? cset (make-re-char-set cset)))))
+
+
+;;; "Apply" the associative char-set function OP to the char-sets ELTS.
+;;; If any of the ELTS is Scheme code instead of a real char set, then
+;;; we instead produce Scheme code for the op, using OP-NAME as the name
+;;; of the function, and R for the macro renamer function.
+
+(define (assoc-cset-op op op-name elts r)
+  (receive (csets code-chunks) (partition char-set? elts)
+    (if (pair? code-chunks)
+	(? ((pair? csets)
+	    `(,(r op-name) ,(char-set->scheme (apply op csets) r)
+			   . ,code-chunks))
+	   ((pair? (cdr code-chunks)) `(,(r op-name) . ,code-chunks))
+	   (else (car code-chunks))) ; Just one.
+	(apply op csets))))
+
+;;; Parse a (/  ...) char-class into a character set in
+;;; case-sensitivity context CS?.
+;;; Each  can be a character or a string of characters.
+
+(define (range-class->char-set range-specs cs?)
+  (let* ((specs (apply string-append
+		       (map (lambda (spec) (if (char? spec) (string spec) spec))
+			    range-specs)))
+	 (len (string-length specs))
+	 (cset (char-set-copy char-set:empty)))
+    (if (odd? len)
+	(error "Unmatched range specifier" range-specs)
+	(let lp ((i (- len 1)) (cset cset))
+	  (if (< i 0)
+	      (if cs? cset (uncase-char-set cset)) ; Case fold if necessary.
+	      (lp (- i 2)
+		  (char-set-union!
+		      cset
+		      (ascii-range->char-set (char->ascii (string-ref specs (- i 1)))
+					     (+ 1 (char->ascii (string-ref specs i)))))))))))
+
+;;; (regexp->scheme re r)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Translate a regexp value RE into raw Scheme code that will create it, with
+;;; calls to the regexp ADT constructor functions. R is a renaming function
+;;; provided by low-level macro expanders.
+
+(define (regexp->scheme re r)
+  (let ((%re-bos (r 're-bos))	(%re-eos (r 're-eos))
+	(%re-bol (r 're-bol))	(%re-eol (r 're-eol))
+	(%re-bow (r 're-bow))	(%re-eow (r 're-eow))
+	(%list   (r 'list)))
+
+  (let recur ((re re))
+    ;; If (fetch-posix re) = #f, produce (OP . ARGS);
+    ;; Otherwise, produce (OP/POSIX ,@ARGS ').
+    (define (doit op op/posix args fetch-posix)
+      (? ((fetch-posix re) =>
+	  (lambda (psx) `(,(r op/posix) ,@args
+					',(cre:string psx) ',(cre:tvec psx))))
+					
+	 (else `(,(r op) . ,args))))
+
+    (? ((re-string? re)   (if (re-trivial? re) (r 're-trivial) ; Special hack
+			      (doit 'make-re-string 'make-re-string/posix
+				    `(,(re-string:chars re))
+				    re-string:posix)))
+
+       ((re-seq? re)      (doit '%make-re-seq '%make-re-seq/posix
+				`((,%list . ,(map recur (re-seq:elts re)))
+				  ,(re-seq:tsm re))
+				re-seq:posix))
+
+       ((re-choice? re)   (doit '%make-re-choice '%make-re-choice/posix
+				`((,%list . ,(map recur (re-choice:elts re)))
+				  ,(re-choice:tsm re))
+				re-choice:posix))
+
+       ((re-char-set? re) (if (re-any? re) (r 're-any) ; Special hack for ANY.
+			      (doit 'make-re-char-set 'make-re-char-set/posix
+				    `(,(char-set->scheme (re-char-set:cset re) r))
+				    re-char-set:posix)))
+
+       ((re-repeat? re)   (doit '%make-re-repeat '%make-re-repeat/posix
+				`(,(re-repeat:from re)
+				  ,(re-repeat:to re)
+				  ,(recur (re-repeat:body re))
+				  ,(re-repeat:tsm re))
+				re-repeat:posix))
+
+       ((re-dsm? re)      (doit '%make-re-dsm '%make-re-dsm/posix
+				`(,(recur (re-dsm:body re))
+				  ,(re-dsm:pre-dsm  re)
+				  ,(re-dsm:tsm re))
+				re-dsm:posix))
+
+       ((re-submatch? re) (doit '%make-re-submatch '%make-re-submatch/posix
+				`(,(recur (re-submatch:body re))
+				  ,(re-submatch:pre-dsm re)
+				  ,(re-submatch:tsm re))
+				re-submatch:posix))
+
+       ((re-bos? re) %re-bos)
+       ((re-eos? re) %re-eos)
+       ((re-bol? re) %re-bol)
+       ((re-eol? re) %re-eol)
+       ((re-bow? re) %re-bow)
+       ((re-eow? re) %re-eow)
+
+       (else re)))))
+
+
+
+;;; Classify a character set.
+;;; We pass in a char set CS and 15 parameters, one for each of the
+;;; standard char sets. If we can classify CS as any of these char
+;;; sets, we return the corresponding parameter's value, otw #f.
+;;;
+;;; This is gratuitously optimised by probing cset with a couple of
+;;; witness chars (a,A,1,space), and doing an initial filter based
+;;; on these witnesses.
+
+(define (try-classify-char-set cs
+			       full nonl lower upper alpha num alphanum
+			       punct graph white print ctl hex blank ascii)
+  (let ((a     (char-set-contains? cs #\a))
+	(biga  (char-set-contains? cs #\A))
+	(one   (char-set-contains? cs #\1))
+	(space (char-set-contains? cs #\space)))
+
+    (if a
+	(if biga
+	    (if space 
+		(and one (switch char-set= cs
+			   ((char-set:full)		full)
+			   ((nonl-chars)		nonl)
+			   ((char-set:printing)		print)
+			   ((char-set:ascii)		ascii)
+			   (else #f)))
+		(if one
+		    (switch char-set= cs
+		      ((char-set:alphanumeric)	alphanum)
+		      ((char-set:graphic)	graph)
+		      ((char-set:hex-digit)	hex)
+		      (else #f))
+		    (and (char-set= cs char-set:alphabetic) alpha)))
+	    (and (char-set= cs char-set:lower-case) lower)) ; a, not A
+
+	(if biga
+	    (and (not space) (char-set= cs char-set:upper-case) upper)
+	    (if one
+		(and (not space) (char-set= cs char-set:numeric) num)
+		(if space
+		    (switch char-set= cs
+		      ((char-set:whitespace) white)
+		      ((char-set:blank)      blank)
+		      (else #f))
+		    (switch char-set= cs
+		      ((char-set:punctuation)	punct)
+		      ((char-set:control)	ctl)
+		      (else #f))))))))
+		
+
+(define (char-set->scheme cs r)
+  (let ((try (lambda (cs)
+	       (try-classify-char-set cs
+				      'char-set:full         'nonl-chars
+				      'char-set:lower-case   'char-set:upper-case
+				      'char-set:alphabetic   'char-set:numeric
+				      'char-set:alphanumeric 'char-set:punctuation
+				      'char-set:graphic      'char-set:whitespace
+				      'char-set:printing     'char-set:control
+				      'char-set:hex-digit    'char-set:blank
+				      'char-set:ascii))))
+    (? ((not (char-set? cs)) cs) ; Dynamic -- *already* Scheme code.
+       ((char-set-empty? cs) (r 'char-set:empty))
+       ((try cs) => r)
+       ((try (char-set-invert cs)) =>
+	(lambda (name) `(,(r 'char-set-invert) ,name)))
+
+       (else
+	(receive (loose+ ranges+) (char-set->in-pair cs)
+	  (receive (loose- ranges-) (char-set->in-pair (char-set-invert cs))
+	    (let ((makeit (r 'spec->char-set)))
+	      (if (< (+ (length loose-) (* 12 (length ranges-)))
+		     (+ (length loose+) (* 12 (length ranges+))))
+		  `(,makeit #f ,(list->string loose-) ',ranges-)
+		  `(,makeit #t ,(list->string loose+) ',ranges+)))))))))
+
+
+
+;;; This code needs work.
+
+(define (char-set->sre cs r)
+  (if (char-set? cs)
+      (let ((try (lambda (cs)
+		   (try-classify-char-set cs
+					  'any          'nonl
+					  'lower-case   'upper-case
+					  'alphabetic   'numeric
+					  'alphanumeric 'punctuation
+					  'graphic      'whitespace
+					  'printing     'control
+					  'hex-digit    'blank
+					  'ascii)))
+	    (nchars (char-set-size cs)))
+	(? ((zero? nchars) `(,(r '|)))
+	   ((= 1 nchars) (apply string (char-set-members cs)))
+	   ((try cs) => r)
+	   ((try (char-set-invert cs)) =>
+	    (lambda (name) `(,(r '~) ,name)))
+	   (else (receive (cs rp comp?) (char-set->in-sexp-spec cs)
+		   (let ((args (append (? ((string=? cs "") '())
+					  ((= 1 (string-length cs)) `(,cs))
+					  (else `((,cs))))
+				       (if (string=? rp "") '()
+					   (list `(,(r '/) ,rp))))))
+		     (if (and (= 1 (length args)) (not comp?))
+			 (car args)
+			 `(,(r (if comp? '~ '|)) . ,args)))))))
+
+      `(,(r 'unquote) ,cs))) ; dynamic -- ,
+
+
+;;; Unparse an re into a *list* of SREs (representing a sequence).
+;;; This is for rendering the bodies of DSM, SUBMATCH, **, *, =, >=, and &'s,
+;;; that is, forms whose body is an implicit sequence.
+
+(define (regexp->sres/renamer re r)
+    (if (re-seq? re)
+	(let ((elts (re-seq:elts re)))
+	  (if (pair? elts)
+	      (map (lambda (re) (regexp->sre/renamer re r)) elts)
+	      (let ((tsm  (re-seq:tsm  re))
+		    (%dsm (r 'dsm)))
+		(if (zero? tsm) '() `((,%dsm ,tsm 0))))))   ; Empty sequence
+	(list (regexp->sre/renamer re r))))		    ; Not a seq
+
+
+(define (regexp->sre/renamer re r)
+  (let recur ((re re))
+    (? ((re-string? re) (re-string:chars re))
+
+       ((re-seq? re)    `(,(r ':) . ,(regexp->sres/renamer re r)))
+	 
+       ((re-choice? re)
+	(let ((elts (re-choice:elts re))
+	      (%| (r '|)))
+	  (if (pair? elts)
+	      `(,%| . ,(map recur elts))
+	      (let ((tsm  (re-choice:tsm  re)))
+		(if (zero? tsm) `(,%|) `(,(r 'dsm) ,tsm 0 (,%|)))))))
+
+       ((re-char-set? re) (char-set->sre (re-char-set:cset re) r))
+
+       ((re-repeat? re)
+	(let ((from (re-repeat:from re))
+	      (to (re-repeat:to re))
+	      (bodies (regexp->sres/renamer (re-repeat:body re) r)))
+	  (? ((and (eqv? from 0) (not to))    `(,(r '*) . ,bodies))
+	     ((and (eqv? from 0) (eqv? to 1)) `(,(r '?) . ,bodies))
+	     ((and (eqv? from 1) (not to))    `(,(r '+) . ,bodies))
+	     ((eqv? from to)		      `(,(r '=) ,to . bodies))
+	     (to                           `(,(r '**) ,from ,to . ,bodies))
+	     (else                         `(,(r '>=) ,from . ,bodies)))))
+
+       ((re-dsm? re)
+	`(,(r 'dsm) ,(re-dsm:pre-dsm re) ,(re-dsm:post-dsm re)
+		    . ,(regexp->sres/renamer (re-dsm:body re) r)))
+
+       ((re-submatch? re)
+	`(,(r 'submatch) . ,(regexp->sres/renamer (re-submatch:body re) r)))
+
+       ((re-bos? re) (r 'bos))
+       ((re-eos? re) (r 'eos))
+       ((re-bol? re) (r 'bol))
+       ((re-eol? re) (r 'eol))
+       ((re-bow? re) (r 'bow))
+       ((re-eow? re) (r 'eow))
+
+       (else re))))			; Presumably it's code.
+
+(define (regexp->sre re) (regexp->sre/renamer re (lambda (x) x)))
+
+;;; Character class unparsing
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; This is the code that takes char-sets and converts them into forms suitable
+;;; for char-class SRE's or [...] Posix strings.
+
+;;; Map a char-set to an (| ("...") (/"...")) or (~ ("...") (/"...")) SRE.
+;;; We try it both ways, and return whichever is shortest.
+;;; We return three values: 
+;;; - a string of chars that are members in the set;
+;;; - a string of chars that, taken in pairs specifying ranges,
+;;;   give the rest of the members of the set.
+;;; - A boolean COMP?, which says whether the set should be complemented
+;;;   (~ ...) or taken as-is (| ...).
+;;;
+;;; E.g., ["!?.", "AZaz09", #t]
+
+(define (char-set->in-sexp-spec cset)
+  (let ((->sexp-pair (lambda (cset)
+		       (receive (loose ranges) (char-set->in-pair cset)
+			 (values (apply string loose)
+				 (apply string
+					(fold-right (lambda (r lis)
+						      `(,(car r) ,(cdr r) . ,lis))
+						    '() ranges)))))))
+    (receive (cs+ rp+) (->sexp-pair cset)
+      (receive (cs- rp-) (->sexp-pair (char-set-invert cset))
+	(if (< (+ (string-length cs-) (string-length rp-))
+	       (+ (string-length cs+) (string-length rp+)))
+	    (values cs- rp- #t)
+	    (values cs+ rp+ #f))))))
+
+;;; Return 2 values characterizing the char set in a run-length encoding:
+;;; - LOOSE		List of singleton chars -- elts of the set.
+;;; - RANGES		List of (from . to) char ranges.
+;;;
+;;; E.g., [(#\! #\? #\.) 
+;;;        ((#\A . #\Z) (#\a . #\z) (#\0 . #\9))]
+
+(define (char-set->in-pair cset)
+  (let ((add-range (lambda (from to loose ranges)
+		     (if from (case (- to from)
+				((0) (values (cons (ascii->char from) loose)
+					     ranges))
+				((1) (values `(,(ascii->char from)
+					       ,(ascii->char to)
+					       . ,loose)
+					     ranges))
+				((2) (values `(,(ascii->char from)
+					       ,(ascii->char (+ from 1))
+					       ,(ascii->char to)
+					       . ,loose)
+					     ranges))
+				(else (values loose
+					      `((,(ascii->char from) .
+						 ,(ascii->char to))
+						. ,ranges))))
+			 (values loose ranges)))))
+
+    (let lp ((i 127) (from #f) (to #f) (loose '()) (ranges '()))
+      (if (< i 0)
+	  (add-range from to loose ranges)
+
+	  (let ((i-1 (- i 1)))
+	    (if (char-set-contains? cset (ascii->char i))
+		(if from
+		    (lp i-1 i to loose ranges)	; Continue the run.
+		    (lp i-1 i i  loose ranges))	; Start a new run.
+
+		;; If there's a run going, finish it off.
+		(receive (loose ranges) (add-range from to loose ranges)
+		  (lp i-1 #f #f loose ranges))))))))
diff --git a/scsh/rx/posixstr.scm b/scsh/rx/posixstr.scm
new file mode 100644
index 0000000..a195ed7
--- /dev/null
+++ b/scsh/rx/posixstr.scm
@@ -0,0 +1,618 @@
+;;; Regexp-ADT -> Posix-string translator.
+;;; Olin Shivers January 1997, May 1998.
+
+;;; - If the regexp value contains nul character constants, or character sets 
+;;;   that contain the nul character, they will show up in the Posix string
+;;;   we produce. Spencer's C regexp engine can handle regexp strings that
+;;;   contain nul bytes, but this might blow up other implementations -- that
+;;;   is, the nul byte might prematurely terminate the C string passed to the
+;;;   regexp engine.
+;;; 
+;;; - The code is ASCII-specific in only one place: the expression for
+;;;   a regexp that matches nothing is the 6-char pattern "[^\000-\177]",
+;;;   which assumes a 7-bit character code. Note that the static simplifier
+;;;   can remove *all* occurences of this "empty regexp" except for the 
+;;;   un-simplifiable case of a single, top-level empty regexp, e.g. 
+;;;       (rx (in))
+;;;   We can handle this one special case specially, so we shouldn't *ever*
+;;;   have to produce this ASCII-specific pattern.
+
+;;; Exports: regexp->posix-string
+
+;;; Todo: A dumb, simple char-set renderer.
+
+;;; These functions translate static regular expressions into Posix regexp
+;;; strings. They generally return four values:
+;;;   - string (regexp)
+;;; 
+;;;   - syntax level: 0 parenthesized exp, 1 piece, 2 branch, 3 top
+;;;     ("piece", "branch" and "top" are Spencer's terms):
+;;;     + A parenthesized exp is syntactically equivalent to a piece.
+;;;       (But it's useful to know when an exp is parenthesized for
+;;;       eliminating redundant submatch-generated parens.)
+;;;     + A piece is something that would bind to a following * 
+;;;       ("a" but not "aa").
+;;;     + A branch is a sequence of pieces -- something that would bind to a |
+;;;       ("ab*d" but not "ab*|d"). That is, a branch is not allowed to contain
+;;;       top-level |'s.
+;;;     + Top is for a sequence of branches -- "a|b*c|d".
+;;; 
+;;;   - paren count in the returned string.
+;;;
+;;;   - Vector of parens numbers used for submatching. The first paren is
+;;;     numbered 1. #F means a dead submatch -- one we can tell statically
+;;;     will never match anything.
+
+;;; Non-R4RS imports:
+;;; ? = COND
+;;; Multiple-value return: VALUES RECEIVE CALL-WITH-VALUES
+;;; SORT-LIST
+
+
+;;; Useful little utility -- pad vector V with 
+;;; PRE initial and POST following #f's.
+
+(define (pad-vector pre post v)
+  (if (= pre post 0) v
+      (let* ((vlen (vector-length v))
+	     (alen (+ pre post vlen))
+	     (ans (make-vector alen #f)))
+	(do ((from (- vlen 1)      (- from 1))
+	     (to   (+ pre vlen -1) (- to 1)))
+	    ((< from 0))
+	  (vector-set! ans to (vector-ref v from)))
+	ans)))
+
+(define (n-falses n) (make-vector n #f))
+
+
+;;; There's no representation for regexps that never match anything (e.g.,
+;;; (|)) in strict Posix notation. When we get one of these, we treat it
+;;; specially, producing [#f #f #f #f].
+;;;
+;;; We can always detect these empty regexps, because they always simplify
+;;; to one of these two values:
+;;; - (make-re-char-set char-set:empty)
+;;; - (dsm m n (make-re-char-set char-set:empty))
+
+(define (simple-empty-re? re)
+  (or (and (re-char-set? re)
+	   (char-set-empty? (re-char-set:cset re)))
+      (and (re-dsm? re)
+	   (simple-empty-re? (re-dsm:body re)))))
+
+
+;;; Top-level
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define (regexp->posix-string re)
+  ;; We *must* simplify, to guarantee correct translation.
+  (let ((re (simplify-regexp re))) 
+    (if (simple-empty-re? re) (values #f #f #f #f)
+	(translate-regexp re))))
+
+
+(define (translate-regexp re)
+  (? ((re-string? re) (translate-string (re-string:chars re)))
+
+     ((re-repeat? re)   (translate-repeat   re))
+     ((re-choice? re)   (translate-choice   re))
+     ((re-seq? re)      (translate-seq      re))
+     ((re-char-set? re) (translate-char-set (re-char-set:cset re)))
+
+     ((re-submatch? re) (translate-submatch re))
+
+     ((re-bos? re) (values "^" 1 0 '#()))
+     ((re-eos? re) (values "$" 1 0 '#()))
+
+     ((re-bol? re) (error "Beginning-of-line regexp not supported in this implementation."))
+     ((re-eol? re) (error "End-of-line regexp not supported in this implementation."))
+
+     ((re-bow? re) (values "[[:<:]]" 1 0 '#())) ; These two are 
+     ((re-eow? re) (values "[[:>:]]" 1 0 '#())) ; Spencer-specific.
+
+     ((re-dsm? re) (let ((pre-dsm (re-dsm:pre-dsm re))
+			 (body    (re-dsm:body re)))
+		     (translate-dsm body pre-dsm
+				    (- (re-dsm:tsm re)
+				       (+ pre-dsm (re-tsm body))))))
+
+     (else (error "Illegal regular expression" re))))
+
+
+;;; Translate reloc-elt ELT = (N . RE) from a sequence or choice
+;;; into a Posix string.
+;;; - Relocate the submatch indices by PREV-PCOUNT.
+;;;   (That is, assume rendering preceding elts used PREV-PCOUNT parens.)
+;;; - Assume preceding elements allocated PREV-SMCOUNT submatches
+;;;   (we may have to pad our returned submatches string with some
+;;;   initial #F's to account for dead submatches PREV-SMCOUNT through N.)
+;;; - If SUB-LEV3? is true, the result string is guaranteed to be < level 3.
+;;;   This is used by the & and | translators.
+;;; - Returns the usual 4 values plus the final submatch count including
+;;;   this regexp.
+
+(define (translate-elt elt prev-pcount prev-smcount sub-lev3?)
+  (let ((offset (car elt))
+	(re     (cdr elt)))
+
+    (receive (s level pcount submatches) (translate-regexp re)
+
+      ;; Relocate submatch indices by OFFSET and force level <3, if needed:
+      (receive (s level pcount submatches)
+               (if (and sub-lev3? (= level 3))
+		   (values (string-append "(" s ")")
+			   0
+			   (+ pcount 1)
+			   (mapv (lambda (sm) (and sm (+ prev-pcount 1 sm)))
+				 submatches))
+		   (values s level pcount
+			   (mapv (lambda (sm) (and sm (+ prev-pcount sm)))
+				 submatches)))
+
+	;; Tack onto submatches as many initial #F's as needed to bump
+	;; the previous submatches count from PREV-SMCOUNT to OFFSET.
+	(values s level pcount
+		(pad-vector (- offset prev-smcount) 0 submatches)
+		(+ offset (re-tsm re)))))))
+      
+
+
+;;; Force the string to be level < 3 by parenthesizing it if necessary.
+
+(define (paren-if-necessary s lev pcount submatches)
+  (if (< lev 3)
+      (values s lev pcount submatches)
+      (values (string-append "(" s ")")
+	      0
+	      (+ pcount 1)
+	      (mapv (lambda (sm) (and sm (+ 1 sm)))
+		    submatches))))
+
+
+
+;;; (: re1 ... ren)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define (translate-seq re)
+  (let ((elts (re-seq:elts re))
+	(tsm  (re-seq:tsm  re)))
+    (let recur ((elts elts) (prev-pcount 0) (prev-smcount 0))
+      ;; Render a sequence tail ELTS, assuming the previous elements translated
+      ;; to a string with PREV-PCOUNT parens, and allocated PREV-SMCOUNT
+      ;; submatches.
+      (if (pair? elts)
+	  (let* ((elt  (car elts))
+		 (elts (cdr elts)))
+
+	    (receive (s1 level1 pcount1 submatches1)
+		     (translate-regexp elt)
+
+	      (receive (s1 level1 pcount1 submatches1)
+		       (paren-if-necessary s1 level1 pcount1 submatches1)
+
+		(receive (s level pcount submatches)
+		         (recur elts
+				(+ pcount1 prev-pcount)
+				(+ prev-smcount (re-tsm elt)))
+
+		  (values (string-append s1 s)
+			  2
+			  (+ pcount1 pcount)
+			  (vector-append (mapv (lambda (sm) (+ sm prev-smcount))
+					       submatches1)
+					 submatches))))))
+
+	    (values "" 2 0 '#()))))) ; Empty seq
+
+
+
+;;; (| re1 ... ren)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define (translate-choice re)
+  (let ((elts (re-choice:elts re))
+	(tsm  (re-choice:tsm  re)))
+    (if (pair? elts)
+	(let recur ((elts elts) (prev-pcount 0) (prev-smcount 0))
+	  ;; ELTS is a non-empty choice tail. Render it, assuming the
+	  ;; previous elements translated to a string with PREV-PCOUNT parens,
+          ;; and allocated PREV-SMCOUNT submatches.
+	  (let ((elt (car elts))  (tail (cdr elts)))
+	    (receive (s1 level1 pcount1 submatches1) (translate-regexp elt)
+	      (let ((submatches1 (mapv (lambda (sm) (and sm (+ sm prev-pcount)))
+				       submatches1)))
+		(if (pair? tail)
+		    (receive (s level pcount submatches)
+			(recur tail
+			       (+ pcount1 prev-pcount)
+			       (+ prev-smcount (re-tsm elt)))
+		      (values (string-append s1 "|" s) 3
+			      (+ pcount1 pcount)
+			      (vector-append submatches1 submatches)))
+		    
+		    (values s1 level1 pcount1 submatches1))))))
+
+	(values "[^\000-\377]" 1 0 (n-falses tsm)))))	; Empty choice.
+
+
+
+;;; Repeated cases: * + ? and {n,m} ranges.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define (translate-repeat re)
+  (let ((from (re-repeat:from re))
+	(to   (re-repeat:to   re))
+	(body (re-repeat:body re))
+	(tsm  (re-repeat:tsm  re)))
+
+    (? ((and to (> from to))				; Unsatisfiable
+	(values "[^\000-\377]" 1 0 (n-falses tsm))) 
+
+       ((and to (= from to 1)) (translate-seq body))	; RE{1,1} => RE
+
+       ((and to (= to 0))				; RE{0,0} => ""
+	(values "" 2 0 (n-falses tsm)))
+
+       (else						; General case
+	(receive (s level pcount submatches) (translate-regexp body)
+	  (receive (s level pcount submatches)	; Coerce S to level <2.
+	           (if (> level 1)
+		       (values (string-append "(" s ")")
+			       0
+			       (+ pcount 1)
+			       (mapv (lambda (i) (and i (+ i 1))) submatches))
+		       (values s level pcount submatches))
+
+	    (values (if to
+			(? ((and (= from 0) (= to 1)) (string-append s "?"))
+			   ((= from to)
+			    (string-append s "{" (number->string to) "}"))
+			   (else
+			    (string-append s "{" (number->string from)
+					   "," (number->string to) "}")))
+			(? ((= from 0) (string-append s "*"))
+			   ((= from 1) (string-append s "+"))
+			   (else (string-append s "{" (number->string from) ",}"))))
+		    1 pcount submatches)))))))
+
+
+
+;;; Submatch
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define (translate-submatch re)
+  (let ((body    (re-submatch:body re))
+	(pre-dsm (re-submatch:pre-dsm  re)))
+
+    ;; Translate the body, along with any leading or trailing dead submatches.
+    (receive (s level pcount submatches)
+	     (translate-dsm body
+			    pre-dsm
+			    (- (re-submatch:tsm re)
+			       (+ 1 pre-dsm (re-tsm body))))
+	
+      ;; If the whole expression isn't already wrapped in a paren, wrap it.
+      ;; This outer paren becomes the new submatch -- add to submatches list.
+      (if (= level 0)
+	  (values s 0 pcount (vector-append '#(1) submatches))
+	  (values (string-append "(" s ")")
+		  0
+		  (+ pcount 1)
+		  (mapv! (lambda (i) (and i (+ i 1)))		; Excuse me.
+			 (vector-append '#(0) submatches)))))))
+
+;;; Translating DSM
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Translate the body, and paste enough #F's before and after the submatches
+;;; list to account for extra dead submatches.
+
+(define (translate-dsm body pre-dsm post-dsm)
+  (receive (s level pcount submatches) (translate-regexp body)
+    (values s level pcount (pad-vector pre-dsm post-dsm submatches))))
+
+;;; Constant regexps
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Convert a string into a regexp pattern that matches that string exactly --
+;;; quote the special chars with backslashes.
+
+(define translate-string
+  (let ((specials (string->char-set "[.*?()|\\$^+")))
+    (lambda (s)
+      (let ((len (string-length s)))
+	(if (zero? len)
+	    (values "()" 0 1 '#()) ; Special case ""
+
+	    (let* ((len2 (string-fold (lambda (c len) ; Length of answer str
+					(+ len (if (char-set-contains? specials c) 2 1)))
+				      0 s))
+		   (s2 (make-string len2)))		; Answer string
+
+	      ;; Copy the chars over to S2.
+	      (string-fold (lambda (c i)
+			     ;; Write char C at index I, return the next index.
+			     (let ((i (cond ((char-set-contains? specials c)
+					     (string-set! s2 i #\\)
+					     (+ i 1))
+					    (else i))))
+			       (string-set! s2 i c)
+			       (+ i 1)))
+			   0 s)
+	      (values s2 (if (= len 1) 1 2)
+		      0 '#())))))))
+
+
+
+;;; Translating char-sets to [...] strings
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; This is the nastiest code in the system. We make an effort to return
+;;; succinct encodings of the char-sets, in the event these encodings are
+;;; being shown to humans.
+;;; - A singleton set is rendered as that char.
+;;; - A full set is rendered as "."
+;;; - An empty set is rendered as [^\000-\177].
+;;; - Otherwise, render it both as a [...] and as a [^...] spec, and
+;;;   take whichever is shortest.
+
+;;; Take a char set, and return the standard 
+;;;     [regexp-string, level, pcount, submatches]
+;;; quadruple.
+;;;
+
+(define (translate-char-set cset)
+  (if (char-set-full? cset) (values "." 1 0 '#())		; Full set
+
+      (let ((nchars (char-set-size cset))
+	    (->bracket-string (lambda (cset in?)
+				(receive (loose ranges) (char-set->in-pair cset)
+				  (hack-bracket-spec loose ranges in?)))))
+	
+	(? ((= 0 nchars) (values "[^\000-\177]" 1 0 '#()))	; Empty set
+	     
+	   ((= 1 nchars)					; Singleton set
+	    (translate-string (string (car (char-set-members cset)))))
+
+	   ;; General case. Try both [...] and [^...].
+	   (else (let ((s- (->bracket-string cset #t))
+		       (s+ (->bracket-string (char-set-invert cset) #f)))
+		   (values (if (< (string-length s-) (string-length s+))
+			       s- s+)
+			   1 0 '#())))))))
+
+
+;;; Commentary
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; Hacking special chars in character-class strings:
+;;; ] - ^	]...^-
+;;; ] -		]...-
+;;; ]   ^	]...^
+;;; ]   	]...
+;;;   - ^	...^-	(or doubleton screw-case)
+;;;   -		...-
+;;;     ^	...^	(or singleton screw-case)
+;;;
+;;; Two screw cases: 
+;;;   "^-" must be converted to "-^" for IN.
+;;;   "^" must be converted to non-class "^" for IN.
+
+;;; Rendering a general char-set into a correct Posix [...] bracket expression
+;;; is a complete mess.
+;;;
+;;; The rules on bracket expressions:
+;;; - ] terminates the exp unless it is the first char 
+;;;   (after an optional leading ^).
+;;; - .*[\ are not special in bracket expressions.
+;;; - However, [. [= and [: *are* special, so you can't follow an
+;;;   open bracket by one of .=: -- argh. See below.
+;;; - ^ isn't special unless it's the first char.
+;;; - - is special unless it's first (after an optional ^), last,
+;;;   or as the ending char in a range (e.g., a--).
+
+;;; This means:
+;;; - You must ensure that ] doesn't begin or terminate a range.
+;;; - You must ensure that .=: don't follow [
+;;;   + This can happen in the loose char list;
+;;;   + This can happen in the range list -- consider the pair of
+;;;     ranges "x-[.-%" Handle this by prohibiting [ as a range-terminator.
+;;;   + It can happen at the loose/range boundary: %[:-?
+
+;;; First, run-length encode the set into loose and range-pairs.
+;;; If the set is a singleton set, then punt the whole [...] effort,
+;;; and do it as a simple char.
+
+;;; Repeat until stable:
+;;; - Sort the ranges in this order:
+;;;     1. other ranges;
+;;;     2. ranges that begin with ^	(not priority)
+;;;     3. ranges that begin with .=:	(priority)
+;;;     4. ranges that end with [	(priority)
+;;;   This eliminates [. [= [: problems in the ranges, and
+;;;   minimises the chances of the problem at the loose/range boundary.
+;;;   and problems with initial ^ chars.
+;;; - Sort the loose chars so that ] is first, then -, then .=:, then [,
+;;;   then others, then ^. This eliminates [. [= [: problems in the loose 
+;;;   chars, and minimises the chances of the problem at the loose/range
+;;;   boundary.
+;;; - Shrink ranges by moving an opening or closing range char into the
+;;;   loose-char set:
+;;;   + If ] opens or closes a range, shrink it out.
+;;;   + If any range opens with -, shrink it out.
+;;;   + If the first range opens with .=:, and the last loose char is [,
+;;;     shrink it out.
+;;;   + If there are no loose chars, the first range begins with ^, and
+;;;     we're doing an IN range, shrink out the ^.
+;;;   + Shrinking a range down to <3 chars means move it's elts into the
+;;;     loose char set.
+;;; - If both [ and - are in the loose char set, 
+;;;   pull - out as special end-hypen.
+
+;;; Finally, we have to hack things so that ^ doesn't begin an IN sequence.
+;;; - If it's a NOT-IN sequence, no worries.
+;;; - If ^ is the opening loose char, then it's the only loose char.
+;;;   If there are ranges, move it to the end of the string.
+;;;   If there are no ranges, then just punt the char-class and convert
+;;;   it to a singleton ^. In fact, do this up-front, for any singleton 
+;;;   set.
+;;;
+;;; If the special end-hyphen flag is set, add - to the end of the string.
+
+;;; This general approach -- starting out with maximal ranges, and then
+;;; shrinking them to avoid other syntax violations -- has the advantage
+;;; of not relying on the details of the ASCII encodings.
+
+;;; Ordering ranges:
+;;;     1. other ranges (ordered by start char)
+;;;     2. ranges that begin with ^	(not priority)
+;;;     3. ranges that begin with .=:	
+;;;     4. ranges that end with [	(priority over #2 & #3)
+
+(define (range< r1 r2)
+  (let ((r1-start (car r1)) (r1-end (cdr r1))
+	(r2-start (car r2)) (r2-end (cdr r2)))
+    (or (char=? r2-end #\[)	; Range ending with [ comes last.
+	(and (not (char=? r1-end #\[))
+
+	     ;; Range begin with one of .=: comes next-to-last
+	     (or (char=? r2-start #\.) (char=? r2-start #\=) (char=? r2-start #\:)
+		 (and (not (char=? r1-start #\.))
+		      (not (char=? r1-start #\=))
+		      (not (char=? r1-start #\:))
+
+		      ;; Range beginning with ^ comes before that.
+		      (or (char=? r1-start #\^)
+			  (and (not (char=? r2-start #\^))
+			       
+			       ;; Other ranges are ordered by start char.
+			       (< (char->ascii r1-start)
+				  (char->ascii r2-start))))))))))
+
+;;; Order loose chars:
+;;;   ]   is first,
+;;;   -   is next, 
+;;;   .=: are next, 
+;;;   [   is next,
+;;;   then others (ordered by ascii val)
+;;;   ^   is last.
+
+
+(define (loose<= c1 c2)
+  (or (char=? c1 #\])				; ] is first,
+      (and (not (char=? c2 #\]))
+
+	   (or (char=? c1 #\-)			; - is next,
+	       (and (not (char=? c2 #\-))
+
+		    ;; .=: are next,
+		    (or (char=? c1 #\.) (char=? c1 #\=) (char=? c1 #\:)
+			(and (not (char=? c2 #\.))
+			     (not (char=? c2 #\=))
+			     (not (char=? c2 #\:))
+
+			     (or (char=? c1 #\[)	; [ is next,
+				 (and (not (char=? c2 #\[))
+
+				      (or (char=? c2 #\^)	; ^ is last,
+					  (and (not (char=? c1 #\^))
+
+					       ;; other chars by ASCII.
+					       (<= (char->ascii c1)
+						   (char->ascii c2)))))))))))))
+
+;;; Returns (1) a list of 0-3 loose chars, (2) a list of 0 or 1 ranges.
+
+(define (shrink-range-start r)
+  (let ((start (char->ascii (car r)))
+	(end   (char->ascii (cdr r))))
+    (shrink-range-finish-up start (+ start 1) end)))
+
+(define (shrink-range-end r)
+  (let ((start (char->ascii (car r)))
+	(end   (char->ascii (cdr r))))
+    (shrink-range-finish-up end start (- end 1))))
+
+(define (shrink-range-finish-up c start end)
+  (? ((> start end) (values (list (ascii->char c)) '()))	; Empty range
+
+   ((= start end)		; Collapse singleton range.
+      (values (list (ascii->char c) (ascii->char start))
+	      '()))
+
+     ((= (+ start 1) end)	; Collapse doubleton range.
+      (values (list (ascii->char c) (ascii->char start) (ascii->char end))
+	      '()))
+
+     (else (values (list (ascii->char c))
+		   (list (cons (ascii->char start) (ascii->char end)))))))
+
+
+;;; We assume the bracket-spec is not a singleton, not empty, and not complete.
+;;; (These cases get rendered as the letter, [^\000-\177], and ".", 
+;;; respectively.) We assume the loose chars and the ranges are all disjoint.
+
+(define (hack-bracket-spec loose ranges in?)
+  (let lp ((loose0 loose) (ranges0 ranges) (end-hyphen? #f))
+    ;; Repeat until stable:
+    (let ((loose  (sort-list loose0  loose<=))	; Sort loose chars and ranges.
+	  (ranges (sort-list ranges0 range<)))
+
+      ;; If ] opens or closes a range, shrink it out.
+      ;; If - opens a range, shrink it out.
+      (receive (loose ranges)
+	       (let recur ((ranges ranges))
+		 (if (pair? ranges)
+		     (let* ((range (car ranges))
+			    (start (car range))
+			    (end   (cdr range))
+			    (ranges (cdr ranges)))
+		       (receive (new-loose new-ranges) (recur ranges)
+			 (receive (new-loose0 new-ranges0)
+			          (? ((char=? #\] start)
+				      (shrink-range-start range))
+
+				     ((char=? #\] end)
+				      (shrink-range-end range))
+
+				     ((char=? #\- start)
+				      (shrink-range-start range))
+
+				     (else (values '() (list range))))
+			   (values (append new-loose0  new-loose)
+				   (append new-ranges0 new-ranges)))))
+		     (values loose '())))
+
+	(? ((or (not (equal? loose0  loose))	; Loop if anything changed.
+		(not (equal? ranges0 ranges)))
+	    (lp loose ranges end-hyphen?))
+
+	   ;; If the first range opens with .=:, and the last loose char is [,
+	   ;; shrink it out & loop.
+	   ((and (pair? ranges)
+		 (memv (caar ranges) '(#\. #\= #\:))
+		 (pair? loose)
+		 (char=? #\[ (car (reverse loose))))
+	    (receive (new-loose new-ranges)
+		     (shrink-range-start (car ranges))
+	      (lp (append new-loose loose) (append new-ranges (cdr ranges)) end-hyphen?)))
+
+	   ;; If there are no loose chars, the first range begins with ^, and
+	   ;; we're doing an IN range, shrink out the ^.
+	   ((and in? (null? loose) (pair? ranges) (char=? #\^ (caar ranges)))
+	    (receive (new-loose new-ranges) (shrink-range-start (car ranges))
+	      (lp (append new-loose loose) (append new-ranges ranges) end-hyphen?)))
+
+	   ;; If both [ and - are in the loose char set,
+	   ;; pull - out as special end-hypen.
+	   ((and (pair? loose)
+		 (pair? (cdr loose))
+		 (char=? (car loose) #\[)
+		 (char=? (car loose) #\-))
+	    (lp (cons (car loose) (cddr loose)) ranges #t))
+
+	   ;; No change! Build the answer...
+	   (else (string-append (if in? "[" "[^")
+				(list->string loose)
+				(apply string-append
+				       (map (lambda (r) (string (car r) #\- (cdr r)))
+					    ranges))
+				"]")))))))
diff --git a/scsh/rx/re-fold.scm b/scsh/rx/re-fold.scm
new file mode 100644
index 0000000..2a85879
--- /dev/null
+++ b/scsh/rx/re-fold.scm
@@ -0,0 +1,114 @@
+;;; Regexp "fold" combinators					-*- scheme -*-
+;;; Copyright (c) 1998 by Olin Shivers.
+
+;;; REGEXP-FOLD       re kons knil s [finish start] -> value
+;;; REGEXP-FOLD-RIGHT re kons knil s [finish start] -> value
+;;; REGEXP-FOR-EACH re proc s [start] -> unspecific
+
+;;; Non-R4RS imports: let-optionals :optional error ?
+
+;;; regexp-fold re kons knil s [finish start] -> value
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; The following definition is a bit unwieldy, but the intuition is
+;;; simple: this procedure uses the regexp RE to divide up string S into
+;;; non-matching/matching chunks, and then "folds" the procedure KONS
+;;; across this sequence of chunks.
+;;;
+;;; Search from START (defaulting to 0) for a match to RE; call
+;;; this match M. Let I be the index of the end of the match
+;;; (that is, (match:end M 0)). Loop as follows:
+;;;   (regexp-fold re kons (kons START M knil) s finish I)
+;;; If there is no match, return instead 
+;;;   (finish START knil)
+;;; FINISH defaults to (lambda (i knil) knil)
+;;;
+;;; In other words, we divide up S into a sequence of non-matching/matching
+;;; chunks:
+;;;    NM1 M1 NM1 M2 ... NMk Mk NMlast
+;;; where NM1 is the initial part of S that isn't matched by the RE, M1 is the
+;;; first match, NM2 is the following part of S that isn't matched, M2 is the
+;;; second match, and so forth -- NMlast is the final non-matching chunk of
+;;; S. We apply KONS from left to right to build up a result, passing it one
+;;; non-matching/matching chunk each time: on an application (KONS i m KNIL),
+;;; the non-matching chunk goes from I to (match:begin m 0), and the following
+;;; matching chunk goes from (match:begin m 0) to (match:end m 0). The last
+;;; non-matching chunk NMlast is processed by FINISH. So the computation we
+;;; perform is
+;;;   (final q (kons Jk MTCHk ... (kons J2 MTCH2 (kons J1 MTCH1 knil))...))
+;;; where Ji is the index of the start of NMi, MTCHi is a match value
+;;; describing Mi, and Q is the index of the beginning of NMlast.
+
+(define (regexp-fold re kons knil s . maybe-finish+start)
+  (let-optionals maybe-finish+start ((finish (lambda (i x) x))
+				     (start 0))
+    (if (> start (string-length s))
+	(error "Illegal START parameter"
+	       regexp-fold re kons knil s finish start))
+    (let lp ((i start) (val knil))
+      (? ((regexp-search re s i) =>
+	  (lambda (m)
+	    (let ((next-i (match:end m 0)))
+	      (if (= next-i (match:start m 0))
+		  (error "An empty-string regexp match has put regexp-fold into an infinite loop."
+			 re s start next-i)
+		  (lp next-i (kons i m val))))))
+	 (else (finish i val))))))
+
+;;; regexp-fold-right re kons knil s [finish start] -> value
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; This procedure repeatedly matches regexp RE across string S.
+;;; This divides S up into a sequence of matching/non-matching chunks:
+;;;    NM0 M1 NM1 M2 NM2 ... Mk NMk
+;;; where NM0 is the initial part of S that isn't matched by the RE,
+;;; M1 is the first match, NM1 is the following part of S that isn't
+;;; matched, M2 is the second match, and so forth. We apply KONS from
+;;; right to left to build up a result
+;;;   (final q (kons MTCH1 J1 (kons MTCH2 J2 ...(kons MTCHk JK knil)...)))
+;;; where MTCHi is a match value describing Mi, Ji is the index of the end of
+;;; NMi (or, equivalently, the beginning of Mi+1), and Q is the index of the
+;;; beginning of M1. In other words, KONS is passed a match, an index
+;;; describing the following non-matching text, and the value produced by
+;;; folding the following text. The FINAL function "polishes off" the fold
+;;; operation by handling the initial chunk of non-matching text (NM0, above).
+;;; FINISH defaults to (lambda (i knil) knil)
+
+(define (regexp-fold-right re kons knil s . maybe-finish+start)
+  (let-optionals maybe-finish+start ((finish (lambda (i x) x))
+				     (start 0))
+    (if (> start (string-length s))
+	(error "Illegal START parameter" regexp-fold-right re kons knil s
+	       finish start))
+
+    (? ((regexp-search re s start) =>
+	(lambda (m)
+	  (finish (match:start m 0)
+		  (let recur ((last-m m))
+		    (? ((regexp-search re s (match:end last-m 0)) =>
+			(lambda (m)
+			  (let ((i (match:start m 0)))
+			    (if (= i (match:end m 0))
+				(error "An empty-string regexp match has put regexp-fold-right into an infinite loop."
+				       re s start i)
+				(kons last-m i (recur m))))))
+			  (else (kons last-m (string-length s) knil)))))))
+	  (else (finish (string-length s) knil)))))
+
+;;; regexp-for-each re proc s [start] -> unspecific
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Repeatedly match regexp RE against string S. 
+;;; Apply PROC to each match that is produced.
+;;; Matches do not overlap.
+
+(define (regexp-for-each re proc s . maybe-start)
+  (let ((start (:optional maybe-start 0)))
+    (if (> start (string-length s))
+	(apply error "Illegal START parameter" regexp-for-each re proc s start)
+	(let lp ((i start))
+	  (? ((regexp-search re s i) =>
+	      (lambda (m)
+		(let ((next-i (match:end m 0)))
+		  (if (= (match:start m 0) next-i)
+		      (error "An empty-string regexp match has put regexp-for-each into an infinite loop."
+			     re proc s start next-i))
+		  (proc m)
+		  (lp next-i)))))))))
diff --git a/scsh/rx/re-high.scm b/scsh/rx/re-high.scm
new file mode 100644
index 0000000..f9b1388
--- /dev/null
+++ b/scsh/rx/re-high.scm
@@ -0,0 +1,62 @@
+;;; Regular expression matching for scsh
+;;; Copyright (c) 1998 by Olin Shivers.
+
+
+;;; Translates the re to a Posix string, and returns a CRE record,
+;;; but doesn't actually compile the Posix string into a C regex_t struct.
+;;; Uses the :POSIX field to cache the CRE record.
+
+(define (compile-regexp re)
+  (let* ((compile (lambda () (receive (s lev pcount tvec)
+				 (regexp->posix-string re)
+			       (new-cre s tvec))))
+
+	 (check-cache (lambda (fetch set)
+			(or (fetch re)		; Already cached.
+			    (let ((cre (compile)))	; Compile it,
+			      (set re cre)		; cache it,
+			      cre)))))			; and return it.
+
+    (? ((re-seq? re)
+	(check-cache re-seq:posix set-re-seq:posix))
+       ((re-choice? re)
+	(check-cache re-choice:posix set-re-choice:posix))
+       ((re-repeat? re)
+	(check-cache re-repeat:posix set-re-repeat:posix))
+       ((re-char-set? re)
+	(check-cache re-char-set:posix set-re-char-set:posix))
+       ((re-string? re)
+	(check-cache re-string:posix set-re-string:posix))
+       ((re-submatch? re)
+	(check-cache re-submatch:posix set-re-submatch:posix))
+       ((re-dsm? re)
+	(check-cache re-dsm:posix set-re-dsm:posix))
+
+       ((re-bos? re) (or bos-cre (set! bos-cre (compile))))
+       ((re-eos? re) (or eos-cre (set! eos-cre (compile))))
+
+       ((re-bol? re) (error "BOL regexp not supported in this implementation."))
+       ((re-eol? re) (error "EOL regexp not supported in this implementation."))
+
+       ((re-bow? re) (or bow-cre (set! bow-cre (compile))))
+       ((re-eow? re) (or eow-cre (set! eow-cre (compile))))
+
+       (else (error "compile-regexp -- not a regexp" re)))))
+
+(define bos-cre #f)
+(define eos-cre #f)
+(define bow-cre #f)
+(define eow-cre #f)
+
+
+
+(define (regexp-search re str . maybe-start)
+  (let* ((tsm (re-tsm re))
+	 (svec (make-vector (+ 1 tsm) #f))
+	 (evec (make-vector (+ 1 tsm) #f))
+	 (cre (compile-regexp re)))
+    (cre-search cre svec evec str (:optional maybe-start 0))))
+	
+
+(define (regexp-search? re str . maybe-start)
+  (cre-search? (compile-regexp re) str (:optional maybe-start 0)))
diff --git a/scsh/rx/re-low.c b/scsh/rx/re-low.c
new file mode 100644
index 0000000..1c58762
--- /dev/null
+++ b/scsh/rx/re-low.c
@@ -0,0 +1,61 @@
+/* This is an Scheme48/C interface file, 
+** automatically generated by cig.
+*/
+
+#include 
+#include  /* For malloc. */
+#include "libcig.h"
+
+/* Make sure foreign-function stubs interface to the C funs correctly: */
+#include 
+#include "../regexp/regex.h"
+#include "re1.h"
+
+scheme_value df_compile_re(long nargs, scheme_value *args)
+{
+    extern int compile_re(scheme_value , int , regex_t* *);
+    scheme_value ret1;
+    int r1;
+    regex_t* r2;
+
+    cig_check_nargs(3, nargs, "compile_re");
+    r1 = compile_re(args[2], EXTRACT_BOOLEAN(args[1]), &r2);
+    ret1 = ENTER_FIXNUM(r1);
+    AlienVal(VECTOR_REF(*args,0)) = (long) r2;
+    return ret1;
+    }
+
+scheme_value df_re_search(long nargs, scheme_value *args)
+{
+    extern scheme_value re_search(const regex_t *, scheme_value , int , scheme_value , int , scheme_value , scheme_value );
+    scheme_value ret1;
+    scheme_value r1;
+
+    cig_check_nargs(7, nargs, "re_search");
+    r1 = re_search((const regex_t *)AlienVal(args[6]), args[5], EXTRACT_FIXNUM(args[4]), args[3], EXTRACT_FIXNUM(args[2]), args[1], args[0]);
+    ret1 = r1;
+    return ret1;
+    }
+
+scheme_value df_re_errint2str(long nargs, scheme_value *args)
+{
+    extern const char *re_errint2str(int , const regex_t *);
+    scheme_value ret1;
+    const char *r1;
+
+    cig_check_nargs(3, nargs, "re_errint2str");
+    r1 = re_errint2str(EXTRACT_FIXNUM(args[2]), (const regex_t *)AlienVal(args[1]));
+    ret1 = VECTOR_REF(*args,0);
+    {AlienVal(CAR(ret1)) = (long) r1; CDR(ret1) = strlen_or_false(r1);}
+    return ret1;
+    }
+
+scheme_value df_free_re(long nargs, scheme_value *args)
+{
+    extern void free_re(regex_t* );
+
+    cig_check_nargs(1, nargs, "free_re");
+    free_re((regex_t* )AlienVal(args[0]));
+    return SCHFALSE;
+    }
+
diff --git a/scsh/rx/re-low.scm b/scsh/rx/re-low.scm
new file mode 100644
index 0000000..1cebb22
--- /dev/null
+++ b/scsh/rx/re-low.scm
@@ -0,0 +1,152 @@
+;;; Regular expression matching for scsh
+;;; Copyright (c) 1994 by Olin Shivers.
+
+(foreign-source
+  "/* Make sure foreign-function stubs interface to the C funs correctly: */"
+  "#include "
+  "#include \"../regexp/regex.h\""
+  "#include \"re1.h\""
+  "" ""
+  )
+
+;;; Match data for regexp matches.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define-record regexp-match
+  string	; The string against which we matched
+  start		; vector of starting indices
+  end)		; vector of ending indices
+
+(define (match:start match . maybe-index)
+  (vector-ref (regexp-match:start match)
+	      (:optional maybe-index 0)))
+
+(define (match:end match . maybe-index)
+  (vector-ref (regexp-match:end match)
+	      (:optional maybe-index 0)))
+
+(define (match:substring match . maybe-index)
+  (let* ((i (:optional maybe-index 0))
+	 (start (vector-ref (regexp-match:start match) i)))
+    (and start (substring (regexp-match:string match)
+			  start
+			  (vector-ref (regexp-match:end match) i)))))
+
+;;; Compiling regexps
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; There's no legal Posix string expressing the empty match (e.g., (|))
+;;; that will never match anything. So when we have one of these, we set
+;;; the STRING field to #f. The matchers will spot this case and handle it
+;;; specially.
+
+;;; We compile the string two ways, on demand -- one for cre-search, and
+;;; one for cre-search?.
+
+(define-record cre	; A compiled regular expression
+  string		; The Posix string form of the regexp or #F.
+  max-paren		; Max paren in STRING needed for submatches.
+  (bytes    #f)		; Pointer to the compiled form, in the C heap, or #F.
+  (bytes/nm #f)		; Same as BYTES, but compiled with no-submatch.
+  tvec			; Translation vector for the submatches
+  ((disclose self) (list "cre" (cre:string self))))
+
+(define (new-cre str tvec) (make-cre str (max-live-posix-submatch tvec) tvec))
+
+(define (max-live-posix-submatch tvec)
+  (vfold (lambda (sm mlpsm) (if sm (max mlpsm sm) mlpsm)) 0 tvec))
+
+(define (compile-posix-re->c-struct re-string sm?)
+  (receive (errcode c-struct) (%compile-re re-string sm?)
+    (if (zero? errcode) c-struct
+	(error errcode (%regerror-msg errcode c-struct)
+	       compile-posix-re->c-struct re-string sm?))))
+
+(define-foreign %compile-re (compile_re (string-desc pattern) (bool submatches?))
+  integer ; 0 or error code
+  (C regex_t*))
+
+
+;;; Searching with compiled regexps
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; cre-search returns match info; cre-search? is just a predicate.
+
+(define (cre-search cre start-vec end-vec str start)
+  (let ((re-str (cre:string cre)))	;; RE-STR = #F => empty match.
+    (and re-str
+	 (let* ((C-bytes (or (cre:bytes cre)
+			     (let ((C-bytes (compile-posix-re->c-struct re-str #t)))
+			       (set-cre:bytes cre C-bytes)
+			       (register-re-c-struct cre C-bytes)
+			       C-bytes)))
+		(retcode (%cre-search C-bytes str start
+				      (cre:tvec cre)
+				      (cre:max-paren cre)
+				      start-vec end-vec)))
+	   (if (integer? retcode)
+	       (error retcode (%regerror-msg retcode C-bytes)
+		      cre-search cre start-vec end-vec str start)
+	       (and retcode (make-regexp-match str start-vec end-vec)))))))
+
+(define (cre-search? cre str start)
+  (let ((re-str (cre:string cre)))	;; RE-STR = #F => empty match.
+    (and re-str
+	 (let* ((C-bytes (or (cre:bytes/nm cre)
+			     (let ((C-bytes (compile-posix-re->c-struct re-str #f)))
+			       (set-cre:bytes/nm cre C-bytes)
+			       (register-re-c-struct cre C-bytes)
+			       C-bytes)))
+		(retcode (%cre-search C-bytes str start '#() -1 '#() '#())))
+	   (if (integer? retcode)
+	       (error retcode (%regerror-msg retcode C-bytes)
+		      cre-search? cre str start)
+	       retcode)))))
+
+(define-foreign %cre-search
+  (re_search ((C "const regex_t *~a") compiled-regexp)
+	      (string-desc str)
+	      (integer start)
+	      (vector-desc tvec) (integer max-psm)
+	      (vector-desc svec) (vector-desc evec))
+  desc)	; 0 success, #f no-match, or non-zero int error code.
+
+
+;;; Generate an error msg from an error code.
+
+(define-foreign %regerror-msg (re_errint2str (integer errcode)
+					     ((C "const regex_t *~a") re))
+  string)
+
+
+;;; Reclaiming compiled regexp storage
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Avert your eyes from the unsightly crock.
+;;;
+;;; S48 0.36 doesn't have finalizers, so we don't have a way to free
+;;; the C regexp_t structure when its CRE record is gc'd. So our current
+;;; lame approximation is to keep track of all the CRE's with a list of
+;;;     (cre-weak-pointer . regex_t*)
+;;; pairs. From time to time, we should walk the list. If we deref the
+;;; weak pointer and discover the CRE's been GC'd, we free the regex_t
+;;; struct.
+;;;
+;;; Note this code is completely thread unsafe.
+
+;;; Free the space used by a compiled regexp.
+(define-foreign %free-re (free_re ((C regex_t*) re)) ignore)
+
+(define *master-cre-list* '())
+
+;;; Whenever we make a new CRE, use this proc to add it to the master list.
+(define (register-re-c-struct cre c-bytes)
+  (set! *master-cre-list* (cons (cons (make-weak-pointer cre) c-bytes)
+				*master-cre-list*)))
+
+(define (clean-up-cres)
+  (set! *master-cre-list*
+	(fold (lambda (elt lis)
+		(if (weak-pointer-ref (car elt)) ; Still alive
+		    (cons elt lis)
+		    (begin (%free-re (cdr elt))
+			   lis)))
+	      '()
+	      *master-cre-list*)))
diff --git a/scsh/rx/re-subst.scm b/scsh/rx/re-subst.scm
new file mode 100644
index 0000000..a55b82b
--- /dev/null
+++ b/scsh/rx/re-subst.scm
@@ -0,0 +1,142 @@
+;;; Substitution ops with regexps
+;;; Copyright (c) 1998 by Olin Shivers.
+
+;;; These function have to be in a separate package because they use
+;;; the scsh I/O function WRITE-STRING. The rest of the regexp system
+;;; has no dependencies on scsh system code, and is defined independently
+;;; of scsh -- which scsh, in turn, relies upon: pieces of scsh-level-0
+;;; use the regexp basics. So we have to split this code out to avoid
+;;; a circular dependency in the modules: scsh-level-0 needs the regexp
+;;; package which needs WRITE-STRING, which comes from the regexp package.
+
+(define (regexp-substitute port match . items)
+  (let* ((str (regexp-match:string match))
+	 (sv (regexp-match:start match))
+	 (ev (regexp-match:end match))
+	 (range (lambda (item)			; Return start & end of
+		  (cond ((integer? item)	; ITEM's range in STR.
+			 (values (vector-ref sv item)
+				 (vector-ref ev item)))
+			((eq? 'pre item) (values 0 (vector-ref sv 0)))
+			((eq? 'post item) (values (vector-ref ev 0)
+						  (string-length str)))
+			(else (error "Illegal substitution item."
+				     item
+				     regexp-substitute))))))
+    (if port
+
+	;; Output port case.
+	(for-each (lambda (item)
+		    (if (string? item) (write-string item port)
+			(receive (si ei) (range item)
+			  (write-string str port si ei))))
+		  items)
+
+	;; Here's the string case. Make two passes -- one to
+	;; compute the length of the target string, one to fill it in.
+	(let* ((len (fold (lambda (item i)
+			    (+ i (if (string? item) (string-length item)
+				     (receive (si ei) (range item) (- ei si)))))
+			  0 items))
+	       (ans (make-string len)))
+
+	  (fold (lambda (item index)
+		  (cond ((string? item)
+			 (string-copy! ans index item)
+			 (+ index (string-length item)))
+			(else (receive (si ei) (range item)
+				(string-copy! ans index str si ei)
+				(+ index (- ei si))))))
+		0 items)
+	  ans))))
+
+
+
+(define (regexp-substitute/global port re str . items)
+  (let ((str-len (string-length str))
+	(range (lambda (start sv ev item)	; Return start & end of
+		 (cond ((integer? item)		; ITEM's range in STR.
+			(values (vector-ref sv item)
+				(vector-ref ev item)))
+		       ((eq? 'pre item) (values start (vector-ref sv 0)))
+		       (else (error "Illegal substitution item."
+				    item
+				    regexp-substitute/global)))))
+	(num-posts (fold (lambda (item count)
+			   (+ count (if (eq? item 'post) 1 0)))
+			 0 items)))
+
+    (if (and port (< num-posts 2))
+
+	;; Output port case, with zero or one POST items.
+	(let recur ((start 0))
+	  (if (<= start str-len)
+	      (let ((match (regexp-search re str start)))
+		(if match
+		    (let* ((sv (regexp-match:start match))
+			   (ev (regexp-match:end match))
+			   (s (vector-ref sv 0))
+			   (e (vector-ref ev 0))
+			   (empty? (= s e)))
+		      (for-each (lambda (item)
+				  (cond ((string? item) (write-string item port))
+
+					((procedure? item) (write-string (item match) port))
+
+					((eq? 'post0 item)
+					 (if (and empty? (< s str-len))
+					     (write-char (string-ref str s) port)))
+
+					((eq? 'post item)
+					 (recur (if empty? (+ 1 e) e)))
+
+					(else (receive (si ei)
+						  (range start sv ev item)
+						(write-string str port si ei)))))
+				items))
+
+		    (write-string str port start))))) ; No match.
+
+	;; Either we're making a string, or >1 POST.
+	(let* ((pieces (let recur ((start 0))
+			 (if (> start str-len) '()
+			     (let ((match (regexp-search re str start))
+				   (cached-post #f))
+			       (if match
+				   (let* ((sv (regexp-match:start match))
+					  (ev (regexp-match:end match))
+					  (s (vector-ref sv 0))
+					  (e (vector-ref ev 0))
+					  (empty? (= s e)))
+				     (fold (lambda (item pieces)
+					     (cond ((string? item)
+						    (cons item pieces))
+
+						   ((procedure? item)
+						    (cons (item match) pieces))
+
+						   ((eq? 'post0 item)
+						    (if (and empty? (< s str-len))
+							(cons (string (string-ref str s))
+							      pieces)
+							pieces))
+
+						   ((eq? 'post item)
+						    (if (not cached-post)
+							(set! cached-post
+							      (recur (if empty? (+ e 1) e))))
+						    (append cached-post pieces))
+
+						   (else (receive (si ei)
+							     (range start sv ev item)
+							   (cons (substring str si ei)
+								 pieces)))))
+					   '() items))
+
+				   ;; No match. Return str[start,end].
+				   (list (if (zero? start) str 
+					     (substring str start (string-length str)))))))))
+			     
+	       (pieces (reverse pieces)))
+	  (if port (for-each (lambda (p) (write-string p port)) pieces)
+	      (apply string-append pieces))))))
diff --git a/scsh/rx/re-syntax.scm b/scsh/rx/re-syntax.scm
new file mode 100644
index 0000000..41f2763
--- /dev/null
+++ b/scsh/rx/re-syntax.scm
@@ -0,0 +1,115 @@
+;;; SRE syntax support for regular expressions
+;;; Olin Shivers, June 1998.
+
+;;; Export SRE-FORM?, EXPAND-RX
+
+;;; Is the form an SRE expression?
+;;; We only shallowly check the initial keyword of a compound form.
+
+(define (sre-form? exp r same?)			; An SRE is
+  (let ((kw? (lambda (x kw) (same? x (r kw)))))		
+    (or (string? exp)				; "foo"
+	(and (pair? exp)
+	     (let ((head (car exp)))
+	       (or (every string? exp)		; ("aeiou")
+		   (kw? head '*)		; (*  re ...)
+		   (kw? head '+)		; (+  re ...)
+		   (kw? head '?)		; (?  re ...)
+		   (kw? head '=)		; (=  n re ...)
+		   (kw? head '>=)		; (>= n re ...)
+		   (kw? head '**)		; (** m n re ...)
+
+		   (kw? head '|)		; (| re ...)
+		   (kw? head 'or)		; (| re ...)
+		   (kw? head ':)		; (: re ...)
+		   (kw? head 'seq)		; (: re ...)
+
+		   (kw? head '-)		; (- re ...)
+		   (kw? head '&)		; (& re ...)
+		   (kw? head '~)		; (~ re ...)
+
+		   (kw? head 'submatch)		; (submatch re ...)
+		   (kw? head 'dsm)		; (dsm pre post re ...)
+
+		   (kw? head 'uncase)		; (uncase re ...)
+		   (kw? head 'w/case)		; (w/case re ...)
+		   (kw? head 'w/nocase)		; (w/nocase re ...)
+
+		   (kw? head 'unquote)		; ,exp
+		   (kw? head 'unquote-splicing)	; ,@exp
+
+		   (kw? head 'posix-string)	; (posix-string string)
+
+		   (kw? head 'word+)		; (word+ re ...)
+		   (kw? head 'word))))		; (word re ...)
+
+	(kw? exp 'any)				; any
+	(kw? exp 'nonl)				; nonl
+	(kw? exp 'word)				; word
+	(kw? exp 'bos) (kw? exp 'eos)		; bos / eos
+	(kw? exp 'bol) (kw? exp 'eol)		; bol / eol
+	(kw? exp 'bow) (kw? exp 'eow)		; bow / eow
+
+	(kw? exp 'lower-case)	(kw? exp 'lower); The char class names
+	(kw? exp 'upper-case)	(kw? exp 'upper)
+	(kw? exp 'alphabetic)	(kw? exp 'alpha)
+	(kw? exp 'numeric)	(kw? exp 'num)		(kw? exp 'digit)
+	(kw? exp 'alphanumeric)	(kw? exp 'alphanum)	(kw? exp 'alnum)
+	(kw? exp 'blank)
+	(kw? exp 'control)	(kw? exp 'cntrl)
+	(kw? exp 'printing)	(kw? exp 'print)
+	(kw? exp 'punctuation)	(kw? exp 'punct)
+	(kw? exp 'hex-digit)	(kw? exp 'hex)		(kw? exp 'xdigit)
+	(kw? exp 'graphic)	(kw? exp 'graph)
+	(kw? exp 'whitespace)	(kw? exp 'white)	(kw? exp 'space)
+	(kw? exp 'ascii))))
+
+
+;;; (if-sre-form form conseq-form alt-form)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; If FORM is an SRE, expand into CONSEQ-FORM, otherwise ALT-FORM.
+;;; This is useful for expanding a subform of a macro that can
+;;; be either a regexp or something else, e.g.
+;;;     (if-sre-form test			; If TEST is a regexp,
+;;;       (regexp-search? (rx test) line)	; match it against the line,
+;;;       (test line))				; otw it's a predicate.
+
+;;; The macro is actually defined directly in the module file.
+;;; (define-syntax if-sre-form
+;;;   (lambda (exp r c)
+;;;     (if (sre-form? (cadr exp) r c)
+;;;         (caddr exp)
+;;;         (cadddr exp))))
+
+
+;;; (RX re ...)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; The basic SRE form.
+
+(define (expand-rx exp r c)
+    (let ((re (simplify-regexp (parse-sres (cdr exp) r c))))
+
+      ;; If it's static, pre-compute the Posix string & tvec now,
+      ;; so the re->scheme unparser will find it and toss it into 
+      ;; the constructor. We do this only for the top-level regexp.
+      (if (static-regexp? re) (compile-regexp re))
+
+      (regexp->scheme re r)))
+
+
+;(define-syntax rx (syntax-rules () ((rx stuff ...) (really-rx stuff ...))))
+;(define-syntax really-rx
+;  (syntax-rules () ((really-rx stuff ...) (rx/cs stuff ...))))
+;
+;(define-syntax rx/cs (lambda (exp r c) (expand-rx exp #t r c)))
+;(define-syntax rx/ci (lambda (exp r c) (expand-rx exp #f r c)))
+;
+;(define-syntax case-sensitive
+;  (lambda (exp r c)
+;    (let ((%ls (r 'let-syntax))
+;	  (%really-rx (r 'really-rx))
+;	  (%sr (r 'syntax-rules))
+;	  (%rx/cs (r 'rx/cs)))
+;    `(,ls ((,%really-rx (,sr () ((,%really-rx stuff ...) (,%rx/cs stuff ...)))))
+;       . ,(cdr exp)))))
+				 
diff --git a/scsh/rx/re.scm b/scsh/rx/re.scm
new file mode 100644
index 0000000..bf03461
--- /dev/null
+++ b/scsh/rx/re.scm
@@ -0,0 +1,592 @@
+;;; The regexp data type
+;;;     Olin Shivers, January 1997, May 1998.
+
+;;;       A DSM around a choice gets absorbed into the choice's first elt.
+;;;         But this prevents it from being moved out into a containing
+;;;         choice or seq elt, or outer DSM. Fix.
+
+;;; A regexp is a: dsm, submatch, seq, choice, repeat, 
+;;;                char-set, string, bos, eos
+
+;;; Deleted sub-match regexp
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; This stands for a regexp containing TSM submatches, of which
+;;; PRE-DSM come first as dead submatches, then the regexp BODY with its 
+;;; submatches, then POST-DSM as dead submatches.
+
+(define-record-type re-dsm :re-dsm
+  (%%make-re-dsm body pre-dsm tsm posix)
+  re-dsm?
+  (body     re-dsm:body)		; A Regexp
+  (pre-dsm  re-dsm:pre-dsm)		; Integer -- initial dead submatches
+  (tsm      re-dsm:tsm)					; Total submatch count
+  (posix    re-dsm:posix set-re-dsm:posix))		; Posix bits
+
+(define (%make-re-dsm body pre-dsm tsm) (%%make-re-dsm body pre-dsm tsm #f))
+
+;;; This is only used in code that the (RX ...) macro produces 
+;;; for static regexps.
+(define (%make-re-dsm/posix body pre-dsm tsm posix-str tvec)
+  (%%make-re-dsm body pre-dsm tsm (new-cre posix-str tvec)))
+
+(define (make-re-dsm body pre-dsm post-dsm)
+  (%make-re-dsm body pre-dsm (+ post-dsm pre-dsm (re-tsm body))))
+
+;;; "Virtual field" for the RE-DSM record -- how many dead submatches 
+;;; come after the body:
+
+(define (re-dsm:post-dsm re)		; Number of post-body DSM's =
+  (- (re-dsm:tsm re)			;   total submatches
+     (+ (re-dsm:pre-dsm re)		;   minus pre-body dead submatches
+	(re-tsm (re-dsm:body re)))))	;   minus body's submatches.
+
+;;; Slightly smart DSM constructor:
+;;; - Absorb this DSM into an inner dsm, or submatch.
+;;; - Punt unnecessary DSM's.
+
+(define (re-dsm body pre-dsm post-dsm)
+  (let ((tsm (+ pre-dsm (re-tsm body) post-dsm)))
+    (receive (body1 pre-dsm1) (open-dsm body)
+      (let ((pre-dsm (+ pre-dsm pre-dsm1)))
+
+	(? ((= tsm (re-tsm body1)) body1)		; Trivial DSM
+
+	   ((re-submatch? body1)			; Absorb into submatch.
+	    (%make-re-submatch (re-submatch:body body1)
+			       (+ pre-dsm (re-submatch:pre-dsm body1))
+			       tsm))
+
+	   (else (%make-re-dsm body1 pre-dsm tsm)))))))	; Non-trivial DSM
+
+;;; Take a regexp RE and return an equivalent (re', pre-dsm) pair of values.
+;;; Recurses into DSM records. It is the case that 
+;;;   (<= (+ pre-dsm (re-tsm re')) (re-tsm re))
+;;; The post-dsm value is (- (re-tsm re) (re-tsm re') pre-dsm).
+
+(define (open-dsm re)
+  (let lp ((re re) (pre-dsm 0))
+    (if (re-dsm? re)
+	(lp (re-dsm:body re) (+ pre-dsm (re-dsm:pre-dsm re)))
+	(values re pre-dsm))))
+
+
+
+;;; Sequence: (& re ...)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define-record-type re-seq :re-seq
+  (%%make-re-seq elts tsm posix)
+  re-seq?
+  (elts   re-seq:elts)				; Regexp list 
+  (tsm    re-seq:tsm)				; Total submatch count
+  (posix  re-seq:posix set-re-seq:posix))	; Posix record
+
+(define (%make-re-seq elts tsm) (%%make-re-seq elts tsm #f))
+
+;;; This is only used in code that (RE ...) macro produces for static regexps.
+(define (%make-re-seq/posix elts tsm posix-str tvec)
+  (%%make-re-seq elts tsm (new-cre posix-str tvec)))
+
+(define (make-re-seq res)
+  (%make-re-seq res
+		(fold (lambda (re sm-count) (+ (re-tsm re) sm-count))
+		      0 res)))
+
+;;; Slightly smart sequence constructor:
+;;; - Flattens nested sequences
+;;; - Drops trivial "" elements
+;;; - Empty sequence => ""
+;;; - Singleton sequence is reduced to its one element.
+;;; - We don't descend into DSM's; too much work for this routine.
+
+(define (re-seq res)
+  (let ((res (let recur ((res res)) 	; Flatten nested seqs & drop ""'s.
+	       (if (pair? res)
+		   (let* ((re (car res))
+			  (tail (recur (cdr res))))
+		     (? ((re-seq? re)		; Flatten nested seqs
+			 (append (recur (re-seq:elts re)) tail))
+			((re-trivial? re) tail)	; Drop trivial elts
+			(else (cons re tail))))
+		   '()))))
+
+    (if (pair? res)
+	(if (pair? (cdr res))
+	    (make-re-seq res)		; General case
+	    (car res))			; Singleton sequence
+	re-trivial)))			; Empty seq -- ""
+
+
+;;; Choice: (| re ...)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define-record-type re-choice :re-choice
+  (%%make-re-choice elts tsm posix)
+  re-choice?
+  (elts re-choice:elts)				; List of rel-items
+  (tsm  re-choice:tsm)				; Total submatch count
+  (posix  re-choice:posix set-re-choice:posix))	; Posix string
+
+(define (%make-re-choice elts tsm) (%%make-re-choice elts tsm #f))
+
+;;; This is only used in code that (RE ...) macro produces for static regexps.
+(define (%make-re-choice/posix elts tsm posix-str tvec)
+  (%%make-re-choice elts tsm (new-cre posix-str tvec)))
+
+(define (make-re-choice res)
+  (%make-re-choice res
+		   (fold (lambda (re sm-count) (+ (re-tsm re) sm-count))
+			 0 res)))
+
+;;; Slightly smart choice constructor:
+;;; - Flattens nested choices
+;;; - Drops empty (impossible) elements
+;;; - Empty choice => empty-match
+;;; - Singleton choice is reduced to its one element.
+;;; - We don't descend into DSM's; too much work for this routine.
+;;;
+;;; This routine guarantees to preserve char-classness -- if it is applied
+;;; to a list of char-class regexps (char-set and singleton-string re's),
+;;; it will return a char-class regexp.
+
+(define (re-choice res)
+  (let ((res (let recur ((res res)) 	; Flatten nested choices
+	       (if (pair? res)		; & drop empty re's.
+		   (let* ((re (car res))
+			  (tail (recur (cdr res))))
+		     (? ((re-choice? re)	    ; Flatten nested choices
+			 (append (recur (re-choice:elts re)) tail))
+			((re-empty? re) tail)	    ; Drop empty re's.
+			(else (cons re tail))))
+		   '()))))
+    ;; If all elts are char-class re's, fold them together.
+    (if (every static-char-class? res)
+	(let ((cset (apply char-set-union
+			   (map (lambda (elt)
+				  (if (re-char-set? elt)
+				      (re-char-set:cset elt)
+				      (string->char-set (re-string:chars elt))))
+				res))))
+	  (if (= 1 (char-set-size cset))
+	      (make-re-string (apply string (char-set-members cset)))
+	      (make-re-char-set cset)))
+
+	(if (pair? res)
+	    (if (pair? (cdr res))
+		(make-re-choice res)	; General case
+		(car res))		; Singleton sequence
+	    re-empty))))		; Empty choice = ("")
+
+;;; Repetition (*,?,+,=,>=,**)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; The repeat record's body contains all of the repeat record's submatches --
+;;; there is no pre-dsm field allowing for initial & trailing dead submatches.
+;;; This is not a limit on expressiveness because repeat commutes with dsm --
+;;; we can always move submatches that come before and after body to an outer
+;;; DSM. Hence
+;;;     (= (re-repeat:tsm re) (re-tsm (re-repeat:body re)))
+
+(define-record-type re-repeat :re-repeat
+  (%%make-re-repeat from to body tsm posix)
+  re-repeat?
+  (from  re-repeat:from)	; Integer    (Macro expander abuses.)
+  (to    re-repeat:to)		; Integer or #f for infinite	(Macro expander abuses.)
+  (body  re-repeat:body)	; Regexp
+  (tsm   re-repeat:tsm)		; Total submatch count
+  (posix re-repeat:posix set-re-repeat:posix))		; Posix record
+
+(define (%make-re-repeat from to body tsm)
+  (%%make-re-repeat from to body tsm #f))
+
+;;; This is only used in code that (RE ...) macro produces for static regexps.
+(define (%make-re-repeat/posix from to body tsm posix-str tvec)
+  (%%make-re-repeat from to body tsm (new-cre posix-str tvec)))
+
+(define (make-re-repeat from to body)
+  (%make-re-repeat  (check-arg (lambda (from)
+				 (or (not (integer? from)) ; Dynamic
+				     (>= from 0)))
+			       from
+			       make-re-repeat)
+		    (check-arg (lambda (to)
+				 (or (not (integer? to)) ; #f or dynamic
+				     (and (integer? to) (>= to 0))))
+			       to
+			       make-re-repeat)
+		    body
+		    (re-tsm body)))
+
+;;; Slightly smart repeat constructor
+;;; - Flattens nested repeats.
+;;; - re{1,1}, re{0,0}, and re{m,n} where m>n reduced.
+;;; - If re is empty-match: from=0 => "", from>0 => empty-match.
+;;; - If re is eos, bos, or "", and to <= from, reduce to simply re.
+;;; - Commutes into DSM records.
+
+(define (re-repeat from to body)
+  (receive (re pre-dsm) (reduce-repeat from to body 0)
+    (re-dsm re pre-dsm (- (re-tsm body) (+ pre-dsm (re-tsm re))))))
+
+;;; This guy does all the work (and is also called by the repeat simplifier)
+
+(define (reduce-repeat from to body pre-dsm)
+  (receive (from to body1 pre-dsm)
+           ;; Collapse nested repeats and dsm's:
+           (let iter ((from from) (to to) (body body) (dsm0 pre-dsm))
+	     (receive (body body-dsm0) (open-dsm body)
+	       (let ((dsm0 (+ dsm0 body-dsm0)))
+		 (if (and (integer? from)		; Stop if FROM or TO
+			  (or (not to) (integer? to))	; are code.
+			  (re-repeat? body))
+		     (let ((bfrom (re-repeat:from body))
+			   (bto (re-repeat:to body))
+			   (bbody (re-repeat:body body)))
+		       (if (or (not (integer? bfrom))        ; Stop if bfrom or
+			       (and bto (not (integer? bto)))) ; bto are code.
+			   (values from to body dsm0)
+			   (iter (* from bfrom)
+				 (and to bto (* to bto))
+				 bbody
+				 dsm0)))
+		     (values from to body dsm0)))))
+
+    (? ((and (eqv? from 1) (eqv? to 1))		; re{1,1} => re
+	(values body1 pre-dsm))
+
+       ((and (eqv? from 0) (eqv? to 0))		; re{0,0} => ""
+	(values re-trivial (+ (re-tsm body1) pre-dsm)))
+
+       ;; re{m,n} => re-empty when m>n:
+       ((and (integer? from) (integer? to) (> from to))
+	(values re-empty (+ (re-tsm body1) pre-dsm)))
+
+       ;; Reduce the body = re-empty case.
+       ((and (re-empty? body1) (integer? from))		; (+ (in)) => (in)
+	(values (if (> from 0) re-empty re-trivial)	; (* (in)) => ""
+		pre-dsm))
+
+       ;; If BODY1 is eos, bos, or "", and m<=n, reduce to simply BODY1.
+       ((and (integer? from)
+	     (or (and (integer? to) (<= from to)) (not to))
+	     (or (re-eos? body1)
+		 (re-bos? body1)
+		 (and (re-string? body1)
+		      (string=? "" (re-string:chars body1)))))
+	(values body1 pre-dsm))
+
+       (else (values (make-re-repeat from to body1)	; general case
+		     pre-dsm)))))
+
+
+
+;;; Submatch
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; A submatch record introduces a new submatch. This is followed by
+;;; PRE-DSM dead submatches (caused by simplifying the body), then the
+;;; BODY, then perhaps more dead submatches, all for a total of TSM
+;;; submatches.
+
+(define-record-type re-submatch :re-submatch
+  (%%make-re-submatch body pre-dsm tsm posix)
+  re-submatch?
+  (body re-submatch:body)	; Regexp
+  (pre-dsm re-submatch:pre-dsm)	; Deleted submatches preceding the body
+  (tsm  re-submatch:tsm)	; Total submatch count for the record
+  (posix re-submatch:posix set-re-submatch:posix)) ; Posix string
+
+(define (%make-re-submatch body pre-dsm tsm)
+  (%%make-re-submatch body pre-dsm tsm #f))
+
+;;; This is only used in code that (RE ...) macro produces for static regexps.
+(define (%make-re-submatch/posix body pre-dsm tsm posix-str tvec)
+  (%%make-re-submatch body pre-dsm tsm (new-cre posix-str tvec)))
+
+
+;;; "Virtual field" for the RE-SUBMATCH record -- how many dead submatches 
+;;; come after the body:
+
+(define (re-submatch:post-dsm re)	 ; Number of post-body DSM's =
+  (- (re-submatch:tsm re)		 ;   total submatches
+     (+ 1				 ;   minus *this* submatch
+	(re-submatch:pre-dsm re)	 ;   minus pre-body dead submatches
+	(re-tsm (re-submatch:body re)))));   minus body's submatches.
+
+(define (make-re-submatch body . maybe-pre+post-dsm)
+  (let-optionals maybe-pre+post-dsm ((pre-dsm 0) (post-dsm 0))
+    (%make-re-submatch body pre-dsm (+ pre-dsm 1 (re-tsm body) post-dsm))))
+
+;;; Slightly smart submatch constructor
+;;; - DSM's unpacked
+;;; - If BODY is the re-empty, we'll never match, so just produce a DSM.
+
+(define (re-submatch body . maybe-pre+post-dsm)
+  (let-optionals maybe-pre+post-dsm ((pre-dsm 0) (post-dsm 0))
+    (let ((tsm (+ 1 pre-dsm (re-tsm body) post-dsm)))
+      (receive (body1 pre-dsm1) (open-dsm body)
+	(if (re-empty? body1)
+	    (re-dsm re-empty tsm 0)
+	    (%make-re-submatch body1 (+ pre-dsm pre-dsm1) tsm))))))
+
+
+
+;;; Other regexps : string, char-set, bos & eos
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Also, re-empty and re-trivial.
+
+(define-record re-string
+  chars			; String
+  (posix #f)		; Posix record
+  ((disclose self) (list "re-string" (re-string:chars self))))
+
+(define re-string make-re-string)	; For consistency w/other re makers.
+
+;;; This is only used in code that (RE ...) macro produces for static regexps.
+(define (make-re-string/posix chars posix-str tvec)
+  (let ((re (make-re-string chars)))
+    (set-re-string:posix re (new-cre posix-str tvec))
+    re))
+
+;;; Matches the empty string anywhere.
+(define re-trivial (make-re-string/posix "" "" '#()))
+
+(define (re-trivial? re)
+  (and (re-string? re) (zero? (string-length (re-string:chars re)))))
+
+(define-record re-char-set
+  cset			; A character set	(Macro expander abuses.)
+  (posix    #f))	; Posix record
+
+(define re-char-set make-re-char-set)	; For consistency w/other re makers.
+
+;;; This is only used in code that (RE ...) macro produces for static regexps.
+(define (make-re-char-set/posix cs posix-str tvec)
+  (let ((re (make-re-char-set cs)))
+    (set-re-char-set:posix re (new-cre posix-str tvec))
+    re))
+
+;;; Never matches
+;;; NEED TO OPTIMIZE - PRE-SET POSIX FIELD.
+(define re-empty (make-re-char-set char-set:empty))
+
+(define (re-empty? re)
+  (and (re-char-set? re)
+       (let ((cs (re-char-set:cset re)))
+	 (and (char-set? cs) ; Might be code...
+	      (char-set-empty? cs)))))
+
+(define-record re-bos)	(define re-bos (make-re-bos))
+(define-record re-eos)  (define re-eos (make-re-eos))
+
+(define-record re-bol)  (define re-bol (make-re-bol))
+(define-record re-eol)  (define re-eol (make-re-eol))
+
+(define-record re-bow)  (define re-bow (make-re-bow))
+(define-record re-eow)  (define re-eow (make-re-eow))
+
+
+(define re-any (make-re-char-set/posix char-set:full "." '#()))
+
+(define (re-any? re)
+  (and (re-char-set? re)
+       (let ((cs (re-char-set:cset re)))
+	 (and (char-set? cs) ; Might be code...
+	      (char-set-full? cs)))))
+
+(define re-nonl (make-re-char-set/posix (char-set #\newline) "[^\n]" '#()))
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define (regexp? x)
+  (or (re-seq? x)      (re-choice? x)   (re-repeat? x)
+      (re-char-set? x) (re-string? x)
+      (re-bos? x)      (re-eos? x)
+      (re-bol? x)      (re-eol? x)
+      (re-bow? x)      (re-eow? x)
+      (re-submatch? x) (re-dsm? x)))
+
+
+;;; Return the total number of submatches bound in RE.
+
+(define (re-tsm re)
+  (? ((re-seq? re)      (re-seq:tsm re))
+     ((re-choice? re)   (re-choice:tsm re))
+     ((re-repeat? re)   (re-repeat:tsm re))
+     ((re-dsm? re)      (re-dsm:tsm re))
+     ((re-submatch? re) (re-submatch:tsm re))
+     (else 0)))
+
+
+(define re-word
+  (let ((wcs (char-set-union char-set:alphanumeric	; Word chars
+			     (char-set #\_))))
+    (make-re-seq (list re-bow
+		       (make-re-repeat 1 #f (make-re-char-set wcs))
+		       re-eow))))
+
+
+;;; (flush-submatches re)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Return regular expression RE with all submatch-binding elements
+;;; stripped out -- (= 0 (re-tsm (flush-submatches re))).
+
+(define (flush-submatches re)
+  (? ((zero? (re-tsm re)) re)		; RE has no submatches.
+
+     ((re-seq?    re) (re-seq    (map flush-submatches (re-seq:elts    re))))
+     ((re-choice? re) (re-choice (map flush-submatches (re-choice:elts re))))
+
+     ((re-repeat? re) (re-repeat (re-repeat:from re)
+				 (re-repeat:to re)
+				 (flush-submatches (re-repeat:body re))))
+		  
+     ((re-submatch? re) (flush-submatches (re-submatch:body re)))
+     ((re-dsm? re)      (flush-submatches (re-dsm:body      re)))
+
+     (else re)))
+
+
+;;; Map F over ELTS. (F x) returns two values -- the "real" return value,
+;;; and a "changed?" flag. If CHANGED? is false, then the "real" return value
+;;; should be identical to the original argument X. MAP/CHANGED constructs
+;;; the mapped list sharing as long an unchanged tail as possible with the
+;;; list ELTS; if F changes no argument, MAP/CHANGED returns exactly the list
+;;; ELTS. MAP/CHANGED returns two values: the mapped list, and a changed? 
+;;; flag for the entire list.
+
+(define (map/changed f elts)
+  (let recur ((elts elts))
+    (if (pair? elts)
+	(let ((elt (car elts)))
+	  (receive (new-elts elts-changed?) (recur (cdr elts))
+	    (receive (new-elt elt-changed?) (f elt)
+	      (if (or elts-changed? elt-changed?)
+		  (values (cons new-elt new-elts) #t)
+		  (values elts #f)))))
+	  (values '() #f))))
+
+
+(define (uncase re)
+  (receive (new-re changed?)
+           (let recur ((re re))
+	     (? ((re-seq? re)
+		 (let ((elts (re-seq:elts re)))
+		   (receive (new-elts elts-changed?)
+		            (map/changed recur elts)
+		     (if elts-changed?
+			 (values (%make-re-seq new-elts (re-seq:tsm re)) #t)
+			 (values re #f)))))
+
+		((re-choice? re)
+		 (let ((elts (re-choice:elts re)))
+		   (receive (new-elts elts-changed?)
+			    (map/changed recur elts)
+		     (if elts-changed?
+			 (values (re-choice new-elts) #t)
+			 (values re #f)))))
+
+		  ((re-char-set? re)
+		   (let* ((cs (re-char-set:cset re))
+			  (new-cs (uncase-char-set cs))) ; Better not be code.
+		     (if (char-set= cs new-cs)
+			 (values re #f)
+			 (values (make-re-char-set new-cs) #t))))
+
+		  ((re-repeat? re)
+		   (receive (new-body body-changed?) (recur (re-repeat:body re))
+		     (if body-changed?
+			 (values (re-repeat (re-repeat:from re)
+					    (re-repeat:to re)
+					    new-body)
+				 #t)
+			 (values re #f))))
+
+		  ((re-submatch? re)
+		   (receive (new-body body-changed?) (recur (re-submatch? re))
+		     (if body-changed?
+			 (values (%make-re-submatch new-body
+						    (re-submatch:pre-dsm re)
+						    (re-submatch:tsm     re))
+				 #t)
+			 (values re #f))))
+		  
+		  ((re-string? re)
+		   (let ((cf-re (uncase-string (re-string:chars re))))
+		     (if (re-string? cf-re)
+			 (values re    #f)
+			 (values cf-re #t))))
+
+		  (else (values re #f))))
+    new-re))
+		  
+     
+;;; (uncase-char-set cs)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Return a char-set cs' such that cs' contains every char c in cs in both
+;;; its upcase and downcase form.
+
+(define (uncase-char-set cs)
+  (char-set-fold (lambda (c new-cset)
+		   (char-set-adjoin! new-cset
+				     (char-downcase c)
+				     (char-upcase c)))
+		 (char-set-copy char-set:empty)
+		 cs))
+
+
+;;; I actually make an effort to keep this a re-string
+;;; if possible (if the string contains no case-sensitive
+;;; characters). Returns a regexp matching the string in
+;;; a case-insensitive fashion.
+
+(define (uncase-string s)
+  ;; SEQ is a list of chars and doubleton char-sets.
+  (let* ((seq (string-fold-right (lambda (c lis)
+				   (cons (? ((char-lower-case? c) (char-set c (char-upcase   c)))
+					    ((char-upper-case? c) (char-set c (char-downcase c)))
+					    (else c))
+					 lis))
+			    '() s))
+
+	 ;; Coalesce adjacent chars together into a string.
+	 (fixup (lambda (chars seq)
+		  (if (pair? chars)
+		      (cons (make-re-string (list->string (reverse chars)))
+			    seq)
+		      seq)))
+
+	 (new-seq (let recur ((seq seq) (chars '()))
+		    (if (pair? seq)
+			(let ((elt (car seq))
+			      (seq (cdr seq)))
+			  (if (char? elt)
+			      (recur seq (cons elt chars))
+			      (fixup chars (cons (make-re-char-set elt)
+						 (recur seq '())))))
+			(fixup chars '())))))
+
+    (if (= 1 (length new-seq)) (car new-seq)
+	(make-re-seq new-seq))))
+
+
+		     
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(define char-set-full?
+  (let ((allchars-nchars (char-set-size char-set:full)))
+    (lambda (cs) (= allchars-nchars (char-set-size cs)))))
+
+(define (char-set-empty? cs) (zero? (char-set-size cs)))
+
+
+;;; A "char-class" re is either a char-set re or a string re whose string
+;;; has only one character.
+
+(define (re-char-class? re)
+  (or (re-char-set? re)
+      (and (re-string? re)
+	   (= 1 (string-length (re-string:chars re))))))
+
+(define (static-char-class? re)
+  (or (and (re-char-set? re)
+	   (char-set? (re-char-set:cset re)))	; This might be code.
+      (and (re-string? re)			; But never this, so no check.
+	   (= 1 (string-length (re-string:chars re))))))
diff --git a/scsh/rx/re1.c b/scsh/rx/re1.c
new file mode 100644
index 0000000..d5d020d
--- /dev/null
+++ b/scsh/rx/re1.c
@@ -0,0 +1,173 @@
+/* Scheme48 interface to Henry Spencer's Posix regular expression package.
+** Copyright (c) 1993, 1994, 1998 by Olin Shivers.
+*/
+
+/* Todo:
+**   not_eol not_bol support on searchers
+**   error code -> err msg
+**   regex freeing
+**   regexp-string -> regex_t caching
+**   make filter_stringvec return an error code.
+*/
+
+#include 
+#include 
+#include "../regexp/regex.h"
+#include "../cstuff.h"
+
+/* Make sure our exports match up w/the implementation: */
+#include "re1.h"
+
+/*
+** Compile regexp into a malloc'd struct.
+** The flag sm_p is true if we want to compile for submatches.
+** On success, store pointer to struct into cr and return 0.
+** On failure, free the struct, store NULL into cr, 
+**   and return a non-zero error code.
+*/
+
+int compile_re(scheme_value re_str, int sm_p, regex_t **cr)
+{
+    char *s = &STRING_REF(re_str, 0);
+    int len = STRING_LENGTH(re_str);
+    int err;
+    regex_t *re = Alloc(regex_t);
+
+    if( !re ) return -1;
+    
+    re->re_endp = s + len;
+    err = regcomp(re, s, REG_EXTENDED | REG_PEND
+		                      | (sm_p ? 0 : REG_NOSUB));
+    if( err ) {Free(re); *cr=0;}
+    else *cr=re;
+
+    return err;
+    }
+
+/* Do a regex search of RE through string STR, beginning at STR[START].
+** - STR is passed as a Scheme value as it is allowed to contain nul bytes.
+**
+** - trans_vec contains the translation from the user's "virtual" submatches to
+**   the actual submatches the engine will report:
+**   - trans_vec[i] = #F means user submatch #i is a dead submatch.
+**   - trans_vec[i] = j  means user submatch #i corresponds to paren #j in re.
+**
+**   Indexing fence-posts are a little complicated due to the fact that you
+**   get an extra match elt back from the matcher -- match 0 is not a 
+**   paren-based *sub*match, but rather the match info for the whole thing.
+**
+**   So, here is how it works: 
+**     length(start_vec) = length(end_vec) = length(trans_vec) + 1
+**   because trans_vec doesn't have a translation for submatch 0, which
+**   is SRE submatch #0 => Posix submatch #0. For SRE submatch #i (1, 2, ...),
+**   we want the submatch associated with Posix paren # trans_vec[i-1].
+**
+** - MAX_PSM is the maximum paren in which we have submatch interest -- the
+**   max element in TRANS_VEC. Any parens after paren #MAX_PSM are just for
+**   grouping, not for marking submatches. We only have to allocate MAX_PSM+1
+**   elements in the submatch vector we pass into the search engine. If
+**   MAX_PSM = -1, then we don't even want the whole-match match bounds, which
+**   is really good -- the search engine can really fly in this case.
+**
+**   If we match, map re's submatches over to the exported start_vec and 
+**   end_vec match vectors using trans_vec.
+**
+** Return 0 on success; #f if no match; non-zero integer error code otherwise.
+*/
+
+scheme_value re_search(const regex_t *re, scheme_value str, int start,
+		       scheme_value trans_vec, int max_psm,
+		       scheme_value start_vec, scheme_value end_vec)
+{
+  char *s = &STRING_REF(str,0);		/* Passed as a scheme_value because */
+  int len = STRING_LENGTH(str);		/* it might contain nul bytes.      */
+
+  int vlen = VECTOR_LENGTH(start_vec);
+  int retval;
+
+  regmatch_t static_pmatch[10], *pm;
+
+  /* If max_psm+1 > 10, we can't use static_pmatch. */
+  if( max_psm < 10 ) pm = static_pmatch;
+  else {
+      pm = Malloc(regmatch_t, max_psm+1);/* Add 1 for the whole-match info. */
+      if( !pm ) return ENTER_FIXNUM(-1);
+      }
+  
+  pm[0].rm_so = start;
+  pm[0].rm_eo = len;
+
+  retval = regexec(re, s, max_psm+1, pm, REG_STARTEND);	/* Do it. */
+
+  /* We matched and have match-bound info, so translate it over. */
+  if( !retval && max_psm >= 0 ) { 
+      int i;
+
+      VECTOR_REF(start_vec,0) = ENTER_FIXNUM(pm[0].rm_so);    /* whole-match */
+      VECTOR_REF(end_vec,0)   = ENTER_FIXNUM(pm[0].rm_eo);
+
+      for( i=vlen-1; --i >= 0; ) {			      /* submatches  */
+	  scheme_value j_scm = VECTOR_REF(trans_vec,i);
+	  if( j_scm != SCHFALSE ) {
+	      int j = EXTRACT_FIXNUM(j_scm);
+	      int k = pm[j].rm_so,
+		  l = pm[j].rm_eo;
+	      VECTOR_REF(start_vec,i+1) = (k != -1) ? ENTER_FIXNUM(k) : SCHFALSE;
+	      VECTOR_REF(end_vec,  i+1) = (l != -1) ? ENTER_FIXNUM(l) : SCHFALSE;
+	      }
+	  }
+      }
+
+  if( max_psm >= 10 ) Free(pm);
+
+  if( retval==REG_NOMATCH ) return SCHFALSE;
+  if( ! retval ) return SCHTRUE;
+  return ENTER_FIXNUM(retval);
+  }
+
+
+
+/* Filter a vector of strings by regexp RE_STR.
+** Stringvec is a NULL-terminated vector of strings;
+** filter it in-place, copying the survivors back to compact them.
+** Put the number of survivors in nummatch.
+*/
+
+int filter_stringvec(scheme_value re_str, char const **stringvec)
+{
+  int re_len     = STRING_LENGTH(re_str);/* Passed as a scheme_value because */
+  char *re_chars = &STRING_REF(re_str,0);/* it might contain nul bytes.      */
+  regex_t re;
+
+  char const **p, **q;
+
+  /* REG_NOSUB -- We just want to know if it matches or not. */
+  re.re_endp = re_chars + re_len;
+  if( regcomp(&re, re_chars, REG_EXTENDED | REG_PEND | REG_NOSUB) ) {
+      return 0;
+      }
+
+  for(p=q=stringvec; *p; p++) {
+      char const *s = *p;
+      if( ! regexec(&re, s, 0, 0, 0) ) *q++ = s;
+      }
+
+  regfree(&re);
+  return q-stringvec;
+  }
+
+
+const char *re_errint2str(int errcode, const regex_t *re)
+{
+  int size = regerror(errcode, re, 0, 0);
+  char *s = Malloc(char,size);
+  if(s) regerror(errcode, re, s, size);
+  return s;
+  }
+
+
+void free_re(regex_t *re)
+{
+  regfree(re);
+  Free(re);
+  }
diff --git a/scsh/rx/re1.h b/scsh/rx/re1.h
new file mode 100644
index 0000000..141dd75
--- /dev/null
+++ b/scsh/rx/re1.h
@@ -0,0 +1,15 @@
+/* Exports from re1.c */
+
+int compile_re(scheme_value sre, int sm_p, regex_t **cr);
+
+scheme_value re_search(const regex_t *re, scheme_value str, int start,
+		       scheme_value trans_vec, int max_psm,
+		       scheme_value start_vec, scheme_value end_vec);
+
+/* Filter a vector of strings by a regexp. */
+int filter_stringvec(scheme_value re_str, char const **stringvec);
+
+/* Error code -> error msg */
+const char *re_errint2str(int errcode, const regex_t *re);
+
+void free_re(regex_t *re);	/* Free the malloc'd regexp. */
diff --git a/scsh/rx/regress.scm b/scsh/rx/regress.scm
new file mode 100644
index 0000000..4f91417
--- /dev/null
+++ b/scsh/rx/regress.scm
@@ -0,0 +1,5 @@
+;;; We keep the regression tests here.
+;;; If I had any.
+
+(define sre+posix
+  '())
diff --git a/scsh/rx/rx-lib.scm b/scsh/rx/rx-lib.scm
new file mode 100644
index 0000000..9bb245b
--- /dev/null
+++ b/scsh/rx/rx-lib.scm
@@ -0,0 +1,40 @@
+;;; Procedures that appear in code produced by (RX ...).
+
+;;; In sexp syntax, a , or ,@ form may evaluate to a string, char,
+;;; char-set, or regexp value. Coerce one of these to a regexp value.
+
+(define (coerce-dynamic-regexp x)
+  (? ((string? x)   (make-re-string x))
+     ((char? x)     (make-re-string (string x)))
+     ((char-set? x) (make-re-char-set x))
+     ((regexp? x) x)
+     (else (error "Cannot coerce value to regular expression." x))))
+
+;;; In a char-set context (e.g., as an operand of the SRE - operator), 
+;;; a , or form must be coercable to a char-set.
+
+(define (coerce-dynamic-charset x)
+  (? ((string? x)
+      (if (= 1 (string-length x)) (string->char-set x)
+	  (error "Multi-char string not allowed as , or ,@ SRE in char-class context."
+		 x)))
+     ((char? x)     (char-set x))
+     ((char-set? x) x)
+     ((re-char-set? x) (re-char-set:cset x))
+     (else (error "Cannot coerce value to character set" x))))
+
+
+(define (spec->char-set in? loose ranges)
+  (let ((doit (lambda (loose ranges)
+		(fold (lambda (r cset)
+			(let ((from (char->ascii (car r)))
+			      (to (char->ascii (cdr r))))
+			  (do ((i from (+ i 1))
+			       (cs cset (char-set-adjoin! cs (ascii->char i))))
+			      ((> i to) cs))))
+		      (string->char-set loose)
+		      ranges))))
+    (if in?
+	(doit loose ranges)
+	(char-set-invert! (doit loose ranges)))))
+
diff --git a/scsh/rx/rx-notes b/scsh/rx/rx-notes
new file mode 100644
index 0000000..5b65ed0
--- /dev/null
+++ b/scsh/rx/rx-notes
@@ -0,0 +1,14 @@
+Notes on Tom Lord's rx regexp package.
+
+API info nodes should give C prototypes. re_compile_pattern doesn't.
+
+Many thread-reentrancy problems:
+    - syntax var is a shared global. Needs to be passed as arg to 
+      pattern compiler -- which could easily be done by simply having
+      the client pre-set the pat->syntax field.
+
+    - regs field should not be a part of the pattern -- you can't
+      use the pattern in multiple concurrent matches.
+
+      Similarly for pat->no_sub, pat->not_bol, pat->not-eol. These fields are
+      properly part of of the client's request, not the pattern.
diff --git a/scsh/rx/scope b/scsh/rx/scope
new file mode 100644
index 0000000..a63cfc6
--- /dev/null
+++ b/scsh/rx/scope
@@ -0,0 +1,41 @@
+(define-syntax color (syntax-rules () ((color) 'green)))
+
+(define-syntax with-blue
+  (lambda (exp r c)
+    `(,(r 'let-syntax) ((color (,(r 'syntax-rules) ()
+                                 ((color) 'blue))))
+       . ,(cdr exp))))
+
+(with-blue (color))
+
+;;; This has a problem --  WITH-BLUE is not hygenic:
+(let ((color (lambda () 'foo)))
+  (with-blue (color)))
+
+=> blue
+
+;;; Let's fix this by adding a layer of indirection --
+;;; 1. (color) ==> (hidden-color)
+;;; 2. with-blue frobs the definition of *our* hidden-color
+
+(define-syntax hidden-color (lambda (exp r c) `(,(r 'quote) green)))
+(define-syntax color        (lambda (exp r c) `(,(r 'hidden-color))))
+
+(define-syntax with-blue
+    (lambda (exp r c)
+      `(,(r 'let-syntax) 
+            ((,(r 'hidden-color) (,(r 'syntax-rules) ()
+                                   ((,(r 'hidden-color)) (,(r 'quote) blue)))))
+	. ,(cdr exp))))
+
+;;; Without all the renaming, the above is
+;;; (let-syntax ((hidden-color (syntax-rules () ((hidden-color 'blue)))))
+;;;   body ...)
+;;; where *all* symbols on the first line are renamed, *including* 
+;;; hidden-color, so we should be redefining the same hidden-color
+;;; to which (color) expands.
+
+;;; It doesn't work:
+(with-blue (color))
+=> green
+
diff --git a/scsh/rx/simp.notes b/scsh/rx/simp.notes
new file mode 100644
index 0000000..85d245b
--- /dev/null
+++ b/scsh/rx/simp.notes
@@ -0,0 +1,43 @@
+The simplifier produces regexps with some simple invariants:
+
+- DSM's are only top-level, never appearing in the body of a DSM,
+  repeat, sequence, choice, or submatch.
+
+- A repeat's body is not a repeat, trivial match, or empty match.
+
+- A choice's body contains more than one element; no element is
+  - a choice,
+  - a DSM, or
+  - an empty-match.
+  
+- A choice contains 0 or 1 char-set, bos, and eos elements.
+
+- A sequence's body contains more than one element; no element is
+  - a sequence,
+  - a DSM, 
+  - a trivial match, or
+  - an empty-match
+  
+- There are no empty matches in the regexp unless the entire regexp
+  is either an empty match, or a dsm whose body is an empty match.
+  (This is good, because there is no way to write an empty match
+  in Posix notation in a char-set independent way -- you have to
+  use the six-char "[^\000-\177]" for ASCII.)
+
+To see these invariants:
+
+- We can always bubble up empty matches:
+  - If a sequence has one, the whole sequence is reduced to an empty match.
+  - They can be deleted from a choice; if the choice reduces to 0 elements,
+    the choice can be reduced to an empty match.
+  - A repeat of an empty match is either an empty match or a trivial match,
+    depending upon whether FROM is >0 or 0, respectively.
+  - DSM of an empty match: the DSM itself can be bubbled upwards (see below).
+
+- We can always bubble up DSM regexps:
+  - If an elt of a choice or sequence is a DSM, it can be "absorbed"
+    into the element's relocation offset.
+  - Repeat commutes with DSM.
+  - A DSM body can be "absorbed" into a submatch record by increasing the
+    submatch's DSM0 count.
+  - Nested DSM's can be collapsed together.
diff --git a/scsh/rx/simp.scm b/scsh/rx/simp.scm
new file mode 100644
index 0000000..8d15347
--- /dev/null
+++ b/scsh/rx/simp.scm
@@ -0,0 +1,403 @@
+;;; Olin Shivers, June 1998
+;;; Copyright (c) 1998 by the Scheme Underground.
+
+;;; One export: (simplify-regexp re) -> re
+
+;;; Regexp simplifier
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; (| (in c1 ...) (in c2 ...) re ...) => (| (in c1 ... c2 ...) re ...)
+;;; (| (not-in c1 ...) (not-in c2 ...)) => (| (not-in [intersect (c1 ...)
+;;; 							     (c2 ...)])
+;;; A run of BOS's or a run of EOS's in a sequence may be elided.
+;;; Nested exponents can be collapsed (*, +, ?) -- multiply the "from's"
+;;;   together; multiply the "to's" together.
+;;; Exponent range [1,1] simplifies, as does [0,0].
+;;; Uniquify branches
+;;; Adjacent literals in a sequence can be collapsed
+;;; A singleton-char char class can be collapsed into a constant
+;;; Nested choices can be collapsed
+;;; Nested sequences can be collapsed
+;;; An empty sequence (:) can be turned into an empty-string match "".
+;;; Singleton choices and sequences can be reduced to their body.
+;;;
+;;; The simplifier is carefully written so that it won't blow up
+;;; when applied to a dynamic regexp -- that is, 
+;;; - a chunk of Scheme code that produces a regexp instead of
+;;;   an actual regexp value;
+;;; - a repeat regexp whose FROM or TO fields are chunks of Scheme code
+;;;   rather than integers; 
+;;; - a char-set regexp whose CSET field is a chunk of Scheme code rather
+;;;   than an actual char-set value.
+;;; This is useful because the RX macro can build such a regexp as part
+;;; of its expansion process.
+
+(define (simplify-regexp re)
+  (receive (simp-re pre-dsm) (simp-re re)
+    (re-dsm simp-re pre-dsm (- (re-tsm re) (+ (re-tsm simp-re) pre-dsm)))))
+
+(define (simp-re re)
+  (? ((re-string? re) (values re 0))
+     ((re-seq? re)    (simp-seq re))
+     ((re-choice? re) (simp-choice re))
+
+     ;; Singleton char-sets reduce to the character.
+     ;; Bear in mind the cset field might be Scheme code instead 
+     ;; of an actual char set if the regexp is dynamic.
+     ((re-char-set? re)
+      (values (let ((cs (re-char-set:cset re)))
+		(if (and (char-set? cs)
+			 (= 1 (char-set-size cs)))
+		    (make-re-string (string (car (char-set-members cs))))
+		    re))
+	      0))
+
+     ((re-repeat? re) (simp-repeat re))
+
+     ((re-submatch? re) (simp-submatch re))
+     ((re-dsm?      re) (simp-dsm      re))
+
+     (else (values re 0))))
+
+
+
+;;; If the body of a submatch is the empty re, reduce it to the empty re.
+
+(define (simp-submatch re)
+  (let ((tsm     (re-submatch:tsm     re))
+	(pre-dsm (re-submatch:pre-dsm re)))
+    (receive (body1 pre-dsm1) (simp-re (re-submatch:body re))
+      (if (re-empty? body1)
+	  (values re-empty tsm)
+	  (values (%make-re-submatch body1 (+ pre-dsm pre-dsm1) tsm)
+		  0)))))
+
+;;; - Flatten nested DSM's.
+;;; - Return pre-dsm field and body field as the two return values.
+
+(define (simp-dsm re)
+  (receive (body pre-dsm1) (simp-re (re-dsm:body re))
+    (values body (+ (re-dsm:pre-dsm re) pre-dsm1))))
+
+
+
+;;; Simplifying sequences
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; - Collapse nested sequences and DSM's.
+;;; - Merge adjacent strings, identical adjacent anchors (bos, eos, etc.).
+;;; - Bubble DSM's forwards past elts that don't contain live submatches.
+;;;   (Going past live submatches would switch the submatch indexes around,
+;;;   which would be an error). This helps to coalesce DSMs and if we bring
+;;;   them all the way to the front, we can pop them off and make them a 
+;;;   pre-dsm for the entire seq record. 
+;;; - If an elt is the re-empty, reduce the whole re to the empty re.
+;;; - Reduce singleton and empty seq.
+
+(define (simp-seq re)
+  (let ((tsm (re-seq:tsm re))
+	(elts (map simplify-regexp (re-seq:elts re))))
+    (if (pair? elts)
+
+	(call-with-current-continuation
+	 (lambda (abort)
+	   (receive (pre-dsm head tail) (simp-seq1 elts abort tsm)
+	     (values (if (pair? tail)
+			 (%make-re-seq (cons head tail) (- tsm pre-dsm))
+			 head)					; Singleton seq
+		     pre-dsm))))
+		     
+	(values re-trivial 0))))				; Empty seq
+
+
+;;; Simplify the non-empty sequence ELTS.
+;;; - Return the result split out into three values: 
+;;;   [head-elt-pre-dsm, head-elt, tail].
+;;; - If any elt is the empty (impossible) re, abort by calling
+;;;   (abort elt tsm). TSM is otherwise unused.
+
+(define (simp-seq1 elts abort tsm)
+  (let recur ((elt (car elts)) (elts (cdr elts)))
+    (receive (elt pre-dsm) (open-dsm elt)
+      (? ((re-seq? elt)					; Flatten nested seqs.
+	  (let ((sub-elts (re-seq:elts elt)))
+	    (recur (re-dsm (car sub-elts) pre-dsm 0)
+		   (append (cdr sub-elts) elts))))
+		  					
+	 ((re-empty? elt) (abort elt tsm))		; Bomb out on the empty
+							; (impossible) re.
+	 ((pair? elts)
+	  (receive (next-pre-dsm next tail)		; Simplify the tail,
+	           (recur (car elts) (cdr elts))	; then think about
+							; the head:
+	    ;; This guy is called when we couldn't find any other
+	    ;; simplification. If ELT contains live submatches, then
+	    ;; there really is nothing to be done at this step -- just
+	    ;; assemble the pieces together and return them. If ELT
+	    ;; *doesn't* contain any live submatches, do the same, but
+	    ;; bubble its following next-pre-dsm submatches forwards.
+	    (define (no-simp)
+	      (if (has-live-submatches? elt)
+		  (values pre-dsm elt (cons (re-dsm next next-pre-dsm 0) tail))
+		  (values (+ pre-dsm next-pre-dsm) elt (cons next tail))))
+
+	    ;; Coalesces two adjacent bol's, two adjacent eol's, etc.
+	    (define (coalesce-anchor anchor?)
+	      (if (and (anchor? elt) (anchor? next))
+		  (values (+ pre-dsm next-pre-dsm) elt tail)
+		  (no-simp)))
+
+	    (? ((re-trivial? elt)	; Drop trivial re's.
+		(values (+ pre-dsm next-pre-dsm) next tail))
+
+	       ;; Coalesce adjacent strings
+	       ((re-string? elt)
+		(if (re-string? next)
+		    (values (+ pre-dsm next-pre-dsm)
+			    (make-re-string (string-append (re-string:chars elt)
+							   (re-string:chars next)))
+			    tail)
+		    (no-simp)))
+
+	       ;; Coalesce adjacent bol/eol/bos/eos/bow/eow's.
+	       ((re-bol? elt) (coalesce-anchor re-bol?))
+	       ((re-eol? elt) (coalesce-anchor re-eol?))
+	       ((re-bos? elt) (coalesce-anchor re-bos?))
+	       ((re-eos? elt) (coalesce-anchor re-eos?))
+	       ((re-bow? elt) (coalesce-anchor re-bow?))
+	       ((re-eow? elt) (coalesce-anchor re-eow?))
+	       (else (no-simp)))))
+
+	 (else (values pre-dsm elt '()))))))
+
+
+
+;;; Simplifying choices
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; - Collapse nested choices and DSM's.
+;;; - Delete re-empty's.
+;;; - Merge sets; merge identical anchors (bos, eos, etc.).
+;;;   But you can't merge across an element that contains a live submatch,
+;;;   see below.
+;;; - A singleton string "c" is included into the char-set merge as a 
+;;;   singleton set.
+;;; - Bubble DSM's forwards past elts that don't contain live submatches.
+;;;   (Going past live submatches would switch the submatch indexes around,
+;;;   which would be an error). This helps to coalesce DSMs and if we bring
+;;;   them all the way to the front, we can pop them off and make them a 
+;;;   pre-dsm for the entire seq record. 
+;;; - Reduce singleton and empty choice.
+;;;
+;;; You have to be careful simplifying choices -- you can't merge two sets
+;;; that appear on different sides of an element containing a live submatch.
+;;; The problem is that the assignment of submatches breaks ties left-to-right.
+;;; So these aren't the same:
+;;;     (| (submatch "x") any)    (| any (submatch "x"))
+;;; The first assigns the submatch, the second doesn't -- the ANY gets credit.
+;;; We want to collapse multiple char-sets, bos's, and eos's, but we have
+;;; to deal with this issue. So
+;;; - When we coalesce anchors, we retain the *leftmost* one.
+;;; - We coalesce sets that appear between live-submatch boundaries.
+;;;   When we do this, we subtract from the set any char that was in
+;;;   an earlier coalesced char-set. If this gets us down to the empty set,
+;;;   we drop it. If it gets us down to a singleton set, we convert it into
+;;;   a singleton string.
+;;; Whew. I had to think about this one.
+
+(define (simp-choice re)
+  (let ((tsm (re-choice:tsm re)))
+
+    (receive (pre-dsm cset bos? eos? bol? eol? bow? eow? tail)
+	     (simp-choice1 (map simplify-regexp (re-choice:elts re)))
+
+      (let ((tail (assemble-boundary-tail char-set:empty cset
+					  bos? eos? bol? eol? bow? eow?
+					  #f #f #f #f #f #f
+					  tail)))
+	(values (if (pair? tail)
+		    (if (pair? (cdr tail))
+			(%make-re-choice tail (- tsm pre-dsm))
+			(car tail))		; Singleton choice
+		    re-empty)			; Empty choice
+		pre-dsm)))))		
+
+
+
+;;; Given the return values from simp-choice1, this tacks all
+;;; the various pieces (CSET, BOS?, EOS?, etc.) onto the front of
+;;; TAIL. However, elements are not added onto TAIL that are already
+;;; described by PREV-CSET, PREV-BOS?, etc. -- they will be added onto
+;;; some earlier bit of the final result.
+
+(define (assemble-boundary-tail prev-cset cset
+				bos? eos? bol? eol? bow? eow? 
+				prev-bos? prev-eos?
+				prev-bol? prev-eol?
+				prev-bow? prev-eow? 
+				tail)
+  (let* ((cset (char-set-difference cset prev-cset))
+	 (numchars (char-set-size cset))
+	 (tail (if (and eos? (not prev-eos?)) (cons re-eos tail) tail))
+	 (tail (if (and eol? (not prev-eol?)) (cons re-eol tail) tail))
+	 (tail (if (and eow? (not prev-eow?)) (cons re-eow tail) tail))
+	 (tail (if (and bow? (not prev-bow?)) (cons re-bow tail) tail))
+	 (tail (if (and bol? (not prev-bol?)) (cons re-bol tail) tail))
+	 (tail (if (and bos? (not prev-bos?)) (cons re-bos tail) tail))
+	 (tail (? ((zero? numchars) tail)	; Drop empty char set.
+		  ((= 1 numchars)		; {c} => "c"
+		   (cons (make-re-string (string (car (char-set-members cset))))
+			 tail))
+		  (else (cons (make-re-char-set cset) tail)))))
+    tail))
+
+
+;;; Simplify the non-empty list of choices ELTS.
+;;; Return the result split out into the values
+;;;     [pre-dsm, cset, bos?, eos?, bol?, eol?, bow?, eow?, tail]
+
+(define (simp-choice1 elts)
+  (let recur ((elts elts)
+
+	      (prev-cset char-set:empty)	; Chars we've already seen.
+
+	      (prev-bos? #f) (prev-eos? #f)	; These flags say if we've
+	      (prev-bol? #f) (prev-eol? #f)	; already seen one of these
+	      (prev-bow? #f) (prev-eow? #f))	; anchors.
+			       
+    
+    (if (pair? elts)
+	(let ((elt  (car elts))
+	      (elts (cdr elts)))
+	  (receive (elt pre-dsm) (open-dsm elt)
+	    (if (re-choice? elt)
+
+		;; Flatten nested choices.
+		(let ((sub-elts (re-seq:elts elt)))
+		  (receive (tail-pre-dsm cset bos? eos? bol? eol? bow? eow? tail)
+		           (recur (append sub-elts elts)
+				  prev-cset
+				  prev-bos? prev-eos?
+				  prev-bol? prev-eol?
+				  prev-bow? prev-eow?)
+		    (values (+ pre-dsm tail-pre-dsm)
+			    cset bos? eos? bol? eol? bow? eow? tail)))
+		  
+		;; Simplify the tail, then think about the head.
+		(receive (tail-pre-dsm cset bos? eos? bol? eol? bow? eow? tail)
+		         (recur elts
+				(? ((and (re-string? elt)
+					 (= 1 (string-length (re-string:chars elt))))
+				    (char-set-union prev-cset
+						    (string->char-set (re-string:chars elt))))
+
+				   ;; The cset might be a Scheme exp.
+				   ((and (re-char-set? elt)
+					 (char-set? (re-char-set:cset elt)))
+				    (char-set-union prev-cset
+						    (re-char-set:cset elt)))
+
+				   (else prev-cset))
+				(or prev-bos? (re-bos? elt))
+				(or prev-eos? (re-eos? elt))
+				(or prev-bol? (re-bol? elt))
+				(or prev-eol? (re-eol? elt))
+				(or prev-bow? (re-bow? elt))
+				(or prev-eow? (re-eow? elt)))
+
+		  ;; This guy is called when we couldn't find any other
+		  ;; simplification. If ELT contains live submatches, then we
+		  ;; are at a merge boundary, and have to take all the
+		  ;; TAIL-PRE-DSM, CSET, BOS?, EOS?, ... stuff we've collected
+		  ;; and tack them onto TAIL as elements, then put ELT on
+		  ;; front.  Otherwise, we can commute TAIL-PRE-DSM, CSET,
+		  ;; BOS?, etc. with ELT, since it contains no live
+		  ;; submatches, so just tack ELT onto TAIL.
+
+		  (define (no-simp)
+		    (if (has-live-submatches? elt)
+			(let ((tail (assemble-boundary-tail prev-cset cset
+							    bos? eos?
+							    bol? eol?
+							    bow? eow? 
+							    prev-bos? prev-eos?
+							    prev-bol? prev-eol?
+							    prev-bow? prev-eow?
+							    tail)))
+			  (values pre-dsm char-set:empty #f #f #f #f #f #f
+				  (if (pair? tail)
+				      ;; Tack tail-pre-dsm onto
+				      ;; TAIL's first elt.
+				      (cons elt
+					    (cons (re-dsm (car tail)
+							  tail-pre-dsm 0)
+						  (cdr tail)))
+
+				      ;; Squirrel case: TAIL is empty, so use 
+				      ;; TAIL-PRE-DSM as ELT's post-dsm.
+				      (list (re-dsm elt 0 tail-pre-dsm)))))
+
+			;; ELT has no live submatches, so we can commute all
+			;; the recursion state forwards past it.
+			(values (+ pre-dsm tail-pre-dsm)
+				cset bos? eos? bol? eol? bow? eow?
+				(cons elt tail))))
+
+	    (? ((and (re-char-set? elt)
+		     (char-set? (re-char-set:cset elt))) ; Might be Scheme code
+		(values (+ pre-dsm tail-pre-dsm)
+			(char-set-union cset (re-char-set:cset elt))
+			bos? eos? bol? eol? bow? eow? tail))
+
+	       ;; Treat a singleton string "c" as a singleton set {c}.
+	       ((and (re-string? elt) (= 1 (string-length (re-string:chars elt))))
+		(values (+ pre-dsm tail-pre-dsm)
+			(char-set-union cset (string->char-set (re-string:chars elt)))
+			bos? eos? bol? eol? bow? eow? tail))
+
+	       ;; Coalesce bol/eol/bos/eos/bow/eow's.
+	       ((re-bos? elt) (values (+ pre-dsm tail-pre-dsm) cset
+				      #t   eos? bol? eol? bow? eow? tail))
+	       ((re-eos? elt) (values (+ pre-dsm tail-pre-dsm) cset
+				      bos? #t   bol? eol? bow? eow? tail))
+	       ((re-bol? elt) (values (+ pre-dsm tail-pre-dsm) cset
+				      bos? eos? #t   eol? bow? eow? tail))
+	       ((re-eol? elt) (values (+ pre-dsm tail-pre-dsm) cset
+				      bos? eos? bol? #t   bow? eow? tail))
+	       ((re-bow? elt) (values (+ pre-dsm tail-pre-dsm) cset
+				      bos? eos? bol? eol? #t   eow? tail))
+	       ((re-eow? elt) (values (+ pre-dsm tail-pre-dsm) cset
+				      bos? eos? bol? eol? bow? #t   tail))
+
+	       (else (no-simp)))))))
+
+	(values 0 char-set:empty #f #f #f #f #f #f '()))))
+
+
+
+(define (simp-repeat re)
+  (let ((from (re-repeat:from re))
+	(to   (re-repeat:to   re))
+	(body (re-repeat:body re)))
+    (receive (simp-body pre-dsm) (simp-re body) ; Simplify body.
+      ;; The fancy reductions are all handled by REDUCE-REPEAT.
+      (reduce-repeat from to simp-body pre-dsm))))
+
+
+
+;;; Does RE contain a live submatch?
+;;; If RE is dynamic, we can't tell, so we err conservatively, 
+;;; which means we say "yes."
+
+(define (has-live-submatches? re)
+  (or (re-submatch? re)
+      (? ((re-seq?    re) (every has-live-submatches? (re-seq:elts    re)))
+	 ((re-choice? re) (every has-live-submatches? (re-choice:elts re)))
+	 ((re-repeat? re) (has-live-submatches? (re-repeat:body re)))
+	 ((re-dsm?    re) (has-live-submatches? (re-dsm:body    re)))
+
+	 ;; If it's not one of these things, then this isn't a regexp -- it's
+	 ;; a chunk of Scheme code producing a regexp, and we conservatively
+	 ;; return #T -- the expression *might* produce a regexp containing
+	 ;; a live submatch:
+	 (else (not (or (re-char-set? re) (re-string? re)
+			(re-bos? re) (re-eos? re) 
+			(re-bol? re) (re-eol? re) 
+			(re-bow? re) (re-eow? re)))))))
diff --git a/scsh/rx/spencer.scm b/scsh/rx/spencer.scm
new file mode 100644
index 0000000..2656d48
--- /dev/null
+++ b/scsh/rx/spencer.scm
@@ -0,0 +1,171 @@
+;;; Parse Spencer-style regexps into the regexp ADT.
+;;; Olin Shivers, July 1998.
+
+;;; One export: (posix-string->regexp s)
+
+;;; Need better error checking on {m,n} brace parsing.
+
+(define (parse-posix-regexp-string s)
+  (receive (re i) (parse-posix-exp s 0)
+    (if (= i (string-length s)) re
+	(error "Illegal Posix regexp -- terminated early" s i))))
+
+(define posix-string->regexp parse-posix-regexp-string)
+
+;;; An complete expression is a sequence of |-separated branches.
+
+(define (parse-posix-exp s i)
+  (let ((len (string-length s)))
+    (if (< i len)
+	(let lp ((i i) (branches '()))
+	  (receive (branch i) (parse-posix-branch s i)
+	    (let ((branches (cons branch branches)))
+	      (if (and (< i len)
+		       (char=? #\| (string-ref s i)))
+		  (lp (+ i 1) branches)
+		  (values (re-choice (reverse branches)) i)))))
+	(values re-trivial i))))
+
+
+;;; A branch is a sequence of pieces -- stuff that goes in-between |'s.
+
+(define (parse-posix-branch s i)
+  (let ((len (string-length s)))
+    (let lp ((i i) (pieces '()))
+      (if (< i len)
+	  (receive (piece i) (parse-posix-piece s i)
+	    (let ((pieces (cons piece pieces)))
+	      (if (< i len)
+		  (case (string-ref s i)
+		    ((#\) #\|) (values (re-seq (reverse pieces)) i))
+		    (else (lp i pieces)))
+		  (values (re-seq (reverse pieces)) i))))
+
+	  (values (re-seq (reverse pieces)) i)))))
+
+
+;;; A piece is an atom possibly followed by a * ? + or {...} multiplier.
+;;; I.e. an element of a branch sequence.
+
+(define (parse-posix-piece s i)
+  (let ((len (string-length s)))
+    (receive (atom i) (parse-posix-atom s i)
+      (if (< i len)
+	  (case (string-ref s i)
+	    ((#\* #\+ #\?)
+	     (receive (from to) (case (string-ref s i)
+				  ((#\*) (values 0 #f))
+				  ((#\+) (values 1 #f))
+				  ((#\?) (values 0 1)))
+	       (values (re-repeat from to atom) (+ i 1))))
+
+	    ((#\{) (receive (from to i) (parse-posix-braces s (+ i 1))
+		     (values (re-repeat from to atom) i)))
+
+	    (else (values atom i)))
+
+	  (values atom i)))))
+
+
+;;; An atom is something that would bind to a following * operator --
+;;; a letter, [...] charset, ^, $, or (...).
+
+(define (parse-posix-atom s i)
+  (let ((len (string-length s)))
+    (if (< i (string-length s))
+	(let ((c (string-ref s i)))
+	  (case c
+	    ((#\^) (values re-bos (+ i 1)))
+	    ((#\$) (values re-eos (+ i 1)))
+	    ((#\.) (values re-any (+ i 1)))
+	
+	    ((#\[) (parse-posix-bracket s (+ i 1)))
+
+	    ((#\() (receive (re i) (parse-posix-exp s (+ i 1))
+		     (if (and (< i len) (char=? #\) (string-ref s i)))
+			 (values (re-submatch re) (+ i 1))
+			 (error "Regexp subexpression has no terminating close parenthesis" s i))))
+
+	    ((#\\) (let ((i (+ i 1)))
+		     (if (< i len)
+			 (values (make-re-string (string (string-ref s i)))
+				 (+ i 1))
+			 (error "Regexps may not terminate with a backslash" s))))
+
+	    ((#\) #\| #\* #\+ #\? #\{)  (values re-trivial i))
+	
+	    (else (values (make-re-string (string c)) (+ i 1)))))
+
+	(values re-trivial i))))
+
+
+;;; Parse a [...] or [^...] bracket expression into a regexp.
+;;; I is the index of the char following the left bracket.
+
+(define db-cset (char-set #\. #\= #\:)) ; Not allowed after a #\[.
+
+(define (parse-posix-bracket s i)
+  (let ((len (string-length s)))
+    (if (>= i len) (error "Missing close right bracket in regexp" s i)
+
+	(receive (negate? i0) (let ((c (string-ref s i)))
+				(if (char=? c #\^)
+				    (values #t (+ i 1))
+				    (values #f i)))
+	  (let lp ((i i0) (cset (char-set-copy char-set:empty)))
+	    (if (>= i len) (error "Missing close right bracket in regexp" s i)
+
+		(let ((c (string-ref s i))
+		      (i1 (+ i 1)))
+		  (case c
+		    ((#\[)
+		     ;; We don't handle [..] [==] [::] frobs.
+		     (if (and (< i1 len)
+			      (char-set-contains? db-cset (string-ref s i1)))
+			 (error "double-bracket regexps not supported." s i)
+			 (lp i1 (char-set-adjoin! cset #\[))))
+
+		    ((#\]) (if (= i i0)
+			       (lp i1 (char-set-adjoin! cset #\]))
+			       (let ((cset (if negate?
+					       (char-set-invert! cset)
+					       cset)))
+				 (values (make-re-char-set cset) i1))))
+
+		    ((#\-) (if (or (= i i0) ; first char or last char
+				   (and (< i1 len)
+					(char=? #\] (string-ref s i1))))
+			       (lp i1 (char-set-adjoin! cset #\-))
+			       (error "Illegal - in [...] regexp" s i)))
+
+		    ;; Regular letter -- either alone, or startpoint of a range.
+		    (else (if (and (< (+ i1 1) len)
+				   (char=? #\- (string-ref s i1)))
+
+			      ;; Range
+			      (let* ((i-tochar (+ i1 1))
+				     (to (char->ascii (string-ref s i-tochar))))
+				(do ((j (char->ascii c) (+ j 1))
+				     (cset cset (char-set-adjoin! cset (ascii->char j))))
+				    ((> j to) (lp (+ i-tochar 1) cset))))
+
+			      ;; Just a letter
+			      (lp i1 (char-set-adjoin! cset c))))))))))))
+
+
+;;; Parse out a [from,to] repetition pair from a {m,n} {m} or {m,} expression.
+;;; I is the index of the char following the left brace.
+
+(define (parse-posix-braces s i)
+  (let ((comma (string-index s #\,) i)
+	(rb (string-index s #\} i)))
+    (if rb
+	(if (and comma (< comma rb))
+	    (values (string->number (substring s i comma))
+		    (and (not (= (+ comma 1) rb))
+			 (string->number (substring s (+ comma 1) rb)))
+		    (+ rb 1))
+	    (let ((m (string->number (substring s i rb))))
+	      (values m m (+ rb 1))))
+	(error "Missing close brace in regexp" s i))))
+	
diff --git a/scsh/rx/test.scm b/scsh/rx/test.scm
new file mode 100644
index 0000000..caf64c8
--- /dev/null
+++ b/scsh/rx/test.scm
@@ -0,0 +1,50 @@
+;;; Test routines
+;;; ,open re-posix-parsers sre-parser-package re-simp-package pp
+
+(define (test-string)
+  (let lp ()
+    (write-char #\newline)
+    (let ((re-s (read-line)))
+      (if (not (eof-object? re-s))
+	  (let ((re (posix-string->regexp re-s)))
+	    (print-re re)
+	    (lp))))))
+
+(define (test-sre)
+  (let lp ()
+    (write-char #\newline)
+    (let ((sre (read)))
+      (if (not (eof-object? sre))
+	  (let ((re (sre->regexp sre)))
+	    (print-re re)
+	    (lp))))))
+
+
+(define (print-re re)
+  (let ((simp-re (simplify-regexp re)))
+    (cond ((static-regexp? re)
+	   (receive (s lev pcount tvec) (regexp->posix-string re)
+	     (format #t "plain: ~a\n       lev=~a pcount=~a tvec=~a\n"
+		     s lev pcount tvec))
+	   (receive (s lev pcount tvec) (regexp->posix-string simp-re)
+	     (format #t "simp: ~a\n      lev=~a pcount=~a tvec=~a\n"
+		     s lev pcount tvec))))
+    (p (regexp->sre re))
+    (p (regexp->sre simp-re))))
+
+(define (test-match)
+  (let lp ()
+    (write-string "sre: ")
+    (let ((sre (read)))
+      (if (not (eof-object? sre))
+	  (let ((re (sre->regexp sre)))
+	    (let lp2 ()
+	      (let ((line (read-line)))
+		(cond ((not (eof-object? line))
+		       (cond ((regexp-search re line) =>
+			      (lambda (m)
+				(format #t "Hit at [~a,~a).\n"
+					(match:start m)
+					(match:end m)))))
+		       (lp2))
+		      (else (lp))))))))))
diff --git a/scsh/rx/todo b/scsh/rx/todo
new file mode 100644
index 0000000..51d886f
--- /dev/null
+++ b/scsh/rx/todo
@@ -0,0 +1,88 @@
+- scsh integration
+  Affected: fr nawk filemtch glob rdelim re scsh-interfaces scsh-package
+
+- Naming conventions. "re" vs. "regexp", should I have "smart" versions
+  of make-re-string, etc.
+
+- Remove all "reduce" forms from scsh, replace with foldl, foldr forms.
+  - Check FPS, network code
+
+- The match fun should allow you to state the beginning of string is not a
+  real bos & likewise for eos. Similarly for bol & eol.
+  execution flag: 
+    -- REG_NOTBOL -- beginning of string doesn't count as ^ match.
+    -- REG_NOTEOL -- end       of string doesn't count as $ match.
+
+- Hack awk, expect, chat, dir-match for new regexp system
+  Current:
+  (awk (test body ...)
+       (:range test1 test2 body ...)
+       (else body ...)
+       (test => proc)
+       (test ==> vars body ...))
+
+  test ::=
+    integer
+    expression
+    string
+
+
+  New:
+  (else body ...)
+  (:range test1 test2 body ...)
+  (after body ...)
+  (test => proc)
+  (test ==> vars body ...)
+  (test body ...)
+
+  test ::= integer | sre | (WHEN exp) | exp
+
+-------------------------------------------------------------------------------
+Must disallow, due to Posix' RE_CONTEXT_INVALID_OPS
+    ...^*...
+    *... ...(*... ...|*...
+    |... ...| ...|$... ...||... ...(|...
+
+    That is: 
+    1. Do simplification below to remove repeats of zero-length matches.
+    2. An empty elt of a choice renders as ().
+    3. ...|$... Hack it: If first char of a rendered choice elt is $, prefix
+       with ().
+
+    Simplify ^{0,n} -> ""
+             ^{m,n} -> ^     (0 (in)  (m>n)
+	     Similarly for bos/eos bol/eol bow/eow ""
+
+    Spencer says:
+       A repetition operator (?, *, +, or bounds)  cannot  follow
+       another repetition operator.  A repetition operator cannot
+       begin an expression or subexpression or follow `^' or `|'.
+
+       `|'  cannot  appear  first or last in a (sub)expression or
+       after another `|', i.e. an operand of  `|'  cannot  be  an
+       empty  subexpression.   An  empty parenthesized subexpres-
+       sion, `()', is legal and matches an empty (sub)string.  An
+       empty string is not a legal RE.
+
+
+Fix the printer and reader so control chars are printed as
+    \ddd; do syntax for control-char input
+
+-------------------------------------------------------------------------------
+Less important:
+- Support for searching vs. matching
+- Case-scope hacking (needs s48 0.51 CODE-QUOTE)
+- simp caching
+- Better char-set->sre renderer
+  First, bound the cset with tightest possible superset,
+  then look for negations.
+
+Possible interesting extensions:
+- An ADT->DFA compiler
+- A DFA->Scheme-code compiler
+- An ADT interpreter
+- A pattern notation for matching against s-expressions.
+  This would be handy for specifying the grammar of Scheme macros,
+  for example.
+- Only allocate svec and evec if we match?