Updated documentation for the new release. Mostly new material for the

SRE regexp system.
1999-09-08 15:18:25 +00:00 · 1999-09-08 15:18:25 +00:00 · 0d3f69124f
parent 239592923f
commit 0d3f69124f
7 changed files with 276 additions and 289 deletions
--- a/doc/scsh-manual/awk.tex
+++ b/doc/scsh-manual/awk.tex
@ -77,7 +77,7 @@ characters.

 \subsection{Parsing fields}

-\defun  {field-splitter}  {[regexp num-fields]}             \proc
+\defun  {field-splitter}  {[field num-fields]}             \proc
 \defunx {infix-splitter}  {[delim num-fields handle-delim]} \proc
 \defunx {suffix-splitter} {[delim num-fields handle-delim]} \proc
 \defunx {sloppy-suffix-splitter} {[delim num-fields handle-delim]} \proc
@ -107,10 +107,10 @@ These functions return a parser function that can be used as follows:
    Defaults:
 \begin{tightinset}
 \begin{tabular}{l@{\quad=\quad }ll}
- \var{delim} & \verb!"[ \t\n]+|$"!   & (suffix delimiter: white space or eos) \\
- \multicolumn{1}{l}{}   	     & \verb|"[ \t\n]+"|     & (infix  delimiter: white space) \\
-    
- \var{re}    & \verb|"[^ \t\n]+"|    & (non-white-space) \\
+ \var{delim} & \ex{(rx (| (+ white) eos))}    & (suffix delimiter: white space or eos) \\
+ \multicolumn{1}{l}{}   	     & \ex{(rx (+ white))}   & (infix  delimiter: white space) \\
+
+ \var{field}    & \verb|(rx (+ (~ white)))|   & (non-white-space) \\
    
 \var{num-fields} & \verb|#f| & (as many fields as possible) \\
    
@ -120,15 +120,30 @@ These functions return a parser function that can be used as follows:
    {\ldots}which means: break the string at white space, discarding the
     white space, and parse as many fields as possible.

-    The regular expression \var{delim} is used to match field delimiters.
-    It can be either a string or a compiled regexp structure (see the
-    \ex{make-regexp} procedure). In the separator case, it defaults to a
-    regular expression matching white space; in the terminator case,
+    The \var{delim} parameter is a regular expression matching the text
+    that occurs between fields.
+    See chapter~\ref{chapt:sre} for information on regular expressions,
+    and the \ex{rx} form used to specify them.
+    In the separator case, 
+    it defaults to a pattern matching white space; 
+    in the terminator case,
    it defaults to white space or end-of-string.

-    The regular expression \var{re} is a regular expression used
+    The \var{field} parameter is a regular expression used
    to match fields. It defaults to non-white-space.

+    The \var{delim} patterns may also be given as a string, 
+    character, or char-set, which are coerced to regular expressions.
+    So the following expressions are all equivalent,
+    each producing a function that splits strings apart at colons:
+\begin{inset}
+\begin{verbatim}
+(infix-splitter (rx ":"))
+(infix-splitter ":")
+(infix-splitter #\:)
+(infix-splitter (char-set #\:))\end{verbatim}
+\end{inset}
+
    The boolean \var{handle-delim} determines what to do with delimiters.
    \begin{tightinset}\begin{tabular}{ll}
    \ex{'trim} &	Delimiters are thrown away after parsing. (default) \\
@ -178,7 +193,7 @@ These functions return a parser function that can be used as follows:

    It is an error if a non-empty record does not end with a delimiter.
    To make the last delimiter optional, make sure the delimiter regexp
-    matches the end-of-string (regexp \ex{"\$"}).
+    matches the end-of-string (sre \ex{eos}).

    \item [\ex{infix-splitter}]
    Delimiters are interpreted as element \emph{separators}. If comma is the
@ -222,7 +237,8 @@ These functions return a parser function that can be used as follows:
    initial delimiter string if the string begins with one instead of parsing 
    an initial empty field. This can be used, for example, to field-split a
    sequence of English text at white-space boundaries, where the string may
-    begin or end with white space, by using regex \verb!"[ \t]+|$"!.
+    begin or end with white space, by using regex
+\begin{code}{(rx (| (+ white) eos))}\end{code}
    (But you would be better off using \ex{field-splitter} in this case.)
    \end{description}
    \end{desc}
@ -318,25 +334,26 @@ Record & : suffix & \verb!:|$! suffix & : infix & non-: field \\
    ; wandy:3xuncWdpKhR.:73:22:Wandy Saetan:/usr/wandy:/bin/csh

 ;;; Two ls -l output readers
-(field-reader (infix-splitter "[ \\t]+" 8))
-(field-reader (infix-splitter "[ \\t]+" -7))
+(field-reader (infix-splitter (rx (+ white)) 8))
+(field-reader (infix-splitter (rx (+ white)) -7))
    ; -rw-r--r--  1 shivers    22880 Sep 24 12:45 scsh.scm

 ;;; Internet hostname reader
-(field-reader (field-splitter "[^.]+"))
+(field-reader (field-splitter (rx (+ (~ ".")))))
    ; stat.sinica.edu.tw

 ;;; Internet IP address reader
-(field-reader (field-splitter "[^.]+" 4))
+(field-reader (field-splitter (rx (+ (~ "."))) 4))
    ; 18.24.0.241

 ;;; Line of integers
-(let ((parser (field-splitter "[+-]?[0-9]+")))
+(let ((parser (field-splitter (rx (? ("+-")) (+ digit)))))
  (field-reader (\l{s} (map string->number (parser s))))
    ; 18 24 0 241

 ;;; Same as above.
-(let ((reader (field-reader (field-splitter "[+-]?[0-9]+"))))
+(let ((reader (field-reader (field-splitter (rx (? ("+-")) 
+                                                (+ digit))))))
  (\lx{maybe-port} (map string->number (apply reader maybe-port))))
    ; Yale beat harvard 26 to 7.\end{centercode}
 \caption{Some examples of \protect\ex{field-reader}}
@ -349,8 +366,9 @@ Record & : suffix & \verb!:|$! suffix & : infix & non-: field \\
 \subsection{Forward-progress guarantees and empty-string matches}
 A loop that pulls text off a string by repeatedly matching a regexp 
 against that string can conceivably get stuck in an infinite loop if 
-the regexp matches the empty string. For example, the regexps \verb|^|, 
-\verb|$|, \verb|.*|, and \verb!foo|[^f]*! can all match the empty string.
+the regexp matches the empty string. For example, the SREs
+\ex{bos}, \ex{eos}, \ex{(* any)}, and \ex{(| "foo" (* (~ "f")))}
+can all match the empty string.

 The routines in this package that iterate through strings with regular
 expressions are careful to handle this empty-string case.
@ -369,10 +387,10 @@ progress, and the loop is guaranteed to terminate.
 This has the effect you want with field parsing. For example, if you split
 a string with the empty pattern, you will explode the string into its
 individual characters:
-    \codex{((suffix-splitter "") "foo") {\evalto} ("" "f" "o" "o")}
+    \codex{((suffix-splitter (rx)) "foo") {\evalto} ("" "f" "o" "o")}
 However, even though this boundary case is handled correctly, we don't
 recommend using it. Say what you mean---just use a field splitter:
-    \codex{((field-splitter ".") "foo") {\evalto} ("f" "o" "o")}
+    \codex{((field-splitter (rx any)) "foo") {\evalto} ("f" "o" "o")}
 Or, more efficiently,
    \codex{((\l{s} (map string (string->list s))) "foo")}

@ -478,15 +496,25 @@ it checks them all.

    	The \var{test} form can be one of:
 	\begin{inset}
-        \begin{tabular}{lp{0.8\linewidth}}
-    	  integer: &	The test is true for that iteration of the loop.
+        \begin{tabular}{lp{0.6\linewidth}}
+    	  \var{integer}: &	The test is true for that iteration of the loop.
                        The first iteration is \#1. \\
    
-    	  string: &	The string is a regular expression. The test is
-    	    	    	true if the regexp matches the record.\\
+    	  \var{sre}: &	A regular expression, in SRE notation 
+                        (see chapter~\ref{chapt:sre}) can be used as
+			a test. The test is successful if the pattern
+                        matches the record.
+                        In particular, note that any string is an SRE. \\
    
-	  expression &  If not an integer or a string, the test form is
-    	    	    	a Scheme expression that is evaluated.
+	  \ex{(when \var{expr})}: &
+			The body of a \ex{when} test is evaluated as a
+			Scheme boolean expression in the inner scope of the
+			\ex{awk} form. \\
+
+	  \var{expr}: & If the form is none of the above, it is treated as
+    	    	    	a Scheme expression---in practice, the \ex{when}
+			keyword is only needed in cases where SRE/Scheme
+			expression ambiguity might occur.
 	\end{tabular}
 	\end{inset}

@ -526,7 +554,7 @@ it checks them all.
    \itum{\ex{(\var{test} => \var{exp})}}
 	If evaluating \ex{test} produces a true value, 
        apply \var{exp} to that value.
-	If \var{test} is a regular-expression string, then \var{exp} is applied
+	If \var{test} is a regular expression, then \var{exp} is applied
    	to the match data structure returned by the regexp match routine.

    \itum{\ex{(after \vari{body}1 \ldots)}}
@ -562,9 +590,10 @@ of input stream.
  (call-with-input-file "/etc/passwd"
    (lambda (port)
      (awk (read-passwd port) (record fields) ()
-        ("^S" (format #t "~a's home directory is ~a~%"
-                      ($ fields 0)
-                      ($ fields 5)))))))\end{code}
+        ((: bos "S") 
+         (format #t "~a's home directory is ~a~%"
+                    ($ fields 0)
+                    ($ fields 5)))))))\end{code}

 \begin{code}
 ;;; Read a series of integers from stdin. This expression evaluates
@ -581,8 +610,8 @@ of input stream.
 \begin{code}
 ;;; Count the number of non-comment lines of code in my Scheme source.
 (awk (read-line) (line) ((nlines 0))
-  ("^[ \\t]*;" nlines)               ; A comment line.
-  (else       (+ nlines 1)))        ; Not a comment line.\end{code}
+  ((: bos (* white) ";")  nlines)         ; A comment line.
+  (else                   (+ nlines 1)))  ; Not a comment line.\end{code}

 \begin{code}
 ;;; Read numbers, counting the evens and odds.
@ -600,10 +629,10 @@ of input stream.
  (#t (max max-len (string-length line))))\end{code}

 \begin{code}
-;;; (This could also be done with REDUCE-PORT:)
-(reduce-port (current-input-port) read-line
-             (lambda (line maxlen) (max (string-length line) maxlen))
-             0)\end{code}
+;;; (This could also be done with PORT-FOLDL:)
+(port-foldl (current-input-port) read-line
+            (lambda (line maxlen) (max (string-length line) maxlen))
+            0)\end{code}

 \begin{code}
 ;;; Print every line longer than 80 chars.
@ -615,7 +644,7 @@ of input stream.
 \begin{code}
 ;;; Strip blank lines from input.
 (awk (read-line) (line) ()
-  ("." (display line) (newline)))\end{code}
+  ((~ white)   (display line) (newline)))\end{code}

 \begin{code}
 ;;; Sort the entries in /etc/passwd by login name.
@ -629,3 +658,14 @@ of input stream.
 ;;; Prefix line numbers to the input stream.
 (awk (read-line) (line) lineno ()
  (#t (format #t "~d:\\t~a~%" lineno line)))\end{code}
+
+
+\section{Backwards compatibility}
+
+Previous scsh releases provided an \ex{awk} form with a different syntax,
+designed around regular expressions written in Posix notation as strings,
+rather than SREs.
+
+This form is still available in a separate module for old code.
+It'll be documented in the next release of this manual. Dig around
+in the sources for it.
--- a/doc/scsh-manual/decls.tex
+++ b/doc/scsh-manual/decls.tex
@ -38,7 +38,7 @@

 % For multiletter vars in math mode:
 \newcommand{\var}[1]{\mbox{\frenchspacing\it{#1}}}
-\newcommand{\vari}[2]{${\mbox{\it{#1}}}_{#2}$}
+\newcommand{\vari}[2]{\ensuremath{\mbox{\it{#1}}_{#2}}}

 %% What you frequently want when you say \tt:
 \def\ttchars{\catcode``=13\@noligs\frenchspacing}
--- a/doc/scsh-manual/front.tex
+++ b/doc/scsh-manual/front.tex
@ -1,9 +1,9 @@
 %&latex -*- latex -*-

 \title{Scsh Reference Manual}
-\subtitle{For scsh release 0.5}
+\subtitle{For scsh release 0.5.2}
 \author{Olin Shivers and Brian D.~Carlstrom}
-\date{April 11, 1997}
+\date{September 1999}

 \maketitle
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
--- a/doc/scsh-manual/man.tex
+++ b/doc/scsh-manual/man.tex
@ -27,6 +27,7 @@
 \include{syscalls}
 \include{network}
 \include{strings}
+\include{sre}
 \include{rdelim}
 \include{awk}
 \include{miscprocs}
--- a/doc/scsh-manual/procnotation.tex
+++ b/doc/scsh-manual/procnotation.tex
@ -324,12 +324,12 @@ run/sexp*     $\equiv$  read              $\circ$ run/port*
 run/sexps*    $\equiv$  port->sexp-list   $\circ$ run/port*\end{code}
 \end{desc}

-\defun{reduce-port}{port reader op . seeds} {\object\star}
+\defun{port-foldl}{port reader op . seeds} {\object\star}
 \begin{desc}
 This procedure can be used to perform a variety of iterative operations
 over an input stream.
 It repeatedly uses \var{reader} to read an object from \var{port}.
-If the first read returns eof, then the entire \ex{reduce-port}
+If the first read returns eof, then the entire \ex{port-foldl}
 operation returns the seeds as multiple values.
 If the first read operation returns some other value $v$, then
 \var{op} is applied to $v$ and the seeds:
@ -340,11 +340,15 @@ reading a new value from the port, and so forth.

 For example, \ex{(port->list \var{reader} \var{port})}
 could be defined as
-        \codex{(reverse (reduce-port \var{port} \var{reader} cons '()))}
+        \codex{(reverse (port-foldl \var{port} \var{reader} cons '()))}

-An imperative way to look at \ex{reduce-port} is to say that it
+An imperative way to look at \ex{port-foldl} is to say that it
 abstracts the idea of a loop over a stream of values read from
 some port, where the seed values express the loop state.
+
+\remark{This procedure was formerly named \texttt{\indx{reduce-port}}.
+        The old binding is still provided, but is deprecated and will
+	probably vanish in a future release.}
 \end{desc}


--- a/doc/scsh-manual/strings.tex
+++ b/doc/scsh-manual/strings.tex
@ -5,10 +5,11 @@ Scsh provides a set of procedures for processing strings and characters.
 The procedures provided match regular expressions, search strings,
 parse file-names, and manipulate sets of characters.

-Also see chapters \ref{chapt:rdelim} and \ref{chapt:fr-awk}
-on record I/O, field parsing, and the awk loop.
-The procedures documented there allow you to read character-delimited
-records from ports, use regular expressions to split the records into fields 
+Also see chapters \ref{chapt:sre}, \ref{chapt:rdelim} and \ref{chapt:fr-awk}
+on regular-expressions, record I/O, field parsing, and the awk loop.
+The procedures documented there allow you to search and pattern-match strings,
+read character-delimited records from ports, 
+use regular expressions to split the records into fields 
 (for example, splitting a string at every occurrence of colon or white-space),
 and loop over streams of these records in a convenient way.

@ -19,213 +20,7 @@ and loop over streams of these records in a convenient way.
 Strings are the basic communication medium for {\Unix} processes, so a
 shell language must have reasonable facilities for manipulating them.

-\subsection{Regular expressions}
-\label{sec:regexps}
-
-The following functions perform regular expression matching.
-The code uses Henry Spencer's regular expression package.
-
-\begin{defundesc}{string-match} {regexp string [start]} {match or false}
-    Search \var{string} starting at position \var{start}, looking for a match
-    for \var{regexp}. If a match is found, return a match structure describing
-    the match, otherwise {\sharpf}. \var{Start} defaults to 0.
-
-    \var{regexp} may be a compiled regexp structure or a string defining
-    a regular expression, which will be compiled to a regexp structure.
-\end{defundesc}
-
-\begin{defundesc} {regexp-match?} {obj} \boolean
-    Is the object a regular expression match?
-\end{defundesc}
-
-\begin{defundesc} {match:start} {match [match-number]} {{\fixnum} or false}
-    Returns the start position of the match denoted by \var{match-number}.
-    The whole regexp is 0.  Each further number represents positions
-    enclosed by \ex{(\ldots)} sections. \var{Match-number} defaults to 0.
-
-    If the regular expression matches as a whole, 
-    but a particular parenthesized sub-expression does not match, then
-    \ex{match:start} returns {\sharpf}.
-\end{defundesc}
-
-\begin{defundesc} {match:end} {match [match-number]} \fixnum
-    Returns the end position of the match denoted by \var{match-number}.
-    \var{Match-number} defaults to 0 (the whole match).
-
-    If the regular expression matches as a whole, 
-    but a particular parenthesized sub-expression does not match, then
-    \ex{match:end} returns {\sharpf}.
-\end{defundesc}
-
-\begin{defundesc} {match:substring} {match [match-number]} {{\str} or false}
-    Returns the substring matched by match \var{match-number}.
-    \var{Match-number} defaults to 0 (the whole match).
-    If there was no match, returns false.
-\end{defundesc}
-
-Regular expression matching compiles patterns into special data
-structures which can be efficiently used to match against strings.
-The overhead of compiling patterns that will be used for multiple
-searches can be avoided by these lower-level routines:
-%
-\begin{defundesc} {make-regexp} {str} {re}
-  Generate a compiled regular expression from the given string.
-\end{defundesc}
-
-\begin{defundesc} {regexp?} {obj} \boolean
-  Is the object a regular expression?
-\end{defundesc}
-
-\begin{defundesc} {regexp-exec} {regexp str [start]} {match or false}
-  Apply the regular expression \var{regexp} to the string \var{str} starting
-  at position \var{start}.  If the match succeeds it returns a regexp-match,
-  otherwise {\sharpf}. \var{Start} defaults to 0.
-\end{defundesc}
-
-\begin{defundesc} {->regexp} {regexp-or-string} {regexp}
-  Coerce the input value into a compiled regular expression:
-  strings are compiled; regexp structures are passed through unchanged.
-\end{defundesc}
-
-\defun{regexp-quote}{str}{\str}
-\begin{desc}
-Returns a regular expression that matches the string \var{str} exactly.
-In other words, it quotes the regular expression, prepending backslashes
-to all the special regexp characters in \var{str}.
-\begin{code}
-(regexp-quote "*Hello* world.") 
-    {\evalto}"\\\\*Hello\\\\* world\\\\."\end{code}
-\end{desc}
-
-\defun{regexp-substitute}{port match . items}{{\str} or \undefined}
-\begin{desc}
-This procedure can be used to perform string substitutions based on
-regular expression matches.
-The results of the substitution can be either output to a port or
-returned as a string.
-
-The \var{match} argument is a regular expression match structure
-that controls the substitution.
-If \var{port} is an output port, the \var{items} are written out to
-the port:
-\begin{itemize}
-    \item If an item is a string, it is copied directly to the port.
-    \item If an item is an integer, the corresponding submatch from \var{match}
-      is written to the port.
-    \item If an item is \ex{'pre}, 
-          the prefix of the matched string (the text preceding the match) 
-	  is written to the port.
-    \item If an item is \ex{'post}, 
-          the suffix of the matched string is written.
-\end{itemize}
-
-If \var{port} is {\sharpf}, nothing is written, and a string is constructed
-and returned instead.
-\end{desc}
-
-\defun{regexp-substitute/global}{port regexp string . items}
-                                {{\str} or \undefined}
-\begin{desc}
-This procedure is similar to \ex{regexp-substitute}, 
-but can be used to perform repeated match/substitute operations over
-a string.
-It has the following differences with \ex{regexp-substitute}:
-\begin{itemize}
-  \item It takes a regular expression and string to be matched as
-        parameters, instead of a completed match structure.
-  \item If the regular expression doesn't match the string, this
-	procedure is the identity transform---it returns or outputs the
-	string.
-  \item If an item is \ex{'post}, the procedure recurses on the suffix string
-        (the text from \var{string} following the match). 
-	Including a \ex{'post} in the list of items is how one gets multiple 
-	match/substitution operations.
-  \item If an item is a procedure, it is applied to the match structure for
-	a given match.
-	The procedure returns a string to be used in the result.
-  \end{itemize}
-The \var{regexp} parameter can be either a compiled regular expression or
-a string specifying a regular expression.
-
-Some examples:
-{\small
-\begin{widecode}
-;;; Replace occurrences of "Cotton" with "Jin".
-(regexp-substitute/global #f "Cotton" s
-                          'pre "Jin" 'post)
-
-;;; mm/dd/yy -> dd/mm/yy date conversion.
-(regexp-substitute/global #f "([0-9]+)/([0-9]+)/([0-9]+)" ; mm/dd/yy
-                          s ; Source string
-                          'pre 2 "/" 1 "/" 3 'post)
-
-;;; "9/29/61" -> "Sep 29, 1961" date conversion.
-(regexp-substitute/global #f "([0-9]+)/([0-9]+)/([0-9]+)" ; mm/dd/yy
-                          s ; Source string
-
-      'pre 
-      ;; Sleazy converter -- ignores "year 2000" issue, and blows up if
-      ;; month is out of range.
-      (lambda (m)
-        (let ((mon (vector-ref '#("Jan" "Feb" "Mar" "Apr" "May" "Jun"
-                                  "Jul" "Aug" "Sep" "Oct" "Nov" "Dec")
-                               (- (string->number (match:substring m 1)) 1)))
-              (day (match:substring m 2))
-              (year (match:substring m 3)))
-          (string-append mon " " day ", 19" year)))
-      'post)
-
-;;; Remove potentially offensive substrings from string S.
-(regexp-substitute/global #f "Windows|tcl|Intel" s
-                          'pre 'post)\end{widecode}}
-
-\end{desc}
-
-\subsection{Other string manipulation facilities}
-
-\defun  {index}  {string char [start]} {{\fixnum} or false}
-\defunx {rindex} {string char [start]} {{\fixnum} or false}
-\begin{desc}
-    These procedures search through \var{string} looking for an occurrence
-    of character \var{char}. \ex{index} searches left-to-right; \ex{rindex}
-    searches right-to-left.
-
-    \ex{index} returns the smallest index $i$ of \var{string} greater
-    than or equal to \var{start} such that $\var{string}[i] = \var{char}$.
-    The default for \var{start} is zero. If there is no such match,
-    \ex{index} returns false.
-
-    \ex{rindex} returns the largest index $i$ of \var{string} less than
-    \var{start} such that $\var{string}[i] = \var{char}$.
-    The default for \var{start} is \ex{(string-length \var{string})}.
-    If there is no such match, \ex{rindex} returns false.
-\end{desc}
-
-I should probably snarf all the MIT Scheme string functions, and stick them
-in a package.  {\Unix} programs need to mung character strings a lot.
-
-MIT string match commands: 
-\begin{tightcode}
-[sub]string-match-{forward,backward}[-ci]
-[sub]string-{prefix,suffix}[-ci]?
-[sub]string-find-{next,previous}-char[-ci]
-[sub]string-find-{next,previous}-char-in-set
-[sub]string-replace[!]
-\ldots\etc\end{tightcode}
-These are not currently provided.
-
-\begin{defundesc} {substitute-env-vars} {fname} \str
-    Replace occurrences of environment variables with their values.
-    An environment variable is denoted by a dollar sign followed by
-    alphanumeric chars and underscores, or is surrounded by braces.
-
-    \begin{exampletable}
-    \splitline{\ex{(substitute-env-vars "\$USER/.login")}} 
-              {\ex{"shivers/.login"}}  \\
-    \cd{(substitute-env-vars "$\{USER\}_log")}  & \cd{"shivers_log"}
-    \end{exampletable}
-\end{defundesc}
-
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Manipulating file-names}
 \label{sec:filenames}

@ -559,6 +354,53 @@ defined in the previous section,
 is also frequently useful for expanding file-names.


+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Other string manipulation facilities}
+
+\defun  {index}  {string char [start]} {{\fixnum} or false}
+\defunx {rindex} {string char [start]} {{\fixnum} or false}
+\begin{desc}
+    These procedures search through \var{string} looking for an occurrence
+    of character \var{char}. \ex{index} searches left-to-right; \ex{rindex}
+    searches right-to-left.
+
+    \ex{index} returns the smallest index $i$ of \var{string} greater
+    than or equal to \var{start} such that $\var{string}[i] = \var{char}$.
+    The default for \var{start} is zero. If there is no such match,
+    \ex{index} returns false.
+
+    \ex{rindex} returns the largest index $i$ of \var{string} less than
+    \var{start} such that $\var{string}[i] = \var{char}$.
+    The default for \var{start} is \ex{(string-length \var{string})}.
+    If there is no such match, \ex{rindex} returns false.
+\end{desc}
+
+I should probably snarf all the MIT Scheme string functions, and stick them
+in a package.  {\Unix} programs need to mung character strings a lot.
+
+MIT string match commands: 
+\begin{tightcode}
+[sub]string-match-{forward,backward}[-ci]
+[sub]string-{prefix,suffix}[-ci]?
+[sub]string-find-{next,previous}-char[-ci]
+[sub]string-find-{next,previous}-char-in-set
+[sub]string-replace[!]
+\ldots\etc\end{tightcode}
+These are not currently provided.
+
+\begin{defundesc} {substitute-env-vars} {fname} \str
+    Replace occurrences of environment variables with their values.
+    An environment variable is denoted by a dollar sign followed by
+    alphanumeric chars and underscores, or is surrounded by braces.
+
+    \begin{exampletable}
+    \splitline{\ex{(substitute-env-vars "\$USER/.login")}} 
+              {\ex{"shivers/.login"}}  \\
+    \cd{(substitute-env-vars "$\{USER\}_log")}  & \cd{"shivers_log"}
+    \end{exampletable}
+\end{defundesc}
+
+
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{ASCII encoding}

@ -596,47 +438,36 @@ assumption into your code if you can help it.\footnote{
 Is the object \var{x} a character set?
 \end{desc}

-\defun{char-set=}{cs1 cs2}\boolean
+\defun{char-set=}{\vari{cs}1 \vari{cs}2\ldots}\boolean
 \begin{desc}
-Are the character sets \var{cs1} and \var{cs2} equal?
+Are the character sets equal?
 \end{desc}

-\defun{char-set<=}{cs1 cs2}\boolean
+\defun{char-set<=}{\vari{cs}1 \vari{cs}2\ldots}\boolean
 \begin{desc}
-Returns true if character set \var{cs1} is a subset of character set \var{cs2}.
+Returns true if every character set \vari{cs}{i} is 
+a subset of character set \vari{cs}{i+1}.
 \end{desc}

-\defun{reduce-char-set}{kons knil cs}\object
+\defun{char-set-fold}{kons knil cs}\object
 \begin{desc}
 This is the fundamental iterator for character sets.
-Reduces the function \var{kons} across the character set \var{cs} using
+Applies the function \var{kons} across the character set \var{cs} using
 initial state value \var{knil}. 
 That is, if \var{cs} is the empty set, the procedure returns \var{knil}.
 Otherwise, some element \var{c} of \var{cs} is chosen; let \var{cs'} be
 the remaining, unchosen characters.
 The procedure returns 
 \begin{tightcode}
-(reduce-char-set \var{kons} (\var{kons} \var{c} \var{knil}) \var{cs'})\end{tightcode}
+(char-set-fold \var{kons} (\var{kons} \var{c} \var{knil}) \var{cs'})\end{tightcode}
 For example, we could define \ex{char-set-members} (see below)
 as
 \begin{tightcode}
-(lambda (cs) (reduce-char-set cons '() cs))\end{tightcode}
-\end{desc}
+(lambda (cs) (char-set-fold cons '() cs))\end{tightcode}

-\subsection{Side effects}
-\defun{set-char-set!}{cs char in?}{\undefined}
-\begin{desc}
-This side-effects character set \var{cs}. 
-If \var{in?} is true, character \var{char} is added to the set.
-Otherwise, it is deleted from the set.
-
-Use of this procedure is deprecated, since it could damage other procedures
-that retain pointers to existing character sets.
-You should use \ex{set-char-set!} in contexts where it is guaranteed that
-there are no other pointers to the character set being modified.
-(For example, functions that create character sets can use this function
-to efficiently construct the character set, after which time the set is
-used in a pure-functional, shared manner.)
+\remark{This procedure was formerly named \texttt{\indx{reduce-char-set}}.
+        The old binding is still provided, but is deprecated and will
+	probably vanish in a future release.}
 \end{desc}

 \defun{char-set-for-each}{p cs}{\undefined}
@ -646,11 +477,7 @@ Note that the order in which \var{p} is applied to the characters in the
 set is not specified, and may even change from application to application.
 \end{desc}

-\defun{copy-char-set}{cs}{char-set}
-\begin{desc}
-Returns a copy of the character set \var{cs}.
-\end{desc}
-
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Creating character sets}

 \defun{char-set}{\vari{char}1\ldots}{char-set}
@ -680,6 +507,7 @@ Returns a character set containing every character whose {\Ascii}
 code lies in the half-open range $[\var{lower},\var{upper})$.
 \end{desc}

+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Querying character sets}
 \defun {char-set-members}{char-set}{character-list}
 \begin{desc}
@ -706,7 +534,23 @@ We sought to escape the dilemma by shifting to a new name.}
 Returns the number of elements in character set \var{cs}.
 \end{desc}

-\subsection{Character set algebra}
+\defun{char-set-every?}{pred cs}\boolean
+\defunx{char-set-any?}{pred cs}\object
+\begin{desc}
+The \ex{char-set-every?} procedure returns true if predicate \var{pred}
+returns true of every character in the character set \var{cs}.
+
+Likewise, \ex{char-set-any?} applies \var{pred} to every character in
+character set \var{cs}, and returns the first true value it finds.
+If no character produces a true value, it returns false.
+
+The order in which these procedures sequence through the elements of
+\var{cs} is not specified.
+\end{desc}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Character-set algebra}
 \defun {char-set-invert}{char-set}{char-set}
 \defunx{char-set-union}{\vari{char-set}1\ldots}{char-set}
 \defunx{char-set-intersection}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set}
@ -719,6 +563,14 @@ to the left; the difference function requires at least one argument, while
 union and intersection may be applied to zero arguments.
 \end{desc}

+\defun {char-set-adjoin}{cs \vari{char}1\ldots}{char-set}
+\defunx{char-set-delete}{cs \vari{char}1\ldots}{char-set}
+\begin{desc}
+Add/delete the \vari{char}i characters to/from character set \var{cs}.
+\end{desc}
+
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \subsection{Standard character sets}
 \label{sec:std-csets}
 Several character sets are predefined for convenience:
@ -727,25 +579,25 @@ Several character sets are predefined for convenience:
 \newcommand{\entry}[1]{\ex{#1}\index{#1}}
 \begin{tabular}{|ll|}
 \hline
-\entry{char-set:alphabetic}	&	Alphabetic chars \\
 \entry{char-set:lower-case}	&	Lower-case alphabetic chars \\
 \entry{char-set:upper-case}	&	Upper-case alphabetic chars \\
+\entry{char-set:alphabetic}	&	Alphabetic chars \\
 \entry{char-set:numeric}	&	Decimal digits: 0--9 \\
 \entry{char-set:alphanumeric}	&	Alphabetic or numeric \\
 \entry{char-set:graphic}	&	Printing characters except space \\
 \entry{char-set:printing}	&	Printing characters including space \\
 \entry{char-set:whitespace}	&	Whitespace characters \\
-\entry{char-set:blank}		&	Blank characters \\
 \entry{char-set:control}	&	Control characters \\
 \entry{char-set:punctuation}	&	Punctuation characters \\
 \entry{char-set:hex-digit}	&       A hexadecimal digit: 0--9, A--F, a--f \\
+\entry{char-set:blank}		&	Blank characters \\
 \entry{char-set:ascii}		&	A character in the ASCII set. \\
 \entry{char-set:empty}		&	Empty set \\
 \entry{char-set:full}		&	All characters \\
 \hline
 \end{tabular}
 \end{center}
-The first twelve of these correspond to the character classes defined in
+The first eleven of these correspond to the character classes defined in
 Posix.
 Note that there may be characters in \ex{char-set:alphabetic} that are
 neither upper or lower case---this might occur in implementations that
@ -788,3 +640,87 @@ char-set:punctuation	&	\verb|!"#$%&'()*+,-./:;<=>|\verb#?@[\]^_`{|}~# \\
 \begin{desc}
 These predicates are defined in terms of the above character sets.
 \end{desc}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\subsection{Linear-update character-set operations}
+These procedures have a hybrid pure-functional/side-effecting semantics:
+they are allowed, but not required, to side-effect one of their parameters
+in order to construct their result.
+An implementation may legally implement these procedures as pure,
+side-effect-free functions, or it may implement them using side effects,
+depending upon the details of what is the most efficient or simple to 
+implement in terms of the underlying representation.
+
+What this means is that clients of these procedures \emph{may not} rely
+upon these procedures working by side effect.
+For example, this is not guaranteed to work:
+\begin{verbatim}
+(let ((cs (char-set #\a #\b #\c)))
+  (char-set-adjoin! cs #\d)
+  cs) ; Could be either {a,b,c} or {a,b,c,d}.
+\end{verbatim}
+However, this is well-defined:
+\begin{verbatim}
+(let ((cs (char-set #\a #\b #\c)))
+  (char-set-adjoin! cs #\d)) ; {a,b,c,d}
+\end{verbatim}
+So clients of these procedures write in a functional style, but must
+additionally be sure that, when the procedure is called, there are no
+other live pointers to the potentially-modified character set (hence the term
+``linear update'').
+
+There are two benefits to this convention:
+\begin{itemize}
+\item Implementations are free to provide the most efficient possible
+      implementation, either functional or side-effecting.
+\item Programmers may nonetheless continue to assume that character sets
+      are purely functional data structures: they may be reliably shared
+      without needing to be copied, uniquified, and so forth.
+\end{itemize}
+
+In practice, these procedures are most useful for efficiently constructing
+character sets in a side-effecting manner, in some limited local context, 
+before passing the character set outside the local construction scope to be
+used in a functional manner.
+
+Scsh provides no assistance in checking the linearity of the potentially
+side-effected parameters passed to these functions --- there's no linear
+type checker or run-time mechanism for detecting violations.
+
+\defun{char-set-copy}{cs}{char-set}
+\begin{desc}
+Returns a copy of the character set \var{cs}.
+``Copy'' means that if either the input parameter or the
+result value of this procedure is passed to one of the linear-update
+procedures described below, the other character set is guaranteed
+not to be altered.
+(A system that provides pure-functional implementations of the rest of
+the linear-operator suite could implement this procedure as the 
+identity function.)
+\end{desc}
+
+\defun{char-set-adjoin!}{cs \vari{char}1\ldots}{char-set}
+\begin{desc}
+Add the \vari{char}i characters to character set \var{cs}, and
+return the result. 
+This procedure is allowed, but not required, to side-effect \var{cs}.
+\end{desc}
+
+\defun{char-set-delete!}{cs \vari{char}1\ldots}{char-set}
+\begin{desc}
+Remove the \vari{char}i characters to character set \var{cs}, and
+return the result. 
+This procedure is allowed, but not required, to side-effect \var{cs}.
+\end{desc}
+
+\defun {char-set-invert!}{char-set}{char-set}
+\defunx{char-set-union!}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set}
+\defunx{char-set-intersection!}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set}
+\defunx{char-set-difference!}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set}
+\begin{desc}
+These procedures implement set complement, union, intersection, and difference
+for character sets.
+They are allowed, but not required, to side-effect their first parameter.
+The union, intersection, and difference operations are n-ary, associating
+to the left.
+\end{desc}
--- a/doc/scsh-manual/syscalls.tex
+++ b/doc/scsh-manual/syscalls.tex
@ -1464,6 +1464,12 @@ All wild-card characters in \var{str} are quoted with a backslash.


 \begin{defundesc}{file-match}{root dot-files? \vari{pat}1 \vari{pat}2 {\ldots} \vari{pat}n}{string list}
+   \note{This procedure is deprecated, and will probably either go away or
+	 be substantially altered in a future release. New code should not
+	 call this procedure. The problem is that it relies upon
+	 Posix-notation regular expressions; the rest of scsh has been 
+ 	 converted over to the new SRE notation.}
+
    \ex{file-match} provides a more powerful file-matching service, at the
    expense of a less convenient notation. It is intermediate in
    power between most shell matching machinery and recursive \ex{find(1)}.