From 0d3f69124f331d7b19ca12407bffa1dcdc9e1cb4 Mon Sep 17 00:00:00 2001 From: shivers Date: Wed, 8 Sep 1999 15:18:25 +0000 Subject: [PATCH] Updated documentation for the new release. Mostly new material for the SRE regexp system. --- doc/scsh-manual/awk.tex | 118 ++++++--- doc/scsh-manual/decls.tex | 2 +- doc/scsh-manual/front.tex | 4 +- doc/scsh-manual/man.tex | 1 + doc/scsh-manual/procnotation.tex | 12 +- doc/scsh-manual/strings.tex | 422 +++++++++++++------------------ doc/scsh-manual/syscalls.tex | 6 + 7 files changed, 276 insertions(+), 289 deletions(-) diff --git a/doc/scsh-manual/awk.tex b/doc/scsh-manual/awk.tex index 4331163..a9bba0a 100644 --- a/doc/scsh-manual/awk.tex +++ b/doc/scsh-manual/awk.tex @@ -77,7 +77,7 @@ characters. \subsection{Parsing fields} -\defun {field-splitter} {[regexp num-fields]} \proc +\defun {field-splitter} {[field num-fields]} \proc \defunx {infix-splitter} {[delim num-fields handle-delim]} \proc \defunx {suffix-splitter} {[delim num-fields handle-delim]} \proc \defunx {sloppy-suffix-splitter} {[delim num-fields handle-delim]} \proc @@ -107,10 +107,10 @@ These functions return a parser function that can be used as follows: Defaults: \begin{tightinset} \begin{tabular}{l@{\quad=\quad }ll} - \var{delim} & \verb!"[ \t\n]+|$"! & (suffix delimiter: white space or eos) \\ - \multicolumn{1}{l}{} & \verb|"[ \t\n]+"| & (infix delimiter: white space) \\ - - \var{re} & \verb|"[^ \t\n]+"| & (non-white-space) \\ + \var{delim} & \ex{(rx (| (+ white) eos))} & (suffix delimiter: white space or eos) \\ + \multicolumn{1}{l}{} & \ex{(rx (+ white))} & (infix delimiter: white space) \\ + + \var{field} & \verb|(rx (+ (~ white)))| & (non-white-space) \\ \var{num-fields} & \verb|#f| & (as many fields as possible) \\ @@ -120,15 +120,30 @@ These functions return a parser function that can be used as follows: {\ldots}which means: break the string at white space, discarding the white space, and parse as many fields as possible. - The regular expression \var{delim} is used to match field delimiters. - It can be either a string or a compiled regexp structure (see the - \ex{make-regexp} procedure). In the separator case, it defaults to a - regular expression matching white space; in the terminator case, + The \var{delim} parameter is a regular expression matching the text + that occurs between fields. + See chapter~\ref{chapt:sre} for information on regular expressions, + and the \ex{rx} form used to specify them. + In the separator case, + it defaults to a pattern matching white space; + in the terminator case, it defaults to white space or end-of-string. - The regular expression \var{re} is a regular expression used + The \var{field} parameter is a regular expression used to match fields. It defaults to non-white-space. + The \var{delim} patterns may also be given as a string, + character, or char-set, which are coerced to regular expressions. + So the following expressions are all equivalent, + each producing a function that splits strings apart at colons: +\begin{inset} +\begin{verbatim} +(infix-splitter (rx ":")) +(infix-splitter ":") +(infix-splitter #\:) +(infix-splitter (char-set #\:))\end{verbatim} +\end{inset} + The boolean \var{handle-delim} determines what to do with delimiters. \begin{tightinset}\begin{tabular}{ll} \ex{'trim} & Delimiters are thrown away after parsing. (default) \\ @@ -178,7 +193,7 @@ These functions return a parser function that can be used as follows: It is an error if a non-empty record does not end with a delimiter. To make the last delimiter optional, make sure the delimiter regexp - matches the end-of-string (regexp \ex{"\$"}). + matches the end-of-string (sre \ex{eos}). \item [\ex{infix-splitter}] Delimiters are interpreted as element \emph{separators}. If comma is the @@ -222,7 +237,8 @@ These functions return a parser function that can be used as follows: initial delimiter string if the string begins with one instead of parsing an initial empty field. This can be used, for example, to field-split a sequence of English text at white-space boundaries, where the string may - begin or end with white space, by using regex \verb!"[ \t]+|$"!. + begin or end with white space, by using regex +\begin{code}{(rx (| (+ white) eos))}\end{code} (But you would be better off using \ex{field-splitter} in this case.) \end{description} \end{desc} @@ -318,25 +334,26 @@ Record & : suffix & \verb!:|$! suffix & : infix & non-: field \\ ; wandy:3xuncWdpKhR.:73:22:Wandy Saetan:/usr/wandy:/bin/csh ;;; Two ls -l output readers -(field-reader (infix-splitter "[ \\t]+" 8)) -(field-reader (infix-splitter "[ \\t]+" -7)) +(field-reader (infix-splitter (rx (+ white)) 8)) +(field-reader (infix-splitter (rx (+ white)) -7)) ; -rw-r--r-- 1 shivers 22880 Sep 24 12:45 scsh.scm ;;; Internet hostname reader -(field-reader (field-splitter "[^.]+")) +(field-reader (field-splitter (rx (+ (~ "."))))) ; stat.sinica.edu.tw ;;; Internet IP address reader -(field-reader (field-splitter "[^.]+" 4)) +(field-reader (field-splitter (rx (+ (~ "."))) 4)) ; 18.24.0.241 ;;; Line of integers -(let ((parser (field-splitter "[+-]?[0-9]+"))) +(let ((parser (field-splitter (rx (? ("+-")) (+ digit))))) (field-reader (\l{s} (map string->number (parser s)))) ; 18 24 0 241 ;;; Same as above. -(let ((reader (field-reader (field-splitter "[+-]?[0-9]+")))) +(let ((reader (field-reader (field-splitter (rx (? ("+-")) + (+ digit)))))) (\lx{maybe-port} (map string->number (apply reader maybe-port)))) ; Yale beat harvard 26 to 7.\end{centercode} \caption{Some examples of \protect\ex{field-reader}} @@ -349,8 +366,9 @@ Record & : suffix & \verb!:|$! suffix & : infix & non-: field \\ \subsection{Forward-progress guarantees and empty-string matches} A loop that pulls text off a string by repeatedly matching a regexp against that string can conceivably get stuck in an infinite loop if -the regexp matches the empty string. For example, the regexps \verb|^|, -\verb|$|, \verb|.*|, and \verb!foo|[^f]*! can all match the empty string. +the regexp matches the empty string. For example, the SREs +\ex{bos}, \ex{eos}, \ex{(* any)}, and \ex{(| "foo" (* (~ "f")))} +can all match the empty string. The routines in this package that iterate through strings with regular expressions are careful to handle this empty-string case. @@ -369,10 +387,10 @@ progress, and the loop is guaranteed to terminate. This has the effect you want with field parsing. For example, if you split a string with the empty pattern, you will explode the string into its individual characters: - \codex{((suffix-splitter "") "foo") {\evalto} ("" "f" "o" "o")} + \codex{((suffix-splitter (rx)) "foo") {\evalto} ("" "f" "o" "o")} However, even though this boundary case is handled correctly, we don't recommend using it. Say what you mean---just use a field splitter: - \codex{((field-splitter ".") "foo") {\evalto} ("f" "o" "o")} + \codex{((field-splitter (rx any)) "foo") {\evalto} ("f" "o" "o")} Or, more efficiently, \codex{((\l{s} (map string (string->list s))) "foo")} @@ -478,15 +496,25 @@ it checks them all. The \var{test} form can be one of: \begin{inset} - \begin{tabular}{lp{0.8\linewidth}} - integer: & The test is true for that iteration of the loop. + \begin{tabular}{lp{0.6\linewidth}} + \var{integer}: & The test is true for that iteration of the loop. The first iteration is \#1. \\ - string: & The string is a regular expression. The test is - true if the regexp matches the record.\\ + \var{sre}: & A regular expression, in SRE notation + (see chapter~\ref{chapt:sre}) can be used as + a test. The test is successful if the pattern + matches the record. + In particular, note that any string is an SRE. \\ - expression & If not an integer or a string, the test form is - a Scheme expression that is evaluated. + \ex{(when \var{expr})}: & + The body of a \ex{when} test is evaluated as a + Scheme boolean expression in the inner scope of the + \ex{awk} form. \\ + + \var{expr}: & If the form is none of the above, it is treated as + a Scheme expression---in practice, the \ex{when} + keyword is only needed in cases where SRE/Scheme + expression ambiguity might occur. \end{tabular} \end{inset} @@ -526,7 +554,7 @@ it checks them all. \itum{\ex{(\var{test} => \var{exp})}} If evaluating \ex{test} produces a true value, apply \var{exp} to that value. - If \var{test} is a regular-expression string, then \var{exp} is applied + If \var{test} is a regular expression, then \var{exp} is applied to the match data structure returned by the regexp match routine. \itum{\ex{(after \vari{body}1 \ldots)}} @@ -562,9 +590,10 @@ of input stream. (call-with-input-file "/etc/passwd" (lambda (port) (awk (read-passwd port) (record fields) () - ("^S" (format #t "~a's home directory is ~a~%" - ($ fields 0) - ($ fields 5)))))))\end{code} + ((: bos "S") + (format #t "~a's home directory is ~a~%" + ($ fields 0) + ($ fields 5)))))))\end{code} \begin{code} ;;; Read a series of integers from stdin. This expression evaluates @@ -581,8 +610,8 @@ of input stream. \begin{code} ;;; Count the number of non-comment lines of code in my Scheme source. (awk (read-line) (line) ((nlines 0)) - ("^[ \\t]*;" nlines) ; A comment line. - (else (+ nlines 1))) ; Not a comment line.\end{code} + ((: bos (* white) ";") nlines) ; A comment line. + (else (+ nlines 1))) ; Not a comment line.\end{code} \begin{code} ;;; Read numbers, counting the evens and odds. @@ -600,10 +629,10 @@ of input stream. (#t (max max-len (string-length line))))\end{code} \begin{code} -;;; (This could also be done with REDUCE-PORT:) -(reduce-port (current-input-port) read-line - (lambda (line maxlen) (max (string-length line) maxlen)) - 0)\end{code} +;;; (This could also be done with PORT-FOLDL:) +(port-foldl (current-input-port) read-line + (lambda (line maxlen) (max (string-length line) maxlen)) + 0)\end{code} \begin{code} ;;; Print every line longer than 80 chars. @@ -615,7 +644,7 @@ of input stream. \begin{code} ;;; Strip blank lines from input. (awk (read-line) (line) () - ("." (display line) (newline)))\end{code} + ((~ white) (display line) (newline)))\end{code} \begin{code} ;;; Sort the entries in /etc/passwd by login name. @@ -629,3 +658,14 @@ of input stream. ;;; Prefix line numbers to the input stream. (awk (read-line) (line) lineno () (#t (format #t "~d:\\t~a~%" lineno line)))\end{code} + + +\section{Backwards compatibility} + +Previous scsh releases provided an \ex{awk} form with a different syntax, +designed around regular expressions written in Posix notation as strings, +rather than SREs. + +This form is still available in a separate module for old code. +It'll be documented in the next release of this manual. Dig around +in the sources for it. diff --git a/doc/scsh-manual/decls.tex b/doc/scsh-manual/decls.tex index 0a3ae03..a391681 100644 --- a/doc/scsh-manual/decls.tex +++ b/doc/scsh-manual/decls.tex @@ -38,7 +38,7 @@ % For multiletter vars in math mode: \newcommand{\var}[1]{\mbox{\frenchspacing\it{#1}}} -\newcommand{\vari}[2]{${\mbox{\it{#1}}}_{#2}$} +\newcommand{\vari}[2]{\ensuremath{\mbox{\it{#1}}_{#2}}} %% What you frequently want when you say \tt: \def\ttchars{\catcode``=13\@noligs\frenchspacing} diff --git a/doc/scsh-manual/front.tex b/doc/scsh-manual/front.tex index 3f8b5bd..94beeea 100644 --- a/doc/scsh-manual/front.tex +++ b/doc/scsh-manual/front.tex @@ -1,9 +1,9 @@ %&latex -*- latex -*- \title{Scsh Reference Manual} -\subtitle{For scsh release 0.5} +\subtitle{For scsh release 0.5.2} \author{Olin Shivers and Brian D.~Carlstrom} -\date{April 11, 1997} +\date{September 1999} \maketitle %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% diff --git a/doc/scsh-manual/man.tex b/doc/scsh-manual/man.tex index 318cbaa..39da8a5 100644 --- a/doc/scsh-manual/man.tex +++ b/doc/scsh-manual/man.tex @@ -27,6 +27,7 @@ \include{syscalls} \include{network} \include{strings} +\include{sre} \include{rdelim} \include{awk} \include{miscprocs} diff --git a/doc/scsh-manual/procnotation.tex b/doc/scsh-manual/procnotation.tex index ce76480..0be94cb 100644 --- a/doc/scsh-manual/procnotation.tex +++ b/doc/scsh-manual/procnotation.tex @@ -324,12 +324,12 @@ run/sexp* $\equiv$ read $\circ$ run/port* run/sexps* $\equiv$ port->sexp-list $\circ$ run/port*\end{code} \end{desc} -\defun{reduce-port}{port reader op . seeds} {\object\star} +\defun{port-foldl}{port reader op . seeds} {\object\star} \begin{desc} This procedure can be used to perform a variety of iterative operations over an input stream. It repeatedly uses \var{reader} to read an object from \var{port}. -If the first read returns eof, then the entire \ex{reduce-port} +If the first read returns eof, then the entire \ex{port-foldl} operation returns the seeds as multiple values. If the first read operation returns some other value $v$, then \var{op} is applied to $v$ and the seeds: @@ -340,11 +340,15 @@ reading a new value from the port, and so forth. For example, \ex{(port->list \var{reader} \var{port})} could be defined as - \codex{(reverse (reduce-port \var{port} \var{reader} cons '()))} + \codex{(reverse (port-foldl \var{port} \var{reader} cons '()))} -An imperative way to look at \ex{reduce-port} is to say that it +An imperative way to look at \ex{port-foldl} is to say that it abstracts the idea of a loop over a stream of values read from some port, where the seed values express the loop state. + +\remark{This procedure was formerly named \texttt{\indx{reduce-port}}. + The old binding is still provided, but is deprecated and will + probably vanish in a future release.} \end{desc} diff --git a/doc/scsh-manual/strings.tex b/doc/scsh-manual/strings.tex index 5d0e303..a87c66f 100644 --- a/doc/scsh-manual/strings.tex +++ b/doc/scsh-manual/strings.tex @@ -5,10 +5,11 @@ Scsh provides a set of procedures for processing strings and characters. The procedures provided match regular expressions, search strings, parse file-names, and manipulate sets of characters. -Also see chapters \ref{chapt:rdelim} and \ref{chapt:fr-awk} -on record I/O, field parsing, and the awk loop. -The procedures documented there allow you to read character-delimited -records from ports, use regular expressions to split the records into fields +Also see chapters \ref{chapt:sre}, \ref{chapt:rdelim} and \ref{chapt:fr-awk} +on regular-expressions, record I/O, field parsing, and the awk loop. +The procedures documented there allow you to search and pattern-match strings, +read character-delimited records from ports, +use regular expressions to split the records into fields (for example, splitting a string at every occurrence of colon or white-space), and loop over streams of these records in a convenient way. @@ -19,213 +20,7 @@ and loop over streams of these records in a convenient way. Strings are the basic communication medium for {\Unix} processes, so a shell language must have reasonable facilities for manipulating them. -\subsection{Regular expressions} -\label{sec:regexps} - -The following functions perform regular expression matching. -The code uses Henry Spencer's regular expression package. - -\begin{defundesc}{string-match} {regexp string [start]} {match or false} - Search \var{string} starting at position \var{start}, looking for a match - for \var{regexp}. If a match is found, return a match structure describing - the match, otherwise {\sharpf}. \var{Start} defaults to 0. - - \var{regexp} may be a compiled regexp structure or a string defining - a regular expression, which will be compiled to a regexp structure. -\end{defundesc} - -\begin{defundesc} {regexp-match?} {obj} \boolean - Is the object a regular expression match? -\end{defundesc} - -\begin{defundesc} {match:start} {match [match-number]} {{\fixnum} or false} - Returns the start position of the match denoted by \var{match-number}. - The whole regexp is 0. Each further number represents positions - enclosed by \ex{(\ldots)} sections. \var{Match-number} defaults to 0. - - If the regular expression matches as a whole, - but a particular parenthesized sub-expression does not match, then - \ex{match:start} returns {\sharpf}. -\end{defundesc} - -\begin{defundesc} {match:end} {match [match-number]} \fixnum - Returns the end position of the match denoted by \var{match-number}. - \var{Match-number} defaults to 0 (the whole match). - - If the regular expression matches as a whole, - but a particular parenthesized sub-expression does not match, then - \ex{match:end} returns {\sharpf}. -\end{defundesc} - -\begin{defundesc} {match:substring} {match [match-number]} {{\str} or false} - Returns the substring matched by match \var{match-number}. - \var{Match-number} defaults to 0 (the whole match). - If there was no match, returns false. -\end{defundesc} - -Regular expression matching compiles patterns into special data -structures which can be efficiently used to match against strings. -The overhead of compiling patterns that will be used for multiple -searches can be avoided by these lower-level routines: -% -\begin{defundesc} {make-regexp} {str} {re} - Generate a compiled regular expression from the given string. -\end{defundesc} - -\begin{defundesc} {regexp?} {obj} \boolean - Is the object a regular expression? -\end{defundesc} - -\begin{defundesc} {regexp-exec} {regexp str [start]} {match or false} - Apply the regular expression \var{regexp} to the string \var{str} starting - at position \var{start}. If the match succeeds it returns a regexp-match, - otherwise {\sharpf}. \var{Start} defaults to 0. -\end{defundesc} - -\begin{defundesc} {->regexp} {regexp-or-string} {regexp} - Coerce the input value into a compiled regular expression: - strings are compiled; regexp structures are passed through unchanged. -\end{defundesc} - -\defun{regexp-quote}{str}{\str} -\begin{desc} -Returns a regular expression that matches the string \var{str} exactly. -In other words, it quotes the regular expression, prepending backslashes -to all the special regexp characters in \var{str}. -\begin{code} -(regexp-quote "*Hello* world.") - {\evalto}"\\\\*Hello\\\\* world\\\\."\end{code} -\end{desc} - -\defun{regexp-substitute}{port match . items}{{\str} or \undefined} -\begin{desc} -This procedure can be used to perform string substitutions based on -regular expression matches. -The results of the substitution can be either output to a port or -returned as a string. - -The \var{match} argument is a regular expression match structure -that controls the substitution. -If \var{port} is an output port, the \var{items} are written out to -the port: -\begin{itemize} - \item If an item is a string, it is copied directly to the port. - \item If an item is an integer, the corresponding submatch from \var{match} - is written to the port. - \item If an item is \ex{'pre}, - the prefix of the matched string (the text preceding the match) - is written to the port. - \item If an item is \ex{'post}, - the suffix of the matched string is written. -\end{itemize} - -If \var{port} is {\sharpf}, nothing is written, and a string is constructed -and returned instead. -\end{desc} - -\defun{regexp-substitute/global}{port regexp string . items} - {{\str} or \undefined} -\begin{desc} -This procedure is similar to \ex{regexp-substitute}, -but can be used to perform repeated match/substitute operations over -a string. -It has the following differences with \ex{regexp-substitute}: -\begin{itemize} - \item It takes a regular expression and string to be matched as - parameters, instead of a completed match structure. - \item If the regular expression doesn't match the string, this - procedure is the identity transform---it returns or outputs the - string. - \item If an item is \ex{'post}, the procedure recurses on the suffix string - (the text from \var{string} following the match). - Including a \ex{'post} in the list of items is how one gets multiple - match/substitution operations. - \item If an item is a procedure, it is applied to the match structure for - a given match. - The procedure returns a string to be used in the result. - \end{itemize} -The \var{regexp} parameter can be either a compiled regular expression or -a string specifying a regular expression. - -Some examples: -{\small -\begin{widecode} -;;; Replace occurrences of "Cotton" with "Jin". -(regexp-substitute/global #f "Cotton" s - 'pre "Jin" 'post) - -;;; mm/dd/yy -> dd/mm/yy date conversion. -(regexp-substitute/global #f "([0-9]+)/([0-9]+)/([0-9]+)" ; mm/dd/yy - s ; Source string - 'pre 2 "/" 1 "/" 3 'post) - -;;; "9/29/61" -> "Sep 29, 1961" date conversion. -(regexp-substitute/global #f "([0-9]+)/([0-9]+)/([0-9]+)" ; mm/dd/yy - s ; Source string - - 'pre - ;; Sleazy converter -- ignores "year 2000" issue, and blows up if - ;; month is out of range. - (lambda (m) - (let ((mon (vector-ref '#("Jan" "Feb" "Mar" "Apr" "May" "Jun" - "Jul" "Aug" "Sep" "Oct" "Nov" "Dec") - (- (string->number (match:substring m 1)) 1))) - (day (match:substring m 2)) - (year (match:substring m 3))) - (string-append mon " " day ", 19" year))) - 'post) - -;;; Remove potentially offensive substrings from string S. -(regexp-substitute/global #f "Windows|tcl|Intel" s - 'pre 'post)\end{widecode}} - -\end{desc} - -\subsection{Other string manipulation facilities} - -\defun {index} {string char [start]} {{\fixnum} or false} -\defunx {rindex} {string char [start]} {{\fixnum} or false} -\begin{desc} - These procedures search through \var{string} looking for an occurrence - of character \var{char}. \ex{index} searches left-to-right; \ex{rindex} - searches right-to-left. - - \ex{index} returns the smallest index $i$ of \var{string} greater - than or equal to \var{start} such that $\var{string}[i] = \var{char}$. - The default for \var{start} is zero. If there is no such match, - \ex{index} returns false. - - \ex{rindex} returns the largest index $i$ of \var{string} less than - \var{start} such that $\var{string}[i] = \var{char}$. - The default for \var{start} is \ex{(string-length \var{string})}. - If there is no such match, \ex{rindex} returns false. -\end{desc} - -I should probably snarf all the MIT Scheme string functions, and stick them -in a package. {\Unix} programs need to mung character strings a lot. - -MIT string match commands: -\begin{tightcode} -[sub]string-match-{forward,backward}[-ci] -[sub]string-{prefix,suffix}[-ci]? -[sub]string-find-{next,previous}-char[-ci] -[sub]string-find-{next,previous}-char-in-set -[sub]string-replace[!] -\ldots\etc\end{tightcode} -These are not currently provided. - -\begin{defundesc} {substitute-env-vars} {fname} \str - Replace occurrences of environment variables with their values. - An environment variable is denoted by a dollar sign followed by - alphanumeric chars and underscores, or is surrounded by braces. - - \begin{exampletable} - \splitline{\ex{(substitute-env-vars "\$USER/.login")}} - {\ex{"shivers/.login"}} \\ - \cd{(substitute-env-vars "$\{USER\}_log")} & \cd{"shivers_log"} - \end{exampletable} -\end{defundesc} - +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Manipulating file-names} \label{sec:filenames} @@ -559,6 +354,53 @@ defined in the previous section, is also frequently useful for expanding file-names. +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Other string manipulation facilities} + +\defun {index} {string char [start]} {{\fixnum} or false} +\defunx {rindex} {string char [start]} {{\fixnum} or false} +\begin{desc} + These procedures search through \var{string} looking for an occurrence + of character \var{char}. \ex{index} searches left-to-right; \ex{rindex} + searches right-to-left. + + \ex{index} returns the smallest index $i$ of \var{string} greater + than or equal to \var{start} such that $\var{string}[i] = \var{char}$. + The default for \var{start} is zero. If there is no such match, + \ex{index} returns false. + + \ex{rindex} returns the largest index $i$ of \var{string} less than + \var{start} such that $\var{string}[i] = \var{char}$. + The default for \var{start} is \ex{(string-length \var{string})}. + If there is no such match, \ex{rindex} returns false. +\end{desc} + +I should probably snarf all the MIT Scheme string functions, and stick them +in a package. {\Unix} programs need to mung character strings a lot. + +MIT string match commands: +\begin{tightcode} +[sub]string-match-{forward,backward}[-ci] +[sub]string-{prefix,suffix}[-ci]? +[sub]string-find-{next,previous}-char[-ci] +[sub]string-find-{next,previous}-char-in-set +[sub]string-replace[!] +\ldots\etc\end{tightcode} +These are not currently provided. + +\begin{defundesc} {substitute-env-vars} {fname} \str + Replace occurrences of environment variables with their values. + An environment variable is denoted by a dollar sign followed by + alphanumeric chars and underscores, or is surrounded by braces. + + \begin{exampletable} + \splitline{\ex{(substitute-env-vars "\$USER/.login")}} + {\ex{"shivers/.login"}} \\ + \cd{(substitute-env-vars "$\{USER\}_log")} & \cd{"shivers_log"} + \end{exampletable} +\end{defundesc} + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{ASCII encoding} @@ -596,47 +438,36 @@ assumption into your code if you can help it.\footnote{ Is the object \var{x} a character set? \end{desc} -\defun{char-set=}{cs1 cs2}\boolean +\defun{char-set=}{\vari{cs}1 \vari{cs}2\ldots}\boolean \begin{desc} -Are the character sets \var{cs1} and \var{cs2} equal? +Are the character sets equal? \end{desc} -\defun{char-set<=}{cs1 cs2}\boolean +\defun{char-set<=}{\vari{cs}1 \vari{cs}2\ldots}\boolean \begin{desc} -Returns true if character set \var{cs1} is a subset of character set \var{cs2}. +Returns true if every character set \vari{cs}{i} is +a subset of character set \vari{cs}{i+1}. \end{desc} -\defun{reduce-char-set}{kons knil cs}\object +\defun{char-set-fold}{kons knil cs}\object \begin{desc} This is the fundamental iterator for character sets. -Reduces the function \var{kons} across the character set \var{cs} using +Applies the function \var{kons} across the character set \var{cs} using initial state value \var{knil}. That is, if \var{cs} is the empty set, the procedure returns \var{knil}. Otherwise, some element \var{c} of \var{cs} is chosen; let \var{cs'} be the remaining, unchosen characters. The procedure returns \begin{tightcode} -(reduce-char-set \var{kons} (\var{kons} \var{c} \var{knil}) \var{cs'})\end{tightcode} +(char-set-fold \var{kons} (\var{kons} \var{c} \var{knil}) \var{cs'})\end{tightcode} For example, we could define \ex{char-set-members} (see below) as \begin{tightcode} -(lambda (cs) (reduce-char-set cons '() cs))\end{tightcode} -\end{desc} +(lambda (cs) (char-set-fold cons '() cs))\end{tightcode} -\subsection{Side effects} -\defun{set-char-set!}{cs char in?}{\undefined} -\begin{desc} -This side-effects character set \var{cs}. -If \var{in?} is true, character \var{char} is added to the set. -Otherwise, it is deleted from the set. - -Use of this procedure is deprecated, since it could damage other procedures -that retain pointers to existing character sets. -You should use \ex{set-char-set!} in contexts where it is guaranteed that -there are no other pointers to the character set being modified. -(For example, functions that create character sets can use this function -to efficiently construct the character set, after which time the set is -used in a pure-functional, shared manner.) +\remark{This procedure was formerly named \texttt{\indx{reduce-char-set}}. + The old binding is still provided, but is deprecated and will + probably vanish in a future release.} \end{desc} \defun{char-set-for-each}{p cs}{\undefined} @@ -646,11 +477,7 @@ Note that the order in which \var{p} is applied to the characters in the set is not specified, and may even change from application to application. \end{desc} -\defun{copy-char-set}{cs}{char-set} -\begin{desc} -Returns a copy of the character set \var{cs}. -\end{desc} - +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Creating character sets} \defun{char-set}{\vari{char}1\ldots}{char-set} @@ -680,6 +507,7 @@ Returns a character set containing every character whose {\Ascii} code lies in the half-open range $[\var{lower},\var{upper})$. \end{desc} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Querying character sets} \defun {char-set-members}{char-set}{character-list} \begin{desc} @@ -706,7 +534,23 @@ We sought to escape the dilemma by shifting to a new name.} Returns the number of elements in character set \var{cs}. \end{desc} -\subsection{Character set algebra} +\defun{char-set-every?}{pred cs}\boolean +\defunx{char-set-any?}{pred cs}\object +\begin{desc} +The \ex{char-set-every?} procedure returns true if predicate \var{pred} +returns true of every character in the character set \var{cs}. + +Likewise, \ex{char-set-any?} applies \var{pred} to every character in +character set \var{cs}, and returns the first true value it finds. +If no character produces a true value, it returns false. + +The order in which these procedures sequence through the elements of +\var{cs} is not specified. +\end{desc} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Character-set algebra} \defun {char-set-invert}{char-set}{char-set} \defunx{char-set-union}{\vari{char-set}1\ldots}{char-set} \defunx{char-set-intersection}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set} @@ -719,6 +563,14 @@ to the left; the difference function requires at least one argument, while union and intersection may be applied to zero arguments. \end{desc} +\defun {char-set-adjoin}{cs \vari{char}1\ldots}{char-set} +\defunx{char-set-delete}{cs \vari{char}1\ldots}{char-set} +\begin{desc} +Add/delete the \vari{char}i characters to/from character set \var{cs}. +\end{desc} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Standard character sets} \label{sec:std-csets} Several character sets are predefined for convenience: @@ -727,25 +579,25 @@ Several character sets are predefined for convenience: \newcommand{\entry}[1]{\ex{#1}\index{#1}} \begin{tabular}{|ll|} \hline -\entry{char-set:alphabetic} & Alphabetic chars \\ \entry{char-set:lower-case} & Lower-case alphabetic chars \\ \entry{char-set:upper-case} & Upper-case alphabetic chars \\ +\entry{char-set:alphabetic} & Alphabetic chars \\ \entry{char-set:numeric} & Decimal digits: 0--9 \\ \entry{char-set:alphanumeric} & Alphabetic or numeric \\ \entry{char-set:graphic} & Printing characters except space \\ \entry{char-set:printing} & Printing characters including space \\ \entry{char-set:whitespace} & Whitespace characters \\ -\entry{char-set:blank} & Blank characters \\ \entry{char-set:control} & Control characters \\ \entry{char-set:punctuation} & Punctuation characters \\ \entry{char-set:hex-digit} & A hexadecimal digit: 0--9, A--F, a--f \\ +\entry{char-set:blank} & Blank characters \\ \entry{char-set:ascii} & A character in the ASCII set. \\ \entry{char-set:empty} & Empty set \\ \entry{char-set:full} & All characters \\ \hline \end{tabular} \end{center} -The first twelve of these correspond to the character classes defined in +The first eleven of these correspond to the character classes defined in Posix. Note that there may be characters in \ex{char-set:alphabetic} that are neither upper or lower case---this might occur in implementations that @@ -788,3 +640,87 @@ char-set:punctuation & \verb|!"#$%&'()*+,-./:;<=>|\verb#?@[\]^_`{|}~# \\ \begin{desc} These predicates are defined in terms of the above character sets. \end{desc} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Linear-update character-set operations} +These procedures have a hybrid pure-functional/side-effecting semantics: +they are allowed, but not required, to side-effect one of their parameters +in order to construct their result. +An implementation may legally implement these procedures as pure, +side-effect-free functions, or it may implement them using side effects, +depending upon the details of what is the most efficient or simple to +implement in terms of the underlying representation. + +What this means is that clients of these procedures \emph{may not} rely +upon these procedures working by side effect. +For example, this is not guaranteed to work: +\begin{verbatim} +(let ((cs (char-set #\a #\b #\c))) + (char-set-adjoin! cs #\d) + cs) ; Could be either {a,b,c} or {a,b,c,d}. +\end{verbatim} +However, this is well-defined: +\begin{verbatim} +(let ((cs (char-set #\a #\b #\c))) + (char-set-adjoin! cs #\d)) ; {a,b,c,d} +\end{verbatim} +So clients of these procedures write in a functional style, but must +additionally be sure that, when the procedure is called, there are no +other live pointers to the potentially-modified character set (hence the term +``linear update''). + +There are two benefits to this convention: +\begin{itemize} +\item Implementations are free to provide the most efficient possible + implementation, either functional or side-effecting. +\item Programmers may nonetheless continue to assume that character sets + are purely functional data structures: they may be reliably shared + without needing to be copied, uniquified, and so forth. +\end{itemize} + +In practice, these procedures are most useful for efficiently constructing +character sets in a side-effecting manner, in some limited local context, +before passing the character set outside the local construction scope to be +used in a functional manner. + +Scsh provides no assistance in checking the linearity of the potentially +side-effected parameters passed to these functions --- there's no linear +type checker or run-time mechanism for detecting violations. + +\defun{char-set-copy}{cs}{char-set} +\begin{desc} +Returns a copy of the character set \var{cs}. +``Copy'' means that if either the input parameter or the +result value of this procedure is passed to one of the linear-update +procedures described below, the other character set is guaranteed +not to be altered. +(A system that provides pure-functional implementations of the rest of +the linear-operator suite could implement this procedure as the +identity function.) +\end{desc} + +\defun{char-set-adjoin!}{cs \vari{char}1\ldots}{char-set} +\begin{desc} +Add the \vari{char}i characters to character set \var{cs}, and +return the result. +This procedure is allowed, but not required, to side-effect \var{cs}. +\end{desc} + +\defun{char-set-delete!}{cs \vari{char}1\ldots}{char-set} +\begin{desc} +Remove the \vari{char}i characters to character set \var{cs}, and +return the result. +This procedure is allowed, but not required, to side-effect \var{cs}. +\end{desc} + +\defun {char-set-invert!}{char-set}{char-set} +\defunx{char-set-union!}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set} +\defunx{char-set-intersection!}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set} +\defunx{char-set-difference!}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set} +\begin{desc} +These procedures implement set complement, union, intersection, and difference +for character sets. +They are allowed, but not required, to side-effect their first parameter. +The union, intersection, and difference operations are n-ary, associating +to the left. +\end{desc} diff --git a/doc/scsh-manual/syscalls.tex b/doc/scsh-manual/syscalls.tex index 1f1257c..cc25f93 100644 --- a/doc/scsh-manual/syscalls.tex +++ b/doc/scsh-manual/syscalls.tex @@ -1464,6 +1464,12 @@ All wild-card characters in \var{str} are quoted with a backslash. \begin{defundesc}{file-match}{root dot-files? \vari{pat}1 \vari{pat}2 {\ldots} \vari{pat}n}{string list} + \note{This procedure is deprecated, and will probably either go away or + be substantially altered in a future release. New code should not + call this procedure. The problem is that it relies upon + Posix-notation regular expressions; the rest of scsh has been + converted over to the new SRE notation.} + \ex{file-match} provides a more powerful file-matching service, at the expense of a less convenient notation. It is intermediate in power between most shell matching machinery and recursive \ex{find(1)}.