Updated documentation for the new release. Mostly new material for the
SRE regexp system.
This commit is contained in:
parent
239592923f
commit
0d3f69124f
|
@ -77,7 +77,7 @@ characters.
|
|||
|
||||
\subsection{Parsing fields}
|
||||
|
||||
\defun {field-splitter} {[regexp num-fields]} \proc
|
||||
\defun {field-splitter} {[field num-fields]} \proc
|
||||
\defunx {infix-splitter} {[delim num-fields handle-delim]} \proc
|
||||
\defunx {suffix-splitter} {[delim num-fields handle-delim]} \proc
|
||||
\defunx {sloppy-suffix-splitter} {[delim num-fields handle-delim]} \proc
|
||||
|
@ -107,10 +107,10 @@ These functions return a parser function that can be used as follows:
|
|||
Defaults:
|
||||
\begin{tightinset}
|
||||
\begin{tabular}{l@{\quad=\quad }ll}
|
||||
\var{delim} & \verb!"[ \t\n]+|$"! & (suffix delimiter: white space or eos) \\
|
||||
\multicolumn{1}{l}{} & \verb|"[ \t\n]+"| & (infix delimiter: white space) \\
|
||||
|
||||
\var{re} & \verb|"[^ \t\n]+"| & (non-white-space) \\
|
||||
\var{delim} & \ex{(rx (| (+ white) eos))} & (suffix delimiter: white space or eos) \\
|
||||
\multicolumn{1}{l}{} & \ex{(rx (+ white))} & (infix delimiter: white space) \\
|
||||
|
||||
\var{field} & \verb|(rx (+ (~ white)))| & (non-white-space) \\
|
||||
|
||||
\var{num-fields} & \verb|#f| & (as many fields as possible) \\
|
||||
|
||||
|
@ -120,15 +120,30 @@ These functions return a parser function that can be used as follows:
|
|||
{\ldots}which means: break the string at white space, discarding the
|
||||
white space, and parse as many fields as possible.
|
||||
|
||||
The regular expression \var{delim} is used to match field delimiters.
|
||||
It can be either a string or a compiled regexp structure (see the
|
||||
\ex{make-regexp} procedure). In the separator case, it defaults to a
|
||||
regular expression matching white space; in the terminator case,
|
||||
The \var{delim} parameter is a regular expression matching the text
|
||||
that occurs between fields.
|
||||
See chapter~\ref{chapt:sre} for information on regular expressions,
|
||||
and the \ex{rx} form used to specify them.
|
||||
In the separator case,
|
||||
it defaults to a pattern matching white space;
|
||||
in the terminator case,
|
||||
it defaults to white space or end-of-string.
|
||||
|
||||
The regular expression \var{re} is a regular expression used
|
||||
The \var{field} parameter is a regular expression used
|
||||
to match fields. It defaults to non-white-space.
|
||||
|
||||
The \var{delim} patterns may also be given as a string,
|
||||
character, or char-set, which are coerced to regular expressions.
|
||||
So the following expressions are all equivalent,
|
||||
each producing a function that splits strings apart at colons:
|
||||
\begin{inset}
|
||||
\begin{verbatim}
|
||||
(infix-splitter (rx ":"))
|
||||
(infix-splitter ":")
|
||||
(infix-splitter #\:)
|
||||
(infix-splitter (char-set #\:))\end{verbatim}
|
||||
\end{inset}
|
||||
|
||||
The boolean \var{handle-delim} determines what to do with delimiters.
|
||||
\begin{tightinset}\begin{tabular}{ll}
|
||||
\ex{'trim} & Delimiters are thrown away after parsing. (default) \\
|
||||
|
@ -178,7 +193,7 @@ These functions return a parser function that can be used as follows:
|
|||
|
||||
It is an error if a non-empty record does not end with a delimiter.
|
||||
To make the last delimiter optional, make sure the delimiter regexp
|
||||
matches the end-of-string (regexp \ex{"\$"}).
|
||||
matches the end-of-string (sre \ex{eos}).
|
||||
|
||||
\item [\ex{infix-splitter}]
|
||||
Delimiters are interpreted as element \emph{separators}. If comma is the
|
||||
|
@ -222,7 +237,8 @@ These functions return a parser function that can be used as follows:
|
|||
initial delimiter string if the string begins with one instead of parsing
|
||||
an initial empty field. This can be used, for example, to field-split a
|
||||
sequence of English text at white-space boundaries, where the string may
|
||||
begin or end with white space, by using regex \verb!"[ \t]+|$"!.
|
||||
begin or end with white space, by using regex
|
||||
\begin{code}{(rx (| (+ white) eos))}\end{code}
|
||||
(But you would be better off using \ex{field-splitter} in this case.)
|
||||
\end{description}
|
||||
\end{desc}
|
||||
|
@ -318,25 +334,26 @@ Record & : suffix & \verb!:|$! suffix & : infix & non-: field \\
|
|||
; wandy:3xuncWdpKhR.:73:22:Wandy Saetan:/usr/wandy:/bin/csh
|
||||
|
||||
;;; Two ls -l output readers
|
||||
(field-reader (infix-splitter "[ \\t]+" 8))
|
||||
(field-reader (infix-splitter "[ \\t]+" -7))
|
||||
(field-reader (infix-splitter (rx (+ white)) 8))
|
||||
(field-reader (infix-splitter (rx (+ white)) -7))
|
||||
; -rw-r--r-- 1 shivers 22880 Sep 24 12:45 scsh.scm
|
||||
|
||||
;;; Internet hostname reader
|
||||
(field-reader (field-splitter "[^.]+"))
|
||||
(field-reader (field-splitter (rx (+ (~ ".")))))
|
||||
; stat.sinica.edu.tw
|
||||
|
||||
;;; Internet IP address reader
|
||||
(field-reader (field-splitter "[^.]+" 4))
|
||||
(field-reader (field-splitter (rx (+ (~ "."))) 4))
|
||||
; 18.24.0.241
|
||||
|
||||
;;; Line of integers
|
||||
(let ((parser (field-splitter "[+-]?[0-9]+")))
|
||||
(let ((parser (field-splitter (rx (? ("+-")) (+ digit)))))
|
||||
(field-reader (\l{s} (map string->number (parser s))))
|
||||
; 18 24 0 241
|
||||
|
||||
;;; Same as above.
|
||||
(let ((reader (field-reader (field-splitter "[+-]?[0-9]+"))))
|
||||
(let ((reader (field-reader (field-splitter (rx (? ("+-"))
|
||||
(+ digit))))))
|
||||
(\lx{maybe-port} (map string->number (apply reader maybe-port))))
|
||||
; Yale beat harvard 26 to 7.\end{centercode}
|
||||
\caption{Some examples of \protect\ex{field-reader}}
|
||||
|
@ -349,8 +366,9 @@ Record & : suffix & \verb!:|$! suffix & : infix & non-: field \\
|
|||
\subsection{Forward-progress guarantees and empty-string matches}
|
||||
A loop that pulls text off a string by repeatedly matching a regexp
|
||||
against that string can conceivably get stuck in an infinite loop if
|
||||
the regexp matches the empty string. For example, the regexps \verb|^|,
|
||||
\verb|$|, \verb|.*|, and \verb!foo|[^f]*! can all match the empty string.
|
||||
the regexp matches the empty string. For example, the SREs
|
||||
\ex{bos}, \ex{eos}, \ex{(* any)}, and \ex{(| "foo" (* (~ "f")))}
|
||||
can all match the empty string.
|
||||
|
||||
The routines in this package that iterate through strings with regular
|
||||
expressions are careful to handle this empty-string case.
|
||||
|
@ -369,10 +387,10 @@ progress, and the loop is guaranteed to terminate.
|
|||
This has the effect you want with field parsing. For example, if you split
|
||||
a string with the empty pattern, you will explode the string into its
|
||||
individual characters:
|
||||
\codex{((suffix-splitter "") "foo") {\evalto} ("" "f" "o" "o")}
|
||||
\codex{((suffix-splitter (rx)) "foo") {\evalto} ("" "f" "o" "o")}
|
||||
However, even though this boundary case is handled correctly, we don't
|
||||
recommend using it. Say what you mean---just use a field splitter:
|
||||
\codex{((field-splitter ".") "foo") {\evalto} ("f" "o" "o")}
|
||||
\codex{((field-splitter (rx any)) "foo") {\evalto} ("f" "o" "o")}
|
||||
Or, more efficiently,
|
||||
\codex{((\l{s} (map string (string->list s))) "foo")}
|
||||
|
||||
|
@ -478,15 +496,25 @@ it checks them all.
|
|||
|
||||
The \var{test} form can be one of:
|
||||
\begin{inset}
|
||||
\begin{tabular}{lp{0.8\linewidth}}
|
||||
integer: & The test is true for that iteration of the loop.
|
||||
\begin{tabular}{lp{0.6\linewidth}}
|
||||
\var{integer}: & The test is true for that iteration of the loop.
|
||||
The first iteration is \#1. \\
|
||||
|
||||
string: & The string is a regular expression. The test is
|
||||
true if the regexp matches the record.\\
|
||||
\var{sre}: & A regular expression, in SRE notation
|
||||
(see chapter~\ref{chapt:sre}) can be used as
|
||||
a test. The test is successful if the pattern
|
||||
matches the record.
|
||||
In particular, note that any string is an SRE. \\
|
||||
|
||||
expression & If not an integer or a string, the test form is
|
||||
a Scheme expression that is evaluated.
|
||||
\ex{(when \var{expr})}: &
|
||||
The body of a \ex{when} test is evaluated as a
|
||||
Scheme boolean expression in the inner scope of the
|
||||
\ex{awk} form. \\
|
||||
|
||||
\var{expr}: & If the form is none of the above, it is treated as
|
||||
a Scheme expression---in practice, the \ex{when}
|
||||
keyword is only needed in cases where SRE/Scheme
|
||||
expression ambiguity might occur.
|
||||
\end{tabular}
|
||||
\end{inset}
|
||||
|
||||
|
@ -526,7 +554,7 @@ it checks them all.
|
|||
\itum{\ex{(\var{test} => \var{exp})}}
|
||||
If evaluating \ex{test} produces a true value,
|
||||
apply \var{exp} to that value.
|
||||
If \var{test} is a regular-expression string, then \var{exp} is applied
|
||||
If \var{test} is a regular expression, then \var{exp} is applied
|
||||
to the match data structure returned by the regexp match routine.
|
||||
|
||||
\itum{\ex{(after \vari{body}1 \ldots)}}
|
||||
|
@ -562,9 +590,10 @@ of input stream.
|
|||
(call-with-input-file "/etc/passwd"
|
||||
(lambda (port)
|
||||
(awk (read-passwd port) (record fields) ()
|
||||
("^S" (format #t "~a's home directory is ~a~%"
|
||||
($ fields 0)
|
||||
($ fields 5)))))))\end{code}
|
||||
((: bos "S")
|
||||
(format #t "~a's home directory is ~a~%"
|
||||
($ fields 0)
|
||||
($ fields 5)))))))\end{code}
|
||||
|
||||
\begin{code}
|
||||
;;; Read a series of integers from stdin. This expression evaluates
|
||||
|
@ -581,8 +610,8 @@ of input stream.
|
|||
\begin{code}
|
||||
;;; Count the number of non-comment lines of code in my Scheme source.
|
||||
(awk (read-line) (line) ((nlines 0))
|
||||
("^[ \\t]*;" nlines) ; A comment line.
|
||||
(else (+ nlines 1))) ; Not a comment line.\end{code}
|
||||
((: bos (* white) ";") nlines) ; A comment line.
|
||||
(else (+ nlines 1))) ; Not a comment line.\end{code}
|
||||
|
||||
\begin{code}
|
||||
;;; Read numbers, counting the evens and odds.
|
||||
|
@ -600,10 +629,10 @@ of input stream.
|
|||
(#t (max max-len (string-length line))))\end{code}
|
||||
|
||||
\begin{code}
|
||||
;;; (This could also be done with REDUCE-PORT:)
|
||||
(reduce-port (current-input-port) read-line
|
||||
(lambda (line maxlen) (max (string-length line) maxlen))
|
||||
0)\end{code}
|
||||
;;; (This could also be done with PORT-FOLDL:)
|
||||
(port-foldl (current-input-port) read-line
|
||||
(lambda (line maxlen) (max (string-length line) maxlen))
|
||||
0)\end{code}
|
||||
|
||||
\begin{code}
|
||||
;;; Print every line longer than 80 chars.
|
||||
|
@ -615,7 +644,7 @@ of input stream.
|
|||
\begin{code}
|
||||
;;; Strip blank lines from input.
|
||||
(awk (read-line) (line) ()
|
||||
("." (display line) (newline)))\end{code}
|
||||
((~ white) (display line) (newline)))\end{code}
|
||||
|
||||
\begin{code}
|
||||
;;; Sort the entries in /etc/passwd by login name.
|
||||
|
@ -629,3 +658,14 @@ of input stream.
|
|||
;;; Prefix line numbers to the input stream.
|
||||
(awk (read-line) (line) lineno ()
|
||||
(#t (format #t "~d:\\t~a~%" lineno line)))\end{code}
|
||||
|
||||
|
||||
\section{Backwards compatibility}
|
||||
|
||||
Previous scsh releases provided an \ex{awk} form with a different syntax,
|
||||
designed around regular expressions written in Posix notation as strings,
|
||||
rather than SREs.
|
||||
|
||||
This form is still available in a separate module for old code.
|
||||
It'll be documented in the next release of this manual. Dig around
|
||||
in the sources for it.
|
||||
|
|
|
@ -38,7 +38,7 @@
|
|||
|
||||
% For multiletter vars in math mode:
|
||||
\newcommand{\var}[1]{\mbox{\frenchspacing\it{#1}}}
|
||||
\newcommand{\vari}[2]{${\mbox{\it{#1}}}_{#2}$}
|
||||
\newcommand{\vari}[2]{\ensuremath{\mbox{\it{#1}}_{#2}}}
|
||||
|
||||
%% What you frequently want when you say \tt:
|
||||
\def\ttchars{\catcode``=13\@noligs\frenchspacing}
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
%&latex -*- latex -*-
|
||||
|
||||
\title{Scsh Reference Manual}
|
||||
\subtitle{For scsh release 0.5}
|
||||
\subtitle{For scsh release 0.5.2}
|
||||
\author{Olin Shivers and Brian D.~Carlstrom}
|
||||
\date{April 11, 1997}
|
||||
\date{September 1999}
|
||||
|
||||
\maketitle
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
\include{syscalls}
|
||||
\include{network}
|
||||
\include{strings}
|
||||
\include{sre}
|
||||
\include{rdelim}
|
||||
\include{awk}
|
||||
\include{miscprocs}
|
||||
|
|
|
@ -324,12 +324,12 @@ run/sexp* $\equiv$ read $\circ$ run/port*
|
|||
run/sexps* $\equiv$ port->sexp-list $\circ$ run/port*\end{code}
|
||||
\end{desc}
|
||||
|
||||
\defun{reduce-port}{port reader op . seeds} {\object\star}
|
||||
\defun{port-foldl}{port reader op . seeds} {\object\star}
|
||||
\begin{desc}
|
||||
This procedure can be used to perform a variety of iterative operations
|
||||
over an input stream.
|
||||
It repeatedly uses \var{reader} to read an object from \var{port}.
|
||||
If the first read returns eof, then the entire \ex{reduce-port}
|
||||
If the first read returns eof, then the entire \ex{port-foldl}
|
||||
operation returns the seeds as multiple values.
|
||||
If the first read operation returns some other value $v$, then
|
||||
\var{op} is applied to $v$ and the seeds:
|
||||
|
@ -340,11 +340,15 @@ reading a new value from the port, and so forth.
|
|||
|
||||
For example, \ex{(port->list \var{reader} \var{port})}
|
||||
could be defined as
|
||||
\codex{(reverse (reduce-port \var{port} \var{reader} cons '()))}
|
||||
\codex{(reverse (port-foldl \var{port} \var{reader} cons '()))}
|
||||
|
||||
An imperative way to look at \ex{reduce-port} is to say that it
|
||||
An imperative way to look at \ex{port-foldl} is to say that it
|
||||
abstracts the idea of a loop over a stream of values read from
|
||||
some port, where the seed values express the loop state.
|
||||
|
||||
\remark{This procedure was formerly named \texttt{\indx{reduce-port}}.
|
||||
The old binding is still provided, but is deprecated and will
|
||||
probably vanish in a future release.}
|
||||
\end{desc}
|
||||
|
||||
|
||||
|
|
|
@ -5,10 +5,11 @@ Scsh provides a set of procedures for processing strings and characters.
|
|||
The procedures provided match regular expressions, search strings,
|
||||
parse file-names, and manipulate sets of characters.
|
||||
|
||||
Also see chapters \ref{chapt:rdelim} and \ref{chapt:fr-awk}
|
||||
on record I/O, field parsing, and the awk loop.
|
||||
The procedures documented there allow you to read character-delimited
|
||||
records from ports, use regular expressions to split the records into fields
|
||||
Also see chapters \ref{chapt:sre}, \ref{chapt:rdelim} and \ref{chapt:fr-awk}
|
||||
on regular-expressions, record I/O, field parsing, and the awk loop.
|
||||
The procedures documented there allow you to search and pattern-match strings,
|
||||
read character-delimited records from ports,
|
||||
use regular expressions to split the records into fields
|
||||
(for example, splitting a string at every occurrence of colon or white-space),
|
||||
and loop over streams of these records in a convenient way.
|
||||
|
||||
|
@ -19,213 +20,7 @@ and loop over streams of these records in a convenient way.
|
|||
Strings are the basic communication medium for {\Unix} processes, so a
|
||||
shell language must have reasonable facilities for manipulating them.
|
||||
|
||||
\subsection{Regular expressions}
|
||||
\label{sec:regexps}
|
||||
|
||||
The following functions perform regular expression matching.
|
||||
The code uses Henry Spencer's regular expression package.
|
||||
|
||||
\begin{defundesc}{string-match} {regexp string [start]} {match or false}
|
||||
Search \var{string} starting at position \var{start}, looking for a match
|
||||
for \var{regexp}. If a match is found, return a match structure describing
|
||||
the match, otherwise {\sharpf}. \var{Start} defaults to 0.
|
||||
|
||||
\var{regexp} may be a compiled regexp structure or a string defining
|
||||
a regular expression, which will be compiled to a regexp structure.
|
||||
\end{defundesc}
|
||||
|
||||
\begin{defundesc} {regexp-match?} {obj} \boolean
|
||||
Is the object a regular expression match?
|
||||
\end{defundesc}
|
||||
|
||||
\begin{defundesc} {match:start} {match [match-number]} {{\fixnum} or false}
|
||||
Returns the start position of the match denoted by \var{match-number}.
|
||||
The whole regexp is 0. Each further number represents positions
|
||||
enclosed by \ex{(\ldots)} sections. \var{Match-number} defaults to 0.
|
||||
|
||||
If the regular expression matches as a whole,
|
||||
but a particular parenthesized sub-expression does not match, then
|
||||
\ex{match:start} returns {\sharpf}.
|
||||
\end{defundesc}
|
||||
|
||||
\begin{defundesc} {match:end} {match [match-number]} \fixnum
|
||||
Returns the end position of the match denoted by \var{match-number}.
|
||||
\var{Match-number} defaults to 0 (the whole match).
|
||||
|
||||
If the regular expression matches as a whole,
|
||||
but a particular parenthesized sub-expression does not match, then
|
||||
\ex{match:end} returns {\sharpf}.
|
||||
\end{defundesc}
|
||||
|
||||
\begin{defundesc} {match:substring} {match [match-number]} {{\str} or false}
|
||||
Returns the substring matched by match \var{match-number}.
|
||||
\var{Match-number} defaults to 0 (the whole match).
|
||||
If there was no match, returns false.
|
||||
\end{defundesc}
|
||||
|
||||
Regular expression matching compiles patterns into special data
|
||||
structures which can be efficiently used to match against strings.
|
||||
The overhead of compiling patterns that will be used for multiple
|
||||
searches can be avoided by these lower-level routines:
|
||||
%
|
||||
\begin{defundesc} {make-regexp} {str} {re}
|
||||
Generate a compiled regular expression from the given string.
|
||||
\end{defundesc}
|
||||
|
||||
\begin{defundesc} {regexp?} {obj} \boolean
|
||||
Is the object a regular expression?
|
||||
\end{defundesc}
|
||||
|
||||
\begin{defundesc} {regexp-exec} {regexp str [start]} {match or false}
|
||||
Apply the regular expression \var{regexp} to the string \var{str} starting
|
||||
at position \var{start}. If the match succeeds it returns a regexp-match,
|
||||
otherwise {\sharpf}. \var{Start} defaults to 0.
|
||||
\end{defundesc}
|
||||
|
||||
\begin{defundesc} {->regexp} {regexp-or-string} {regexp}
|
||||
Coerce the input value into a compiled regular expression:
|
||||
strings are compiled; regexp structures are passed through unchanged.
|
||||
\end{defundesc}
|
||||
|
||||
\defun{regexp-quote}{str}{\str}
|
||||
\begin{desc}
|
||||
Returns a regular expression that matches the string \var{str} exactly.
|
||||
In other words, it quotes the regular expression, prepending backslashes
|
||||
to all the special regexp characters in \var{str}.
|
||||
\begin{code}
|
||||
(regexp-quote "*Hello* world.")
|
||||
{\evalto}"\\\\*Hello\\\\* world\\\\."\end{code}
|
||||
\end{desc}
|
||||
|
||||
\defun{regexp-substitute}{port match . items}{{\str} or \undefined}
|
||||
\begin{desc}
|
||||
This procedure can be used to perform string substitutions based on
|
||||
regular expression matches.
|
||||
The results of the substitution can be either output to a port or
|
||||
returned as a string.
|
||||
|
||||
The \var{match} argument is a regular expression match structure
|
||||
that controls the substitution.
|
||||
If \var{port} is an output port, the \var{items} are written out to
|
||||
the port:
|
||||
\begin{itemize}
|
||||
\item If an item is a string, it is copied directly to the port.
|
||||
\item If an item is an integer, the corresponding submatch from \var{match}
|
||||
is written to the port.
|
||||
\item If an item is \ex{'pre},
|
||||
the prefix of the matched string (the text preceding the match)
|
||||
is written to the port.
|
||||
\item If an item is \ex{'post},
|
||||
the suffix of the matched string is written.
|
||||
\end{itemize}
|
||||
|
||||
If \var{port} is {\sharpf}, nothing is written, and a string is constructed
|
||||
and returned instead.
|
||||
\end{desc}
|
||||
|
||||
\defun{regexp-substitute/global}{port regexp string . items}
|
||||
{{\str} or \undefined}
|
||||
\begin{desc}
|
||||
This procedure is similar to \ex{regexp-substitute},
|
||||
but can be used to perform repeated match/substitute operations over
|
||||
a string.
|
||||
It has the following differences with \ex{regexp-substitute}:
|
||||
\begin{itemize}
|
||||
\item It takes a regular expression and string to be matched as
|
||||
parameters, instead of a completed match structure.
|
||||
\item If the regular expression doesn't match the string, this
|
||||
procedure is the identity transform---it returns or outputs the
|
||||
string.
|
||||
\item If an item is \ex{'post}, the procedure recurses on the suffix string
|
||||
(the text from \var{string} following the match).
|
||||
Including a \ex{'post} in the list of items is how one gets multiple
|
||||
match/substitution operations.
|
||||
\item If an item is a procedure, it is applied to the match structure for
|
||||
a given match.
|
||||
The procedure returns a string to be used in the result.
|
||||
\end{itemize}
|
||||
The \var{regexp} parameter can be either a compiled regular expression or
|
||||
a string specifying a regular expression.
|
||||
|
||||
Some examples:
|
||||
{\small
|
||||
\begin{widecode}
|
||||
;;; Replace occurrences of "Cotton" with "Jin".
|
||||
(regexp-substitute/global #f "Cotton" s
|
||||
'pre "Jin" 'post)
|
||||
|
||||
;;; mm/dd/yy -> dd/mm/yy date conversion.
|
||||
(regexp-substitute/global #f "([0-9]+)/([0-9]+)/([0-9]+)" ; mm/dd/yy
|
||||
s ; Source string
|
||||
'pre 2 "/" 1 "/" 3 'post)
|
||||
|
||||
;;; "9/29/61" -> "Sep 29, 1961" date conversion.
|
||||
(regexp-substitute/global #f "([0-9]+)/([0-9]+)/([0-9]+)" ; mm/dd/yy
|
||||
s ; Source string
|
||||
|
||||
'pre
|
||||
;; Sleazy converter -- ignores "year 2000" issue, and blows up if
|
||||
;; month is out of range.
|
||||
(lambda (m)
|
||||
(let ((mon (vector-ref '#("Jan" "Feb" "Mar" "Apr" "May" "Jun"
|
||||
"Jul" "Aug" "Sep" "Oct" "Nov" "Dec")
|
||||
(- (string->number (match:substring m 1)) 1)))
|
||||
(day (match:substring m 2))
|
||||
(year (match:substring m 3)))
|
||||
(string-append mon " " day ", 19" year)))
|
||||
'post)
|
||||
|
||||
;;; Remove potentially offensive substrings from string S.
|
||||
(regexp-substitute/global #f "Windows|tcl|Intel" s
|
||||
'pre 'post)\end{widecode}}
|
||||
|
||||
\end{desc}
|
||||
|
||||
\subsection{Other string manipulation facilities}
|
||||
|
||||
\defun {index} {string char [start]} {{\fixnum} or false}
|
||||
\defunx {rindex} {string char [start]} {{\fixnum} or false}
|
||||
\begin{desc}
|
||||
These procedures search through \var{string} looking for an occurrence
|
||||
of character \var{char}. \ex{index} searches left-to-right; \ex{rindex}
|
||||
searches right-to-left.
|
||||
|
||||
\ex{index} returns the smallest index $i$ of \var{string} greater
|
||||
than or equal to \var{start} such that $\var{string}[i] = \var{char}$.
|
||||
The default for \var{start} is zero. If there is no such match,
|
||||
\ex{index} returns false.
|
||||
|
||||
\ex{rindex} returns the largest index $i$ of \var{string} less than
|
||||
\var{start} such that $\var{string}[i] = \var{char}$.
|
||||
The default for \var{start} is \ex{(string-length \var{string})}.
|
||||
If there is no such match, \ex{rindex} returns false.
|
||||
\end{desc}
|
||||
|
||||
I should probably snarf all the MIT Scheme string functions, and stick them
|
||||
in a package. {\Unix} programs need to mung character strings a lot.
|
||||
|
||||
MIT string match commands:
|
||||
\begin{tightcode}
|
||||
[sub]string-match-{forward,backward}[-ci]
|
||||
[sub]string-{prefix,suffix}[-ci]?
|
||||
[sub]string-find-{next,previous}-char[-ci]
|
||||
[sub]string-find-{next,previous}-char-in-set
|
||||
[sub]string-replace[!]
|
||||
\ldots\etc\end{tightcode}
|
||||
These are not currently provided.
|
||||
|
||||
\begin{defundesc} {substitute-env-vars} {fname} \str
|
||||
Replace occurrences of environment variables with their values.
|
||||
An environment variable is denoted by a dollar sign followed by
|
||||
alphanumeric chars and underscores, or is surrounded by braces.
|
||||
|
||||
\begin{exampletable}
|
||||
\splitline{\ex{(substitute-env-vars "\$USER/.login")}}
|
||||
{\ex{"shivers/.login"}} \\
|
||||
\cd{(substitute-env-vars "$\{USER\}_log")} & \cd{"shivers_log"}
|
||||
\end{exampletable}
|
||||
\end{defundesc}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{Manipulating file-names}
|
||||
\label{sec:filenames}
|
||||
|
||||
|
@ -559,6 +354,53 @@ defined in the previous section,
|
|||
is also frequently useful for expanding file-names.
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{Other string manipulation facilities}
|
||||
|
||||
\defun {index} {string char [start]} {{\fixnum} or false}
|
||||
\defunx {rindex} {string char [start]} {{\fixnum} or false}
|
||||
\begin{desc}
|
||||
These procedures search through \var{string} looking for an occurrence
|
||||
of character \var{char}. \ex{index} searches left-to-right; \ex{rindex}
|
||||
searches right-to-left.
|
||||
|
||||
\ex{index} returns the smallest index $i$ of \var{string} greater
|
||||
than or equal to \var{start} such that $\var{string}[i] = \var{char}$.
|
||||
The default for \var{start} is zero. If there is no such match,
|
||||
\ex{index} returns false.
|
||||
|
||||
\ex{rindex} returns the largest index $i$ of \var{string} less than
|
||||
\var{start} such that $\var{string}[i] = \var{char}$.
|
||||
The default for \var{start} is \ex{(string-length \var{string})}.
|
||||
If there is no such match, \ex{rindex} returns false.
|
||||
\end{desc}
|
||||
|
||||
I should probably snarf all the MIT Scheme string functions, and stick them
|
||||
in a package. {\Unix} programs need to mung character strings a lot.
|
||||
|
||||
MIT string match commands:
|
||||
\begin{tightcode}
|
||||
[sub]string-match-{forward,backward}[-ci]
|
||||
[sub]string-{prefix,suffix}[-ci]?
|
||||
[sub]string-find-{next,previous}-char[-ci]
|
||||
[sub]string-find-{next,previous}-char-in-set
|
||||
[sub]string-replace[!]
|
||||
\ldots\etc\end{tightcode}
|
||||
These are not currently provided.
|
||||
|
||||
\begin{defundesc} {substitute-env-vars} {fname} \str
|
||||
Replace occurrences of environment variables with their values.
|
||||
An environment variable is denoted by a dollar sign followed by
|
||||
alphanumeric chars and underscores, or is surrounded by braces.
|
||||
|
||||
\begin{exampletable}
|
||||
\splitline{\ex{(substitute-env-vars "\$USER/.login")}}
|
||||
{\ex{"shivers/.login"}} \\
|
||||
\cd{(substitute-env-vars "$\{USER\}_log")} & \cd{"shivers_log"}
|
||||
\end{exampletable}
|
||||
\end{defundesc}
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\section{ASCII encoding}
|
||||
|
||||
|
@ -596,47 +438,36 @@ assumption into your code if you can help it.\footnote{
|
|||
Is the object \var{x} a character set?
|
||||
\end{desc}
|
||||
|
||||
\defun{char-set=}{cs1 cs2}\boolean
|
||||
\defun{char-set=}{\vari{cs}1 \vari{cs}2\ldots}\boolean
|
||||
\begin{desc}
|
||||
Are the character sets \var{cs1} and \var{cs2} equal?
|
||||
Are the character sets equal?
|
||||
\end{desc}
|
||||
|
||||
\defun{char-set<=}{cs1 cs2}\boolean
|
||||
\defun{char-set<=}{\vari{cs}1 \vari{cs}2\ldots}\boolean
|
||||
\begin{desc}
|
||||
Returns true if character set \var{cs1} is a subset of character set \var{cs2}.
|
||||
Returns true if every character set \vari{cs}{i} is
|
||||
a subset of character set \vari{cs}{i+1}.
|
||||
\end{desc}
|
||||
|
||||
\defun{reduce-char-set}{kons knil cs}\object
|
||||
\defun{char-set-fold}{kons knil cs}\object
|
||||
\begin{desc}
|
||||
This is the fundamental iterator for character sets.
|
||||
Reduces the function \var{kons} across the character set \var{cs} using
|
||||
Applies the function \var{kons} across the character set \var{cs} using
|
||||
initial state value \var{knil}.
|
||||
That is, if \var{cs} is the empty set, the procedure returns \var{knil}.
|
||||
Otherwise, some element \var{c} of \var{cs} is chosen; let \var{cs'} be
|
||||
the remaining, unchosen characters.
|
||||
The procedure returns
|
||||
\begin{tightcode}
|
||||
(reduce-char-set \var{kons} (\var{kons} \var{c} \var{knil}) \var{cs'})\end{tightcode}
|
||||
(char-set-fold \var{kons} (\var{kons} \var{c} \var{knil}) \var{cs'})\end{tightcode}
|
||||
For example, we could define \ex{char-set-members} (see below)
|
||||
as
|
||||
\begin{tightcode}
|
||||
(lambda (cs) (reduce-char-set cons '() cs))\end{tightcode}
|
||||
\end{desc}
|
||||
(lambda (cs) (char-set-fold cons '() cs))\end{tightcode}
|
||||
|
||||
\subsection{Side effects}
|
||||
\defun{set-char-set!}{cs char in?}{\undefined}
|
||||
\begin{desc}
|
||||
This side-effects character set \var{cs}.
|
||||
If \var{in?} is true, character \var{char} is added to the set.
|
||||
Otherwise, it is deleted from the set.
|
||||
|
||||
Use of this procedure is deprecated, since it could damage other procedures
|
||||
that retain pointers to existing character sets.
|
||||
You should use \ex{set-char-set!} in contexts where it is guaranteed that
|
||||
there are no other pointers to the character set being modified.
|
||||
(For example, functions that create character sets can use this function
|
||||
to efficiently construct the character set, after which time the set is
|
||||
used in a pure-functional, shared manner.)
|
||||
\remark{This procedure was formerly named \texttt{\indx{reduce-char-set}}.
|
||||
The old binding is still provided, but is deprecated and will
|
||||
probably vanish in a future release.}
|
||||
\end{desc}
|
||||
|
||||
\defun{char-set-for-each}{p cs}{\undefined}
|
||||
|
@ -646,11 +477,7 @@ Note that the order in which \var{p} is applied to the characters in the
|
|||
set is not specified, and may even change from application to application.
|
||||
\end{desc}
|
||||
|
||||
\defun{copy-char-set}{cs}{char-set}
|
||||
\begin{desc}
|
||||
Returns a copy of the character set \var{cs}.
|
||||
\end{desc}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{Creating character sets}
|
||||
|
||||
\defun{char-set}{\vari{char}1\ldots}{char-set}
|
||||
|
@ -680,6 +507,7 @@ Returns a character set containing every character whose {\Ascii}
|
|||
code lies in the half-open range $[\var{lower},\var{upper})$.
|
||||
\end{desc}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{Querying character sets}
|
||||
\defun {char-set-members}{char-set}{character-list}
|
||||
\begin{desc}
|
||||
|
@ -706,7 +534,23 @@ We sought to escape the dilemma by shifting to a new name.}
|
|||
Returns the number of elements in character set \var{cs}.
|
||||
\end{desc}
|
||||
|
||||
\subsection{Character set algebra}
|
||||
\defun{char-set-every?}{pred cs}\boolean
|
||||
\defunx{char-set-any?}{pred cs}\object
|
||||
\begin{desc}
|
||||
The \ex{char-set-every?} procedure returns true if predicate \var{pred}
|
||||
returns true of every character in the character set \var{cs}.
|
||||
|
||||
Likewise, \ex{char-set-any?} applies \var{pred} to every character in
|
||||
character set \var{cs}, and returns the first true value it finds.
|
||||
If no character produces a true value, it returns false.
|
||||
|
||||
The order in which these procedures sequence through the elements of
|
||||
\var{cs} is not specified.
|
||||
\end{desc}
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{Character-set algebra}
|
||||
\defun {char-set-invert}{char-set}{char-set}
|
||||
\defunx{char-set-union}{\vari{char-set}1\ldots}{char-set}
|
||||
\defunx{char-set-intersection}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set}
|
||||
|
@ -719,6 +563,14 @@ to the left; the difference function requires at least one argument, while
|
|||
union and intersection may be applied to zero arguments.
|
||||
\end{desc}
|
||||
|
||||
\defun {char-set-adjoin}{cs \vari{char}1\ldots}{char-set}
|
||||
\defunx{char-set-delete}{cs \vari{char}1\ldots}{char-set}
|
||||
\begin{desc}
|
||||
Add/delete the \vari{char}i characters to/from character set \var{cs}.
|
||||
\end{desc}
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{Standard character sets}
|
||||
\label{sec:std-csets}
|
||||
Several character sets are predefined for convenience:
|
||||
|
@ -727,25 +579,25 @@ Several character sets are predefined for convenience:
|
|||
\newcommand{\entry}[1]{\ex{#1}\index{#1}}
|
||||
\begin{tabular}{|ll|}
|
||||
\hline
|
||||
\entry{char-set:alphabetic} & Alphabetic chars \\
|
||||
\entry{char-set:lower-case} & Lower-case alphabetic chars \\
|
||||
\entry{char-set:upper-case} & Upper-case alphabetic chars \\
|
||||
\entry{char-set:alphabetic} & Alphabetic chars \\
|
||||
\entry{char-set:numeric} & Decimal digits: 0--9 \\
|
||||
\entry{char-set:alphanumeric} & Alphabetic or numeric \\
|
||||
\entry{char-set:graphic} & Printing characters except space \\
|
||||
\entry{char-set:printing} & Printing characters including space \\
|
||||
\entry{char-set:whitespace} & Whitespace characters \\
|
||||
\entry{char-set:blank} & Blank characters \\
|
||||
\entry{char-set:control} & Control characters \\
|
||||
\entry{char-set:punctuation} & Punctuation characters \\
|
||||
\entry{char-set:hex-digit} & A hexadecimal digit: 0--9, A--F, a--f \\
|
||||
\entry{char-set:blank} & Blank characters \\
|
||||
\entry{char-set:ascii} & A character in the ASCII set. \\
|
||||
\entry{char-set:empty} & Empty set \\
|
||||
\entry{char-set:full} & All characters \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
The first twelve of these correspond to the character classes defined in
|
||||
The first eleven of these correspond to the character classes defined in
|
||||
Posix.
|
||||
Note that there may be characters in \ex{char-set:alphabetic} that are
|
||||
neither upper or lower case---this might occur in implementations that
|
||||
|
@ -788,3 +640,87 @@ char-set:punctuation & \verb|!"#$%&'()*+,-./:;<=>|\verb#?@[\]^_`{|}~# \\
|
|||
\begin{desc}
|
||||
These predicates are defined in terms of the above character sets.
|
||||
\end{desc}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\subsection{Linear-update character-set operations}
|
||||
These procedures have a hybrid pure-functional/side-effecting semantics:
|
||||
they are allowed, but not required, to side-effect one of their parameters
|
||||
in order to construct their result.
|
||||
An implementation may legally implement these procedures as pure,
|
||||
side-effect-free functions, or it may implement them using side effects,
|
||||
depending upon the details of what is the most efficient or simple to
|
||||
implement in terms of the underlying representation.
|
||||
|
||||
What this means is that clients of these procedures \emph{may not} rely
|
||||
upon these procedures working by side effect.
|
||||
For example, this is not guaranteed to work:
|
||||
\begin{verbatim}
|
||||
(let ((cs (char-set #\a #\b #\c)))
|
||||
(char-set-adjoin! cs #\d)
|
||||
cs) ; Could be either {a,b,c} or {a,b,c,d}.
|
||||
\end{verbatim}
|
||||
However, this is well-defined:
|
||||
\begin{verbatim}
|
||||
(let ((cs (char-set #\a #\b #\c)))
|
||||
(char-set-adjoin! cs #\d)) ; {a,b,c,d}
|
||||
\end{verbatim}
|
||||
So clients of these procedures write in a functional style, but must
|
||||
additionally be sure that, when the procedure is called, there are no
|
||||
other live pointers to the potentially-modified character set (hence the term
|
||||
``linear update'').
|
||||
|
||||
There are two benefits to this convention:
|
||||
\begin{itemize}
|
||||
\item Implementations are free to provide the most efficient possible
|
||||
implementation, either functional or side-effecting.
|
||||
\item Programmers may nonetheless continue to assume that character sets
|
||||
are purely functional data structures: they may be reliably shared
|
||||
without needing to be copied, uniquified, and so forth.
|
||||
\end{itemize}
|
||||
|
||||
In practice, these procedures are most useful for efficiently constructing
|
||||
character sets in a side-effecting manner, in some limited local context,
|
||||
before passing the character set outside the local construction scope to be
|
||||
used in a functional manner.
|
||||
|
||||
Scsh provides no assistance in checking the linearity of the potentially
|
||||
side-effected parameters passed to these functions --- there's no linear
|
||||
type checker or run-time mechanism for detecting violations.
|
||||
|
||||
\defun{char-set-copy}{cs}{char-set}
|
||||
\begin{desc}
|
||||
Returns a copy of the character set \var{cs}.
|
||||
``Copy'' means that if either the input parameter or the
|
||||
result value of this procedure is passed to one of the linear-update
|
||||
procedures described below, the other character set is guaranteed
|
||||
not to be altered.
|
||||
(A system that provides pure-functional implementations of the rest of
|
||||
the linear-operator suite could implement this procedure as the
|
||||
identity function.)
|
||||
\end{desc}
|
||||
|
||||
\defun{char-set-adjoin!}{cs \vari{char}1\ldots}{char-set}
|
||||
\begin{desc}
|
||||
Add the \vari{char}i characters to character set \var{cs}, and
|
||||
return the result.
|
||||
This procedure is allowed, but not required, to side-effect \var{cs}.
|
||||
\end{desc}
|
||||
|
||||
\defun{char-set-delete!}{cs \vari{char}1\ldots}{char-set}
|
||||
\begin{desc}
|
||||
Remove the \vari{char}i characters to character set \var{cs}, and
|
||||
return the result.
|
||||
This procedure is allowed, but not required, to side-effect \var{cs}.
|
||||
\end{desc}
|
||||
|
||||
\defun {char-set-invert!}{char-set}{char-set}
|
||||
\defunx{char-set-union!}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set}
|
||||
\defunx{char-set-intersection!}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set}
|
||||
\defunx{char-set-difference!}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set}
|
||||
\begin{desc}
|
||||
These procedures implement set complement, union, intersection, and difference
|
||||
for character sets.
|
||||
They are allowed, but not required, to side-effect their first parameter.
|
||||
The union, intersection, and difference operations are n-ary, associating
|
||||
to the left.
|
||||
\end{desc}
|
||||
|
|
|
@ -1464,6 +1464,12 @@ All wild-card characters in \var{str} are quoted with a backslash.
|
|||
|
||||
|
||||
\begin{defundesc}{file-match}{root dot-files? \vari{pat}1 \vari{pat}2 {\ldots} \vari{pat}n}{string list}
|
||||
\note{This procedure is deprecated, and will probably either go away or
|
||||
be substantially altered in a future release. New code should not
|
||||
call this procedure. The problem is that it relies upon
|
||||
Posix-notation regular expressions; the rest of scsh has been
|
||||
converted over to the new SRE notation.}
|
||||
|
||||
\ex{file-match} provides a more powerful file-matching service, at the
|
||||
expense of a less convenient notation. It is intermediate in
|
||||
power between most shell matching machinery and recursive \ex{find(1)}.
|
||||
|
|
Loading…
Reference in New Issue