Updated documentation for the new release. Mostly new material for the
SRE regexp system.
This commit is contained in:
		
							parent
							
								
									239592923f
								
							
						
					
					
						commit
						0d3f69124f
					
				|  | @ -77,7 +77,7 @@ characters. | |||
| 
 | ||||
| \subsection{Parsing fields} | ||||
| 
 | ||||
| \defun  {field-splitter}  {[regexp num-fields]}             \proc | ||||
| \defun  {field-splitter}  {[field num-fields]}             \proc | ||||
| \defunx {infix-splitter}  {[delim num-fields handle-delim]} \proc | ||||
| \defunx {suffix-splitter} {[delim num-fields handle-delim]} \proc | ||||
| \defunx {sloppy-suffix-splitter} {[delim num-fields handle-delim]} \proc | ||||
|  | @ -107,10 +107,10 @@ These functions return a parser function that can be used as follows: | |||
|     Defaults: | ||||
| \begin{tightinset} | ||||
| \begin{tabular}{l@{\quad=\quad }ll} | ||||
|  \var{delim} & \verb!"[ \t\n]+|$"!   & (suffix delimiter: white space or eos) \\ | ||||
|  \multicolumn{1}{l}{}   	     & \verb|"[ \t\n]+"|     & (infix  delimiter: white space) \\ | ||||
|      | ||||
|  \var{re}    & \verb|"[^ \t\n]+"|    & (non-white-space) \\ | ||||
|  \var{delim} & \ex{(rx (| (+ white) eos))}    & (suffix delimiter: white space or eos) \\ | ||||
|  \multicolumn{1}{l}{}   	     & \ex{(rx (+ white))}   & (infix  delimiter: white space) \\ | ||||
| 
 | ||||
|  \var{field}    & \verb|(rx (+ (~ white)))|   & (non-white-space) \\ | ||||
|      | ||||
|  \var{num-fields} & \verb|#f| & (as many fields as possible) \\ | ||||
|      | ||||
|  | @ -120,15 +120,30 @@ These functions return a parser function that can be used as follows: | |||
|     {\ldots}which means: break the string at white space, discarding the | ||||
|      white space, and parse as many fields as possible. | ||||
| 
 | ||||
|     The regular expression \var{delim} is used to match field delimiters. | ||||
|     It can be either a string or a compiled regexp structure (see the | ||||
|     \ex{make-regexp} procedure). In the separator case, it defaults to a | ||||
|     regular expression matching white space; in the terminator case, | ||||
|     The \var{delim} parameter is a regular expression matching the text | ||||
|     that occurs between fields. | ||||
|     See chapter~\ref{chapt:sre} for information on regular expressions, | ||||
|     and the \ex{rx} form used to specify them. | ||||
|     In the separator case,  | ||||
|     it defaults to a pattern matching white space;  | ||||
|     in the terminator case, | ||||
|     it defaults to white space or end-of-string. | ||||
| 
 | ||||
|     The regular expression \var{re} is a regular expression used | ||||
|     The \var{field} parameter is a regular expression used | ||||
|     to match fields. It defaults to non-white-space. | ||||
| 
 | ||||
|     The \var{delim} patterns may also be given as a string,  | ||||
|     character, or char-set, which are coerced to regular expressions. | ||||
|     So the following expressions are all equivalent, | ||||
|     each producing a function that splits strings apart at colons: | ||||
| \begin{inset} | ||||
| \begin{verbatim} | ||||
| (infix-splitter (rx ":")) | ||||
| (infix-splitter ":") | ||||
| (infix-splitter #\:) | ||||
| (infix-splitter (char-set #\:))\end{verbatim} | ||||
| \end{inset} | ||||
| 
 | ||||
|     The boolean \var{handle-delim} determines what to do with delimiters. | ||||
|     \begin{tightinset}\begin{tabular}{ll} | ||||
|     \ex{'trim} &	Delimiters are thrown away after parsing. (default) \\ | ||||
|  | @ -178,7 +193,7 @@ These functions return a parser function that can be used as follows: | |||
| 
 | ||||
|     It is an error if a non-empty record does not end with a delimiter. | ||||
|     To make the last delimiter optional, make sure the delimiter regexp | ||||
|     matches the end-of-string (regexp \ex{"\$"}). | ||||
|     matches the end-of-string (sre \ex{eos}). | ||||
| 
 | ||||
|     \item [\ex{infix-splitter}] | ||||
|     Delimiters are interpreted as element \emph{separators}. If comma is the | ||||
|  | @ -222,7 +237,8 @@ These functions return a parser function that can be used as follows: | |||
|     initial delimiter string if the string begins with one instead of parsing  | ||||
|     an initial empty field. This can be used, for example, to field-split a | ||||
|     sequence of English text at white-space boundaries, where the string may | ||||
|     begin or end with white space, by using regex \verb!"[ \t]+|$"!. | ||||
|     begin or end with white space, by using regex | ||||
| \begin{code}{(rx (| (+ white) eos))}\end{code} | ||||
|     (But you would be better off using \ex{field-splitter} in this case.) | ||||
|     \end{description} | ||||
|     \end{desc} | ||||
|  | @ -318,25 +334,26 @@ Record & : suffix & \verb!:|$! suffix & : infix & non-: field \\ | |||
|     ; wandy:3xuncWdpKhR.:73:22:Wandy Saetan:/usr/wandy:/bin/csh | ||||
| 
 | ||||
| ;;; Two ls -l output readers | ||||
| (field-reader (infix-splitter "[ \\t]+" 8)) | ||||
| (field-reader (infix-splitter "[ \\t]+" -7)) | ||||
| (field-reader (infix-splitter (rx (+ white)) 8)) | ||||
| (field-reader (infix-splitter (rx (+ white)) -7)) | ||||
|     ; -rw-r--r--  1 shivers    22880 Sep 24 12:45 scsh.scm | ||||
| 
 | ||||
| ;;; Internet hostname reader | ||||
| (field-reader (field-splitter "[^.]+")) | ||||
| (field-reader (field-splitter (rx (+ (~ "."))))) | ||||
|     ; stat.sinica.edu.tw | ||||
| 
 | ||||
| ;;; Internet IP address reader | ||||
| (field-reader (field-splitter "[^.]+" 4)) | ||||
| (field-reader (field-splitter (rx (+ (~ "."))) 4)) | ||||
|     ; 18.24.0.241 | ||||
| 
 | ||||
| ;;; Line of integers | ||||
| (let ((parser (field-splitter "[+-]?[0-9]+"))) | ||||
| (let ((parser (field-splitter (rx (? ("+-")) (+ digit))))) | ||||
|   (field-reader (\l{s} (map string->number (parser s)))) | ||||
|     ; 18 24 0 241 | ||||
| 
 | ||||
| ;;; Same as above. | ||||
| (let ((reader (field-reader (field-splitter "[+-]?[0-9]+")))) | ||||
| (let ((reader (field-reader (field-splitter (rx (? ("+-"))  | ||||
|                                                 (+ digit)))))) | ||||
|   (\lx{maybe-port} (map string->number (apply reader maybe-port)))) | ||||
|     ; Yale beat harvard 26 to 7.\end{centercode} | ||||
| \caption{Some examples of \protect\ex{field-reader}} | ||||
|  | @ -349,8 +366,9 @@ Record & : suffix & \verb!:|$! suffix & : infix & non-: field \\ | |||
| \subsection{Forward-progress guarantees and empty-string matches} | ||||
| A loop that pulls text off a string by repeatedly matching a regexp  | ||||
| against that string can conceivably get stuck in an infinite loop if  | ||||
| the regexp matches the empty string. For example, the regexps \verb|^|,  | ||||
| \verb|$|, \verb|.*|, and \verb!foo|[^f]*! can all match the empty string. | ||||
| the regexp matches the empty string. For example, the SREs | ||||
| \ex{bos}, \ex{eos}, \ex{(* any)}, and \ex{(| "foo" (* (~ "f")))} | ||||
| can all match the empty string. | ||||
| 
 | ||||
| The routines in this package that iterate through strings with regular | ||||
| expressions are careful to handle this empty-string case. | ||||
|  | @ -369,10 +387,10 @@ progress, and the loop is guaranteed to terminate. | |||
| This has the effect you want with field parsing. For example, if you split | ||||
| a string with the empty pattern, you will explode the string into its | ||||
| individual characters: | ||||
|     \codex{((suffix-splitter "") "foo") {\evalto} ("" "f" "o" "o")} | ||||
|     \codex{((suffix-splitter (rx)) "foo") {\evalto} ("" "f" "o" "o")} | ||||
| However, even though this boundary case is handled correctly, we don't | ||||
| recommend using it. Say what you mean---just use a field splitter: | ||||
|     \codex{((field-splitter ".") "foo") {\evalto} ("f" "o" "o")} | ||||
|     \codex{((field-splitter (rx any)) "foo") {\evalto} ("f" "o" "o")} | ||||
| Or, more efficiently, | ||||
|     \codex{((\l{s} (map string (string->list s))) "foo")} | ||||
| 
 | ||||
|  | @ -478,15 +496,25 @@ it checks them all. | |||
| 
 | ||||
|     	The \var{test} form can be one of: | ||||
| 	\begin{inset} | ||||
|         \begin{tabular}{lp{0.8\linewidth}} | ||||
|     	  integer: &	The test is true for that iteration of the loop. | ||||
|         \begin{tabular}{lp{0.6\linewidth}} | ||||
|     	  \var{integer}: &	The test is true for that iteration of the loop. | ||||
|                         The first iteration is \#1. \\ | ||||
|      | ||||
|     	  string: &	The string is a regular expression. The test is | ||||
|     	    	    	true if the regexp matches the record.\\ | ||||
|     	  \var{sre}: &	A regular expression, in SRE notation  | ||||
|                         (see chapter~\ref{chapt:sre}) can be used as | ||||
| 			a test. The test is successful if the pattern | ||||
|                         matches the record. | ||||
|                         In particular, note that any string is an SRE. \\ | ||||
|      | ||||
| 	  expression &  If not an integer or a string, the test form is | ||||
|     	    	    	a Scheme expression that is evaluated. | ||||
| 	  \ex{(when \var{expr})}: & | ||||
| 			The body of a \ex{when} test is evaluated as a | ||||
| 			Scheme boolean expression in the inner scope of the | ||||
| 			\ex{awk} form. \\ | ||||
| 
 | ||||
| 	  \var{expr}: & If the form is none of the above, it is treated as | ||||
|     	    	    	a Scheme expression---in practice, the \ex{when} | ||||
| 			keyword is only needed in cases where SRE/Scheme | ||||
| 			expression ambiguity might occur. | ||||
| 	\end{tabular} | ||||
| 	\end{inset} | ||||
| 
 | ||||
|  | @ -526,7 +554,7 @@ it checks them all. | |||
|     \itum{\ex{(\var{test} => \var{exp})}} | ||||
| 	If evaluating \ex{test} produces a true value,  | ||||
|         apply \var{exp} to that value. | ||||
| 	If \var{test} is a regular-expression string, then \var{exp} is applied | ||||
| 	If \var{test} is a regular expression, then \var{exp} is applied | ||||
|     	to the match data structure returned by the regexp match routine. | ||||
| 
 | ||||
|     \itum{\ex{(after \vari{body}1 \ldots)}} | ||||
|  | @ -562,9 +590,10 @@ of input stream. | |||
|   (call-with-input-file "/etc/passwd" | ||||
|     (lambda (port) | ||||
|       (awk (read-passwd port) (record fields) () | ||||
|         ("^S" (format #t "~a's home directory is ~a~%" | ||||
|                       ($ fields 0) | ||||
|                       ($ fields 5)))))))\end{code} | ||||
|         ((: bos "S")  | ||||
|          (format #t "~a's home directory is ~a~%" | ||||
|                     ($ fields 0) | ||||
|                     ($ fields 5)))))))\end{code} | ||||
| 
 | ||||
| \begin{code} | ||||
| ;;; Read a series of integers from stdin. This expression evaluates | ||||
|  | @ -581,8 +610,8 @@ of input stream. | |||
| \begin{code} | ||||
| ;;; Count the number of non-comment lines of code in my Scheme source. | ||||
| (awk (read-line) (line) ((nlines 0)) | ||||
|   ("^[ \\t]*;" nlines)               ; A comment line. | ||||
|   (else       (+ nlines 1)))        ; Not a comment line.\end{code} | ||||
|   ((: bos (* white) ";")  nlines)         ; A comment line. | ||||
|   (else                   (+ nlines 1)))  ; Not a comment line.\end{code} | ||||
| 
 | ||||
| \begin{code} | ||||
| ;;; Read numbers, counting the evens and odds. | ||||
|  | @ -600,10 +629,10 @@ of input stream. | |||
|   (#t (max max-len (string-length line))))\end{code} | ||||
| 
 | ||||
| \begin{code} | ||||
| ;;; (This could also be done with REDUCE-PORT:) | ||||
| (reduce-port (current-input-port) read-line | ||||
|              (lambda (line maxlen) (max (string-length line) maxlen)) | ||||
|              0)\end{code} | ||||
| ;;; (This could also be done with PORT-FOLDL:) | ||||
| (port-foldl (current-input-port) read-line | ||||
|             (lambda (line maxlen) (max (string-length line) maxlen)) | ||||
|             0)\end{code} | ||||
| 
 | ||||
| \begin{code} | ||||
| ;;; Print every line longer than 80 chars. | ||||
|  | @ -615,7 +644,7 @@ of input stream. | |||
| \begin{code} | ||||
| ;;; Strip blank lines from input. | ||||
| (awk (read-line) (line) () | ||||
|   ("." (display line) (newline)))\end{code} | ||||
|   ((~ white)   (display line) (newline)))\end{code} | ||||
| 
 | ||||
| \begin{code} | ||||
| ;;; Sort the entries in /etc/passwd by login name. | ||||
|  | @ -629,3 +658,14 @@ of input stream. | |||
| ;;; Prefix line numbers to the input stream. | ||||
| (awk (read-line) (line) lineno () | ||||
|   (#t (format #t "~d:\\t~a~%" lineno line)))\end{code} | ||||
| 
 | ||||
| 
 | ||||
| \section{Backwards compatibility} | ||||
| 
 | ||||
| Previous scsh releases provided an \ex{awk} form with a different syntax, | ||||
| designed around regular expressions written in Posix notation as strings, | ||||
| rather than SREs. | ||||
| 
 | ||||
| This form is still available in a separate module for old code. | ||||
| It'll be documented in the next release of this manual. Dig around | ||||
| in the sources for it. | ||||
|  |  | |||
|  | @ -38,7 +38,7 @@ | |||
| 
 | ||||
| % For multiletter vars in math mode: | ||||
| \newcommand{\var}[1]{\mbox{\frenchspacing\it{#1}}} | ||||
| \newcommand{\vari}[2]{${\mbox{\it{#1}}}_{#2}$} | ||||
| \newcommand{\vari}[2]{\ensuremath{\mbox{\it{#1}}_{#2}}} | ||||
| 
 | ||||
| %% What you frequently want when you say \tt: | ||||
| \def\ttchars{\catcode``=13\@noligs\frenchspacing} | ||||
|  |  | |||
|  | @ -1,9 +1,9 @@ | |||
| %&latex -*- latex -*- | ||||
| 
 | ||||
| \title{Scsh Reference Manual} | ||||
| \subtitle{For scsh release 0.5} | ||||
| \subtitle{For scsh release 0.5.2} | ||||
| \author{Olin Shivers and Brian D.~Carlstrom} | ||||
| \date{April 11, 1997} | ||||
| \date{September 1999} | ||||
| 
 | ||||
| \maketitle | ||||
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||||
|  |  | |||
|  | @ -27,6 +27,7 @@ | |||
| \include{syscalls} | ||||
| \include{network} | ||||
| \include{strings} | ||||
| \include{sre} | ||||
| \include{rdelim} | ||||
| \include{awk} | ||||
| \include{miscprocs} | ||||
|  |  | |||
|  | @ -324,12 +324,12 @@ run/sexp*     $\equiv$  read              $\circ$ run/port* | |||
| run/sexps*    $\equiv$  port->sexp-list   $\circ$ run/port*\end{code} | ||||
| \end{desc} | ||||
| 
 | ||||
| \defun{reduce-port}{port reader op . seeds} {\object\star} | ||||
| \defun{port-foldl}{port reader op . seeds} {\object\star} | ||||
| \begin{desc} | ||||
| This procedure can be used to perform a variety of iterative operations | ||||
| over an input stream. | ||||
| It repeatedly uses \var{reader} to read an object from \var{port}. | ||||
| If the first read returns eof, then the entire \ex{reduce-port} | ||||
| If the first read returns eof, then the entire \ex{port-foldl} | ||||
| operation returns the seeds as multiple values. | ||||
| If the first read operation returns some other value $v$, then | ||||
| \var{op} is applied to $v$ and the seeds: | ||||
|  | @ -340,11 +340,15 @@ reading a new value from the port, and so forth. | |||
| 
 | ||||
| For example, \ex{(port->list \var{reader} \var{port})} | ||||
| could be defined as | ||||
|         \codex{(reverse (reduce-port \var{port} \var{reader} cons '()))} | ||||
|         \codex{(reverse (port-foldl \var{port} \var{reader} cons '()))} | ||||
| 
 | ||||
| An imperative way to look at \ex{reduce-port} is to say that it | ||||
| An imperative way to look at \ex{port-foldl} is to say that it | ||||
| abstracts the idea of a loop over a stream of values read from | ||||
| some port, where the seed values express the loop state. | ||||
| 
 | ||||
| \remark{This procedure was formerly named \texttt{\indx{reduce-port}}. | ||||
|         The old binding is still provided, but is deprecated and will | ||||
| 	probably vanish in a future release.} | ||||
| \end{desc} | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -5,10 +5,11 @@ Scsh provides a set of procedures for processing strings and characters. | |||
| The procedures provided match regular expressions, search strings, | ||||
| parse file-names, and manipulate sets of characters. | ||||
| 
 | ||||
| Also see chapters \ref{chapt:rdelim} and \ref{chapt:fr-awk} | ||||
| on record I/O, field parsing, and the awk loop. | ||||
| The procedures documented there allow you to read character-delimited | ||||
| records from ports, use regular expressions to split the records into fields  | ||||
| Also see chapters \ref{chapt:sre}, \ref{chapt:rdelim} and \ref{chapt:fr-awk} | ||||
| on regular-expressions, record I/O, field parsing, and the awk loop. | ||||
| The procedures documented there allow you to search and pattern-match strings, | ||||
| read character-delimited records from ports,  | ||||
| use regular expressions to split the records into fields  | ||||
| (for example, splitting a string at every occurrence of colon or white-space), | ||||
| and loop over streams of these records in a convenient way. | ||||
| 
 | ||||
|  | @ -19,213 +20,7 @@ and loop over streams of these records in a convenient way. | |||
| Strings are the basic communication medium for {\Unix} processes, so a | ||||
| shell language must have reasonable facilities for manipulating them. | ||||
| 
 | ||||
| \subsection{Regular expressions} | ||||
| \label{sec:regexps} | ||||
| 
 | ||||
| The following functions perform regular expression matching. | ||||
| The code uses Henry Spencer's regular expression package. | ||||
| 
 | ||||
| \begin{defundesc}{string-match} {regexp string [start]} {match or false} | ||||
|     Search \var{string} starting at position \var{start}, looking for a match | ||||
|     for \var{regexp}. If a match is found, return a match structure describing | ||||
|     the match, otherwise {\sharpf}. \var{Start} defaults to 0. | ||||
| 
 | ||||
|     \var{regexp} may be a compiled regexp structure or a string defining | ||||
|     a regular expression, which will be compiled to a regexp structure. | ||||
| \end{defundesc} | ||||
| 
 | ||||
| \begin{defundesc} {regexp-match?} {obj} \boolean | ||||
|     Is the object a regular expression match? | ||||
| \end{defundesc} | ||||
| 
 | ||||
| \begin{defundesc} {match:start} {match [match-number]} {{\fixnum} or false} | ||||
|     Returns the start position of the match denoted by \var{match-number}. | ||||
|     The whole regexp is 0.  Each further number represents positions | ||||
|     enclosed by \ex{(\ldots)} sections. \var{Match-number} defaults to 0. | ||||
| 
 | ||||
|     If the regular expression matches as a whole,  | ||||
|     but a particular parenthesized sub-expression does not match, then | ||||
|     \ex{match:start} returns {\sharpf}. | ||||
| \end{defundesc} | ||||
| 
 | ||||
| \begin{defundesc} {match:end} {match [match-number]} \fixnum | ||||
|     Returns the end position of the match denoted by \var{match-number}. | ||||
|     \var{Match-number} defaults to 0 (the whole match). | ||||
| 
 | ||||
|     If the regular expression matches as a whole,  | ||||
|     but a particular parenthesized sub-expression does not match, then | ||||
|     \ex{match:end} returns {\sharpf}. | ||||
| \end{defundesc} | ||||
| 
 | ||||
| \begin{defundesc} {match:substring} {match [match-number]} {{\str} or false} | ||||
|     Returns the substring matched by match \var{match-number}. | ||||
|     \var{Match-number} defaults to 0 (the whole match). | ||||
|     If there was no match, returns false. | ||||
| \end{defundesc} | ||||
| 
 | ||||
| Regular expression matching compiles patterns into special data | ||||
| structures which can be efficiently used to match against strings. | ||||
| The overhead of compiling patterns that will be used for multiple | ||||
| searches can be avoided by these lower-level routines: | ||||
| % | ||||
| \begin{defundesc} {make-regexp} {str} {re} | ||||
|   Generate a compiled regular expression from the given string. | ||||
| \end{defundesc} | ||||
| 
 | ||||
| \begin{defundesc} {regexp?} {obj} \boolean | ||||
|   Is the object a regular expression? | ||||
| \end{defundesc} | ||||
| 
 | ||||
| \begin{defundesc} {regexp-exec} {regexp str [start]} {match or false} | ||||
|   Apply the regular expression \var{regexp} to the string \var{str} starting | ||||
|   at position \var{start}.  If the match succeeds it returns a regexp-match, | ||||
|   otherwise {\sharpf}. \var{Start} defaults to 0. | ||||
| \end{defundesc} | ||||
| 
 | ||||
| \begin{defundesc} {->regexp} {regexp-or-string} {regexp} | ||||
|   Coerce the input value into a compiled regular expression: | ||||
|   strings are compiled; regexp structures are passed through unchanged. | ||||
| \end{defundesc} | ||||
| 
 | ||||
| \defun{regexp-quote}{str}{\str} | ||||
| \begin{desc} | ||||
| Returns a regular expression that matches the string \var{str} exactly. | ||||
| In other words, it quotes the regular expression, prepending backslashes | ||||
| to all the special regexp characters in \var{str}. | ||||
| \begin{code} | ||||
| (regexp-quote "*Hello* world.")  | ||||
|     {\evalto}"\\\\*Hello\\\\* world\\\\."\end{code} | ||||
| \end{desc} | ||||
| 
 | ||||
| \defun{regexp-substitute}{port match . items}{{\str} or \undefined} | ||||
| \begin{desc} | ||||
| This procedure can be used to perform string substitutions based on | ||||
| regular expression matches. | ||||
| The results of the substitution can be either output to a port or | ||||
| returned as a string. | ||||
| 
 | ||||
| The \var{match} argument is a regular expression match structure | ||||
| that controls the substitution. | ||||
| If \var{port} is an output port, the \var{items} are written out to | ||||
| the port: | ||||
| \begin{itemize} | ||||
|     \item If an item is a string, it is copied directly to the port. | ||||
|     \item If an item is an integer, the corresponding submatch from \var{match} | ||||
|       is written to the port. | ||||
|     \item If an item is \ex{'pre},  | ||||
|           the prefix of the matched string (the text preceding the match)  | ||||
| 	  is written to the port. | ||||
|     \item If an item is \ex{'post},  | ||||
|           the suffix of the matched string is written. | ||||
| \end{itemize} | ||||
| 
 | ||||
| If \var{port} is {\sharpf}, nothing is written, and a string is constructed | ||||
| and returned instead. | ||||
| \end{desc} | ||||
| 
 | ||||
| \defun{regexp-substitute/global}{port regexp string . items} | ||||
|                                 {{\str} or \undefined} | ||||
| \begin{desc} | ||||
| This procedure is similar to \ex{regexp-substitute},  | ||||
| but can be used to perform repeated match/substitute operations over | ||||
| a string. | ||||
| It has the following differences with \ex{regexp-substitute}: | ||||
| \begin{itemize} | ||||
|   \item It takes a regular expression and string to be matched as | ||||
|         parameters, instead of a completed match structure. | ||||
|   \item If the regular expression doesn't match the string, this | ||||
| 	procedure is the identity transform---it returns or outputs the | ||||
| 	string. | ||||
|   \item If an item is \ex{'post}, the procedure recurses on the suffix string | ||||
|         (the text from \var{string} following the match).  | ||||
| 	Including a \ex{'post} in the list of items is how one gets multiple  | ||||
| 	match/substitution operations. | ||||
|   \item If an item is a procedure, it is applied to the match structure for | ||||
| 	a given match. | ||||
| 	The procedure returns a string to be used in the result. | ||||
|   \end{itemize} | ||||
| The \var{regexp} parameter can be either a compiled regular expression or | ||||
| a string specifying a regular expression. | ||||
| 
 | ||||
| Some examples: | ||||
| {\small | ||||
| \begin{widecode} | ||||
| ;;; Replace occurrences of "Cotton" with "Jin". | ||||
| (regexp-substitute/global #f "Cotton" s | ||||
|                           'pre "Jin" 'post) | ||||
| 
 | ||||
| ;;; mm/dd/yy -> dd/mm/yy date conversion. | ||||
| (regexp-substitute/global #f "([0-9]+)/([0-9]+)/([0-9]+)" ; mm/dd/yy | ||||
|                           s ; Source string | ||||
|                           'pre 2 "/" 1 "/" 3 'post) | ||||
| 
 | ||||
| ;;; "9/29/61" -> "Sep 29, 1961" date conversion. | ||||
| (regexp-substitute/global #f "([0-9]+)/([0-9]+)/([0-9]+)" ; mm/dd/yy | ||||
|                           s ; Source string | ||||
| 
 | ||||
|       'pre  | ||||
|       ;; Sleazy converter -- ignores "year 2000" issue, and blows up if | ||||
|       ;; month is out of range. | ||||
|       (lambda (m) | ||||
|         (let ((mon (vector-ref '#("Jan" "Feb" "Mar" "Apr" "May" "Jun" | ||||
|                                   "Jul" "Aug" "Sep" "Oct" "Nov" "Dec") | ||||
|                                (- (string->number (match:substring m 1)) 1))) | ||||
|               (day (match:substring m 2)) | ||||
|               (year (match:substring m 3))) | ||||
|           (string-append mon " " day ", 19" year))) | ||||
|       'post) | ||||
| 
 | ||||
| ;;; Remove potentially offensive substrings from string S. | ||||
| (regexp-substitute/global #f "Windows|tcl|Intel" s | ||||
|                           'pre 'post)\end{widecode}} | ||||
| 
 | ||||
| \end{desc} | ||||
| 
 | ||||
| \subsection{Other string manipulation facilities} | ||||
| 
 | ||||
| \defun  {index}  {string char [start]} {{\fixnum} or false} | ||||
| \defunx {rindex} {string char [start]} {{\fixnum} or false} | ||||
| \begin{desc} | ||||
|     These procedures search through \var{string} looking for an occurrence | ||||
|     of character \var{char}. \ex{index} searches left-to-right; \ex{rindex} | ||||
|     searches right-to-left. | ||||
| 
 | ||||
|     \ex{index} returns the smallest index $i$ of \var{string} greater | ||||
|     than or equal to \var{start} such that $\var{string}[i] = \var{char}$. | ||||
|     The default for \var{start} is zero. If there is no such match, | ||||
|     \ex{index} returns false. | ||||
| 
 | ||||
|     \ex{rindex} returns the largest index $i$ of \var{string} less than | ||||
|     \var{start} such that $\var{string}[i] = \var{char}$. | ||||
|     The default for \var{start} is \ex{(string-length \var{string})}. | ||||
|     If there is no such match, \ex{rindex} returns false. | ||||
| \end{desc} | ||||
| 
 | ||||
| I should probably snarf all the MIT Scheme string functions, and stick them | ||||
| in a package.  {\Unix} programs need to mung character strings a lot. | ||||
| 
 | ||||
| MIT string match commands:  | ||||
| \begin{tightcode} | ||||
| [sub]string-match-{forward,backward}[-ci] | ||||
| [sub]string-{prefix,suffix}[-ci]? | ||||
| [sub]string-find-{next,previous}-char[-ci] | ||||
| [sub]string-find-{next,previous}-char-in-set | ||||
| [sub]string-replace[!] | ||||
| \ldots\etc\end{tightcode} | ||||
| These are not currently provided. | ||||
| 
 | ||||
| \begin{defundesc} {substitute-env-vars} {fname} \str | ||||
|     Replace occurrences of environment variables with their values. | ||||
|     An environment variable is denoted by a dollar sign followed by | ||||
|     alphanumeric chars and underscores, or is surrounded by braces. | ||||
| 
 | ||||
|     \begin{exampletable} | ||||
|     \splitline{\ex{(substitute-env-vars "\$USER/.login")}}  | ||||
|               {\ex{"shivers/.login"}}  \\ | ||||
|     \cd{(substitute-env-vars "$\{USER\}_log")}  & \cd{"shivers_log"} | ||||
|     \end{exampletable} | ||||
| \end{defundesc} | ||||
| 
 | ||||
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||||
| \subsection{Manipulating file-names} | ||||
| \label{sec:filenames} | ||||
| 
 | ||||
|  | @ -559,6 +354,53 @@ defined in the previous section, | |||
| is also frequently useful for expanding file-names. | ||||
| 
 | ||||
| 
 | ||||
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||||
| \subsection{Other string manipulation facilities} | ||||
| 
 | ||||
| \defun  {index}  {string char [start]} {{\fixnum} or false} | ||||
| \defunx {rindex} {string char [start]} {{\fixnum} or false} | ||||
| \begin{desc} | ||||
|     These procedures search through \var{string} looking for an occurrence | ||||
|     of character \var{char}. \ex{index} searches left-to-right; \ex{rindex} | ||||
|     searches right-to-left. | ||||
| 
 | ||||
|     \ex{index} returns the smallest index $i$ of \var{string} greater | ||||
|     than or equal to \var{start} such that $\var{string}[i] = \var{char}$. | ||||
|     The default for \var{start} is zero. If there is no such match, | ||||
|     \ex{index} returns false. | ||||
| 
 | ||||
|     \ex{rindex} returns the largest index $i$ of \var{string} less than | ||||
|     \var{start} such that $\var{string}[i] = \var{char}$. | ||||
|     The default for \var{start} is \ex{(string-length \var{string})}. | ||||
|     If there is no such match, \ex{rindex} returns false. | ||||
| \end{desc} | ||||
| 
 | ||||
| I should probably snarf all the MIT Scheme string functions, and stick them | ||||
| in a package.  {\Unix} programs need to mung character strings a lot. | ||||
| 
 | ||||
| MIT string match commands:  | ||||
| \begin{tightcode} | ||||
| [sub]string-match-{forward,backward}[-ci] | ||||
| [sub]string-{prefix,suffix}[-ci]? | ||||
| [sub]string-find-{next,previous}-char[-ci] | ||||
| [sub]string-find-{next,previous}-char-in-set | ||||
| [sub]string-replace[!] | ||||
| \ldots\etc\end{tightcode} | ||||
| These are not currently provided. | ||||
| 
 | ||||
| \begin{defundesc} {substitute-env-vars} {fname} \str | ||||
|     Replace occurrences of environment variables with their values. | ||||
|     An environment variable is denoted by a dollar sign followed by | ||||
|     alphanumeric chars and underscores, or is surrounded by braces. | ||||
| 
 | ||||
|     \begin{exampletable} | ||||
|     \splitline{\ex{(substitute-env-vars "\$USER/.login")}}  | ||||
|               {\ex{"shivers/.login"}}  \\ | ||||
|     \cd{(substitute-env-vars "$\{USER\}_log")}  & \cd{"shivers_log"} | ||||
|     \end{exampletable} | ||||
| \end{defundesc} | ||||
| 
 | ||||
| 
 | ||||
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||||
| \section{ASCII encoding} | ||||
| 
 | ||||
|  | @ -596,47 +438,36 @@ assumption into your code if you can help it.\footnote{ | |||
| Is the object \var{x} a character set? | ||||
| \end{desc} | ||||
| 
 | ||||
| \defun{char-set=}{cs1 cs2}\boolean | ||||
| \defun{char-set=}{\vari{cs}1 \vari{cs}2\ldots}\boolean | ||||
| \begin{desc} | ||||
| Are the character sets \var{cs1} and \var{cs2} equal? | ||||
| Are the character sets equal? | ||||
| \end{desc} | ||||
| 
 | ||||
| \defun{char-set<=}{cs1 cs2}\boolean | ||||
| \defun{char-set<=}{\vari{cs}1 \vari{cs}2\ldots}\boolean | ||||
| \begin{desc} | ||||
| Returns true if character set \var{cs1} is a subset of character set \var{cs2}. | ||||
| Returns true if every character set \vari{cs}{i} is  | ||||
| a subset of character set \vari{cs}{i+1}. | ||||
| \end{desc} | ||||
| 
 | ||||
| \defun{reduce-char-set}{kons knil cs}\object | ||||
| \defun{char-set-fold}{kons knil cs}\object | ||||
| \begin{desc} | ||||
| This is the fundamental iterator for character sets. | ||||
| Reduces the function \var{kons} across the character set \var{cs} using | ||||
| Applies the function \var{kons} across the character set \var{cs} using | ||||
| initial state value \var{knil}.  | ||||
| That is, if \var{cs} is the empty set, the procedure returns \var{knil}. | ||||
| Otherwise, some element \var{c} of \var{cs} is chosen; let \var{cs'} be | ||||
| the remaining, unchosen characters. | ||||
| The procedure returns  | ||||
| \begin{tightcode} | ||||
| (reduce-char-set \var{kons} (\var{kons} \var{c} \var{knil}) \var{cs'})\end{tightcode} | ||||
| (char-set-fold \var{kons} (\var{kons} \var{c} \var{knil}) \var{cs'})\end{tightcode} | ||||
| For example, we could define \ex{char-set-members} (see below) | ||||
| as | ||||
| \begin{tightcode} | ||||
| (lambda (cs) (reduce-char-set cons '() cs))\end{tightcode} | ||||
| \end{desc} | ||||
| (lambda (cs) (char-set-fold cons '() cs))\end{tightcode} | ||||
| 
 | ||||
| \subsection{Side effects} | ||||
| \defun{set-char-set!}{cs char in?}{\undefined} | ||||
| \begin{desc} | ||||
| This side-effects character set \var{cs}.  | ||||
| If \var{in?} is true, character \var{char} is added to the set. | ||||
| Otherwise, it is deleted from the set. | ||||
| 
 | ||||
| Use of this procedure is deprecated, since it could damage other procedures | ||||
| that retain pointers to existing character sets. | ||||
| You should use \ex{set-char-set!} in contexts where it is guaranteed that | ||||
| there are no other pointers to the character set being modified. | ||||
| (For example, functions that create character sets can use this function | ||||
| to efficiently construct the character set, after which time the set is | ||||
| used in a pure-functional, shared manner.) | ||||
| \remark{This procedure was formerly named \texttt{\indx{reduce-char-set}}. | ||||
|         The old binding is still provided, but is deprecated and will | ||||
| 	probably vanish in a future release.} | ||||
| \end{desc} | ||||
| 
 | ||||
| \defun{char-set-for-each}{p cs}{\undefined} | ||||
|  | @ -646,11 +477,7 @@ Note that the order in which \var{p} is applied to the characters in the | |||
| set is not specified, and may even change from application to application. | ||||
| \end{desc} | ||||
| 
 | ||||
| \defun{copy-char-set}{cs}{char-set} | ||||
| \begin{desc} | ||||
| Returns a copy of the character set \var{cs}. | ||||
| \end{desc} | ||||
| 
 | ||||
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||||
| \subsection{Creating character sets} | ||||
| 
 | ||||
| \defun{char-set}{\vari{char}1\ldots}{char-set} | ||||
|  | @ -680,6 +507,7 @@ Returns a character set containing every character whose {\Ascii} | |||
| code lies in the half-open range $[\var{lower},\var{upper})$. | ||||
| \end{desc} | ||||
| 
 | ||||
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||||
| \subsection{Querying character sets} | ||||
| \defun {char-set-members}{char-set}{character-list} | ||||
| \begin{desc} | ||||
|  | @ -706,7 +534,23 @@ We sought to escape the dilemma by shifting to a new name.} | |||
| Returns the number of elements in character set \var{cs}. | ||||
| \end{desc} | ||||
| 
 | ||||
| \subsection{Character set algebra} | ||||
| \defun{char-set-every?}{pred cs}\boolean | ||||
| \defunx{char-set-any?}{pred cs}\object | ||||
| \begin{desc} | ||||
| The \ex{char-set-every?} procedure returns true if predicate \var{pred} | ||||
| returns true of every character in the character set \var{cs}. | ||||
| 
 | ||||
| Likewise, \ex{char-set-any?} applies \var{pred} to every character in | ||||
| character set \var{cs}, and returns the first true value it finds. | ||||
| If no character produces a true value, it returns false. | ||||
| 
 | ||||
| The order in which these procedures sequence through the elements of | ||||
| \var{cs} is not specified. | ||||
| \end{desc} | ||||
| 
 | ||||
| 
 | ||||
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||||
| \subsection{Character-set algebra} | ||||
| \defun {char-set-invert}{char-set}{char-set} | ||||
| \defunx{char-set-union}{\vari{char-set}1\ldots}{char-set} | ||||
| \defunx{char-set-intersection}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set} | ||||
|  | @ -719,6 +563,14 @@ to the left; the difference function requires at least one argument, while | |||
| union and intersection may be applied to zero arguments. | ||||
| \end{desc} | ||||
| 
 | ||||
| \defun {char-set-adjoin}{cs \vari{char}1\ldots}{char-set} | ||||
| \defunx{char-set-delete}{cs \vari{char}1\ldots}{char-set} | ||||
| \begin{desc} | ||||
| Add/delete the \vari{char}i characters to/from character set \var{cs}. | ||||
| \end{desc} | ||||
| 
 | ||||
| 
 | ||||
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||||
| \subsection{Standard character sets} | ||||
| \label{sec:std-csets} | ||||
| Several character sets are predefined for convenience: | ||||
|  | @ -727,25 +579,25 @@ Several character sets are predefined for convenience: | |||
| \newcommand{\entry}[1]{\ex{#1}\index{#1}} | ||||
| \begin{tabular}{|ll|} | ||||
| \hline | ||||
| \entry{char-set:alphabetic}	&	Alphabetic chars \\ | ||||
| \entry{char-set:lower-case}	&	Lower-case alphabetic chars \\ | ||||
| \entry{char-set:upper-case}	&	Upper-case alphabetic chars \\ | ||||
| \entry{char-set:alphabetic}	&	Alphabetic chars \\ | ||||
| \entry{char-set:numeric}	&	Decimal digits: 0--9 \\ | ||||
| \entry{char-set:alphanumeric}	&	Alphabetic or numeric \\ | ||||
| \entry{char-set:graphic}	&	Printing characters except space \\ | ||||
| \entry{char-set:printing}	&	Printing characters including space \\ | ||||
| \entry{char-set:whitespace}	&	Whitespace characters \\ | ||||
| \entry{char-set:blank}		&	Blank characters \\ | ||||
| \entry{char-set:control}	&	Control characters \\ | ||||
| \entry{char-set:punctuation}	&	Punctuation characters \\ | ||||
| \entry{char-set:hex-digit}	&       A hexadecimal digit: 0--9, A--F, a--f \\ | ||||
| \entry{char-set:blank}		&	Blank characters \\ | ||||
| \entry{char-set:ascii}		&	A character in the ASCII set. \\ | ||||
| \entry{char-set:empty}		&	Empty set \\ | ||||
| \entry{char-set:full}		&	All characters \\ | ||||
| \hline | ||||
| \end{tabular} | ||||
| \end{center} | ||||
| The first twelve of these correspond to the character classes defined in | ||||
| The first eleven of these correspond to the character classes defined in | ||||
| Posix. | ||||
| Note that there may be characters in \ex{char-set:alphabetic} that are | ||||
| neither upper or lower case---this might occur in implementations that | ||||
|  | @ -788,3 +640,87 @@ char-set:punctuation	&	\verb|!"#$%&'()*+,-./:;<=>|\verb#?@[\]^_`{|}~# \\ | |||
| \begin{desc} | ||||
| These predicates are defined in terms of the above character sets. | ||||
| \end{desc} | ||||
| 
 | ||||
| %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | ||||
| \subsection{Linear-update character-set operations} | ||||
| These procedures have a hybrid pure-functional/side-effecting semantics: | ||||
| they are allowed, but not required, to side-effect one of their parameters | ||||
| in order to construct their result. | ||||
| An implementation may legally implement these procedures as pure, | ||||
| side-effect-free functions, or it may implement them using side effects, | ||||
| depending upon the details of what is the most efficient or simple to  | ||||
| implement in terms of the underlying representation. | ||||
| 
 | ||||
| What this means is that clients of these procedures \emph{may not} rely | ||||
| upon these procedures working by side effect. | ||||
| For example, this is not guaranteed to work: | ||||
| \begin{verbatim} | ||||
| (let ((cs (char-set #\a #\b #\c))) | ||||
|   (char-set-adjoin! cs #\d) | ||||
|   cs) ; Could be either {a,b,c} or {a,b,c,d}. | ||||
| \end{verbatim} | ||||
| However, this is well-defined: | ||||
| \begin{verbatim} | ||||
| (let ((cs (char-set #\a #\b #\c))) | ||||
|   (char-set-adjoin! cs #\d)) ; {a,b,c,d} | ||||
| \end{verbatim} | ||||
| So clients of these procedures write in a functional style, but must | ||||
| additionally be sure that, when the procedure is called, there are no | ||||
| other live pointers to the potentially-modified character set (hence the term | ||||
| ``linear update''). | ||||
| 
 | ||||
| There are two benefits to this convention: | ||||
| \begin{itemize} | ||||
| \item Implementations are free to provide the most efficient possible | ||||
|       implementation, either functional or side-effecting. | ||||
| \item Programmers may nonetheless continue to assume that character sets | ||||
|       are purely functional data structures: they may be reliably shared | ||||
|       without needing to be copied, uniquified, and so forth. | ||||
| \end{itemize} | ||||
| 
 | ||||
| In practice, these procedures are most useful for efficiently constructing | ||||
| character sets in a side-effecting manner, in some limited local context,  | ||||
| before passing the character set outside the local construction scope to be | ||||
| used in a functional manner. | ||||
| 
 | ||||
| Scsh provides no assistance in checking the linearity of the potentially | ||||
| side-effected parameters passed to these functions --- there's no linear | ||||
| type checker or run-time mechanism for detecting violations. | ||||
| 
 | ||||
| \defun{char-set-copy}{cs}{char-set} | ||||
| \begin{desc} | ||||
| Returns a copy of the character set \var{cs}. | ||||
| ``Copy'' means that if either the input parameter or the | ||||
| result value of this procedure is passed to one of the linear-update | ||||
| procedures described below, the other character set is guaranteed | ||||
| not to be altered. | ||||
| (A system that provides pure-functional implementations of the rest of | ||||
| the linear-operator suite could implement this procedure as the  | ||||
| identity function.) | ||||
| \end{desc} | ||||
| 
 | ||||
| \defun{char-set-adjoin!}{cs \vari{char}1\ldots}{char-set} | ||||
| \begin{desc} | ||||
| Add the \vari{char}i characters to character set \var{cs}, and | ||||
| return the result.  | ||||
| This procedure is allowed, but not required, to side-effect \var{cs}. | ||||
| \end{desc} | ||||
| 
 | ||||
| \defun{char-set-delete!}{cs \vari{char}1\ldots}{char-set} | ||||
| \begin{desc} | ||||
| Remove the \vari{char}i characters to character set \var{cs}, and | ||||
| return the result.  | ||||
| This procedure is allowed, but not required, to side-effect \var{cs}. | ||||
| \end{desc} | ||||
| 
 | ||||
| \defun {char-set-invert!}{char-set}{char-set} | ||||
| \defunx{char-set-union!}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set} | ||||
| \defunx{char-set-intersection!}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set} | ||||
| \defunx{char-set-difference!}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set} | ||||
| \begin{desc} | ||||
| These procedures implement set complement, union, intersection, and difference | ||||
| for character sets. | ||||
| They are allowed, but not required, to side-effect their first parameter. | ||||
| The union, intersection, and difference operations are n-ary, associating | ||||
| to the left. | ||||
| \end{desc} | ||||
|  |  | |||
|  | @ -1464,6 +1464,12 @@ All wild-card characters in \var{str} are quoted with a backslash. | |||
| 
 | ||||
| 
 | ||||
| \begin{defundesc}{file-match}{root dot-files? \vari{pat}1 \vari{pat}2 {\ldots} \vari{pat}n}{string list} | ||||
|    \note{This procedure is deprecated, and will probably either go away or | ||||
| 	 be substantially altered in a future release. New code should not | ||||
| 	 call this procedure. The problem is that it relies upon | ||||
| 	 Posix-notation regular expressions; the rest of scsh has been  | ||||
|  	 converted over to the new SRE notation.} | ||||
| 
 | ||||
|     \ex{file-match} provides a more powerful file-matching service, at the | ||||
|     expense of a less convenient notation. It is intermediate in | ||||
|     power between most shell matching machinery and recursive \ex{find(1)}. | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 shivers
						shivers