Remove documentation for BOW, EOW, WORD, and WORD+, as they have no

POSIX counterparts, and their meaning is locale-dependent.
This commit is contained in:
sperber 2002-02-16 16:52:27 +00:00
parent 00bba17d56
commit 357afa99ae
1 changed files with 81 additions and 127 deletions

View File

@ -72,8 +72,6 @@ providing:
\item repetition (\ex{*}, \ex{+}, \ex{?}, \ex{\{$m$,$n$\}}) \item repetition (\ex{*}, \ex{+}, \ex{?}, \ex{\{$m$,$n$\}})
\item character classes (\eg, \ex{[aeiou]}) and wildcard (\ex{.}) \item character classes (\eg, \ex{[aeiou]}) and wildcard (\ex{.})
\item beginning/end of string anchors (\verb|^|, \verb|$|) \item beginning/end of string anchors (\verb|^|, \verb|$|)
\item beginning/end of line anchors
\item beginning/end of word anchors
\item case-sensitivity control \item case-sensitivity control
\item submatch-marking \item submatch-marking
\end{itemize} \end{itemize}
@ -100,63 +98,56 @@ the next section is a friendlier tutorial introduction.
case-sensitivity lexical context. \\ case-sensitivity lexical context. \\
\\ \\
\ex{(* \var{sre} {\ldots})} & 0 or more matches \\ \ex{(* \var{sre} {\ldots})} & 0 or more matches \\
\ex{(+ \var{sre} {\ldots})} & 1 or more matches \\ \ex{(+ \var{sre} {\ldots})} & 1 or more matches \\
\ex{(? \var{sre} {\ldots})} & 0 or 1 matches \\ \ex{(? \var{sre} {\ldots})} & 0 or 1 matches \\
\ex{(= \var{n} \var{sre} {\ldots})} & \var{n} matches \\ \ex{(= \var{n} \var{sre} {\ldots})} & \var{n} matches \\
\ex{(>= \var{n} \var{sre} {\ldots})} & \var{n} or more matches \\ \ex{(>= \var{n} \var{sre} {\ldots})} & \var{n} or more matches \\
\ex{(** \var{n} \var{m} \var{sre} {\ldots})} & \var{n} to \var{m} matches \\ \ex{(** \var{n} \var{m} \var{sre} {\ldots})} & \var{n} to \var{m} matches \\
\srecomment{ \srecomment{
\var{N} and \var{m} are Scheme expressions producing non-negative \var{N} and \var{m} are Scheme expressions producing non-negative
integers. \\ integers. \\
\var{M} may also be \ex{\#f}, meaning ``infinity.''} \\ \var{M} may also be \ex{\#f}, meaning ``infinity.''} \\
\\ \\
\ex{(| \var{sre} {\ldots})} & Choice (\ex{or} is R5RS symbol; \\ \ex{(| \var{sre} {\ldots})} & Choice (\ex{or} is R5RS symbol; \\
\ex{(or \var{sre} {\ldots})} & \ex{|} is not specified by R5RS.) \\ \ex{(or \var{sre} {\ldots})} & \ex{|} is not specified by R5RS.) \\
\\ \\
\ex{(: \var{sre} {\ldots})} & Sequence (\ex{seq} is legal \\ \ex{(: \var{sre} {\ldots})} & Sequence (\ex{seq} is legal \\
\ex{(seq \var{sre} {\ldots})} & Common Lisp symbol) \\ \ex{(seq \var{sre} {\ldots})} & Common Lisp symbol) \\
\\ \\
\ex{(submatch \var{sre} {\ldots})} & Numbered submatch \\ \ex{(submatch \var{sre} {\ldots})} & Numbered submatch \\
\\ \\
\ex{(dsm \var{pre} \var{post} \var{sre} {\ldots})} & Deleted submatches \\ \ex{(dsm \var{pre} \var{post} \var{sre} {\ldots})} & Deleted submatches \\
\srecomment{\var{Pre} and \var{post} are numerals.} \\ \srecomment{\var{Pre} and \var{post} are numerals.} \\
\\ \\
\ex{(uncase \var{sre} {\ldots})} & Case-folded match \\ \ex{(uncase \var{sre} {\ldots})} & Case-folded match \\
\ex{(w/case \var{sre} {\ldots})} & Introduce a lexical case-sensitivity \\ \ex{(w/case \var{sre} {\ldots})} & Introduce a lexical case-sensitivity \\
\ex{(w/nocase \var{sre} {\ldots})} & context. \\ \ex{(w/nocase \var{sre} {\ldots})} & context. \\
\\ \\
\ex{,@\var{exp}} & Dynamically computed regexp \\ \ex{,@\var{exp}} & Dynamically computed regexp \\
\ex{,\var{exp}} & Same as ,@\var{exp}, but no submatch info \\ \ex{,\var{exp}} & Same as ,@\var{exp}, but no submatch info \\
\srecomment{\var{Exp} must produce a character, string, \srecomment{\var{Exp} must produce a character, string,
char-set, or regexp.} \\ char-set, or regexp.} \\
\\ \\
\ex{bos eos} & Beginning/end of string \\ \ex{bos eos} & Beginning/end of string \\
\ex{bol eol} & Beginning/end of line \\ \ex{bol eol} & Beginning/end of line \\
\ex{bow eow} & Beginning/end of word \\
\end{tabular} \end{tabular}
\caption{SRE syntax summary (part 1)} \caption{SRE syntax summary (part 1)}
\end{boxedfigure} \end{boxedfigure}
\begin{boxedfigure}{tbhp} \begin{boxedfigure}{tbhp}
\begin{tabular}{lp{3in}} \begin{tabular}{lp{3in}}
\ex{(word \var{sre} {\ldots})} & (: bow \var{sre} {\ldots} eow) \\ \ex{(posix-string \var{string})} & Escape for Posix string notation \\
\ex{(word+ \var{cset-sre} {\ldots})}
& \cd{(word (+ (& (| alphanumeric "_")} \\
& \cd{ (| \var{cset-sre} {\ldots}))))} \\
\ex{word} & \ex{(word+ any)} \\
\\ \\
\ex{(posix-string \var{string})} & Escape for Posix string notation \\ \ex{\var{char}} & Singleton char set \\
\\ \ex{\var{class-name}} & alphanumeric, whitespace, \etc \\
\ex{\var{char}} & Singleton char set \\
\ex{\var{class-name}} & alphanumeric, whitespace, \etc \\
\srecomment{These two forms are interpreted subject to \srecomment{These two forms are interpreted subject to
the lexical case-sensitivity context.} \\ the lexical case-sensitivity context.} \\
\\ \\
\cd{(~ \var{cset-sre} {\ldots})} & Complement-of-union (\cd{[^{\ldots}]}) \\ \cd{(~ \var{cset-sre} {\ldots})} & Complement-of-union (\cd{[^{\ldots}]}) \\
\ex{(- \var{cset-sre} {\ldots})} & Difference \\ \ex{(- \var{cset-sre} {\ldots})} & Difference \\
\cd{(& \var{cset-sre} {\ldots})} & Intersection \\ \cd{(& \var{cset-sre} {\ldots})} & Intersection \\
\\ \\
\ex{(/ \var{range-spec} {\ldots})} & Character range---interpreted \ex{(/ \var{range-spec} {\ldots})} & Character range---interpreted
subject to subject to
the lexical case-sensitivy context \\ the lexical case-sensitivy context \\
\end{tabular} \end{tabular}
@ -167,19 +158,19 @@ the next section is a friendlier tutorial introduction.
{\tt {\tt
\begin{tabular}{l@{\quad\texttt{|}\quad}ll} \begin{tabular}{l@{\quad\texttt{|}\quad}ll}
\multicolumn{1}{l}{\var{class-name}\quad ::=\quad} & any \\ \multicolumn{1}{l}{\var{class-name}\quad ::=\quad} & any \\
& nonl \\ & nonl \\
& lower-case & | lower \\ & lower-case & | lower \\
& upper-case & | upper \\ & upper-case & | upper \\
& alphabetic & | alpha \\ & alphabetic & | alpha \\
& numeric & | digit | num \\ & numeric & | digit | num \\
& alphanumeric & | alnum \\ & alphanumeric & | alnum \\
& punctuation & | punct \\ & punctuation & | punct \\
& graphic & | graph \\ & graphic & | graph \\
& whitespace & | space | white \\ & whitespace & | space | white \\
& printing & | print \\ & printing & | print \\
& control & | cntrl \\ & control & | cntrl \\
& hex-digit & | xdigit | hex \\ & hex-digit & | xdigit | hex \\
& ascii & ascii
\end{tabular} \end{tabular}
\\[2ex] \\[2ex]
\ex{\var{range-spec} ::= \var{string} | \var{char}} \\ \ex{\var{range-spec} ::= \var{string} | \var{char}} \\
@ -197,22 +188,22 @@ The chars are taken in pairs to form inclusive ranges.
| (& <cset-sre> ...) Intersection | (& <cset-sre> ...) Intersection
| (| <cset-sre> ...) Set union | (| <cset-sre> ...) Set union
| (/ <range-spec> ...) Range | (/ <range-spec> ...) Range
| (<string>) Constant set | (<string>) Constant set
| <char> Singleton constant set | <char> Singleton constant set
| <string> For 1-char string "c" | <string> For 1-char string "c"
| <class-name> Constant set | <class-name> Constant set
| ,<exp> <exp> evals to a char-set, | ,<exp> <exp> evals to a char-set,
| ,@<exp> char, single-char string, | ,@<exp> char, single-char string,
or re-char-set regexp. or re-char-set regexp.
| (uncase <cset-sre>) Case-folding | (uncase <cset-sre>) Case-folding
| (w/case <cset-sre>) | (w/case <cset-sre>)
| (w/nocase <cset-sre>) | (w/nocase <cset-sre>)
\end{verbatim} \end{verbatim}
\caption{%The \cd{~}, \cd{-}, \cd{&}, and \cd{word+} operators may only be \caption{%The \cd{~}, \cd{-}, and \cd{&} operators may only be
applied to SRE's that specify character sets. applied to SRE's that specify character sets.
These are the ``type-checking'' rules for character-set SRE's.} These are the ``type-checking'' rules for character-set SRE's.}
\end{boxedfigure} \end{boxedfigure}
@ -352,12 +343,12 @@ of SRE repetition forms:
\begin{inset} \begin{inset}
\begin{tabular}{llrr} \begin{tabular}{llrr}
SRE & means & at least & no more than \\ \hline SRE & means & at least & no more than \\ \hline
\ex{(* \var{sre} \ldots)} &zero-or-more &0 &infinity \\ \ex{(* \var{sre} \ldots)} &zero-or-more &0 &infinity \\
\ex{(+ \var{sre} \ldots)} &one-or-more &1 &infinity \\ \ex{(+ \var{sre} \ldots)} &one-or-more &1 &infinity \\
\ex{(? \var{sre} \ldots)} &zero-or-one &0 &1 \\ \ex{(? \var{sre} \ldots)} &zero-or-one &0 &1 \\
\ex{(= \var{from} \var{sre} \ldots)} &exactly-n &\var{from} &\var{from} \\ \ex{(= \var{from} \var{sre} \ldots)} &exactly-n &\var{from} &\var{from} \\
\ex{(>= \var{from} \var{sre} \ldots)} &n-or-more &\var{from} &infinity \\ \ex{(>= \var{from} \var{sre} \ldots)} &n-or-more &\var{from} &infinity \\
\ex{(** \var{from} \var{to} \var{sre} \ldots)} &n-to-m &\var{from} &\var{to} \ex{(** \var{from} \var{to} \var{sre} \ldots)} &n-to-m &\var{from} &\var{to}
\end{tabular} \end{tabular}
\end{inset} \end{inset}
@ -381,8 +372,8 @@ We can limit the a/d chains to 4 characters or less with the SRE
Some boundary cases: Some boundary cases:
\begin{code} \begin{code}
(** 5 2 "foo") ; Will never match (** 5 2 "foo") ; Will never match
(** 0 0 "foo") ; Matches the empty string\end{code} (** 0 0 "foo") ; Matches the empty string\end{code}
\paragraph{Character classes} \paragraph{Character classes}
@ -450,20 +441,20 @@ There are also predefined named char classes for the standard Posix and Gnu
character classes: character classes:
\begin{inset} \begin{inset}
\begin{tabular}{llll} \begin{tabular}{llll}
scsh name & Posix/ctype & Alternate name & Comment \\ \hline scsh name & Posix/ctype & Alternate name & Comment \\ \hline
\ex{lower-case} & \ex{lower} \\ \ex{lower-case} & \ex{lower} \\
\ex{upper-case} & \ex{upper} \\ \ex{upper-case} & \ex{upper} \\
\ex{alphabetic} & \ex{alpha} \\ \ex{alphabetic} & \ex{alpha} \\
\ex{numeric} & \ex{digit} & \ex{num} \\ \ex{numeric} & \ex{digit} & \ex{num} \\
\ex{alphanumeric} & \ex{alnum} & \ex{alphanum} \\ \ex{alphanumeric} & \ex{alnum} & \ex{alphanum} \\
\ex{punctuation} & \ex{punct} \\ \ex{punctuation} & \ex{punct} \\
\ex{graphic} & \ex{graph} \\ \ex{graphic} & \ex{graph} \\
\ex{blank} & (Gnu extension) \\ \ex{blank} & (Gnu extension) \\
\ex{whitespace} & \ex{space} & \ex{white} & {``\ex{space}'' is deprecated.}\\ \ex{whitespace} & \ex{space} & \ex{white} & {``\ex{space}'' is deprecated.}\\
\ex{printing} & \ex{print} \\ \ex{printing} & \ex{print} \\
\ex{control} & \ex{cntrl} \\ \ex{control} & \ex{cntrl} \\
\ex{hex-digit} & \ex{xdigit} & \ex{hex} \\ \ex{hex-digit} & \ex{xdigit} & \ex{hex} \\
\ex{ascii} & (Gnu extension) \\ \ex{ascii} & (Gnu extension) \\
\end{tabular} \end{tabular}
\end{inset} \end{inset}
See the scsh character-set documentation or the Posix isalpha(3) man page See the scsh character-set documentation or the Posix isalpha(3) man page
@ -705,10 +696,10 @@ to produce a certain number of submatches---if that is part of \var{exp}'s
``contract.'' ``contract.''
\paragraph{String, line, and word units} \paragraph{String and line units}
The regexps \ex{bos} and \ex{eos} match the empty string at the beginning and The regexps \ex{bos} and \ex{eos} match the empty string at the
end of the string, respectively. beginning and end of the string, respectively.
The regexps \ex{bol} and \ex{eol} match the empty string at the beginning and The regexps \ex{bol} and \ex{eol} match the empty string at the beginning and
end of a line, respectively. A line begins at the beginning of the string, and end of a line, respectively. A line begins at the beginning of the string, and
@ -717,32 +708,6 @@ just before every newline character. The char class \ex{nonl} matches any
character except newline, and is useful in conjunction with line-based pattern character except newline, and is useful in conjunction with line-based pattern
matching. matching.
The regexps \ex{bow} and \ex{eow} match the empty string at the beginning and
end of a word, respectively. A word is a contiguous sequence of characters
that are either alphanumeric or the underscore character.
The regexp \ex{(word \var{sre} \ldots)} surrounds the sequence
\ex{(: \var{sre} \ldots)}with bow/eow delimiters. It is equivalent to
\begin{code}
(: bow \var{sre} \ldots eow)\end{code}%
%
The regexp \ex{(word+ \var{cset-sre} \ldots)} matches a word whose body is
one or more word characters matched by the char-set sre \var{cset-sre}.
It is equivalent to
\begin{code}
(word (+ (& (| alphanumeric "_")
(| \var{cset-sre} \ldots))))\end{code}%
%
For example, a word not containing x, y, or z is
\begin{code}
(word+ (~ ("xyz")))\end{code}%
%
The regexp \ex{word} matches one word; it is equivalent to
\begin{code}
(word+ any)
\end{code}%
\note{\ex{bol} and \ex{eol} are not supported by scsh's current \note{\ex{bol} and \ex{eol} are not supported by scsh's current
regexp search engine, which is Spencer's Posix matcher. This is the only regexp search engine, which is Spencer's Posix matcher. This is the only
element of the notation that is not supported by the current scsh element of the notation that is not supported by the current scsh
@ -829,7 +794,7 @@ submatches before the body, and \var{post} deleted submatches after the
body. body.
If the body \var{(: \var{sre} \ldots)} itself has \var{body-sm} submatches, If the body \var{(: \var{sre} \ldots)} itself has \var{body-sm} submatches,
then the total number of submatches for the DSM form is then the total number of submatches for the DSM form is
$$\var{pre} + \var{body-sm} + \var{post}.$$ $$\var{pre} + \var{body-sm} + \var{post}.$$
These extra, deleted submatches are never assigned string indices in any These extra, deleted submatches are never assigned string indices in any
match values produced when matching the regexp against a string. match values produced when matching the regexp against a string.
@ -870,7 +835,7 @@ There are two places where one can
embed run-time computations in an SRE: embed run-time computations in an SRE:
\begin{itemize} \begin{itemize}
\item The \var{from} or \var{to} repetition counts of \item The \var{from} or \var{to} repetition counts of
\ex{**}, \ex{=}, and \ex{>=} forms; \ex{**}, \ex{=}, and \ex{>=} forms;
\item \ex{,\var{exp}} and \ex{,@\var{exp}} forms. \item \ex{,\var{exp}} and \ex{,@\var{exp}} forms.
\end{itemize} \end{itemize}
@ -933,8 +898,8 @@ should not be used in new code.
\ex{regexp/bos-not-bol} means the beginning of the string isn't a \ex{regexp/bos-not-bol} means the beginning of the string isn't a
line-begin. \ex{regexp/eos-not-eol} is analogous. line-begin. \ex{regexp/eos-not-eol} is analogous.
\note{They're currently ignored because \note{They're currently ignored because
begining/end-of-line anchors aren't supported by the current begining/end-of-line anchors aren't supported by the current
implementation.} implementation.}
Use \ex{regexp-search?} when you don't need submatch information, as Use \ex{regexp-search?} when you don't need submatch information, as
it has the potential to be \emph{significantly} faster on it has the potential to be \emph{significantly} faster on
@ -983,7 +948,7 @@ the port:
is written to the port. is written to the port.
\item If an item is \ex{'pre}, \item If an item is \ex{'pre},
the prefix of the matched string (the text preceding the match) the prefix of the matched string (the text preceding the match)
is written to the port. is written to the port.
\item If an item is \ex{'post}, \item If an item is \ex{'post},
the suffix of the matched string is written. the suffix of the matched string is written.
\end{itemize} \end{itemize}
@ -1009,15 +974,15 @@ It has the following differences with \ex{regexp-substitute}:
\item It takes a regular expression and string to be matched as \item It takes a regular expression and string to be matched as
parameters, instead of a completed match structure. parameters, instead of a completed match structure.
\item If the regular expression doesn't match the string, this \item If the regular expression doesn't match the string, this
procedure is the identity transform---it returns or outputs the procedure is the identity transform---it returns or outputs the
string. string.
\item If an item is \ex{'post}, the procedure recurses on the suffix string \item If an item is \ex{'post}, the procedure recurses on the suffix string
(the text from \var{string} following the match). (the text from \var{string} following the match).
Including a \ex{'post} in the list of items is how one gets multiple Including a \ex{'post} in the list of items is how one gets multiple
match/substitution operations. match/substitution operations.
\item If an item is a procedure, it is applied to the match structure for \item If an item is a procedure, it is applied to the match structure for
a given match. a given match.
The procedure returns a string to be used in the result. The procedure returns a string to be used in the result.
\end{itemize} \end{itemize}
The \var{regexp} parameter can be either a compiled regular expression or The \var{regexp} parameter can be either a compiled regular expression or
a string specifying a regular expression. a string specifying a regular expression.
@ -1262,9 +1227,6 @@ Note:\begin{itemize}
\item The string parser doesn't handle the exotica of character class \item The string parser doesn't handle the exotica of character class
names such as \verb|[[:alnum:]]|; the current implementation was written names such as \verb|[[:alnum:]]|; the current implementation was written
in in three hours. in in three hours.
\item The unparser produces Spencer-specific strings for bow/eow
elements; otherwise, it's Posix all the way.
\end{itemize} \end{itemize}
\end{desc} \end{desc}
@ -1327,18 +1289,14 @@ contained in the regular expression.
\defvarx{re-eos}{regexp} \defvarx{re-eos}{regexp}
\defvarx{re-bol}{regexp} \defvarx{re-bol}{regexp}
\defvarx{re-eol}{regexp} \defvarx{re-eol}{regexp}
\defvarx{re-bow}{regexp}
\defvarx{re-eow}{regexp}
\begin{desc} \begin{desc}
These variables are bound to the primitive anchor regexps. These variables are bound to the primitive anchor regexps.
\end{desc} \end{desc}
\defun {re-bos?}{\object}{\boolean} \defun {re-bos?}{\object}{\boolean}
\defunx{re-eos?}{\object}{\boolean} \defunx{re-eos?}{\object}{\boolean}
\defunx{re-bol?}{\object}{\boolean} \defunx{re-bol?}{\object}{\boolean}
\defunx{re-eol?}{\object}{\boolean} \defunx{re-eol?}{\object}{\boolean}
\defunx{re-bow?}{\object}{\boolean}
\defunx{re-eow?}{\object}{\boolean}
\begin{desc} \begin{desc}
These predicates recognise the associated primitive anchor regexp. These predicates recognise the associated primitive anchor regexp.
\end{desc} \end{desc}
@ -1378,15 +1336,11 @@ regexps built using other constructors may or may not produce a true value.
% These are non-primitive predefined regexps of general utility. % These are non-primitive predefined regexps of general utility.
\defvar {re-nonl}{regexp} \defvarx {re-nonl}{regexp}
\defvarx{re-word}{regexp}
\begin{desc} \begin{desc}
The variable \ex{re-nonl} is bound to a regular expression The variable \ex{re-nonl} is bound to a regular expression
that matches any non-newline character that matches any non-newline character
(corresponding to the SRE \verb|(~ #\newline)|). (corresponding to the SRE \verb|(~ #\newline)|).
Similarly, \ex{re-word} is bound to a regular expression
that matches any word (corresponding to the SRE \ex{word}).
\end{desc} \end{desc}
\defun{regexp?}{\object}{\boolean} \defun{regexp?}{\object}{\boolean}