Remove documentation for BOW, EOW, WORD, and WORD+, as they have no

POSIX counterparts, and their meaning is locale-dependent.
This commit is contained in:
sperber 2002-02-16 16:52:27 +00:00
parent 00bba17d56
commit 357afa99ae
1 changed files with 81 additions and 127 deletions

View File

@ -72,8 +72,6 @@ providing:
\item repetition (\ex{*}, \ex{+}, \ex{?}, \ex{\{$m$,$n$\}})
\item character classes (\eg, \ex{[aeiou]}) and wildcard (\ex{.})
\item beginning/end of string anchors (\verb|^|, \verb|$|)
\item beginning/end of line anchors
\item beginning/end of word anchors
\item case-sensitivity control
\item submatch-marking
\end{itemize}
@ -100,63 +98,56 @@ the next section is a friendlier tutorial introduction.
case-sensitivity lexical context. \\
\\
\ex{(* \var{sre} {\ldots})} & 0 or more matches \\
\ex{(+ \var{sre} {\ldots})} & 1 or more matches \\
\ex{(? \var{sre} {\ldots})} & 0 or 1 matches \\
\ex{(= \var{n} \var{sre} {\ldots})} & \var{n} matches \\
\ex{(>= \var{n} \var{sre} {\ldots})} & \var{n} or more matches \\
\ex{(** \var{n} \var{m} \var{sre} {\ldots})} & \var{n} to \var{m} matches \\
\ex{(+ \var{sre} {\ldots})} & 1 or more matches \\
\ex{(? \var{sre} {\ldots})} & 0 or 1 matches \\
\ex{(= \var{n} \var{sre} {\ldots})} & \var{n} matches \\
\ex{(>= \var{n} \var{sre} {\ldots})} & \var{n} or more matches \\
\ex{(** \var{n} \var{m} \var{sre} {\ldots})} & \var{n} to \var{m} matches \\
\srecomment{
\var{N} and \var{m} are Scheme expressions producing non-negative
integers. \\
\var{M} may also be \ex{\#f}, meaning ``infinity.''} \\
\\
\ex{(| \var{sre} {\ldots})} & Choice (\ex{or} is R5RS symbol; \\
\ex{(or \var{sre} {\ldots})} & \ex{|} is not specified by R5RS.) \\
\ex{(| \var{sre} {\ldots})} & Choice (\ex{or} is R5RS symbol; \\
\ex{(or \var{sre} {\ldots})} & \ex{|} is not specified by R5RS.) \\
\\
\ex{(: \var{sre} {\ldots})} & Sequence (\ex{seq} is legal \\
\ex{(seq \var{sre} {\ldots})} & Common Lisp symbol) \\
\ex{(: \var{sre} {\ldots})} & Sequence (\ex{seq} is legal \\
\ex{(seq \var{sre} {\ldots})} & Common Lisp symbol) \\
\\
\ex{(submatch \var{sre} {\ldots})} & Numbered submatch \\
\ex{(submatch \var{sre} {\ldots})} & Numbered submatch \\
\\
\ex{(dsm \var{pre} \var{post} \var{sre} {\ldots})} & Deleted submatches \\
\ex{(dsm \var{pre} \var{post} \var{sre} {\ldots})} & Deleted submatches \\
\srecomment{\var{Pre} and \var{post} are numerals.} \\
\\
\ex{(uncase \var{sre} {\ldots})} & Case-folded match \\
\ex{(w/case \var{sre} {\ldots})} & Introduce a lexical case-sensitivity \\
\ex{(w/nocase \var{sre} {\ldots})} & context. \\
\ex{(uncase \var{sre} {\ldots})} & Case-folded match \\
\ex{(w/case \var{sre} {\ldots})} & Introduce a lexical case-sensitivity \\
\ex{(w/nocase \var{sre} {\ldots})} & context. \\
\\
\ex{,@\var{exp}} & Dynamically computed regexp \\
\ex{,\var{exp}} & Same as ,@\var{exp}, but no submatch info \\
\ex{,@\var{exp}} & Dynamically computed regexp \\
\ex{,\var{exp}} & Same as ,@\var{exp}, but no submatch info \\
\srecomment{\var{Exp} must produce a character, string,
char-set, or regexp.} \\
char-set, or regexp.} \\
\\
\ex{bos eos} & Beginning/end of string \\
\ex{bos eos} & Beginning/end of string \\
\ex{bol eol} & Beginning/end of line \\
\ex{bow eow} & Beginning/end of word \\
\end{tabular}
\caption{SRE syntax summary (part 1)}
\end{boxedfigure}
\begin{boxedfigure}{tbhp}
\begin{tabular}{lp{3in}}
\ex{(word \var{sre} {\ldots})} & (: bow \var{sre} {\ldots} eow) \\
\ex{(word+ \var{cset-sre} {\ldots})}
& \cd{(word (+ (& (| alphanumeric "_")} \\
& \cd{ (| \var{cset-sre} {\ldots}))))} \\
\ex{word} & \ex{(word+ any)} \\
\ex{(posix-string \var{string})} & Escape for Posix string notation \\
\\
\ex{(posix-string \var{string})} & Escape for Posix string notation \\
\\
\ex{\var{char}} & Singleton char set \\
\ex{\var{class-name}} & alphanumeric, whitespace, \etc \\
\ex{\var{char}} & Singleton char set \\
\ex{\var{class-name}} & alphanumeric, whitespace, \etc \\
\srecomment{These two forms are interpreted subject to
the lexical case-sensitivity context.} \\
\\
\cd{(~ \var{cset-sre} {\ldots})} & Complement-of-union (\cd{[^{\ldots}]}) \\
\ex{(- \var{cset-sre} {\ldots})} & Difference \\
\cd{(& \var{cset-sre} {\ldots})} & Intersection \\
\cd{(~ \var{cset-sre} {\ldots})} & Complement-of-union (\cd{[^{\ldots}]}) \\
\ex{(- \var{cset-sre} {\ldots})} & Difference \\
\cd{(& \var{cset-sre} {\ldots})} & Intersection \\
\\
\ex{(/ \var{range-spec} {\ldots})} & Character range---interpreted
\ex{(/ \var{range-spec} {\ldots})} & Character range---interpreted
subject to
the lexical case-sensitivy context \\
\end{tabular}
@ -167,19 +158,19 @@ the next section is a friendlier tutorial introduction.
{\tt
\begin{tabular}{l@{\quad\texttt{|}\quad}ll}
\multicolumn{1}{l}{\var{class-name}\quad ::=\quad} & any \\
& nonl \\
& lower-case & | lower \\
& upper-case & | upper \\
& alphabetic & | alpha \\
& numeric & | digit | num \\
& alphanumeric & | alnum \\
& punctuation & | punct \\
& graphic & | graph \\
& whitespace & | space | white \\
& printing & | print \\
& control & | cntrl \\
& hex-digit & | xdigit | hex \\
& ascii
& nonl \\
& lower-case & | lower \\
& upper-case & | upper \\
& alphabetic & | alpha \\
& numeric & | digit | num \\
& alphanumeric & | alnum \\
& punctuation & | punct \\
& graphic & | graph \\
& whitespace & | space | white \\
& printing & | print \\
& control & | cntrl \\
& hex-digit & | xdigit | hex \\
& ascii
\end{tabular}
\\[2ex]
\ex{\var{range-spec} ::= \var{string} | \var{char}} \\
@ -197,22 +188,22 @@ The chars are taken in pairs to form inclusive ranges.
| (& <cset-sre> ...) Intersection
| (| <cset-sre> ...) Set union
| (/ <range-spec> ...) Range
| (<string>) Constant set
| <char> Singleton constant set
| <string> For 1-char string "c"
| <class-name> Constant set
| ,<exp> <exp> evals to a char-set,
| ,@<exp> char, single-char string,
or re-char-set regexp.
| (uncase <cset-sre>) Case-folding
| (w/case <cset-sre>)
| (w/nocase <cset-sre>)
\end{verbatim}
\caption{%The \cd{~}, \cd{-}, \cd{&}, and \cd{word+} operators may only be
\caption{%The \cd{~}, \cd{-}, and \cd{&} operators may only be
applied to SRE's that specify character sets.
These are the ``type-checking'' rules for character-set SRE's.}
\end{boxedfigure}
@ -352,12 +343,12 @@ of SRE repetition forms:
\begin{inset}
\begin{tabular}{llrr}
SRE & means & at least & no more than \\ \hline
\ex{(* \var{sre} \ldots)} &zero-or-more &0 &infinity \\
\ex{(+ \var{sre} \ldots)} &one-or-more &1 &infinity \\
\ex{(? \var{sre} \ldots)} &zero-or-one &0 &1 \\
\ex{(= \var{from} \var{sre} \ldots)} &exactly-n &\var{from} &\var{from} \\
\ex{(>= \var{from} \var{sre} \ldots)} &n-or-more &\var{from} &infinity \\
\ex{(** \var{from} \var{to} \var{sre} \ldots)} &n-to-m &\var{from} &\var{to}
\ex{(* \var{sre} \ldots)} &zero-or-more &0 &infinity \\
\ex{(+ \var{sre} \ldots)} &one-or-more &1 &infinity \\
\ex{(? \var{sre} \ldots)} &zero-or-one &0 &1 \\
\ex{(= \var{from} \var{sre} \ldots)} &exactly-n &\var{from} &\var{from} \\
\ex{(>= \var{from} \var{sre} \ldots)} &n-or-more &\var{from} &infinity \\
\ex{(** \var{from} \var{to} \var{sre} \ldots)} &n-to-m &\var{from} &\var{to}
\end{tabular}
\end{inset}
@ -381,8 +372,8 @@ We can limit the a/d chains to 4 characters or less with the SRE
Some boundary cases:
\begin{code}
(** 5 2 "foo") ; Will never match
(** 0 0 "foo") ; Matches the empty string\end{code}
(** 5 2 "foo") ; Will never match
(** 0 0 "foo") ; Matches the empty string\end{code}
\paragraph{Character classes}
@ -450,20 +441,20 @@ There are also predefined named char classes for the standard Posix and Gnu
character classes:
\begin{inset}
\begin{tabular}{llll}
scsh name & Posix/ctype & Alternate name & Comment \\ \hline
\ex{lower-case} & \ex{lower} \\
\ex{upper-case} & \ex{upper} \\
\ex{alphabetic} & \ex{alpha} \\
\ex{numeric} & \ex{digit} & \ex{num} \\
\ex{alphanumeric} & \ex{alnum} & \ex{alphanum} \\
\ex{punctuation} & \ex{punct} \\
\ex{graphic} & \ex{graph} \\
\ex{blank} & (Gnu extension) \\
\ex{whitespace} & \ex{space} & \ex{white} & {``\ex{space}'' is deprecated.}\\
\ex{printing} & \ex{print} \\
\ex{control} & \ex{cntrl} \\
\ex{hex-digit} & \ex{xdigit} & \ex{hex} \\
\ex{ascii} & (Gnu extension) \\
scsh name & Posix/ctype & Alternate name & Comment \\ \hline
\ex{lower-case} & \ex{lower} \\
\ex{upper-case} & \ex{upper} \\
\ex{alphabetic} & \ex{alpha} \\
\ex{numeric} & \ex{digit} & \ex{num} \\
\ex{alphanumeric} & \ex{alnum} & \ex{alphanum} \\
\ex{punctuation} & \ex{punct} \\
\ex{graphic} & \ex{graph} \\
\ex{blank} & (Gnu extension) \\
\ex{whitespace} & \ex{space} & \ex{white} & {``\ex{space}'' is deprecated.}\\
\ex{printing} & \ex{print} \\
\ex{control} & \ex{cntrl} \\
\ex{hex-digit} & \ex{xdigit} & \ex{hex} \\
\ex{ascii} & (Gnu extension) \\
\end{tabular}
\end{inset}
See the scsh character-set documentation or the Posix isalpha(3) man page
@ -705,10 +696,10 @@ to produce a certain number of submatches---if that is part of \var{exp}'s
``contract.''
\paragraph{String, line, and word units}
\paragraph{String and line units}
The regexps \ex{bos} and \ex{eos} match the empty string at the beginning and
end of the string, respectively.
The regexps \ex{bos} and \ex{eos} match the empty string at the
beginning and end of the string, respectively.
The regexps \ex{bol} and \ex{eol} match the empty string at the beginning and
end of a line, respectively. A line begins at the beginning of the string, and
@ -717,32 +708,6 @@ just before every newline character. The char class \ex{nonl} matches any
character except newline, and is useful in conjunction with line-based pattern
matching.
The regexps \ex{bow} and \ex{eow} match the empty string at the beginning and
end of a word, respectively. A word is a contiguous sequence of characters
that are either alphanumeric or the underscore character.
The regexp \ex{(word \var{sre} \ldots)} surrounds the sequence
\ex{(: \var{sre} \ldots)}with bow/eow delimiters. It is equivalent to
\begin{code}
(: bow \var{sre} \ldots eow)\end{code}%
%
The regexp \ex{(word+ \var{cset-sre} \ldots)} matches a word whose body is
one or more word characters matched by the char-set sre \var{cset-sre}.
It is equivalent to
\begin{code}
(word (+ (& (| alphanumeric "_")
(| \var{cset-sre} \ldots))))\end{code}%
%
For example, a word not containing x, y, or z is
\begin{code}
(word+ (~ ("xyz")))\end{code}%
%
The regexp \ex{word} matches one word; it is equivalent to
\begin{code}
(word+ any)
\end{code}%
\note{\ex{bol} and \ex{eol} are not supported by scsh's current
regexp search engine, which is Spencer's Posix matcher. This is the only
element of the notation that is not supported by the current scsh
@ -829,7 +794,7 @@ submatches before the body, and \var{post} deleted submatches after the
body.
If the body \var{(: \var{sre} \ldots)} itself has \var{body-sm} submatches,
then the total number of submatches for the DSM form is
$$\var{pre} + \var{body-sm} + \var{post}.$$
$$\var{pre} + \var{body-sm} + \var{post}.$$
These extra, deleted submatches are never assigned string indices in any
match values produced when matching the regexp against a string.
@ -870,7 +835,7 @@ There are two places where one can
embed run-time computations in an SRE:
\begin{itemize}
\item The \var{from} or \var{to} repetition counts of
\ex{**}, \ex{=}, and \ex{>=} forms;
\ex{**}, \ex{=}, and \ex{>=} forms;
\item \ex{,\var{exp}} and \ex{,@\var{exp}} forms.
\end{itemize}
@ -933,8 +898,8 @@ should not be used in new code.
\ex{regexp/bos-not-bol} means the beginning of the string isn't a
line-begin. \ex{regexp/eos-not-eol} is analogous.
\note{They're currently ignored because
begining/end-of-line anchors aren't supported by the current
implementation.}
begining/end-of-line anchors aren't supported by the current
implementation.}
Use \ex{regexp-search?} when you don't need submatch information, as
it has the potential to be \emph{significantly} faster on
@ -983,7 +948,7 @@ the port:
is written to the port.
\item If an item is \ex{'pre},
the prefix of the matched string (the text preceding the match)
is written to the port.
is written to the port.
\item If an item is \ex{'post},
the suffix of the matched string is written.
\end{itemize}
@ -1009,15 +974,15 @@ It has the following differences with \ex{regexp-substitute}:
\item It takes a regular expression and string to be matched as
parameters, instead of a completed match structure.
\item If the regular expression doesn't match the string, this
procedure is the identity transform---it returns or outputs the
string.
procedure is the identity transform---it returns or outputs the
string.
\item If an item is \ex{'post}, the procedure recurses on the suffix string
(the text from \var{string} following the match).
Including a \ex{'post} in the list of items is how one gets multiple
match/substitution operations.
Including a \ex{'post} in the list of items is how one gets multiple
match/substitution operations.
\item If an item is a procedure, it is applied to the match structure for
a given match.
The procedure returns a string to be used in the result.
a given match.
The procedure returns a string to be used in the result.
\end{itemize}
The \var{regexp} parameter can be either a compiled regular expression or
a string specifying a regular expression.
@ -1262,9 +1227,6 @@ Note:\begin{itemize}
\item The string parser doesn't handle the exotica of character class
names such as \verb|[[:alnum:]]|; the current implementation was written
in in three hours.
\item The unparser produces Spencer-specific strings for bow/eow
elements; otherwise, it's Posix all the way.
\end{itemize}
\end{desc}
@ -1327,18 +1289,14 @@ contained in the regular expression.
\defvarx{re-eos}{regexp}
\defvarx{re-bol}{regexp}
\defvarx{re-eol}{regexp}
\defvarx{re-bow}{regexp}
\defvarx{re-eow}{regexp}
\begin{desc}
These variables are bound to the primitive anchor regexps.
\end{desc}
\defun {re-bos?}{\object}{\boolean}
\defunx{re-eos?}{\object}{\boolean}
\defunx{re-eos?}{\object}{\boolean}
\defunx{re-bol?}{\object}{\boolean}
\defunx{re-eol?}{\object}{\boolean}
\defunx{re-bow?}{\object}{\boolean}
\defunx{re-eow?}{\object}{\boolean}
\begin{desc}
These predicates recognise the associated primitive anchor regexp.
\end{desc}
@ -1378,15 +1336,11 @@ regexps built using other constructors may or may not produce a true value.
% These are non-primitive predefined regexps of general utility.
\defvar {re-nonl}{regexp}
\defvarx{re-word}{regexp}
\defvarx {re-nonl}{regexp}
\begin{desc}
The variable \ex{re-nonl} is bound to a regular expression
that matches any non-newline character
(corresponding to the SRE \verb|(~ #\newline)|).
Similarly, \ex{re-word} is bound to a regular expression
that matches any word (corresponding to the SRE \ex{word}).
\end{desc}
\defun{regexp?}{\object}{\boolean}