Remove documentation for BOW, EOW, WORD, and WORD+, as they have no
POSIX counterparts, and their meaning is locale-dependent.
This commit is contained in:
parent
00bba17d56
commit
357afa99ae
|
@ -72,8 +72,6 @@ providing:
|
|||
\item repetition (\ex{*}, \ex{+}, \ex{?}, \ex{\{$m$,$n$\}})
|
||||
\item character classes (\eg, \ex{[aeiou]}) and wildcard (\ex{.})
|
||||
\item beginning/end of string anchors (\verb|^|, \verb|$|)
|
||||
\item beginning/end of line anchors
|
||||
\item beginning/end of word anchors
|
||||
\item case-sensitivity control
|
||||
\item submatch-marking
|
||||
\end{itemize}
|
||||
|
@ -100,63 +98,56 @@ the next section is a friendlier tutorial introduction.
|
|||
case-sensitivity lexical context. \\
|
||||
\\
|
||||
\ex{(* \var{sre} {\ldots})} & 0 or more matches \\
|
||||
\ex{(+ \var{sre} {\ldots})} & 1 or more matches \\
|
||||
\ex{(? \var{sre} {\ldots})} & 0 or 1 matches \\
|
||||
\ex{(= \var{n} \var{sre} {\ldots})} & \var{n} matches \\
|
||||
\ex{(>= \var{n} \var{sre} {\ldots})} & \var{n} or more matches \\
|
||||
\ex{(** \var{n} \var{m} \var{sre} {\ldots})} & \var{n} to \var{m} matches \\
|
||||
\ex{(+ \var{sre} {\ldots})} & 1 or more matches \\
|
||||
\ex{(? \var{sre} {\ldots})} & 0 or 1 matches \\
|
||||
\ex{(= \var{n} \var{sre} {\ldots})} & \var{n} matches \\
|
||||
\ex{(>= \var{n} \var{sre} {\ldots})} & \var{n} or more matches \\
|
||||
\ex{(** \var{n} \var{m} \var{sre} {\ldots})} & \var{n} to \var{m} matches \\
|
||||
\srecomment{
|
||||
\var{N} and \var{m} are Scheme expressions producing non-negative
|
||||
integers. \\
|
||||
\var{M} may also be \ex{\#f}, meaning ``infinity.''} \\
|
||||
\\
|
||||
\ex{(| \var{sre} {\ldots})} & Choice (\ex{or} is R5RS symbol; \\
|
||||
\ex{(or \var{sre} {\ldots})} & \ex{|} is not specified by R5RS.) \\
|
||||
\ex{(| \var{sre} {\ldots})} & Choice (\ex{or} is R5RS symbol; \\
|
||||
\ex{(or \var{sre} {\ldots})} & \ex{|} is not specified by R5RS.) \\
|
||||
\\
|
||||
\ex{(: \var{sre} {\ldots})} & Sequence (\ex{seq} is legal \\
|
||||
\ex{(seq \var{sre} {\ldots})} & Common Lisp symbol) \\
|
||||
\ex{(: \var{sre} {\ldots})} & Sequence (\ex{seq} is legal \\
|
||||
\ex{(seq \var{sre} {\ldots})} & Common Lisp symbol) \\
|
||||
\\
|
||||
\ex{(submatch \var{sre} {\ldots})} & Numbered submatch \\
|
||||
\ex{(submatch \var{sre} {\ldots})} & Numbered submatch \\
|
||||
\\
|
||||
\ex{(dsm \var{pre} \var{post} \var{sre} {\ldots})} & Deleted submatches \\
|
||||
\ex{(dsm \var{pre} \var{post} \var{sre} {\ldots})} & Deleted submatches \\
|
||||
\srecomment{\var{Pre} and \var{post} are numerals.} \\
|
||||
\\
|
||||
\ex{(uncase \var{sre} {\ldots})} & Case-folded match \\
|
||||
\ex{(w/case \var{sre} {\ldots})} & Introduce a lexical case-sensitivity \\
|
||||
\ex{(w/nocase \var{sre} {\ldots})} & context. \\
|
||||
\ex{(uncase \var{sre} {\ldots})} & Case-folded match \\
|
||||
\ex{(w/case \var{sre} {\ldots})} & Introduce a lexical case-sensitivity \\
|
||||
\ex{(w/nocase \var{sre} {\ldots})} & context. \\
|
||||
\\
|
||||
\ex{,@\var{exp}} & Dynamically computed regexp \\
|
||||
\ex{,\var{exp}} & Same as ,@\var{exp}, but no submatch info \\
|
||||
\ex{,@\var{exp}} & Dynamically computed regexp \\
|
||||
\ex{,\var{exp}} & Same as ,@\var{exp}, but no submatch info \\
|
||||
\srecomment{\var{Exp} must produce a character, string,
|
||||
char-set, or regexp.} \\
|
||||
char-set, or regexp.} \\
|
||||
\\
|
||||
\ex{bos eos} & Beginning/end of string \\
|
||||
\ex{bos eos} & Beginning/end of string \\
|
||||
\ex{bol eol} & Beginning/end of line \\
|
||||
\ex{bow eow} & Beginning/end of word \\
|
||||
\end{tabular}
|
||||
\caption{SRE syntax summary (part 1)}
|
||||
\end{boxedfigure}
|
||||
|
||||
\begin{boxedfigure}{tbhp}
|
||||
\begin{tabular}{lp{3in}}
|
||||
\ex{(word \var{sre} {\ldots})} & (: bow \var{sre} {\ldots} eow) \\
|
||||
\ex{(word+ \var{cset-sre} {\ldots})}
|
||||
& \cd{(word (+ (& (| alphanumeric "_")} \\
|
||||
& \cd{ (| \var{cset-sre} {\ldots}))))} \\
|
||||
\ex{word} & \ex{(word+ any)} \\
|
||||
\ex{(posix-string \var{string})} & Escape for Posix string notation \\
|
||||
\\
|
||||
\ex{(posix-string \var{string})} & Escape for Posix string notation \\
|
||||
\\
|
||||
\ex{\var{char}} & Singleton char set \\
|
||||
\ex{\var{class-name}} & alphanumeric, whitespace, \etc \\
|
||||
\ex{\var{char}} & Singleton char set \\
|
||||
\ex{\var{class-name}} & alphanumeric, whitespace, \etc \\
|
||||
\srecomment{These two forms are interpreted subject to
|
||||
the lexical case-sensitivity context.} \\
|
||||
\\
|
||||
\cd{(~ \var{cset-sre} {\ldots})} & Complement-of-union (\cd{[^{\ldots}]}) \\
|
||||
\ex{(- \var{cset-sre} {\ldots})} & Difference \\
|
||||
\cd{(& \var{cset-sre} {\ldots})} & Intersection \\
|
||||
\cd{(~ \var{cset-sre} {\ldots})} & Complement-of-union (\cd{[^{\ldots}]}) \\
|
||||
\ex{(- \var{cset-sre} {\ldots})} & Difference \\
|
||||
\cd{(& \var{cset-sre} {\ldots})} & Intersection \\
|
||||
\\
|
||||
\ex{(/ \var{range-spec} {\ldots})} & Character range---interpreted
|
||||
\ex{(/ \var{range-spec} {\ldots})} & Character range---interpreted
|
||||
subject to
|
||||
the lexical case-sensitivy context \\
|
||||
\end{tabular}
|
||||
|
@ -167,19 +158,19 @@ the next section is a friendlier tutorial introduction.
|
|||
{\tt
|
||||
\begin{tabular}{l@{\quad\texttt{|}\quad}ll}
|
||||
\multicolumn{1}{l}{\var{class-name}\quad ::=\quad} & any \\
|
||||
& nonl \\
|
||||
& lower-case & | lower \\
|
||||
& upper-case & | upper \\
|
||||
& alphabetic & | alpha \\
|
||||
& numeric & | digit | num \\
|
||||
& alphanumeric & | alnum \\
|
||||
& punctuation & | punct \\
|
||||
& graphic & | graph \\
|
||||
& whitespace & | space | white \\
|
||||
& printing & | print \\
|
||||
& control & | cntrl \\
|
||||
& hex-digit & | xdigit | hex \\
|
||||
& ascii
|
||||
& nonl \\
|
||||
& lower-case & | lower \\
|
||||
& upper-case & | upper \\
|
||||
& alphabetic & | alpha \\
|
||||
& numeric & | digit | num \\
|
||||
& alphanumeric & | alnum \\
|
||||
& punctuation & | punct \\
|
||||
& graphic & | graph \\
|
||||
& whitespace & | space | white \\
|
||||
& printing & | print \\
|
||||
& control & | cntrl \\
|
||||
& hex-digit & | xdigit | hex \\
|
||||
& ascii
|
||||
\end{tabular}
|
||||
\\[2ex]
|
||||
\ex{\var{range-spec} ::= \var{string} | \var{char}} \\
|
||||
|
@ -212,7 +203,7 @@ The chars are taken in pairs to form inclusive ranges.
|
|||
| (w/case <cset-sre>)
|
||||
| (w/nocase <cset-sre>)
|
||||
\end{verbatim}
|
||||
\caption{%The \cd{~}, \cd{-}, \cd{&}, and \cd{word+} operators may only be
|
||||
\caption{%The \cd{~}, \cd{-}, and \cd{&} operators may only be
|
||||
applied to SRE's that specify character sets.
|
||||
These are the ``type-checking'' rules for character-set SRE's.}
|
||||
\end{boxedfigure}
|
||||
|
@ -352,12 +343,12 @@ of SRE repetition forms:
|
|||
\begin{inset}
|
||||
\begin{tabular}{llrr}
|
||||
SRE & means & at least & no more than \\ \hline
|
||||
\ex{(* \var{sre} \ldots)} &zero-or-more &0 &infinity \\
|
||||
\ex{(+ \var{sre} \ldots)} &one-or-more &1 &infinity \\
|
||||
\ex{(? \var{sre} \ldots)} &zero-or-one &0 &1 \\
|
||||
\ex{(= \var{from} \var{sre} \ldots)} &exactly-n &\var{from} &\var{from} \\
|
||||
\ex{(>= \var{from} \var{sre} \ldots)} &n-or-more &\var{from} &infinity \\
|
||||
\ex{(** \var{from} \var{to} \var{sre} \ldots)} &n-to-m &\var{from} &\var{to}
|
||||
\ex{(* \var{sre} \ldots)} &zero-or-more &0 &infinity \\
|
||||
\ex{(+ \var{sre} \ldots)} &one-or-more &1 &infinity \\
|
||||
\ex{(? \var{sre} \ldots)} &zero-or-one &0 &1 \\
|
||||
\ex{(= \var{from} \var{sre} \ldots)} &exactly-n &\var{from} &\var{from} \\
|
||||
\ex{(>= \var{from} \var{sre} \ldots)} &n-or-more &\var{from} &infinity \\
|
||||
\ex{(** \var{from} \var{to} \var{sre} \ldots)} &n-to-m &\var{from} &\var{to}
|
||||
\end{tabular}
|
||||
\end{inset}
|
||||
|
||||
|
@ -381,8 +372,8 @@ We can limit the a/d chains to 4 characters or less with the SRE
|
|||
|
||||
Some boundary cases:
|
||||
\begin{code}
|
||||
(** 5 2 "foo") ; Will never match
|
||||
(** 0 0 "foo") ; Matches the empty string\end{code}
|
||||
(** 5 2 "foo") ; Will never match
|
||||
(** 0 0 "foo") ; Matches the empty string\end{code}
|
||||
|
||||
\paragraph{Character classes}
|
||||
|
||||
|
@ -450,20 +441,20 @@ There are also predefined named char classes for the standard Posix and Gnu
|
|||
character classes:
|
||||
\begin{inset}
|
||||
\begin{tabular}{llll}
|
||||
scsh name & Posix/ctype & Alternate name & Comment \\ \hline
|
||||
\ex{lower-case} & \ex{lower} \\
|
||||
\ex{upper-case} & \ex{upper} \\
|
||||
\ex{alphabetic} & \ex{alpha} \\
|
||||
\ex{numeric} & \ex{digit} & \ex{num} \\
|
||||
\ex{alphanumeric} & \ex{alnum} & \ex{alphanum} \\
|
||||
\ex{punctuation} & \ex{punct} \\
|
||||
\ex{graphic} & \ex{graph} \\
|
||||
\ex{blank} & (Gnu extension) \\
|
||||
\ex{whitespace} & \ex{space} & \ex{white} & {``\ex{space}'' is deprecated.}\\
|
||||
\ex{printing} & \ex{print} \\
|
||||
\ex{control} & \ex{cntrl} \\
|
||||
\ex{hex-digit} & \ex{xdigit} & \ex{hex} \\
|
||||
\ex{ascii} & (Gnu extension) \\
|
||||
scsh name & Posix/ctype & Alternate name & Comment \\ \hline
|
||||
\ex{lower-case} & \ex{lower} \\
|
||||
\ex{upper-case} & \ex{upper} \\
|
||||
\ex{alphabetic} & \ex{alpha} \\
|
||||
\ex{numeric} & \ex{digit} & \ex{num} \\
|
||||
\ex{alphanumeric} & \ex{alnum} & \ex{alphanum} \\
|
||||
\ex{punctuation} & \ex{punct} \\
|
||||
\ex{graphic} & \ex{graph} \\
|
||||
\ex{blank} & (Gnu extension) \\
|
||||
\ex{whitespace} & \ex{space} & \ex{white} & {``\ex{space}'' is deprecated.}\\
|
||||
\ex{printing} & \ex{print} \\
|
||||
\ex{control} & \ex{cntrl} \\
|
||||
\ex{hex-digit} & \ex{xdigit} & \ex{hex} \\
|
||||
\ex{ascii} & (Gnu extension) \\
|
||||
\end{tabular}
|
||||
\end{inset}
|
||||
See the scsh character-set documentation or the Posix isalpha(3) man page
|
||||
|
@ -705,10 +696,10 @@ to produce a certain number of submatches---if that is part of \var{exp}'s
|
|||
``contract.''
|
||||
|
||||
|
||||
\paragraph{String, line, and word units}
|
||||
\paragraph{String and line units}
|
||||
|
||||
The regexps \ex{bos} and \ex{eos} match the empty string at the beginning and
|
||||
end of the string, respectively.
|
||||
The regexps \ex{bos} and \ex{eos} match the empty string at the
|
||||
beginning and end of the string, respectively.
|
||||
|
||||
The regexps \ex{bol} and \ex{eol} match the empty string at the beginning and
|
||||
end of a line, respectively. A line begins at the beginning of the string, and
|
||||
|
@ -717,32 +708,6 @@ just before every newline character. The char class \ex{nonl} matches any
|
|||
character except newline, and is useful in conjunction with line-based pattern
|
||||
matching.
|
||||
|
||||
The regexps \ex{bow} and \ex{eow} match the empty string at the beginning and
|
||||
end of a word, respectively. A word is a contiguous sequence of characters
|
||||
that are either alphanumeric or the underscore character.
|
||||
|
||||
The regexp \ex{(word \var{sre} \ldots)} surrounds the sequence
|
||||
\ex{(: \var{sre} \ldots)}with bow/eow delimiters. It is equivalent to
|
||||
\begin{code}
|
||||
(: bow \var{sre} \ldots eow)\end{code}%
|
||||
%
|
||||
|
||||
The regexp \ex{(word+ \var{cset-sre} \ldots)} matches a word whose body is
|
||||
one or more word characters matched by the char-set sre \var{cset-sre}.
|
||||
It is equivalent to
|
||||
\begin{code}
|
||||
(word (+ (& (| alphanumeric "_")
|
||||
(| \var{cset-sre} \ldots))))\end{code}%
|
||||
%
|
||||
For example, a word not containing x, y, or z is
|
||||
\begin{code}
|
||||
(word+ (~ ("xyz")))\end{code}%
|
||||
%
|
||||
The regexp \ex{word} matches one word; it is equivalent to
|
||||
\begin{code}
|
||||
(word+ any)
|
||||
\end{code}%
|
||||
|
||||
\note{\ex{bol} and \ex{eol} are not supported by scsh's current
|
||||
regexp search engine, which is Spencer's Posix matcher. This is the only
|
||||
element of the notation that is not supported by the current scsh
|
||||
|
@ -829,7 +794,7 @@ submatches before the body, and \var{post} deleted submatches after the
|
|||
body.
|
||||
If the body \var{(: \var{sre} \ldots)} itself has \var{body-sm} submatches,
|
||||
then the total number of submatches for the DSM form is
|
||||
$$\var{pre} + \var{body-sm} + \var{post}.$$
|
||||
$$\var{pre} + \var{body-sm} + \var{post}.$$
|
||||
These extra, deleted submatches are never assigned string indices in any
|
||||
match values produced when matching the regexp against a string.
|
||||
|
||||
|
@ -870,7 +835,7 @@ There are two places where one can
|
|||
embed run-time computations in an SRE:
|
||||
\begin{itemize}
|
||||
\item The \var{from} or \var{to} repetition counts of
|
||||
\ex{**}, \ex{=}, and \ex{>=} forms;
|
||||
\ex{**}, \ex{=}, and \ex{>=} forms;
|
||||
\item \ex{,\var{exp}} and \ex{,@\var{exp}} forms.
|
||||
\end{itemize}
|
||||
|
||||
|
@ -933,8 +898,8 @@ should not be used in new code.
|
|||
\ex{regexp/bos-not-bol} means the beginning of the string isn't a
|
||||
line-begin. \ex{regexp/eos-not-eol} is analogous.
|
||||
\note{They're currently ignored because
|
||||
begining/end-of-line anchors aren't supported by the current
|
||||
implementation.}
|
||||
begining/end-of-line anchors aren't supported by the current
|
||||
implementation.}
|
||||
|
||||
Use \ex{regexp-search?} when you don't need submatch information, as
|
||||
it has the potential to be \emph{significantly} faster on
|
||||
|
@ -983,7 +948,7 @@ the port:
|
|||
is written to the port.
|
||||
\item If an item is \ex{'pre},
|
||||
the prefix of the matched string (the text preceding the match)
|
||||
is written to the port.
|
||||
is written to the port.
|
||||
\item If an item is \ex{'post},
|
||||
the suffix of the matched string is written.
|
||||
\end{itemize}
|
||||
|
@ -1009,15 +974,15 @@ It has the following differences with \ex{regexp-substitute}:
|
|||
\item It takes a regular expression and string to be matched as
|
||||
parameters, instead of a completed match structure.
|
||||
\item If the regular expression doesn't match the string, this
|
||||
procedure is the identity transform---it returns or outputs the
|
||||
string.
|
||||
procedure is the identity transform---it returns or outputs the
|
||||
string.
|
||||
\item If an item is \ex{'post}, the procedure recurses on the suffix string
|
||||
(the text from \var{string} following the match).
|
||||
Including a \ex{'post} in the list of items is how one gets multiple
|
||||
match/substitution operations.
|
||||
Including a \ex{'post} in the list of items is how one gets multiple
|
||||
match/substitution operations.
|
||||
\item If an item is a procedure, it is applied to the match structure for
|
||||
a given match.
|
||||
The procedure returns a string to be used in the result.
|
||||
a given match.
|
||||
The procedure returns a string to be used in the result.
|
||||
\end{itemize}
|
||||
The \var{regexp} parameter can be either a compiled regular expression or
|
||||
a string specifying a regular expression.
|
||||
|
@ -1262,9 +1227,6 @@ Note:\begin{itemize}
|
|||
\item The string parser doesn't handle the exotica of character class
|
||||
names such as \verb|[[:alnum:]]|; the current implementation was written
|
||||
in in three hours.
|
||||
|
||||
\item The unparser produces Spencer-specific strings for bow/eow
|
||||
elements; otherwise, it's Posix all the way.
|
||||
\end{itemize}
|
||||
\end{desc}
|
||||
|
||||
|
@ -1327,8 +1289,6 @@ contained in the regular expression.
|
|||
\defvarx{re-eos}{regexp}
|
||||
\defvarx{re-bol}{regexp}
|
||||
\defvarx{re-eol}{regexp}
|
||||
\defvarx{re-bow}{regexp}
|
||||
\defvarx{re-eow}{regexp}
|
||||
\begin{desc}
|
||||
These variables are bound to the primitive anchor regexps.
|
||||
\end{desc}
|
||||
|
@ -1337,8 +1297,6 @@ These variables are bound to the primitive anchor regexps.
|
|||
\defunx{re-eos?}{\object}{\boolean}
|
||||
\defunx{re-bol?}{\object}{\boolean}
|
||||
\defunx{re-eol?}{\object}{\boolean}
|
||||
\defunx{re-bow?}{\object}{\boolean}
|
||||
\defunx{re-eow?}{\object}{\boolean}
|
||||
\begin{desc}
|
||||
These predicates recognise the associated primitive anchor regexp.
|
||||
\end{desc}
|
||||
|
@ -1378,15 +1336,11 @@ regexps built using other constructors may or may not produce a true value.
|
|||
|
||||
% These are non-primitive predefined regexps of general utility.
|
||||
|
||||
\defvar {re-nonl}{regexp}
|
||||
\defvarx{re-word}{regexp}
|
||||
\defvarx {re-nonl}{regexp}
|
||||
\begin{desc}
|
||||
The variable \ex{re-nonl} is bound to a regular expression
|
||||
that matches any non-newline character
|
||||
(corresponding to the SRE \verb|(~ #\newline)|).
|
||||
|
||||
Similarly, \ex{re-word} is bound to a regular expression
|
||||
that matches any word (corresponding to the SRE \ex{word}).
|
||||
\end{desc}
|
||||
|
||||
\defun{regexp?}{\object}{\boolean}
|
||||
|
|
Loading…
Reference in New Issue