From 357afa99ae958adab6dc68bfaddf235834239e0c Mon Sep 17 00:00:00 2001 From: sperber Date: Sat, 16 Feb 2002 16:52:27 +0000 Subject: [PATCH] Remove documentation for BOW, EOW, WORD, and WORD+, as they have no POSIX counterparts, and their meaning is locale-dependent. --- doc/scsh-manual/sre.tex | 208 ++++++++++++++++------------------------ 1 file changed, 81 insertions(+), 127 deletions(-) diff --git a/doc/scsh-manual/sre.tex b/doc/scsh-manual/sre.tex index 15b0c74..8de3ab9 100644 --- a/doc/scsh-manual/sre.tex +++ b/doc/scsh-manual/sre.tex @@ -72,8 +72,6 @@ providing: \item repetition (\ex{*}, \ex{+}, \ex{?}, \ex{\{$m$,$n$\}}) \item character classes (\eg, \ex{[aeiou]}) and wildcard (\ex{.}) \item beginning/end of string anchors (\verb|^|, \verb|$|) -\item beginning/end of line anchors -\item beginning/end of word anchors \item case-sensitivity control \item submatch-marking \end{itemize} @@ -100,63 +98,56 @@ the next section is a friendlier tutorial introduction. case-sensitivity lexical context. \\ \\ \ex{(* \var{sre} {\ldots})} & 0 or more matches \\ -\ex{(+ \var{sre} {\ldots})} & 1 or more matches \\ -\ex{(? \var{sre} {\ldots})} & 0 or 1 matches \\ -\ex{(= \var{n} \var{sre} {\ldots})} & \var{n} matches \\ -\ex{(>= \var{n} \var{sre} {\ldots})} & \var{n} or more matches \\ -\ex{(** \var{n} \var{m} \var{sre} {\ldots})} & \var{n} to \var{m} matches \\ +\ex{(+ \var{sre} {\ldots})} & 1 or more matches \\ +\ex{(? \var{sre} {\ldots})} & 0 or 1 matches \\ +\ex{(= \var{n} \var{sre} {\ldots})} & \var{n} matches \\ +\ex{(>= \var{n} \var{sre} {\ldots})} & \var{n} or more matches \\ +\ex{(** \var{n} \var{m} \var{sre} {\ldots})} & \var{n} to \var{m} matches \\ \srecomment{ \var{N} and \var{m} are Scheme expressions producing non-negative integers. \\ \var{M} may also be \ex{\#f}, meaning ``infinity.''} \\ \\ -\ex{(| \var{sre} {\ldots})} & Choice (\ex{or} is R5RS symbol; \\ -\ex{(or \var{sre} {\ldots})} & \ex{|} is not specified by R5RS.) \\ +\ex{(| \var{sre} {\ldots})} & Choice (\ex{or} is R5RS symbol; \\ +\ex{(or \var{sre} {\ldots})} & \ex{|} is not specified by R5RS.) \\ \\ -\ex{(: \var{sre} {\ldots})} & Sequence (\ex{seq} is legal \\ -\ex{(seq \var{sre} {\ldots})} & Common Lisp symbol) \\ +\ex{(: \var{sre} {\ldots})} & Sequence (\ex{seq} is legal \\ +\ex{(seq \var{sre} {\ldots})} & Common Lisp symbol) \\ \\ -\ex{(submatch \var{sre} {\ldots})} & Numbered submatch \\ +\ex{(submatch \var{sre} {\ldots})} & Numbered submatch \\ \\ -\ex{(dsm \var{pre} \var{post} \var{sre} {\ldots})} & Deleted submatches \\ +\ex{(dsm \var{pre} \var{post} \var{sre} {\ldots})} & Deleted submatches \\ \srecomment{\var{Pre} and \var{post} are numerals.} \\ \\ -\ex{(uncase \var{sre} {\ldots})} & Case-folded match \\ -\ex{(w/case \var{sre} {\ldots})} & Introduce a lexical case-sensitivity \\ -\ex{(w/nocase \var{sre} {\ldots})} & context. \\ +\ex{(uncase \var{sre} {\ldots})} & Case-folded match \\ +\ex{(w/case \var{sre} {\ldots})} & Introduce a lexical case-sensitivity \\ +\ex{(w/nocase \var{sre} {\ldots})} & context. \\ \\ -\ex{,@\var{exp}} & Dynamically computed regexp \\ -\ex{,\var{exp}} & Same as ,@\var{exp}, but no submatch info \\ +\ex{,@\var{exp}} & Dynamically computed regexp \\ +\ex{,\var{exp}} & Same as ,@\var{exp}, but no submatch info \\ \srecomment{\var{Exp} must produce a character, string, - char-set, or regexp.} \\ + char-set, or regexp.} \\ \\ -\ex{bos eos} & Beginning/end of string \\ +\ex{bos eos} & Beginning/end of string \\ \ex{bol eol} & Beginning/end of line \\ -\ex{bow eow} & Beginning/end of word \\ \end{tabular} \caption{SRE syntax summary (part 1)} \end{boxedfigure} \begin{boxedfigure}{tbhp} \begin{tabular}{lp{3in}} -\ex{(word \var{sre} {\ldots})} & (: bow \var{sre} {\ldots} eow) \\ -\ex{(word+ \var{cset-sre} {\ldots})} - & \cd{(word (+ (& (| alphanumeric "_")} \\ - & \cd{ (| \var{cset-sre} {\ldots}))))} \\ -\ex{word} & \ex{(word+ any)} \\ +\ex{(posix-string \var{string})} & Escape for Posix string notation \\ \\ -\ex{(posix-string \var{string})} & Escape for Posix string notation \\ -\\ -\ex{\var{char}} & Singleton char set \\ -\ex{\var{class-name}} & alphanumeric, whitespace, \etc \\ +\ex{\var{char}} & Singleton char set \\ +\ex{\var{class-name}} & alphanumeric, whitespace, \etc \\ \srecomment{These two forms are interpreted subject to the lexical case-sensitivity context.} \\ \\ -\cd{(~ \var{cset-sre} {\ldots})} & Complement-of-union (\cd{[^{\ldots}]}) \\ -\ex{(- \var{cset-sre} {\ldots})} & Difference \\ -\cd{(& \var{cset-sre} {\ldots})} & Intersection \\ +\cd{(~ \var{cset-sre} {\ldots})} & Complement-of-union (\cd{[^{\ldots}]}) \\ +\ex{(- \var{cset-sre} {\ldots})} & Difference \\ +\cd{(& \var{cset-sre} {\ldots})} & Intersection \\ \\ -\ex{(/ \var{range-spec} {\ldots})} & Character range---interpreted +\ex{(/ \var{range-spec} {\ldots})} & Character range---interpreted subject to the lexical case-sensitivy context \\ \end{tabular} @@ -167,19 +158,19 @@ the next section is a friendlier tutorial introduction. {\tt \begin{tabular}{l@{\quad\texttt{|}\quad}ll} \multicolumn{1}{l}{\var{class-name}\quad ::=\quad} & any \\ - & nonl \\ - & lower-case & | lower \\ - & upper-case & | upper \\ - & alphabetic & | alpha \\ - & numeric & | digit | num \\ - & alphanumeric & | alnum \\ - & punctuation & | punct \\ - & graphic & | graph \\ - & whitespace & | space | white \\ - & printing & | print \\ - & control & | cntrl \\ - & hex-digit & | xdigit | hex \\ - & ascii + & nonl \\ + & lower-case & | lower \\ + & upper-case & | upper \\ + & alphabetic & | alpha \\ + & numeric & | digit | num \\ + & alphanumeric & | alnum \\ + & punctuation & | punct \\ + & graphic & | graph \\ + & whitespace & | space | white \\ + & printing & | print \\ + & control & | cntrl \\ + & hex-digit & | xdigit | hex \\ + & ascii \end{tabular} \\[2ex] \ex{\var{range-spec} ::= \var{string} | \var{char}} \\ @@ -197,22 +188,22 @@ The chars are taken in pairs to form inclusive ranges. | (& ...) Intersection | (| ...) Set union | (/ ...) Range - + | () Constant set | Singleton constant set | For 1-char string "c" - + | Constant set - + | , evals to a char-set, | ,@ char, single-char string, or re-char-set regexp. - + | (uncase ) Case-folding | (w/case ) | (w/nocase ) \end{verbatim} -\caption{%The \cd{~}, \cd{-}, \cd{&}, and \cd{word+} operators may only be +\caption{%The \cd{~}, \cd{-}, and \cd{&} operators may only be applied to SRE's that specify character sets. These are the ``type-checking'' rules for character-set SRE's.} \end{boxedfigure} @@ -352,12 +343,12 @@ of SRE repetition forms: \begin{inset} \begin{tabular}{llrr} SRE & means & at least & no more than \\ \hline -\ex{(* \var{sre} \ldots)} &zero-or-more &0 &infinity \\ -\ex{(+ \var{sre} \ldots)} &one-or-more &1 &infinity \\ -\ex{(? \var{sre} \ldots)} &zero-or-one &0 &1 \\ -\ex{(= \var{from} \var{sre} \ldots)} &exactly-n &\var{from} &\var{from} \\ -\ex{(>= \var{from} \var{sre} \ldots)} &n-or-more &\var{from} &infinity \\ -\ex{(** \var{from} \var{to} \var{sre} \ldots)} &n-to-m &\var{from} &\var{to} +\ex{(* \var{sre} \ldots)} &zero-or-more &0 &infinity \\ +\ex{(+ \var{sre} \ldots)} &one-or-more &1 &infinity \\ +\ex{(? \var{sre} \ldots)} &zero-or-one &0 &1 \\ +\ex{(= \var{from} \var{sre} \ldots)} &exactly-n &\var{from} &\var{from} \\ +\ex{(>= \var{from} \var{sre} \ldots)} &n-or-more &\var{from} &infinity \\ +\ex{(** \var{from} \var{to} \var{sre} \ldots)} &n-to-m &\var{from} &\var{to} \end{tabular} \end{inset} @@ -381,8 +372,8 @@ We can limit the a/d chains to 4 characters or less with the SRE Some boundary cases: \begin{code} - (** 5 2 "foo") ; Will never match - (** 0 0 "foo") ; Matches the empty string\end{code} + (** 5 2 "foo") ; Will never match + (** 0 0 "foo") ; Matches the empty string\end{code} \paragraph{Character classes} @@ -450,20 +441,20 @@ There are also predefined named char classes for the standard Posix and Gnu character classes: \begin{inset} \begin{tabular}{llll} -scsh name & Posix/ctype & Alternate name & Comment \\ \hline -\ex{lower-case} & \ex{lower} \\ -\ex{upper-case} & \ex{upper} \\ -\ex{alphabetic} & \ex{alpha} \\ -\ex{numeric} & \ex{digit} & \ex{num} \\ -\ex{alphanumeric} & \ex{alnum} & \ex{alphanum} \\ -\ex{punctuation} & \ex{punct} \\ -\ex{graphic} & \ex{graph} \\ -\ex{blank} & (Gnu extension) \\ -\ex{whitespace} & \ex{space} & \ex{white} & {``\ex{space}'' is deprecated.}\\ -\ex{printing} & \ex{print} \\ -\ex{control} & \ex{cntrl} \\ -\ex{hex-digit} & \ex{xdigit} & \ex{hex} \\ -\ex{ascii} & (Gnu extension) \\ +scsh name & Posix/ctype & Alternate name & Comment \\ \hline +\ex{lower-case} & \ex{lower} \\ +\ex{upper-case} & \ex{upper} \\ +\ex{alphabetic} & \ex{alpha} \\ +\ex{numeric} & \ex{digit} & \ex{num} \\ +\ex{alphanumeric} & \ex{alnum} & \ex{alphanum} \\ +\ex{punctuation} & \ex{punct} \\ +\ex{graphic} & \ex{graph} \\ +\ex{blank} & (Gnu extension) \\ +\ex{whitespace} & \ex{space} & \ex{white} & {``\ex{space}'' is deprecated.}\\ +\ex{printing} & \ex{print} \\ +\ex{control} & \ex{cntrl} \\ +\ex{hex-digit} & \ex{xdigit} & \ex{hex} \\ +\ex{ascii} & (Gnu extension) \\ \end{tabular} \end{inset} See the scsh character-set documentation or the Posix isalpha(3) man page @@ -705,10 +696,10 @@ to produce a certain number of submatches---if that is part of \var{exp}'s ``contract.'' -\paragraph{String, line, and word units} +\paragraph{String and line units} -The regexps \ex{bos} and \ex{eos} match the empty string at the beginning and -end of the string, respectively. +The regexps \ex{bos} and \ex{eos} match the empty string at the +beginning and end of the string, respectively. The regexps \ex{bol} and \ex{eol} match the empty string at the beginning and end of a line, respectively. A line begins at the beginning of the string, and @@ -717,32 +708,6 @@ just before every newline character. The char class \ex{nonl} matches any character except newline, and is useful in conjunction with line-based pattern matching. -The regexps \ex{bow} and \ex{eow} match the empty string at the beginning and -end of a word, respectively. A word is a contiguous sequence of characters -that are either alphanumeric or the underscore character. - -The regexp \ex{(word \var{sre} \ldots)} surrounds the sequence -\ex{(: \var{sre} \ldots)}with bow/eow delimiters. It is equivalent to -\begin{code} -(: bow \var{sre} \ldots eow)\end{code}% -% - -The regexp \ex{(word+ \var{cset-sre} \ldots)} matches a word whose body is -one or more word characters matched by the char-set sre \var{cset-sre}. -It is equivalent to -\begin{code} -(word (+ (& (| alphanumeric "_") - (| \var{cset-sre} \ldots))))\end{code}% -% -For example, a word not containing x, y, or z is -\begin{code} -(word+ (~ ("xyz")))\end{code}% -% -The regexp \ex{word} matches one word; it is equivalent to -\begin{code} -(word+ any) -\end{code}% - \note{\ex{bol} and \ex{eol} are not supported by scsh's current regexp search engine, which is Spencer's Posix matcher. This is the only element of the notation that is not supported by the current scsh @@ -829,7 +794,7 @@ submatches before the body, and \var{post} deleted submatches after the body. If the body \var{(: \var{sre} \ldots)} itself has \var{body-sm} submatches, then the total number of submatches for the DSM form is - $$\var{pre} + \var{body-sm} + \var{post}.$$ + $$\var{pre} + \var{body-sm} + \var{post}.$$ These extra, deleted submatches are never assigned string indices in any match values produced when matching the regexp against a string. @@ -870,7 +835,7 @@ There are two places where one can embed run-time computations in an SRE: \begin{itemize} \item The \var{from} or \var{to} repetition counts of - \ex{**}, \ex{=}, and \ex{>=} forms; + \ex{**}, \ex{=}, and \ex{>=} forms; \item \ex{,\var{exp}} and \ex{,@\var{exp}} forms. \end{itemize} @@ -933,8 +898,8 @@ should not be used in new code. \ex{regexp/bos-not-bol} means the beginning of the string isn't a line-begin. \ex{regexp/eos-not-eol} is analogous. \note{They're currently ignored because - begining/end-of-line anchors aren't supported by the current - implementation.} + begining/end-of-line anchors aren't supported by the current + implementation.} Use \ex{regexp-search?} when you don't need submatch information, as it has the potential to be \emph{significantly} faster on @@ -983,7 +948,7 @@ the port: is written to the port. \item If an item is \ex{'pre}, the prefix of the matched string (the text preceding the match) - is written to the port. + is written to the port. \item If an item is \ex{'post}, the suffix of the matched string is written. \end{itemize} @@ -1009,15 +974,15 @@ It has the following differences with \ex{regexp-substitute}: \item It takes a regular expression and string to be matched as parameters, instead of a completed match structure. \item If the regular expression doesn't match the string, this - procedure is the identity transform---it returns or outputs the - string. + procedure is the identity transform---it returns or outputs the + string. \item If an item is \ex{'post}, the procedure recurses on the suffix string (the text from \var{string} following the match). - Including a \ex{'post} in the list of items is how one gets multiple - match/substitution operations. + Including a \ex{'post} in the list of items is how one gets multiple + match/substitution operations. \item If an item is a procedure, it is applied to the match structure for - a given match. - The procedure returns a string to be used in the result. + a given match. + The procedure returns a string to be used in the result. \end{itemize} The \var{regexp} parameter can be either a compiled regular expression or a string specifying a regular expression. @@ -1262,9 +1227,6 @@ Note:\begin{itemize} \item The string parser doesn't handle the exotica of character class names such as \verb|[[:alnum:]]|; the current implementation was written in in three hours. - -\item The unparser produces Spencer-specific strings for bow/eow - elements; otherwise, it's Posix all the way. \end{itemize} \end{desc} @@ -1327,18 +1289,14 @@ contained in the regular expression. \defvarx{re-eos}{regexp} \defvarx{re-bol}{regexp} \defvarx{re-eol}{regexp} -\defvarx{re-bow}{regexp} -\defvarx{re-eow}{regexp} \begin{desc} These variables are bound to the primitive anchor regexps. \end{desc} \defun {re-bos?}{\object}{\boolean} -\defunx{re-eos?}{\object}{\boolean} +\defunx{re-eos?}{\object}{\boolean} \defunx{re-bol?}{\object}{\boolean} \defunx{re-eol?}{\object}{\boolean} -\defunx{re-bow?}{\object}{\boolean} -\defunx{re-eow?}{\object}{\boolean} \begin{desc} These predicates recognise the associated primitive anchor regexp. \end{desc} @@ -1378,15 +1336,11 @@ regexps built using other constructors may or may not produce a true value. % These are non-primitive predefined regexps of general utility. -\defvar {re-nonl}{regexp} -\defvarx{re-word}{regexp} +\defvarx {re-nonl}{regexp} \begin{desc} The variable \ex{re-nonl} is bound to a regular expression that matches any non-newline character (corresponding to the SRE \verb|(~ #\newline)|). - -Similarly, \ex{re-word} is bound to a regular expression -that matches any word (corresponding to the SRE \ex{word}). \end{desc} \defun{regexp?}{\object}{\boolean}