Remove documentation for BOW, EOW, WORD, and WORD+, as they have no

POSIX counterparts, and their meaning is locale-dependent.
2002-02-16 16:52:27 +00:00 · 2002-02-16 16:52:27 +00:00 · 357afa99ae
parent 00bba17d56
commit 357afa99ae
1 changed files with 81 additions and 127 deletions
--- a/doc/scsh-manual/sre.tex
+++ b/doc/scsh-manual/sre.tex
@ -72,8 +72,6 @@ providing:
 \item repetition (\ex{*}, \ex{+}, \ex{?}, \ex{\{$m$,$n$\}})
 \item character classes (\eg, \ex{[aeiou]}) and wildcard (\ex{.})
 \item beginning/end of string anchors (\verb|^|, \verb|$|)
-\item beginning/end of line anchors
-\item beginning/end of word anchors
 \item case-sensitivity control
 \item submatch-marking
 \end{itemize}
@ -100,63 +98,56 @@ the next section is a friendlier tutorial introduction.
    case-sensitivity lexical context. \\
 \\
 \ex{(* \var{sre} {\ldots})} & 0 or more matches \\
-\ex{(+ \var{sre} {\ldots})}	& 1 or more matches \\
-\ex{(? \var{sre} {\ldots})}	& 0 or 1 matches \\
-\ex{(= \var{n} \var{sre} {\ldots})}	& \var{n} matches \\
-\ex{(>= \var{n} \var{sre} {\ldots})}	& \var{n} or more matches \\
-\ex{(** \var{n} \var{m} \var{sre} {\ldots})}	& \var{n} to \var{m} matches \\
+\ex{(+ \var{sre} {\ldots})}     & 1 or more matches \\
+\ex{(? \var{sre} {\ldots})}     & 0 or 1 matches \\
+\ex{(= \var{n} \var{sre} {\ldots})}     & \var{n} matches \\
+\ex{(>= \var{n} \var{sre} {\ldots})}    & \var{n} or more matches \\
+\ex{(** \var{n} \var{m} \var{sre} {\ldots})}    & \var{n} to \var{m} matches \\
 \srecomment{
    \var{N} and \var{m} are Scheme expressions producing non-negative
    integers. \\
    \var{M} may also be \ex{\#f}, meaning ``infinity.''} \\
 \\
-\ex{(| \var{sre} {\ldots})} 	& Choice (\ex{or} is R5RS symbol; \\
-\ex{(or \var{sre} {\ldots})}	& \ex{|} is not specified by R5RS.) \\
+\ex{(| \var{sre} {\ldots})}     & Choice (\ex{or} is R5RS symbol; \\
+\ex{(or \var{sre} {\ldots})}    & \ex{|} is not specified by R5RS.) \\
 \\
-\ex{(:   \var{sre} {\ldots})}	& Sequence (\ex{seq} is legal \\
-\ex{(seq \var{sre} {\ldots})}	& Common Lisp symbol) \\
+\ex{(:   \var{sre} {\ldots})}   & Sequence (\ex{seq} is legal \\
+\ex{(seq \var{sre} {\ldots})}   & Common Lisp symbol) \\
 \\
-\ex{(submatch \var{sre} {\ldots})}	& Numbered submatch \\
+\ex{(submatch \var{sre} {\ldots})}      & Numbered submatch \\
 \\
-\ex{(dsm \var{pre} \var{post} \var{sre} {\ldots})}	& Deleted submatches \\
+\ex{(dsm \var{pre} \var{post} \var{sre} {\ldots})}      & Deleted submatches \\
    \srecomment{\var{Pre} and \var{post} are numerals.} \\
 \\
-\ex{(uncase \var{sre} {\ldots})}	& Case-folded match \\
-\ex{(w/case   \var{sre} {\ldots})}	& Introduce a lexical case-sensitivity \\
-\ex{(w/nocase \var{sre} {\ldots})}	& context. \\
+\ex{(uncase \var{sre} {\ldots})}        & Case-folded match \\
+\ex{(w/case   \var{sre} {\ldots})}      & Introduce a lexical case-sensitivity \\
+\ex{(w/nocase \var{sre} {\ldots})}      & context. \\
 \\
-\ex{,@\var{exp}}	& Dynamically computed regexp \\
-\ex{,\var{exp}}		& Same as ,@\var{exp}, but no submatch info \\
+\ex{,@\var{exp}}        & Dynamically computed regexp \\
+\ex{,\var{exp}}         & Same as ,@\var{exp}, but no submatch info \\
    \srecomment{\var{Exp} must produce a character, string, 
-		char-set, or regexp.} \\
+                char-set, or regexp.} \\
 \\
-\ex{bos eos}	& Beginning/end of string \\
+\ex{bos eos}    & Beginning/end of string \\
 \ex{bol eol}    & Beginning/end of line \\
-\ex{bow eow}	& Beginning/end of word \\
 \end{tabular}
 \caption{SRE syntax summary (part 1)}
 \end{boxedfigure}

 \begin{boxedfigure}{tbhp}
 \begin{tabular}{lp{3in}}
-\ex{(word  \var{sre} {\ldots})}	& (: bow \var{sre} {\ldots} eow) \\
-\ex{(word+ \var{cset-sre} {\ldots})}
-               & \cd{(word (+ (& (| alphanumeric "_")} \\
-               & \cd{            (| \var{cset-sre} {\ldots}))))} \\
-\ex{word}	& \ex{(word+ any)} \\
+\ex{(posix-string \var{string})}        & Escape for Posix string notation \\
 \\
-\ex{(posix-string \var{string})}	& Escape for Posix string notation \\
-\\
-\ex{\var{char}}		& Singleton char set \\
-\ex{\var{class-name}}	& alphanumeric, whitespace, \etc \\
+\ex{\var{char}}         & Singleton char set \\
+\ex{\var{class-name}}   & alphanumeric, whitespace, \etc \\
    \srecomment{These two forms are interpreted subject to
                the lexical case-sensitivity context.} \\
 \\
-\cd{(~ \var{cset-sre} {\ldots})}	& Complement-of-union (\cd{[^{\ldots}]}) \\
-\ex{(- \var{cset-sre} {\ldots})}	& Difference \\
-\cd{(& \var{cset-sre} {\ldots})}	& Intersection \\
+\cd{(~ \var{cset-sre} {\ldots})}        & Complement-of-union (\cd{[^{\ldots}]}) \\
+\ex{(- \var{cset-sre} {\ldots})}        & Difference \\
+\cd{(& \var{cset-sre} {\ldots})}        & Intersection \\
 \\
-\ex{(/ \var{range-spec} {\ldots})}	& Character range---interpreted
+\ex{(/ \var{range-spec} {\ldots})}      & Character range---interpreted
                                subject to 
                                the lexical case-sensitivy context \\
 \end{tabular}
@ -167,19 +158,19 @@ the next section is a friendlier tutorial introduction.
 {\tt
 \begin{tabular}{l@{\quad\texttt{|}\quad}ll}
 \multicolumn{1}{l}{\var{class-name}\quad ::=\quad} & any \\
-                     & nonl		\\
-		     & lower-case	& | lower \\
-		     & upper-case	& | upper \\
-		     & alphabetic	& | alpha \\
-		     & numeric		& | digit | num \\
-		     & alphanumeric	& | alnum \\
-		     & punctuation	& | punct \\
-		     & graphic		& | graph \\
-		     & whitespace	& | space | white \\
-		     & printing		& | print \\
-		     & control		& | cntrl \\
-		     & hex-digit	& | xdigit | hex \\
-		     & ascii
+                     & nonl             \\
+                     & lower-case       & | lower \\
+                     & upper-case       & | upper \\
+                     & alphabetic       & | alpha \\
+                     & numeric          & | digit | num \\
+                     & alphanumeric     & | alnum \\
+                     & punctuation      & | punct \\
+                     & graphic          & | graph \\
+                     & whitespace       & | space | white \\
+                     & printing         & | print \\
+                     & control          & | cntrl \\
+                     & hex-digit        & | xdigit | hex \\
+                     & ascii
 \end{tabular}
 \\[2ex]
 \ex{\var{range-spec} ::= \var{string} | \var{char}} \\
@ -197,22 +188,22 @@ The chars are taken in pairs to form inclusive ranges.
             | (& <cset-sre> ...)    Intersection
             | (| <cset-sre> ...)    Set union
             | (/ <range-spec> ...)  Range
-				     
+                                     
             | (<string>)            Constant set
             | <char>                Singleton constant set
             | <string>              For 1-char string "c"
-				     
+                                     
             | <class-name>          Constant set
-				     
+                                     
             | ,<exp>                <exp> evals to a char-set,
             | ,@<exp>               char, single-char string,
                                     or re-char-set regexp.
-				     
+                                     
             | (uncase <cset-sre>)   Case-folding
             | (w/case <cset-sre>)              
             | (w/nocase <cset-sre>)            
 \end{verbatim}
-\caption{%The \cd{~}, \cd{-}, \cd{&}, and \cd{word+} operators may only be
+\caption{%The \cd{~}, \cd{-}, and \cd{&} operators may only be
         applied to SRE's that specify character sets. 
         These are the ``type-checking'' rules for character-set SRE's.}
 \end{boxedfigure}
@ -352,12 +343,12 @@ of SRE repetition forms:
 \begin{inset}
 \begin{tabular}{llrr}
 SRE & means & at least & no more than \\ \hline
-\ex{(* \var{sre} \ldots)}			&zero-or-more	&0	&infinity \\
-\ex{(+ \var{sre} \ldots)}			&one-or-more	&1	&infinity \\
-\ex{(? \var{sre} \ldots)}			&zero-or-one	&0	&1 \\
-\ex{(= \var{from} \var{sre} \ldots)}		&exactly-n	&\var{from}  &\var{from} \\
-\ex{(>= \var{from} \var{sre} \ldots)}		&n-or-more	&\var{from}  &infinity \\
-\ex{(** \var{from} \var{to} \var{sre} \ldots)}	&n-to-m		&\var{from}  &\var{to}
+\ex{(* \var{sre} \ldots)}                       &zero-or-more   &0      &infinity \\
+\ex{(+ \var{sre} \ldots)}                       &one-or-more    &1      &infinity \\
+\ex{(? \var{sre} \ldots)}                       &zero-or-one    &0      &1 \\
+\ex{(= \var{from} \var{sre} \ldots)}            &exactly-n      &\var{from}  &\var{from} \\
+\ex{(>= \var{from} \var{sre} \ldots)}           &n-or-more      &\var{from}  &infinity \\
+\ex{(** \var{from} \var{to} \var{sre} \ldots)}  &n-to-m         &\var{from}  &\var{to}
 \end{tabular}
 \end{inset}

@ -381,8 +372,8 @@ We can limit the a/d chains to 4 characters or less with the SRE

 Some boundary cases:
 \begin{code}
-    (** 5 2 "foo")	; Will never match
-    (** 0 0 "foo")	; Matches the empty string\end{code}
+    (** 5 2 "foo")      ; Will never match
+    (** 0 0 "foo")      ; Matches the empty string\end{code}

 \paragraph{Character classes}

@ -450,20 +441,20 @@ There are also predefined named char classes for the standard Posix and Gnu
 character classes:
 \begin{inset}
 \begin{tabular}{llll}
-scsh name	& Posix/ctype	& Alternate name 	& Comment \\ \hline
-\ex{lower-case} & \ex{lower}	\\
-\ex{upper-case} & \ex{upper}	\\
-\ex{alphabetic} & \ex{alpha}	\\
-\ex{numeric}	& \ex{digit}	& \ex{num}	\\
-\ex{alphanumeric} & \ex{alnum}	& \ex{alphanum}	\\
-\ex{punctuation} & \ex{punct}	\\
-\ex{graphic} 	& \ex{graph}	\\
-\ex{blank}	& (Gnu extension)	\\
-\ex{whitespace}	& \ex{space}	& \ex{white} & {``\ex{space}'' is deprecated.}\\
-\ex{printing}	& \ex{print}	\\
-\ex{control}	& \ex{cntrl}	\\
-\ex{hex-digit}	& \ex{xdigit}	& \ex{hex}	\\
-\ex{ascii} 	& (Gnu extension)	\\
+scsh name       & Posix/ctype   & Alternate name        & Comment \\ \hline
+\ex{lower-case} & \ex{lower}    \\
+\ex{upper-case} & \ex{upper}    \\
+\ex{alphabetic} & \ex{alpha}    \\
+\ex{numeric}    & \ex{digit}    & \ex{num}      \\
+\ex{alphanumeric} & \ex{alnum}  & \ex{alphanum} \\
+\ex{punctuation} & \ex{punct}   \\
+\ex{graphic}    & \ex{graph}    \\
+\ex{blank}      & (Gnu extension)       \\
+\ex{whitespace} & \ex{space}    & \ex{white} & {``\ex{space}'' is deprecated.}\\
+\ex{printing}   & \ex{print}    \\
+\ex{control}    & \ex{cntrl}    \\
+\ex{hex-digit}  & \ex{xdigit}   & \ex{hex}      \\
+\ex{ascii}      & (Gnu extension)       \\
 \end{tabular}
 \end{inset}
 See the scsh character-set documentation or the Posix isalpha(3) man page
@ -705,10 +696,10 @@ to produce a certain number of submatches---if that is part of \var{exp}'s
 ``contract.''


-\paragraph{String, line, and word units}
+\paragraph{String and line units}

-The regexps \ex{bos} and \ex{eos} match the empty string at the beginning and
-end of the string, respectively.
+The regexps \ex{bos} and \ex{eos} match the empty string at the
+beginning and end of the string, respectively.

 The regexps \ex{bol} and \ex{eol} match the empty string at the beginning and
 end of a line, respectively. A line begins at the beginning of the string, and
@ -717,32 +708,6 @@ just before every newline character. The char class \ex{nonl} matches any
 character except newline, and is useful in conjunction with line-based pattern
 matching.

-The regexps \ex{bow} and \ex{eow} match the empty string at the beginning and
-end of a word, respectively. A word is a contiguous sequence of characters
-that are either alphanumeric or the underscore character.
-
-The regexp \ex{(word \var{sre} \ldots)} surrounds the sequence 
-\ex{(: \var{sre} \ldots)}with bow/eow delimiters. It is equivalent to
-\begin{code}
-(: bow \var{sre} \ldots eow)\end{code}%
-%
-
-The regexp \ex{(word+ \var{cset-sre} \ldots)} matches a word whose body is 
-one or more word characters matched by the char-set sre \var{cset-sre}.
-It is equivalent to
-\begin{code}
-(word (+ (& (| alphanumeric "_")
-            (| \var{cset-sre} \ldots))))\end{code}%
-%
-For example, a word not containing x, y, or z is
-\begin{code}
-(word+ (~ ("xyz")))\end{code}%
-%
-The regexp \ex{word} matches one word; it is equivalent to 
-\begin{code}
-(word+ any)
-\end{code}%
-
 \note{\ex{bol} and \ex{eol} are not supported by scsh's current 
      regexp search engine, which is Spencer's Posix matcher. This is the only
      element of the notation that is not supported by the current scsh
@ -829,7 +794,7 @@ submatches before the body, and \var{post} deleted submatches after the
 body. 
 If the body \var{(: \var{sre} \ldots)} itself has \var{body-sm} submatches,
 then the total number of submatches for the DSM form is
-		  $$\var{pre} + \var{body-sm} + \var{post}.$$
+                  $$\var{pre} + \var{body-sm} + \var{post}.$$
 These extra, deleted submatches are never assigned string indices in any
 match values produced when matching the regexp against a string.

@ -870,7 +835,7 @@ There are two places where one can
 embed run-time computations in an SRE:
 \begin{itemize}
    \item The \var{from} or \var{to} repetition counts of 
-	\ex{**}, \ex{=}, and \ex{>=} forms;
+        \ex{**}, \ex{=}, and \ex{>=} forms;
    \item \ex{,\var{exp}} and \ex{,@\var{exp}} forms.
 \end{itemize}

@ -933,8 +898,8 @@ should not be used in new code.
    \ex{regexp/bos-not-bol} means the beginning of the string isn't a
    line-begin. \ex{regexp/eos-not-eol} is analogous. 
    \note{They're currently ignored because
-    	  begining/end-of-line anchors aren't supported by the current
-	  implementation.}
+          begining/end-of-line anchors aren't supported by the current
+          implementation.}

    Use \ex{regexp-search?} when you don't need submatch information, as
    it has the potential to be \emph{significantly} faster on 
@ -983,7 +948,7 @@ the port:
      is written to the port.
    \item If an item is \ex{'pre}, 
          the prefix of the matched string (the text preceding the match) 
-	  is written to the port.
+          is written to the port.
    \item If an item is \ex{'post}, 
          the suffix of the matched string is written.
 \end{itemize}
@ -1009,15 +974,15 @@ It has the following differences with \ex{regexp-substitute}:
  \item It takes a regular expression and string to be matched as
        parameters, instead of a completed match structure.
  \item If the regular expression doesn't match the string, this
-	procedure is the identity transform---it returns or outputs the
-	string.
+        procedure is the identity transform---it returns or outputs the
+        string.
  \item If an item is \ex{'post}, the procedure recurses on the suffix string
        (the text from \var{string} following the match). 
-	Including a \ex{'post} in the list of items is how one gets multiple 
-	match/substitution operations.
+        Including a \ex{'post} in the list of items is how one gets multiple 
+        match/substitution operations.
  \item If an item is a procedure, it is applied to the match structure for
-	a given match.
-	The procedure returns a string to be used in the result.
+        a given match.
+        The procedure returns a string to be used in the result.
  \end{itemize}
 The \var{regexp} parameter can be either a compiled regular expression or
 a string specifying a regular expression.
@ -1262,9 +1227,6 @@ Note:\begin{itemize}
 \item The string parser doesn't handle the exotica of character class
      names such as \verb|[[:alnum:]]|; the current implementation was written
      in in three hours.
-
-\item The unparser produces Spencer-specific strings for bow/eow
-      elements; otherwise, it's Posix all the way.
 \end{itemize}
 \end{desc}

@ -1327,18 +1289,14 @@ contained in the regular expression.
 \defvarx{re-eos}{regexp}
 \defvarx{re-bol}{regexp}
 \defvarx{re-eol}{regexp}
-\defvarx{re-bow}{regexp}
-\defvarx{re-eow}{regexp}
 \begin{desc}
 These variables are bound to the primitive anchor regexps.
 \end{desc}

 \defun {re-bos?}{\object}{\boolean}
-\defunx{re-eos?}{\object}{\boolean}				
+\defunx{re-eos?}{\object}{\boolean}                             
 \defunx{re-bol?}{\object}{\boolean}
 \defunx{re-eol?}{\object}{\boolean}
-\defunx{re-bow?}{\object}{\boolean}
-\defunx{re-eow?}{\object}{\boolean}
 \begin{desc}
 These predicates recognise the associated primitive anchor regexp.
 \end{desc}
@ -1378,15 +1336,11 @@ regexps built using other constructors may or may not produce a true value.

 %    These are non-primitive predefined regexps of general utility.

-\defvar {re-nonl}{regexp}
-\defvarx{re-word}{regexp}
+\defvarx {re-nonl}{regexp}
 \begin{desc}
 The variable \ex{re-nonl} is bound to a regular expression
 that matches any non-newline character
 (corresponding to the SRE \verb|(~ #\newline)|).
-
-Similarly, \ex{re-word} is bound to a regular expression
-that matches any word (corresponding to the SRE \ex{word}).
 \end{desc}

 \defun{regexp?}{\object}{\boolean}