Added a bunch of new char-set stuff.

1998-06-16 21:19:32 +00:00 · 1998-06-16 21:19:32 +00:00 · 54ca902c67
parent ec63c53ee6
commit 54ca902c67
1 changed files with 130 additions and 19 deletions
--- a/doc/scsh-manual/strings.tex
+++ b/doc/scsh-manual/strings.tex
@ -576,12 +576,79 @@ is also frequently useful for expanding file-names.
 Scsh provides a \ex{char-set} type for expressing sets of characters.
 These sets are used by some of the delimited-input procedures 
 (section~\ref{sec:field-reader}).
-The character set package that scsh uses was taken from Project Mac's 
-MIT Scheme.
+Scsh's character set package was adapted and extended from
+Project Mac's MIT Scheme package.
+Note that the character type used in the current implementation corresponds
+to the ASCII character set---but you would be wise not to build this
+assumption into your code if you can help it.\footnote{
+	Actually, it's slightly uglier than that, albeit somewhat more
+	useful. The current character type corresponds to an eight-bit
+	superset of ASCII. The \ex{ascii->char} and \ex{char->ascii}
+ 	functions will preserve this eighth bit. However, none of the
+	the high 128 characters appear in any of the standard character
+	sets defined in section~\ref{sec:std-csets}, except for 
+	\ex{char-set:full}. If someone would email the authors a listing
+	of the full Latin-1 definition,	we'll be happy to upgrade these
+	sets' definitions to make them Latin-1 compliant.}

 \defun{char-set?}{x}\boolean
 \begin{desc}
-Returns true if the object \var{x} is a character set.
+Is the object \var{x} a character set?
+\end{desc}
+
+\defun{char-set=}{cs1 cs2}\boolean
+\begin{desc}
+Are the character sets \var{cs1} and \var{cs2} equal?
+\end{desc}
+
+\defun{char-set<=}{cs1 cs2}\boolean
+\begin{desc}
+Returns true if character set \var{cs1} is a subset of character set \var{cs2}.
+\end{desc}
+
+\defun{reduce-char-set}{kons knil cs}\object
+\begin{desc}
+This is the fundamental iterator for character sets.
+Reduces the function \var{kons} across the character set \var{cs} using
+initial state value \var{knil}. 
+That is, if \var{cs} is the empty set, the procedure returns \var{knil}.
+Otherwise, some element \var{c} of \var{cs} is chosen; let \var{cs'} be
+the remaining, unchosen characters.
+The procedure returns 
+\begin{tightcode}
+(reduce-char-set \var{kons} (\var{kons} \var{c} \var{knil}) \var{cs'})\end{tightcode}
+For example, we could define \ex{char-set-members} (see below)
+as
+\begin{tightcode}
+(lambda (cs) (reduce-char-set cons '() cs))\end{tightcode}
+\end{desc}
+
+\subsection{Side effects}
+\defun{set-char-set!}{cs char in?}{\undefined}
+\begin{desc}
+This side-effects character set \var{cs}. 
+If \var{in?} is true, character \var{char} is added to the set.
+Otherwise, it is deleted from the set.
+
+Use of this procedure is deprecated, since it could damage other procedures
+that retain pointers to existing character sets.
+You should use \ex{set-char-set!} in contexts where it is guaranteed that
+there are no other pointers to the character set being modified.
+(For example, functions that create character sets can use this function
+to efficiently construct the character set, after which time the set is
+used in a pure-functional, shared manner.)
+\end{desc}
+
+\defun{char-set-for-each}{p cs}{\undefined}
+\begin{desc}
+Apply procedure \var{p} to each character in the character set \var{cs}.
+Note that the order in which \var{p} is applied to the characters in the
+set is not specified, and may even change from application to application.
+\end{desc}
+
+\defun{copy-char-set}{cs}{char-set}
+\begin{desc}
+Returns a copy of the character set \var{cs}.
 \end{desc}

 \subsection{Creating character sets}
@ -634,46 +701,90 @@ of R4RS procedures.
 We sought to escape the dilemma by shifting to a new name.}
 \end{desc}

+\defun{char-set-size}{cs}\integer
+\begin{desc}
+Returns the number of elements in character set \var{cs}.
+\end{desc}
+
 \subsection{Character set algebra}
 \defun {char-set-invert}{char-set}{char-set}
-\defunx{char-set-union}{\vari{char-set}1 \vari{char-set}2}{char-set}
-\defunx{char-set-intersection}{\vari{char-set}1 \vari{char-set}2}{char-set}
-\defunx{char-set-difference}{\vari{char-set}1 \vari{char-set}2}{char-set}
+\defunx{char-set-union}{\vari{char-set}1\ldots}{char-set}
+\defunx{char-set-intersection}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set}
+\defunx{char-set-difference}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set}
 \begin{desc}
 These procedures implement set complement, union, intersection, and difference
 for character sets.
+The union, intersection, and difference operations are n-ary, associating
+to the left; the difference function requires at least one argument, while
+union and intersection may be applied to zero arguments.
 \end{desc}

 \subsection{Standard character sets}
+\label{sec:std-csets}
 Several character sets are predefined for convenience:

 \begin{center}
 \newcommand{\entry}[1]{\ex{#1}\index{#1}}
 \begin{tabular}{|ll|}
 \hline
-\entry{char-set:upper-case}	&	A--Z \\
-\entry{char-set:lower-case}	&	a--z \\
-\entry{char-set:numeric}	&	0--9 \\
-\entry{char-set:whitespace}	&	space, newline, tab, linefeed, page,
-					return \\
-\entry{char-set:not-whitespace}	&	Complement of \ex{char-set:whitespace} \\
-\entry{char-set:alphabetic}	&	A--Z and a--z \\
+\entry{char-set:alphabetic}	&	Alphabetic chars \\
+\entry{char-set:lower-case}	&	Lower-case alphabetic chars \\
+\entry{char-set:upper-case}	&	Upper-case alphabetic chars \\
+\entry{char-set:numeric}	&	Decimal digits: 0--9 \\
 \entry{char-set:alphanumeric}	&	Alphabetic or numeric \\
-\entry{char-set:graphic}	&	Printing characters and space \\
+\entry{char-set:graphic}	&	Printing characters except space \\
+\entry{char-set:printing}	&	Printing characters including space \\
+\entry{char-set:whitespace}	&	Whitespace characters \\
+\entry{char-set:blank}		&	Blank characters \\
+\entry{char-set:control}	&	Control characters \\
+\entry{char-set:punctuation}	&	Punctuation characters \\
+\entry{char-set:hex-digit}	&       A hexadecimal digit: 0--9, A--F, a--f \\
+\entry{char-set:ascii}		&	A character in the ASCII set. \\
+\entry{char-set:empty}		&	Empty set \\
+\entry{char-set:full}		&	All characters \\
+\hline
+\end{tabular}
+\end{center}
+The first twelve of these correspond to the character classes defined in
+Posix.
+Note that there may be characters in \ex{char-set:alphabetic} that are
+neither upper or lower case---this might occur in implementations that
+use a character type richer than ASCII, such as Unicode.
+A ``graphic character'' is one that would put ink on your page.
+While the exact composition of these sets may vary depending upon the
+character type provided by the Scheme system upon which scsh is running,
+here are the definitions for some of the sets in an ASCII character set:
+\begin{center}
+\newcommand{\entry}[1]{\ex{#1}\index{#1}}
+\begin{tabular}{|ll|}
+\hline
+char-set:alphabetic	&	A--Z and a--z \\
+char-set:lower-case	&	a--z \\
+char-set:upper-case	&	A--Z \\
+char-set:graphic	&	Alphanumeric + punctuation \\
+char-set:whitespace	&	Space, newline, tab, page, 
+					vertical tab, carriage return \\
+char-set:blank		&	Space and tab \\
+char-set:control	&	ASCII 0--31 and 127 \\
+char-set:punctuation	&	\verb|!"#$%&'()*+,-./:;<=>|\verb#?@[\]^_`{|}~# \\
 \hline
 \end{tabular}
 \end{center}


-\defun {char-upper-case?}\character\boolean
+\defun {char-alphabetic?}\character\boolean
 \defunx{char-lower-case?}\character\boolean
+\defunx{char-upper-case?}\character\boolean
 \defunx{char-numeric?	}\character\boolean
-\defunx{char-whitespace?}\character\boolean
-\defunx{char-alphabetic?}\character\boolean
 \defunx{char-alphanumeric?}\character\boolean
 \defunx{char-graphic?}\character\boolean
+\defunx{char-printing?}\character\boolean
+\defunx{char-whitespace?}\character\boolean
+\defunx{char-blank?}\character\boolean
+\defunx{char-control?}\character\boolean
+\defunx{char-punctuation?}\character\boolean
+\defunx{char-hex-digit?}\character\boolean
+\defunx{char-ascii?}\character\boolean
 \begin{desc}
 These predicates are defined in terms of the above character sets.
 \end{desc}
-
-