From 54ca902c672e2b66d5c4f9084688f9edff55c1f6 Mon Sep 17 00:00:00 2001 From: shivers Date: Tue, 16 Jun 1998 21:19:32 +0000 Subject: [PATCH] Added a bunch of new char-set stuff. --- doc/scsh-manual/strings.tex | 149 +++++++++++++++++++++++++++++++----- 1 file changed, 130 insertions(+), 19 deletions(-) diff --git a/doc/scsh-manual/strings.tex b/doc/scsh-manual/strings.tex index aca24db..5d0e303 100644 --- a/doc/scsh-manual/strings.tex +++ b/doc/scsh-manual/strings.tex @@ -576,12 +576,79 @@ is also frequently useful for expanding file-names. Scsh provides a \ex{char-set} type for expressing sets of characters. These sets are used by some of the delimited-input procedures (section~\ref{sec:field-reader}). -The character set package that scsh uses was taken from Project Mac's -MIT Scheme. +Scsh's character set package was adapted and extended from +Project Mac's MIT Scheme package. +Note that the character type used in the current implementation corresponds +to the ASCII character set---but you would be wise not to build this +assumption into your code if you can help it.\footnote{ + Actually, it's slightly uglier than that, albeit somewhat more + useful. The current character type corresponds to an eight-bit + superset of ASCII. The \ex{ascii->char} and \ex{char->ascii} + functions will preserve this eighth bit. However, none of the + the high 128 characters appear in any of the standard character + sets defined in section~\ref{sec:std-csets}, except for + \ex{char-set:full}. If someone would email the authors a listing + of the full Latin-1 definition, we'll be happy to upgrade these + sets' definitions to make them Latin-1 compliant.} \defun{char-set?}{x}\boolean \begin{desc} -Returns true if the object \var{x} is a character set. +Is the object \var{x} a character set? +\end{desc} + +\defun{char-set=}{cs1 cs2}\boolean +\begin{desc} +Are the character sets \var{cs1} and \var{cs2} equal? +\end{desc} + +\defun{char-set<=}{cs1 cs2}\boolean +\begin{desc} +Returns true if character set \var{cs1} is a subset of character set \var{cs2}. +\end{desc} + +\defun{reduce-char-set}{kons knil cs}\object +\begin{desc} +This is the fundamental iterator for character sets. +Reduces the function \var{kons} across the character set \var{cs} using +initial state value \var{knil}. +That is, if \var{cs} is the empty set, the procedure returns \var{knil}. +Otherwise, some element \var{c} of \var{cs} is chosen; let \var{cs'} be +the remaining, unchosen characters. +The procedure returns +\begin{tightcode} +(reduce-char-set \var{kons} (\var{kons} \var{c} \var{knil}) \var{cs'})\end{tightcode} +For example, we could define \ex{char-set-members} (see below) +as +\begin{tightcode} +(lambda (cs) (reduce-char-set cons '() cs))\end{tightcode} +\end{desc} + +\subsection{Side effects} +\defun{set-char-set!}{cs char in?}{\undefined} +\begin{desc} +This side-effects character set \var{cs}. +If \var{in?} is true, character \var{char} is added to the set. +Otherwise, it is deleted from the set. + +Use of this procedure is deprecated, since it could damage other procedures +that retain pointers to existing character sets. +You should use \ex{set-char-set!} in contexts where it is guaranteed that +there are no other pointers to the character set being modified. +(For example, functions that create character sets can use this function +to efficiently construct the character set, after which time the set is +used in a pure-functional, shared manner.) +\end{desc} + +\defun{char-set-for-each}{p cs}{\undefined} +\begin{desc} +Apply procedure \var{p} to each character in the character set \var{cs}. +Note that the order in which \var{p} is applied to the characters in the +set is not specified, and may even change from application to application. +\end{desc} + +\defun{copy-char-set}{cs}{char-set} +\begin{desc} +Returns a copy of the character set \var{cs}. \end{desc} \subsection{Creating character sets} @@ -634,46 +701,90 @@ of R4RS procedures. We sought to escape the dilemma by shifting to a new name.} \end{desc} +\defun{char-set-size}{cs}\integer +\begin{desc} +Returns the number of elements in character set \var{cs}. +\end{desc} + \subsection{Character set algebra} \defun {char-set-invert}{char-set}{char-set} -\defunx{char-set-union}{\vari{char-set}1 \vari{char-set}2}{char-set} -\defunx{char-set-intersection}{\vari{char-set}1 \vari{char-set}2}{char-set} -\defunx{char-set-difference}{\vari{char-set}1 \vari{char-set}2}{char-set} +\defunx{char-set-union}{\vari{char-set}1\ldots}{char-set} +\defunx{char-set-intersection}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set} +\defunx{char-set-difference}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set} \begin{desc} These procedures implement set complement, union, intersection, and difference for character sets. +The union, intersection, and difference operations are n-ary, associating +to the left; the difference function requires at least one argument, while +union and intersection may be applied to zero arguments. \end{desc} \subsection{Standard character sets} +\label{sec:std-csets} Several character sets are predefined for convenience: \begin{center} \newcommand{\entry}[1]{\ex{#1}\index{#1}} \begin{tabular}{|ll|} \hline -\entry{char-set:upper-case} & A--Z \\ -\entry{char-set:lower-case} & a--z \\ -\entry{char-set:numeric} & 0--9 \\ -\entry{char-set:whitespace} & space, newline, tab, linefeed, page, - return \\ -\entry{char-set:not-whitespace} & Complement of \ex{char-set:whitespace} \\ -\entry{char-set:alphabetic} & A--Z and a--z \\ +\entry{char-set:alphabetic} & Alphabetic chars \\ +\entry{char-set:lower-case} & Lower-case alphabetic chars \\ +\entry{char-set:upper-case} & Upper-case alphabetic chars \\ +\entry{char-set:numeric} & Decimal digits: 0--9 \\ \entry{char-set:alphanumeric} & Alphabetic or numeric \\ -\entry{char-set:graphic} & Printing characters and space \\ +\entry{char-set:graphic} & Printing characters except space \\ +\entry{char-set:printing} & Printing characters including space \\ +\entry{char-set:whitespace} & Whitespace characters \\ +\entry{char-set:blank} & Blank characters \\ +\entry{char-set:control} & Control characters \\ +\entry{char-set:punctuation} & Punctuation characters \\ +\entry{char-set:hex-digit} & A hexadecimal digit: 0--9, A--F, a--f \\ +\entry{char-set:ascii} & A character in the ASCII set. \\ +\entry{char-set:empty} & Empty set \\ +\entry{char-set:full} & All characters \\ +\hline +\end{tabular} +\end{center} +The first twelve of these correspond to the character classes defined in +Posix. +Note that there may be characters in \ex{char-set:alphabetic} that are +neither upper or lower case---this might occur in implementations that +use a character type richer than ASCII, such as Unicode. +A ``graphic character'' is one that would put ink on your page. +While the exact composition of these sets may vary depending upon the +character type provided by the Scheme system upon which scsh is running, +here are the definitions for some of the sets in an ASCII character set: +\begin{center} +\newcommand{\entry}[1]{\ex{#1}\index{#1}} +\begin{tabular}{|ll|} +\hline +char-set:alphabetic & A--Z and a--z \\ +char-set:lower-case & a--z \\ +char-set:upper-case & A--Z \\ +char-set:graphic & Alphanumeric + punctuation \\ +char-set:whitespace & Space, newline, tab, page, + vertical tab, carriage return \\ +char-set:blank & Space and tab \\ +char-set:control & ASCII 0--31 and 127 \\ +char-set:punctuation & \verb|!"#$%&'()*+,-./:;<=>|\verb#?@[\]^_`{|}~# \\ \hline \end{tabular} \end{center} -\defun {char-upper-case?}\character\boolean +\defun {char-alphabetic?}\character\boolean \defunx{char-lower-case?}\character\boolean +\defunx{char-upper-case?}\character\boolean \defunx{char-numeric? }\character\boolean -\defunx{char-whitespace?}\character\boolean -\defunx{char-alphabetic?}\character\boolean \defunx{char-alphanumeric?}\character\boolean \defunx{char-graphic?}\character\boolean +\defunx{char-printing?}\character\boolean +\defunx{char-whitespace?}\character\boolean +\defunx{char-blank?}\character\boolean +\defunx{char-control?}\character\boolean +\defunx{char-punctuation?}\character\boolean +\defunx{char-hex-digit?}\character\boolean +\defunx{char-ascii?}\character\boolean \begin{desc} These predicates are defined in terms of the above character sets. \end{desc} - -