Added a bunch of new char-set stuff.
This commit is contained in:
parent
ec63c53ee6
commit
54ca902c67
|
@ -576,12 +576,79 @@ is also frequently useful for expanding file-names.
|
|||
Scsh provides a \ex{char-set} type for expressing sets of characters.
|
||||
These sets are used by some of the delimited-input procedures
|
||||
(section~\ref{sec:field-reader}).
|
||||
The character set package that scsh uses was taken from Project Mac's
|
||||
MIT Scheme.
|
||||
Scsh's character set package was adapted and extended from
|
||||
Project Mac's MIT Scheme package.
|
||||
Note that the character type used in the current implementation corresponds
|
||||
to the ASCII character set---but you would be wise not to build this
|
||||
assumption into your code if you can help it.\footnote{
|
||||
Actually, it's slightly uglier than that, albeit somewhat more
|
||||
useful. The current character type corresponds to an eight-bit
|
||||
superset of ASCII. The \ex{ascii->char} and \ex{char->ascii}
|
||||
functions will preserve this eighth bit. However, none of the
|
||||
the high 128 characters appear in any of the standard character
|
||||
sets defined in section~\ref{sec:std-csets}, except for
|
||||
\ex{char-set:full}. If someone would email the authors a listing
|
||||
of the full Latin-1 definition, we'll be happy to upgrade these
|
||||
sets' definitions to make them Latin-1 compliant.}
|
||||
|
||||
\defun{char-set?}{x}\boolean
|
||||
\begin{desc}
|
||||
Returns true if the object \var{x} is a character set.
|
||||
Is the object \var{x} a character set?
|
||||
\end{desc}
|
||||
|
||||
\defun{char-set=}{cs1 cs2}\boolean
|
||||
\begin{desc}
|
||||
Are the character sets \var{cs1} and \var{cs2} equal?
|
||||
\end{desc}
|
||||
|
||||
\defun{char-set<=}{cs1 cs2}\boolean
|
||||
\begin{desc}
|
||||
Returns true if character set \var{cs1} is a subset of character set \var{cs2}.
|
||||
\end{desc}
|
||||
|
||||
\defun{reduce-char-set}{kons knil cs}\object
|
||||
\begin{desc}
|
||||
This is the fundamental iterator for character sets.
|
||||
Reduces the function \var{kons} across the character set \var{cs} using
|
||||
initial state value \var{knil}.
|
||||
That is, if \var{cs} is the empty set, the procedure returns \var{knil}.
|
||||
Otherwise, some element \var{c} of \var{cs} is chosen; let \var{cs'} be
|
||||
the remaining, unchosen characters.
|
||||
The procedure returns
|
||||
\begin{tightcode}
|
||||
(reduce-char-set \var{kons} (\var{kons} \var{c} \var{knil}) \var{cs'})\end{tightcode}
|
||||
For example, we could define \ex{char-set-members} (see below)
|
||||
as
|
||||
\begin{tightcode}
|
||||
(lambda (cs) (reduce-char-set cons '() cs))\end{tightcode}
|
||||
\end{desc}
|
||||
|
||||
\subsection{Side effects}
|
||||
\defun{set-char-set!}{cs char in?}{\undefined}
|
||||
\begin{desc}
|
||||
This side-effects character set \var{cs}.
|
||||
If \var{in?} is true, character \var{char} is added to the set.
|
||||
Otherwise, it is deleted from the set.
|
||||
|
||||
Use of this procedure is deprecated, since it could damage other procedures
|
||||
that retain pointers to existing character sets.
|
||||
You should use \ex{set-char-set!} in contexts where it is guaranteed that
|
||||
there are no other pointers to the character set being modified.
|
||||
(For example, functions that create character sets can use this function
|
||||
to efficiently construct the character set, after which time the set is
|
||||
used in a pure-functional, shared manner.)
|
||||
\end{desc}
|
||||
|
||||
\defun{char-set-for-each}{p cs}{\undefined}
|
||||
\begin{desc}
|
||||
Apply procedure \var{p} to each character in the character set \var{cs}.
|
||||
Note that the order in which \var{p} is applied to the characters in the
|
||||
set is not specified, and may even change from application to application.
|
||||
\end{desc}
|
||||
|
||||
\defun{copy-char-set}{cs}{char-set}
|
||||
\begin{desc}
|
||||
Returns a copy of the character set \var{cs}.
|
||||
\end{desc}
|
||||
|
||||
\subsection{Creating character sets}
|
||||
|
@ -634,46 +701,90 @@ of R4RS procedures.
|
|||
We sought to escape the dilemma by shifting to a new name.}
|
||||
\end{desc}
|
||||
|
||||
\defun{char-set-size}{cs}\integer
|
||||
\begin{desc}
|
||||
Returns the number of elements in character set \var{cs}.
|
||||
\end{desc}
|
||||
|
||||
\subsection{Character set algebra}
|
||||
\defun {char-set-invert}{char-set}{char-set}
|
||||
\defunx{char-set-union}{\vari{char-set}1 \vari{char-set}2}{char-set}
|
||||
\defunx{char-set-intersection}{\vari{char-set}1 \vari{char-set}2}{char-set}
|
||||
\defunx{char-set-difference}{\vari{char-set}1 \vari{char-set}2}{char-set}
|
||||
\defunx{char-set-union}{\vari{char-set}1\ldots}{char-set}
|
||||
\defunx{char-set-intersection}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set}
|
||||
\defunx{char-set-difference}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set}
|
||||
\begin{desc}
|
||||
These procedures implement set complement, union, intersection, and difference
|
||||
for character sets.
|
||||
The union, intersection, and difference operations are n-ary, associating
|
||||
to the left; the difference function requires at least one argument, while
|
||||
union and intersection may be applied to zero arguments.
|
||||
\end{desc}
|
||||
|
||||
\subsection{Standard character sets}
|
||||
\label{sec:std-csets}
|
||||
Several character sets are predefined for convenience:
|
||||
|
||||
\begin{center}
|
||||
\newcommand{\entry}[1]{\ex{#1}\index{#1}}
|
||||
\begin{tabular}{|ll|}
|
||||
\hline
|
||||
\entry{char-set:upper-case} & A--Z \\
|
||||
\entry{char-set:lower-case} & a--z \\
|
||||
\entry{char-set:numeric} & 0--9 \\
|
||||
\entry{char-set:whitespace} & space, newline, tab, linefeed, page,
|
||||
return \\
|
||||
\entry{char-set:not-whitespace} & Complement of \ex{char-set:whitespace} \\
|
||||
\entry{char-set:alphabetic} & A--Z and a--z \\
|
||||
\entry{char-set:alphabetic} & Alphabetic chars \\
|
||||
\entry{char-set:lower-case} & Lower-case alphabetic chars \\
|
||||
\entry{char-set:upper-case} & Upper-case alphabetic chars \\
|
||||
\entry{char-set:numeric} & Decimal digits: 0--9 \\
|
||||
\entry{char-set:alphanumeric} & Alphabetic or numeric \\
|
||||
\entry{char-set:graphic} & Printing characters and space \\
|
||||
\entry{char-set:graphic} & Printing characters except space \\
|
||||
\entry{char-set:printing} & Printing characters including space \\
|
||||
\entry{char-set:whitespace} & Whitespace characters \\
|
||||
\entry{char-set:blank} & Blank characters \\
|
||||
\entry{char-set:control} & Control characters \\
|
||||
\entry{char-set:punctuation} & Punctuation characters \\
|
||||
\entry{char-set:hex-digit} & A hexadecimal digit: 0--9, A--F, a--f \\
|
||||
\entry{char-set:ascii} & A character in the ASCII set. \\
|
||||
\entry{char-set:empty} & Empty set \\
|
||||
\entry{char-set:full} & All characters \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
The first twelve of these correspond to the character classes defined in
|
||||
Posix.
|
||||
Note that there may be characters in \ex{char-set:alphabetic} that are
|
||||
neither upper or lower case---this might occur in implementations that
|
||||
use a character type richer than ASCII, such as Unicode.
|
||||
A ``graphic character'' is one that would put ink on your page.
|
||||
While the exact composition of these sets may vary depending upon the
|
||||
character type provided by the Scheme system upon which scsh is running,
|
||||
here are the definitions for some of the sets in an ASCII character set:
|
||||
\begin{center}
|
||||
\newcommand{\entry}[1]{\ex{#1}\index{#1}}
|
||||
\begin{tabular}{|ll|}
|
||||
\hline
|
||||
char-set:alphabetic & A--Z and a--z \\
|
||||
char-set:lower-case & a--z \\
|
||||
char-set:upper-case & A--Z \\
|
||||
char-set:graphic & Alphanumeric + punctuation \\
|
||||
char-set:whitespace & Space, newline, tab, page,
|
||||
vertical tab, carriage return \\
|
||||
char-set:blank & Space and tab \\
|
||||
char-set:control & ASCII 0--31 and 127 \\
|
||||
char-set:punctuation & \verb|!"#$%&'()*+,-./:;<=>|\verb#?@[\]^_`{|}~# \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
|
||||
|
||||
\defun {char-upper-case?}\character\boolean
|
||||
\defun {char-alphabetic?}\character\boolean
|
||||
\defunx{char-lower-case?}\character\boolean
|
||||
\defunx{char-upper-case?}\character\boolean
|
||||
\defunx{char-numeric? }\character\boolean
|
||||
\defunx{char-whitespace?}\character\boolean
|
||||
\defunx{char-alphabetic?}\character\boolean
|
||||
\defunx{char-alphanumeric?}\character\boolean
|
||||
\defunx{char-graphic?}\character\boolean
|
||||
\defunx{char-printing?}\character\boolean
|
||||
\defunx{char-whitespace?}\character\boolean
|
||||
\defunx{char-blank?}\character\boolean
|
||||
\defunx{char-control?}\character\boolean
|
||||
\defunx{char-punctuation?}\character\boolean
|
||||
\defunx{char-hex-digit?}\character\boolean
|
||||
\defunx{char-ascii?}\character\boolean
|
||||
\begin{desc}
|
||||
These predicates are defined in terms of the above character sets.
|
||||
\end{desc}
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue