Added a bunch of new char-set stuff.
This commit is contained in:
parent
ec63c53ee6
commit
54ca902c67
|
@ -576,12 +576,79 @@ is also frequently useful for expanding file-names.
|
||||||
Scsh provides a \ex{char-set} type for expressing sets of characters.
|
Scsh provides a \ex{char-set} type for expressing sets of characters.
|
||||||
These sets are used by some of the delimited-input procedures
|
These sets are used by some of the delimited-input procedures
|
||||||
(section~\ref{sec:field-reader}).
|
(section~\ref{sec:field-reader}).
|
||||||
The character set package that scsh uses was taken from Project Mac's
|
Scsh's character set package was adapted and extended from
|
||||||
MIT Scheme.
|
Project Mac's MIT Scheme package.
|
||||||
|
Note that the character type used in the current implementation corresponds
|
||||||
|
to the ASCII character set---but you would be wise not to build this
|
||||||
|
assumption into your code if you can help it.\footnote{
|
||||||
|
Actually, it's slightly uglier than that, albeit somewhat more
|
||||||
|
useful. The current character type corresponds to an eight-bit
|
||||||
|
superset of ASCII. The \ex{ascii->char} and \ex{char->ascii}
|
||||||
|
functions will preserve this eighth bit. However, none of the
|
||||||
|
the high 128 characters appear in any of the standard character
|
||||||
|
sets defined in section~\ref{sec:std-csets}, except for
|
||||||
|
\ex{char-set:full}. If someone would email the authors a listing
|
||||||
|
of the full Latin-1 definition, we'll be happy to upgrade these
|
||||||
|
sets' definitions to make them Latin-1 compliant.}
|
||||||
|
|
||||||
\defun{char-set?}{x}\boolean
|
\defun{char-set?}{x}\boolean
|
||||||
\begin{desc}
|
\begin{desc}
|
||||||
Returns true if the object \var{x} is a character set.
|
Is the object \var{x} a character set?
|
||||||
|
\end{desc}
|
||||||
|
|
||||||
|
\defun{char-set=}{cs1 cs2}\boolean
|
||||||
|
\begin{desc}
|
||||||
|
Are the character sets \var{cs1} and \var{cs2} equal?
|
||||||
|
\end{desc}
|
||||||
|
|
||||||
|
\defun{char-set<=}{cs1 cs2}\boolean
|
||||||
|
\begin{desc}
|
||||||
|
Returns true if character set \var{cs1} is a subset of character set \var{cs2}.
|
||||||
|
\end{desc}
|
||||||
|
|
||||||
|
\defun{reduce-char-set}{kons knil cs}\object
|
||||||
|
\begin{desc}
|
||||||
|
This is the fundamental iterator for character sets.
|
||||||
|
Reduces the function \var{kons} across the character set \var{cs} using
|
||||||
|
initial state value \var{knil}.
|
||||||
|
That is, if \var{cs} is the empty set, the procedure returns \var{knil}.
|
||||||
|
Otherwise, some element \var{c} of \var{cs} is chosen; let \var{cs'} be
|
||||||
|
the remaining, unchosen characters.
|
||||||
|
The procedure returns
|
||||||
|
\begin{tightcode}
|
||||||
|
(reduce-char-set \var{kons} (\var{kons} \var{c} \var{knil}) \var{cs'})\end{tightcode}
|
||||||
|
For example, we could define \ex{char-set-members} (see below)
|
||||||
|
as
|
||||||
|
\begin{tightcode}
|
||||||
|
(lambda (cs) (reduce-char-set cons '() cs))\end{tightcode}
|
||||||
|
\end{desc}
|
||||||
|
|
||||||
|
\subsection{Side effects}
|
||||||
|
\defun{set-char-set!}{cs char in?}{\undefined}
|
||||||
|
\begin{desc}
|
||||||
|
This side-effects character set \var{cs}.
|
||||||
|
If \var{in?} is true, character \var{char} is added to the set.
|
||||||
|
Otherwise, it is deleted from the set.
|
||||||
|
|
||||||
|
Use of this procedure is deprecated, since it could damage other procedures
|
||||||
|
that retain pointers to existing character sets.
|
||||||
|
You should use \ex{set-char-set!} in contexts where it is guaranteed that
|
||||||
|
there are no other pointers to the character set being modified.
|
||||||
|
(For example, functions that create character sets can use this function
|
||||||
|
to efficiently construct the character set, after which time the set is
|
||||||
|
used in a pure-functional, shared manner.)
|
||||||
|
\end{desc}
|
||||||
|
|
||||||
|
\defun{char-set-for-each}{p cs}{\undefined}
|
||||||
|
\begin{desc}
|
||||||
|
Apply procedure \var{p} to each character in the character set \var{cs}.
|
||||||
|
Note that the order in which \var{p} is applied to the characters in the
|
||||||
|
set is not specified, and may even change from application to application.
|
||||||
|
\end{desc}
|
||||||
|
|
||||||
|
\defun{copy-char-set}{cs}{char-set}
|
||||||
|
\begin{desc}
|
||||||
|
Returns a copy of the character set \var{cs}.
|
||||||
\end{desc}
|
\end{desc}
|
||||||
|
|
||||||
\subsection{Creating character sets}
|
\subsection{Creating character sets}
|
||||||
|
@ -634,46 +701,90 @@ of R4RS procedures.
|
||||||
We sought to escape the dilemma by shifting to a new name.}
|
We sought to escape the dilemma by shifting to a new name.}
|
||||||
\end{desc}
|
\end{desc}
|
||||||
|
|
||||||
|
\defun{char-set-size}{cs}\integer
|
||||||
|
\begin{desc}
|
||||||
|
Returns the number of elements in character set \var{cs}.
|
||||||
|
\end{desc}
|
||||||
|
|
||||||
\subsection{Character set algebra}
|
\subsection{Character set algebra}
|
||||||
\defun {char-set-invert}{char-set}{char-set}
|
\defun {char-set-invert}{char-set}{char-set}
|
||||||
\defunx{char-set-union}{\vari{char-set}1 \vari{char-set}2}{char-set}
|
\defunx{char-set-union}{\vari{char-set}1\ldots}{char-set}
|
||||||
\defunx{char-set-intersection}{\vari{char-set}1 \vari{char-set}2}{char-set}
|
\defunx{char-set-intersection}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set}
|
||||||
\defunx{char-set-difference}{\vari{char-set}1 \vari{char-set}2}{char-set}
|
\defunx{char-set-difference}{\vari{char-set}1 \vari{char-set}2\ldots}{char-set}
|
||||||
\begin{desc}
|
\begin{desc}
|
||||||
These procedures implement set complement, union, intersection, and difference
|
These procedures implement set complement, union, intersection, and difference
|
||||||
for character sets.
|
for character sets.
|
||||||
|
The union, intersection, and difference operations are n-ary, associating
|
||||||
|
to the left; the difference function requires at least one argument, while
|
||||||
|
union and intersection may be applied to zero arguments.
|
||||||
\end{desc}
|
\end{desc}
|
||||||
|
|
||||||
\subsection{Standard character sets}
|
\subsection{Standard character sets}
|
||||||
|
\label{sec:std-csets}
|
||||||
Several character sets are predefined for convenience:
|
Several character sets are predefined for convenience:
|
||||||
|
|
||||||
\begin{center}
|
\begin{center}
|
||||||
\newcommand{\entry}[1]{\ex{#1}\index{#1}}
|
\newcommand{\entry}[1]{\ex{#1}\index{#1}}
|
||||||
\begin{tabular}{|ll|}
|
\begin{tabular}{|ll|}
|
||||||
\hline
|
\hline
|
||||||
\entry{char-set:upper-case} & A--Z \\
|
\entry{char-set:alphabetic} & Alphabetic chars \\
|
||||||
\entry{char-set:lower-case} & a--z \\
|
\entry{char-set:lower-case} & Lower-case alphabetic chars \\
|
||||||
\entry{char-set:numeric} & 0--9 \\
|
\entry{char-set:upper-case} & Upper-case alphabetic chars \\
|
||||||
\entry{char-set:whitespace} & space, newline, tab, linefeed, page,
|
\entry{char-set:numeric} & Decimal digits: 0--9 \\
|
||||||
return \\
|
|
||||||
\entry{char-set:not-whitespace} & Complement of \ex{char-set:whitespace} \\
|
|
||||||
\entry{char-set:alphabetic} & A--Z and a--z \\
|
|
||||||
\entry{char-set:alphanumeric} & Alphabetic or numeric \\
|
\entry{char-set:alphanumeric} & Alphabetic or numeric \\
|
||||||
\entry{char-set:graphic} & Printing characters and space \\
|
\entry{char-set:graphic} & Printing characters except space \\
|
||||||
|
\entry{char-set:printing} & Printing characters including space \\
|
||||||
|
\entry{char-set:whitespace} & Whitespace characters \\
|
||||||
|
\entry{char-set:blank} & Blank characters \\
|
||||||
|
\entry{char-set:control} & Control characters \\
|
||||||
|
\entry{char-set:punctuation} & Punctuation characters \\
|
||||||
|
\entry{char-set:hex-digit} & A hexadecimal digit: 0--9, A--F, a--f \\
|
||||||
|
\entry{char-set:ascii} & A character in the ASCII set. \\
|
||||||
|
\entry{char-set:empty} & Empty set \\
|
||||||
|
\entry{char-set:full} & All characters \\
|
||||||
|
\hline
|
||||||
|
\end{tabular}
|
||||||
|
\end{center}
|
||||||
|
The first twelve of these correspond to the character classes defined in
|
||||||
|
Posix.
|
||||||
|
Note that there may be characters in \ex{char-set:alphabetic} that are
|
||||||
|
neither upper or lower case---this might occur in implementations that
|
||||||
|
use a character type richer than ASCII, such as Unicode.
|
||||||
|
A ``graphic character'' is one that would put ink on your page.
|
||||||
|
While the exact composition of these sets may vary depending upon the
|
||||||
|
character type provided by the Scheme system upon which scsh is running,
|
||||||
|
here are the definitions for some of the sets in an ASCII character set:
|
||||||
|
\begin{center}
|
||||||
|
\newcommand{\entry}[1]{\ex{#1}\index{#1}}
|
||||||
|
\begin{tabular}{|ll|}
|
||||||
|
\hline
|
||||||
|
char-set:alphabetic & A--Z and a--z \\
|
||||||
|
char-set:lower-case & a--z \\
|
||||||
|
char-set:upper-case & A--Z \\
|
||||||
|
char-set:graphic & Alphanumeric + punctuation \\
|
||||||
|
char-set:whitespace & Space, newline, tab, page,
|
||||||
|
vertical tab, carriage return \\
|
||||||
|
char-set:blank & Space and tab \\
|
||||||
|
char-set:control & ASCII 0--31 and 127 \\
|
||||||
|
char-set:punctuation & \verb|!"#$%&'()*+,-./:;<=>|\verb#?@[\]^_`{|}~# \\
|
||||||
\hline
|
\hline
|
||||||
\end{tabular}
|
\end{tabular}
|
||||||
\end{center}
|
\end{center}
|
||||||
|
|
||||||
|
|
||||||
\defun {char-upper-case?}\character\boolean
|
\defun {char-alphabetic?}\character\boolean
|
||||||
\defunx{char-lower-case?}\character\boolean
|
\defunx{char-lower-case?}\character\boolean
|
||||||
|
\defunx{char-upper-case?}\character\boolean
|
||||||
\defunx{char-numeric? }\character\boolean
|
\defunx{char-numeric? }\character\boolean
|
||||||
\defunx{char-whitespace?}\character\boolean
|
|
||||||
\defunx{char-alphabetic?}\character\boolean
|
|
||||||
\defunx{char-alphanumeric?}\character\boolean
|
\defunx{char-alphanumeric?}\character\boolean
|
||||||
\defunx{char-graphic?}\character\boolean
|
\defunx{char-graphic?}\character\boolean
|
||||||
|
\defunx{char-printing?}\character\boolean
|
||||||
|
\defunx{char-whitespace?}\character\boolean
|
||||||
|
\defunx{char-blank?}\character\boolean
|
||||||
|
\defunx{char-control?}\character\boolean
|
||||||
|
\defunx{char-punctuation?}\character\boolean
|
||||||
|
\defunx{char-hex-digit?}\character\boolean
|
||||||
|
\defunx{char-ascii?}\character\boolean
|
||||||
\begin{desc}
|
\begin{desc}
|
||||||
These predicates are defined in terms of the above character sets.
|
These predicates are defined in terms of the above character sets.
|
||||||
\end{desc}
|
\end{desc}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue