diff --git a/Makefile.in b/Makefile.in index 0732fc1..bea6dd8 100644 --- a/Makefile.in +++ b/Makefile.in @@ -81,9 +81,9 @@ BIG_HEAP = -h 5000000 # LINKER_VM = ./$(VM) $(BIG_HEAP) # LINKER_RUNNABLE = $(LINKER_VM) -i $(IMAGE) -# therefor according to 2. -LINKER_VM = $(RUNNABLE) $(BIG_HEAP) -LINKER_RUNNABLE = $(RUNNABLE) +# therefor according to 2 but canot use scsh since -i is not understood +LINKER_VM = scheme48 $(BIG_HEAP) +LINKER_RUNNABLE = scheme48 LINKER_IMAGE = build/linker.image LINKER = $(LINKER_VM) -i $(LINKER_IMAGE) @@ -382,7 +382,7 @@ go: echo '#!/bin/sh' >$@ && \ echo >>$@ && \ echo "lib=`pwd`" >>$@ && \ - echo 'exec $$lib/$(VM) -o $$lib/$(VM) -i $$lib/scsh/$(IMAGE) "$$@"' \ + echo 'exec $$lib/$(VM) -o $$lib/$(VM) -i $$lib/scsh/scsh.image "$$@"' \ >>$@ && \ chmod +x $@ @@ -766,6 +766,7 @@ SCHEME =scsh/awk.scm \ scsh/scsh-version.scm \ scsh/scsh.scm \ scsh/select.scm \ + scsh/sighandlers.scm \ scsh/startup.scm \ scsh/stringcoll.scm \ scsh/syntax-helpers.scm \ @@ -798,16 +799,6 @@ scsh/scsh: scsh/scsh-tramp.c -DIMAGE=\"$(LIB)/scsh.image\" \ scsh/scsh-tramp.c -loads = $(srcdir)/scsh/let-opt.scm $(srcdir)/scsh/scsh-interfaces.scm \ - $(srcdir)/scsh/machine/packages.scm \ - $(srcdir)/scsh/rx/packages.scm \ - $(srcdir)/scsh/rx/cond-package.scm \ - $(srcdir)/scsh/scsh-package.scm \ - $(srcdir)/scsh/lib/string-pack.scm \ - $(srcdir)/scsh/lib/list-pack.scm \ - $(srcdir)/scsh/lib/ccp-pack.scm - - bs: build/build-scsh-image sh $(srcdir)/build/build-scsh-image "$(srcdir)" "$(LIB)" "$(IMAGE)" \ "$(VM)" cig/cig.image @@ -817,9 +808,13 @@ loads = $(srcdir)/scsh/let-opt.scm $(srcdir)/scsh/scsh-interfaces.scm \ $(srcdir)/scsh/rx/packages.scm \ $(srcdir)/scsh/rx/cond-package.scm \ $(srcdir)/scsh/scsh-package.scm \ - $(srcdir)/scsh/lib/string-pack.scm \ + $(srcdir)/scsh/lib/cset-package.scm \ + $(srcdir)/scsh/lib/string-package.scm \ $(srcdir)/scsh/lib/list-pack.scm \ - $(srcdir)/scsh/lib/ccp-pack.scm + $(srcdir)/scsh/lib/ccp-pack.scm \ + $(srcdir)/scsh/lib/char-package.scm \ + $(srcdir)/scsh/lib/cset-obsolete.scm + scsh/scsh.image: $(VM) $(SCHEME) $(CIG).image (echo ",translate =scheme48/ $(srcdir)/scheme/"; \ @@ -828,7 +823,6 @@ scsh/scsh.image: $(VM) $(SCHEME) $(CIG).image echo ",config"; \ echo ",load $(loads)"; \ echo ",load-package scsh"; \ - echo ",load-package events"; \ echo ",load-package scsh-here-string-hax"; \ echo ",translate =scheme48/ $(LIB)/"; \ echo ",load-package list-lib"; \ diff --git a/scsh/fr.scm b/scsh/fr.scm index d98ceaf..a50782f 100644 --- a/scsh/fr.scm +++ b/scsh/fr.scm @@ -318,7 +318,7 @@ s))) ((concat) ; CONCAT-delimiter reader. - (let ((not-delims (char-set-invert delims))) + (let ((not-delims (char-set-complement delims))) (lambda maybe-port (let* ((p (:optional maybe-port (current-input-port))) (s (read-delimited delims p 'concat))) @@ -328,7 +328,7 @@ (string-append s extra-delims)))))))) ((split) ; SPLIT-delimiter reader. - (let ((not-delims (char-set-invert delims))) + (let ((not-delims (char-set-complement delims))) (lambda maybe-port (let ((p (:optional maybe-port (current-input-port)))) (receive (s delim) (read-delimited delims p 'split) diff --git a/scsh/glob.scm b/scsh/glob.scm index 271b59c..bb6253d 100644 --- a/scsh/glob.scm +++ b/scsh/glob.scm @@ -150,16 +150,15 @@ (case c ((#\]) (let ((cset (fold (lambda (elt cset) - (char-set-union - cset - (if (char? elt) - (char-set elt) - (ascii-range->char-set (char->ascii (car elt)) - (+ 1 (char->ascii (cdr elt))))))) - char-set:empty + (if (char? elt) + (char-set-adjoin! cset elt) + (ucs-range->char-set! (char->ascii (car elt)) + (+ 1 (char->ascii (cdr elt))) + #f cset))) + (char-set-copy char-set:empty) elts))) (values (re-char-set (if negate? - (char-set-invert cset) + (char-set-compelment! cset) cset)) i))) diff --git a/scsh/lib/ccp-pack.scm b/scsh/lib/ccp-pack.scm index b5095a1..50f4d1c 100644 --- a/scsh/lib/ccp-pack.scm +++ b/scsh/lib/ccp-pack.scm @@ -93,7 +93,7 @@ )) (define-structure ccp-lib ccp-lib-interface - (open char-set-package + (open char-set-lib ascii defrec-package string-lib diff --git a/scsh/lib/ccp.scm b/scsh/lib/ccp.scm index 16dcece..9f17cab 100644 --- a/scsh/lib/ccp.scm +++ b/scsh/lib/ccp.scm @@ -95,11 +95,11 @@ (every (lambda (ccp2) (and (char-set= domain (ccp:domain ccp2)) (let ((cmap2 (ccp:map ccp2))) - (char-set-every? (lambda (c) - (let ((i (char->ascii c))) - (char=? (string-ref cmap i) - (string-ref cmap2 i)))) - domain)))) + (char-set-every (lambda (c) + (let ((i (char->ascii c))) + (char=? (string-ref cmap i) + (string-ref cmap2 i)))) + domain)))) rest))) @@ -116,11 +116,11 @@ (rest (cdr rest))) (and (char-set<= domain1 domain2) (let ((cmap2 (ccp:map ccp2))) - (char-set-every? (lambda (c) - (let ((i (char->ascii c))) - (char=? (string-ref cmap1 i) - (string-ref cmap2 i)))) - domain1)) + (char-set-every (lambda (c) + (let ((i (char->ascii c))) + (char=? (string-ref cmap1 i) + (string-ref cmap2 i)))) + domain1)) (lp domain2 cmap2 rest)))))) diff --git a/scsh/lib/char-package.scm b/scsh/lib/char-package.scm new file mode 100644 index 0000000..1d535c1 --- /dev/null +++ b/scsh/lib/char-package.scm @@ -0,0 +1,59 @@ +;;; These defs are things for characters *not* in SRFIs 13 & 14. +;;; It includes some R5RS defs that are not correct in S48 in a Latin-1 world. + +(define-interface char-set-predicates-interface + (export + ((char-lower-case? ; R5RS + char-upper-case? ; R5RS + char-alphabetic? ; R5RS + char-numeric? ; R5RS + char-whitespace? ; R5RS + + char-alphanumeric? ; For compatibility w/old code + + char-letter? ; Scsh + char-digit? + char-letter+digit? + char-graphic? + char-printing? + char-blank? + char-iso-control? + char-punctuation? + char-symbol? + char-hex-digit? + char-ascii?) (proc (:char) :boolean)))) + + +(define-structure char-set-predicates-lib char-set-predicates-interface + (open error-package ; ERROR + scsh-utilities ; DEPRECATED-PROC + char-set-lib + scheme) + + (begin + ;; These are R5RS. We can't use the native S48 ones, because they + ;; don't handle full Latin-1. + (define (char-lower-case? c) (char-set-contains? char-set:lower-case c)) + (define (char-upper-case? c) (char-set-contains? char-set:upper-case c)) + (define (char-alphabetic? c) (char-set-contains? char-set:letter c)) + (define (char-numeric? c) (char-set-contains? char-set:digit c)) + (define (char-whitespace? c) (char-set-contains? char-set:whitespace c)) + + ;; These are scsh extensions to R5RS. + (define (char-letter? c) (char-set-contains? char-set:letter c)) + (define (char-digit? c) (char-set-contains? char-set:digit c)) + (define (char-letter+digit? c) (char-set-contains? char-set:letter+digit c)) + (define (char-graphic? c) (char-set-contains? char-set:graphic c)) + (define (char-printing? c) (char-set-contains? char-set:printing c)) + (define (char-blank? c) (char-set-contains? char-set:blank c)) + (define (char-iso-control? c) (char-set-contains? char-set:iso-control c)) + (define (char-punctuation? c) (char-set-contains? char-set:punctuation c)) + (define (char-symbol? c) (char-set-contains? char-set:symbol c)) + (define (char-hex-digit? c) (char-set-contains? char-set:hex-digit c)) + (define (char-ascii? c) (char-set-contains? char-set:ascii c)) + + ;; Obsolete scsh. + (define char-alphanumeric? + (deprecated-proc char-letter+digit? 'char-alphanumeric? + "Use CHAR-LETTER+DIGIT? instead."))) + (optimize auto-integrate)) diff --git a/scsh/lib/cset-lib.html b/scsh/lib/cset-lib.html new file mode 100644 index 0000000..b7eb80f --- /dev/null +++ b/scsh/lib/cset-lib.html @@ -0,0 +1,2016 @@ + + + + + +
+ + ++ +The ability to efficiently represent and manipulate sets of characters is an +unglamorous but very useful capability for text-processing code -- one that +tends to pop up in the definitions of other libraries. Hence it is useful to +specify a general substrate for this functionality early. This SRFI defines a +general library that provides this functionality. + +It is accompanied by a reference implementation for the spec. The reference +implementation is fairly efficient, straightforwardly portable, and has a +"free software" copyright. The implementation is tuned for "small" 7 or 8 +bit character types, such as ASCII or Latin-1; the data structures and +algorithms would have to be altered for larger 16 or 32 bit character types +such as Unicode -- however, the specs have been carefully designed with these +larger character types in mind. + +Several forthcoming SRFIs can be defined in terms of this one: +
read-line
)
+ +Here is the complete set of bindings -- procedural and otherwise -- +exported by this library. In a Scheme system that has a module or package +system, these procedures should be contained in a module named "char-set-lib". + +
+char-set? char-set= char-set<= char-set-hash ++ +
+char-set-cursor char-set-ref char-set-cursor-next end-of-char-set? +char-set-fold char-set-unfold char-set-unfold! +char-set-for-each char-set-map ++ +
+char-set-copy char-set + +list->char-set string->char-set +list->char-set! string->char-set! + +char-set-filter ucs-range->char-set +char-set-filter! ucs-range->char-set! + +->char-set ++ +
+char-set->list char-set->string +char-set-size char-set-count char-set-contains? +char-set-every char-set-any ++ +
+char-set-adjoin char-set-delete +char-set-adjoin! char-set-delete! + +char-set-complement char-set-union char-set-intersection +char-set-complement! char-set-union! char-set-intersection! + +char-set-difference char-set-xor char-set-diff+intersection +char-set-difference! char-set-xor! char-set-diff+intersection! ++ +
+char-set:lower-case char-set:upper-case char-set:title-case +char-set:letter char-set:digit char-set:letter+digit +char-set:graphic char-set:printing char-set:whitespace +char-set:iso-control char-set:punctuation char-set:symbol +char-set:hex-digit char-set:blank char-set:ascii +char-set:empty char-set:full ++ +
+The ability to efficiently manipulate sets of characters is quite +useful for text-processing code. Encapsulating this functionality in +a general, efficiently implemented library can assist all such code. +This library defines a new data structure to represent these sets, called +a "char-set." The char-set type is distinct from all other types. + +
+This library is designed to be portable across implementations that use
+different character types and representations, especially ASCII, Latin-1
+and Unicode. Some effort has been made to preserve compatibility with Java
+in the Unicode case (see the definition of char-set:whitespace
for the
+single real deviation).
+
+
+
+The procedures of this SRFI, by default, are "pure functional" -- they do not +alter their parameters. However, this SRFI defines a set of "linear-update" +procedures which have a hybrid pure-functional/side-effecting semantics: they +are allowed, but not required, to side-effect one of their parameters in order +to construct their result. An implementation may legally implement these +procedures as pure, side-effect-free functions, or it may implement them using +side effects, depending upon the details of what is the most efficient or +simple to implement in terms of the underlying representation. + +
+The linear-update routines all have names ending with "!". + +
+Clients of these procedures may not rely upon these procedures working by +side effect. For example, this is not guaranteed to work: +
+(let* ((cs1 (char-set #\a #\b #\c)) ; cs1 = {a,b,c}. + (cs2 (char-set-adjoin! cs1 #\d))) ; Add d to {a,b,c}. + cs1) ; Could be either {a,b,c} or {a,b,c,d}. ++
+However, this is well-defined: +
+(let ((cs (char-set #\a #\b #\c))) + (char-set-adjoin! cs #\d)) ; Add d to {a,b,c}. ++ +
+So clients of these procedures write in a functional style, but must +additionally be sure that, when the procedure is called, there are no other +live pointers to the potentially-modified character set (hence the term +"linear update"). + +
+There are two benefits to this convention: +
+Note that pure functional representations are the right thing for +ASCII- or Latin-1-based Scheme implementations, since a char-set can +be represented in an ASCII Scheme with 4 32-bit words. Pure set-algebra +operations on such a representation are very fast and efficient. Programmers +who code using linear-update operations are guaranteed the system will +provide the best implementation across multiple platforms. + +
+In practice, these procedures are most useful for efficiently constructing +character sets in a side-effecting manner, in some limited local context, +before passing the character set outside the local construction scope to be +used in a functional manner. + +
+Scheme provides no assistance in checking the linearity of the potentially +side-effected parameters passed to these functions --- there's no linear +type checker or run-time mechanism for detecting violations. (But +sophisticated programming environments, such as DrScheme, might help.) + + +
+Users are cautioned that the R5RS predicates +
+char-alphabetic?
+char-numeric?
+char-whitespace?
+char-upper-case?
+char-lower-case?
+
++may or may not be in agreement with the SRFI 14 base character sets +
+char-set:letter
+char-set:digit
+char-set:whitespace
+char-set:upper-case
+char-set:lower-case
+
++Implementors are strongly encouraged to bring these predicates into +agreement with the base character sets of this SRFI; not to do so risks +major confusion. + + + +
+In the following procedure specifications: +
+Passing values to procedures with these parameters that do not satisfy these +types is an error. + +
+Unless otherwise noted in the specification of a procedure, procedures
+always return character sets that are distinct (from the point of view
+of the linear-update operations) from the parameter character sets. For
+example, char-set-adjoin
is guaranteed to provide a fresh character set,
+even if it is not given any character parameters.
+
+
+Parameters given in square brackets are optional. Unless otherwise noted in the +text describing the procedure, any prefix of these optional parameters may +be supplied, from zero arguments to the full list. When a procedure returns +multiple values, this is shown by listing the return values in square +brackets, as well. So, for example, the procedure with signature +
+halts? f [x init-store] -> [boolean integer] ++would take one (f), two (f, x) +or three (f, x, init-store) input parameters, +and return two values, a boolean and an integer. + +
+A parameter followed by "...
" means zero-or-more elements.
+So the procedure with the signature
+
+sum-squares x ... -> number ++takes zero or more arguments (x ...), +while the procedure with signature +
+spell-check doc dict1 dict2 ... -> string-list ++takes two required parameters +(doc and dict1) +and zero or more optional parameters (dict2 ...). + + + +
char-set?
obj -> boolean
+char-set=
cs1 ... -> boolean
++ Boundary cases: +
+(char-set=) => true +(char-set= cs) => true ++ +
+ Rationale: transitive binary relations are generally extended to n-ary + relations in Scheme, which enables clearer, more concise code to be + written. While the zero-argument and one-argument cases will almost + certainly not arise in first-order uses of such relations, they may well + arise in higher-order cases or macro-generated code. + E.g., consider +
+(apply char-set= cset-list) ++
+ This is well-defined if the list is empty or a singleton list. Hence + we extend these relations to any number of arguments. Implementors + have reported actual uses of n-ary relations in higher-order cases + allowing for fewer than two arguments. The way of Scheme is to handle the + general case; we provide the fully general extension. +
+ A counter-argument to this extension is that
+ R5RS's
+ transitive binary arithmetic relations
+ (=
, <
, etc.)
+ require at least two arguments, hence
+ this decision is a break with the prior convention -- although it is
+ at least one that is backwards-compatible.
+
+
+
char-set<=
cs1 ... -> boolean
++Boundary cases: +
+(char-set<=) => true +(char-set<= cs) => true ++
+Rationale: See char-set=
for discussion of zero- and one-argument
+applications. Consider testing a list of char-sets for monotonicity
+with
+
+(apply char-set<= cset-list) ++ + +
char-set-hash
cs [bound] -> integer
++ If bound is either zero or not given, the implementation may use + an implementation-specific default value, chosen to be as large as + is efficiently practical. For instance, the default range might be chosen + for a given implementation to map all strings into the range of + integers that can be represented with a single machine word. + + +
+ Invariant: +
+(char-set= cs1 cs2) => (= (char-set-hash cs1 b) (char-set-hash cs2 b)) ++ +
+ A legal but nonetheless discouraged implementation: +
+(define (char-set-hash cs . maybe-bound) 1) ++ +
+ Rationale: allowing the user to specify an explicit bound simplifies user + code by removing the mod operation that typically accompanies every hash + computation, and also may allow the implementation of the hash function to + exploit a reduced range to efficiently compute the hash value. + E.g., for + small bounds, the hash function may be computed in a fashion such that + intermediate values never overflow into bignum integers, allowing the + implementor to provide a fixnum-specific "fast path" for computing the + common cases very rapidly. + +
char-set-cursor
cset -> cursor
+char-set-ref
cset cursor -> char
+char-set-cursor-next
cset cursor -> cursor
+end-of-char-set?
cursor -> boolean
+char-set-cursor
produces a new cursor for a given char set.
+ The set element indexed by the cursor is fetched with
+ char-set-ref
.
+ A cursor index is incremented with char-set-cursor-next
;
+ in this way, code can step through every character in a char set.
+ Stepping a cursor "past the end" of a char set produces a cursor that
+ answers true to end-of-char-set?
.
+ It is an error to pass such a cursor to char-set-ref
or to
+ char-set-cursor-next
.
+
+
+ A cursor value may not be used in conjunction with a different character
+ set; if it is passed to char-set-ref
or
+ char-set-cursor-next
with
+ a character set other than the one used to create it, the results and
+ effects are undefined.
+
+
+ Cursor values are not necessarily distinct from other types. + They may be + integers, linked lists, records, procedures or other values. This license + is granted to allow cursors to be very "lightweight" values suitable for + tight iteration, even in fairly simple implementations. + +
+ Note that these primitives are necessary to export an iteration facility + for char sets to loop macros. + +
+ Example: +
+(define cs (char-set #\G #\a #\T #\e #\c #\h)) + +;; Collect elts of CS into a list. +(let lp ((cur (char-set-cursor cs)) (ans '())) + (if (end-of-char-set? cur) ans + (lp (char-set-cursor-next cs cur) + (cons (char-set-ref cs cur) ans)))) + => (#\G #\T #\a #\c #\e #\h) + +;; Equivalently, using a list unfold (from SRFI 1): +(unfold-right end-of-char-set? + (curry char-set-ref cs) + (curry char-set-cursor-next cs) + (char-set-cursor cs)) + => (#\G #\T #\a #\c #\e #\h) ++ +
+ Rationale: Note that the cursor API's four functions "fit" the functional
+ protocol used by the unfolders provided by the list, string and char-set
+ SRFIs (see the example above). By way of contrast, here is a simpler,
+ two-function API that was rejected for failing this criterion. Besides
+ char-set-cursor
, it provided a single
+ function that mapped a cursor and a character set to two values, the
+ indexed character and the next cursor. If the cursor had exhausted the
+ character set, then this function returned false instead of the character
+ value, and another end-of-char-set cursor. In this way, the other three
+ functions of the current API were combined together.
+
+
+
char-set-fold
kons knil cs -> object
++(char-set-fold kons (kons c knil) cs') ++
+ Examples: +
+;; CHAR-SET-MEMBERS +(lambda (cs) (char-set-fold cons '() cs)) + +;; CHAR-SET-SIZE +(lambda (cs) (char-set-fold (lambda (c i) (+ i 1)) 0 cs)) + +;; How many vowels in the char set? +(lambda (cs) + (char-set-fold (lambda (c i) (if (vowel? c) (+ i 1) i)) + 0 cs)) ++ + +
char-set-unfold
f p g seed [base-cs] -> char-set
+char-set-unfold!
f p g seed base-cs -> char-set
+char-set-unfold!
adds the characters to base-cs in a
+ linear-update -- it is allowed, but not required, to side-effect
+ and use base-cs's storage to construct the result.
++ More precisely, the following definitions hold, ignoring the + optional-argument issues: + +
+(define (char-set-unfold p f g seed base-cs) + (char-set-unfold! p f g seed (char-set-copy base-cs))) + +(define (char-set-unfold! p f g seed base-cs) + (let lp ((seed seed) (cs base-cs)) + (if (p seed) cs ; P says we are done. + (lp (g seed) ; Loop on (G SEED). + (char-set-adjoin! cs (f seed)))))) ; Add (F SEED) to set. ++ + (Note that the actual implementation may be more efficient.) + +
+ Examples: +
+(port->char-set p) = (char-set-unfold eof-object? values + (lambda (x) (read-char p)) + (read-char p)) + +(list->char-set lis) = (char-set-unfold null? car cdr lis) ++ +
char-set-for-each
proc cs -> unspecified
+
+ Nothing at all is specified about the value returned by this procedure; it
+ is not even required to be consistent from call to call. It is simply
+ required to be a value (or values) that may be passed to a command
+ continuation, e.g. as the value of an expression appearing as a
+ non-terminal subform of a begin
expression.
+ Note that in
+ R5RS,
+ this restricts the procedure to returning a single value;
+ non-R5RS systems may not even provide this restriction.
+
+
+
char-set-map
proc cs -> char-set
++ Essentially lifts proc from a char->char procedure to a char-set -> + char-set procedure. + +
+ Example: +
+(char-set-map char-downcase cset) ++
char-set-copy
cs -> char-set
+
+ A system that provides pure-functional implementations of the
+ linear-operator suite could implement this procedure as the identity
+ function -- so copies are not guaranteed to be distinct by eq?
.
+
+
+
char-set
char1 ... -> char-set
+list->char-set
char-list [base-cs] -> char-set
+list->char-set!
char-list base-cs -> char-set
+
+ If character set base-cs is provided, the characters from char-list
+ are added to it. list->char-set!
is allowed, but not required,
+ to side-effect and reuse the storage in base-cs;
+ list->char-set
produces a fresh character set.
+
+
+
string->char-set
s [base-cs] -> char-set
+string->char-set!
s base-cs -> char-set
+
+ If character set base-cs is provided, the characters from s are added to
+ it. string->char-set!
is allowed, but not required, to side-effect and
+ reuse the storage in base-cs; string->char-set
produces a fresh character
+ set.
+
+
+
char-set-filter
pred cs [base-cs] -> char-set
+char-set-filter!
pred cs base-cs -> char-set
+(pred c)
+ returns true.
+
+
+ If character set base-cs is provided, the characters specified
+ by pred are added to it.
+ char-set-filter!
is allowed, but not required,
+ to side-effect and reuse the storage in base-cs;
+ char-set-filter
produces a fresh character set.
+
+
+ An implementation may not save away a reference to pred and
+ invoke it after char-set-filter
or
+ char-set-filter!
returns -- that is, "lazy,"
+ on-demand implementations are not allowed, as pred may have
+ external dependencies on mutable data or have other side-effects.
+
+
+ Rationale: This procedure provides a means of converting a character
+ predicate into its equivalent character set; the cs parameter
+ allows the programmer to bound the predicate's domain. Programmers should
+ be aware that filtering a character set such as char-set:full
+ could be a very expensive operation in an implementation that provided an
+ extremely large character type, such as 32-bit Unicode. An earlier draft
+ of this library provided a simple predicate->char-set
+ procedure, which was rejected in favor of char-set-filter
for
+ this reason.
+
+
+
+
ucs-range->char-set
lower upper [error? base-cs] -> char-set
+ucs-range->char-set!
lower upper error? base-cs -> char-set
++ Returns a character set containing every character whose ISO/IEC 10646 + UCS-4 code lies in the half-open range [lower,upper). + +
+ If character set base-cs is provided, the characters specified by the
+ range are added to it. ucs-range->char-set!
is allowed, but not required,
+ to side-effect and reuse the storage in base-cs;
+ ucs-range->char-set
produces a fresh character set.
+
+
+ Note that ASCII codes are a subset of the Latin-1 codes, which are in turn + a subset of the 16-bit Unicode codes, which are themselves a subset of the + 32-bit UCS-4 codes. We commit to a specific encoding in this routine, + regardless of the underlying representation of characters, so that client + code using this library will be portable. I.e., a conformant Scheme + implementation may use EBCDIC or SHIFT-JIS to encode characters; it must + simply map the UCS characters from the given range into the native + representation when possible, and report errors when not possible. + + +
->char-set
x -> char-set
+char-set-size
cs -> integer
+char-set-count
pred cs -> integer
+char-set->list
cs -> character-list
+char-set->string
cs -> string
+char-set-contains?
cs char -> boolean
++ The MIT Scheme character-set package called this procedure + char-set-member?, but the argument order isn't consistent with the name. + + +
char-set-every
pred cs -> boolean
+char-set-any
pred cs -> boolean
+char-set-every
procedure returns true if predicate pred
+ returns true of every character in the character set cs.
+ Likewise, char-set-any
applies pred to every character in
+ character set cs, and returns the first true value it finds.
+ If no character produces a true value, it returns false.
+ The order in which these procedures sequence through the elements of
+ cs is not specified.
+
+
+ Note that if you need to determine the actual character on which a
+ predicate returns true, use char-set-any
and arrange for the predicate
+ to return the character parameter as its true value, e.g.
+
+(char-set-any (lambda (c) (and (char-upper-case? c) c)) + cs) ++
char-set-adjoin
cs char1 ... -> char-set
+char-set-delete
cs char1 ... -> char-set
+char-set-adjoin!
cs char1 ... -> char-set
+char-set-delete!
cs char1 ... -> char-set
+char-set-complement
cs -> char-set
+char-set-union
cs1 ... -> char-set
+char-set-intersection
cs1 ... -> char-set
+char-set-difference
cs1 cs2 ... -> char-set
+char-set-xor
cs1 ... -> char-set
+char-set-diff+intersection
cs1 cs2 ... -> [char-set char-set]
++ Boundary cases: +
+(char-set-union) => char-set:empty +(char-set-intersection) => char-set:full +(char-set-xor) => char-set:empty +(char-set-difference cs) => cs ++ +
+ char-set-diff+intersection
returns both the difference and the
+ intersection of the arguments -- it partitions its first parameter.
+ It is equivalent to
+
+(values (char-set-difference cs1 cs2 ...) + (char-set-intersection cs1 (char-set-union cs2 ...))) ++ but can be implemented more efficiently. + +
+ Programmers should be aware that char-set-complement
could potentially
+ be a very expensive operation in Scheme implementations that provide
+ a very large character type, such as 32-bit Unicode. If this is a
+ possibility, sets can be complimented with respect to a smaller
+ universe using char-set-difference
.
+
+
+
+
char-set-complement!
cs -> char-set
+char-set-union!
cs1 cs2 ... -> char-set
+char-set-intersection!
cs1 cs2 ... -> char-set
+char-set-difference!
cs1 cs2 ... -> char-set
+char-set-xor!
cs1 cs2 ... -> char-set
+char-set-diff+intersection!
cs1 cs2 cs3 ... -> [char-set char-set]
+
+ char-set-diff+intersection!
is allowed to side-effect both
+ of its two required parameters, cs1
+ and cs2.
+
+Several character sets are predefined for convenience: + + + + + + + + + + + + + + + + + + +
char-set:lower-case | Lower-case letters |
char-set:upper-case | Upper-case letters |
char-set:title-case | Title-case letters |
char-set:letter | Letters |
char-set:digit | Digits |
char-set:letter+digit | Letters and digits |
char-set:graphic | Printing characters except spaces |
char-set:printing | Printing characters including spaces |
char-set:whitespace | Whitespace characters |
char-set:iso-control | The ISO control characters |
char-set:punctuation | Punctuation characters |
char-set:symbol | Symbol characters |
char-set:hex-digit | A hexadecimal digit: 0-9, A-F, a-f |
char-set:blank | Blank characters -- horizontal whitespace |
char-set:ascii | All characters in the ASCII set. |
char-set:empty | Empty set |
char-set:full | All characters |
+Note that there may be characters in char-set:letter
that are neither upper or
+lower case---this might occur in implementations that use a character type
+richer than ASCII, such as Unicode. A "graphic character" is one that would
+put ink on your page. While the exact composition of these sets may vary
+depending upon the character type provided by the underlying Scheme system,
+here are the definitions for some of the sets in an ASCII implementation:
+
char-set:lower-case | a-z |
char-set:upper-case | A-Z |
char-set:letter | A-Z and a-z |
char-set:digit | 0123456789 |
char-set:punctuation | !"#%&'()*,-./:;?@[\]_{} |
char-set:symbol | $+<=>^`|~ |
char-set:whitespace | Space, newline, tab, form feed, |
vertical tab, carriage return | |
char-set:blank | Space and tab |
char-set:graphic | letter + digit + punctuation + symbol |
char-set:printing | graphic + whitespace |
char-set:iso-control | ASCII 0-31 and 127 |
+Note that the existence of the char-set:ascii
set implies that the underlying
+character set is required to be at least as rich as ASCII (including
+ASCII's control characters).
+
+
+Rationale: The name choices reflect a shift from the older "alphabetic/numeric" +terms found in +R5RS +and Posix to newer, Unicode-influenced "letter/digit" lexemes. + + +
+In Unicode Scheme implementations, the base character sets are compatible with +Java's Unicode specifications. For ASCII or Latin-1, we simply restrict the +Unicode set specifications to their first 128 or 256 codes, respectively. +Scheme implementations that are not based on ASCII, Latin-1 or Unicode should +attempt to preserve the sense or spirit of these definitions. + +
+The following descriptions frequently make reference to the "Unicode character +database." This is a file, available at URL +
++Each line contains a description of a Unicode character. The first +semicolon-delimited field of the line gives the hex value of the character's +code; the second field gives the name of the character, and the third field +gives a two-letter category. Other fields give simple 1-1 case-mappings for +the character and other information; see +
++for further description of the file's format. Note in particular the +two-letter category specified in the the third field, which is referenced +frequently in the descriptions below. + + +
+For Unicode, we follow Java's specification: a character is lowercase if +
+The lower-case ASCII characters are +
+Latin-1 adds another 33 lower-case characters to the ASCII set: +
00B5 | MICRO SIGN |
00DF | LATIN SMALL LETTER SHARP S |
00E0 | LATIN SMALL LETTER A WITH GRAVE |
00E1 | LATIN SMALL LETTER A WITH ACUTE |
00E2 | LATIN SMALL LETTER A WITH CIRCUMFLEX |
00E3 | LATIN SMALL LETTER A WITH TILDE |
00E4 | LATIN SMALL LETTER A WITH DIAERESIS |
00E5 | LATIN SMALL LETTER A WITH RING ABOVE |
00E6 | LATIN SMALL LETTER AE |
00E7 | LATIN SMALL LETTER C WITH CEDILLA |
00E8 | LATIN SMALL LETTER E WITH GRAVE |
00E9 | LATIN SMALL LETTER E WITH ACUTE |
00EA | LATIN SMALL LETTER E WITH CIRCUMFLEX |
00EB | LATIN SMALL LETTER E WITH DIAERESIS |
00EC | LATIN SMALL LETTER I WITH GRAVE |
00ED | LATIN SMALL LETTER I WITH ACUTE |
00EE | LATIN SMALL LETTER I WITH CIRCUMFLEX |
00EF | LATIN SMALL LETTER I WITH DIAERESIS |
00F0 | LATIN SMALL LETTER ETH |
00F1 | LATIN SMALL LETTER N WITH TILDE |
00F2 | LATIN SMALL LETTER O WITH GRAVE |
00F3 | LATIN SMALL LETTER O WITH ACUTE |
00F4 | LATIN SMALL LETTER O WITH CIRCUMFLEX |
00F5 | LATIN SMALL LETTER O WITH TILDE |
00F6 | LATIN SMALL LETTER O WITH DIAERESIS |
00F8 | LATIN SMALL LETTER O WITH STROKE |
00F9 | LATIN SMALL LETTER U WITH GRAVE |
00FA | LATIN SMALL LETTER U WITH ACUTE |
00FB | LATIN SMALL LETTER U WITH CIRCUMFLEX |
00FC | LATIN SMALL LETTER U WITH DIAERESIS |
00FD | LATIN SMALL LETTER Y WITH ACUTE |
00FE | LATIN SMALL LETTER THORN |
00FF | LATIN SMALL LETTER Y WITH DIAERESIS |
+Note that three of these have no corresponding Latin-1 upper-case character: +
00B5 | MICRO SIGN |
00DF | LATIN SMALL LETTER SHARP S |
00FF | LATIN SMALL LETTER Y WITH DIAERESIS |
+(The compatibility micro character uppercases to the non-Latin-1 Greek capital +mu; the German sharp s character uppercases to the pair of characters "SS," +and the capital y-with-diaeresis is non-Latin-1.) + +
+(Note that the Java spec for lowercase characters given at +
++is inconsistent. U+00B5 MICRO SIGN fulfills the requirements for a lower-case +character (as of Unicode 3.0), but is not given in the numeric list of +lower-case character codes.) + +
+(Note that the Java spec for isLowerCase()
given at
+
+gives three mutually inconsistent definitions of "lower case." The first is +the definition used in this SRFI. Following text says "A character is +considered to be lowercase if and only if it is specified to be lowercase by +the Unicode 2.0 standard (category Ll in the Unicode specification data +file)." The former spec excludes U+00AA FEMININE ORDINAL INDICATOR and +U+00BA MASCULINE ORDINAL INDICATOR; the later spec includes them. Finally, +the spec enumerates a list of characters in the Latin-1 subset; this list +excludes U+00B5 MICRO SIGN, which is included in both of the previous specs.) + + +
+For Unicode, we follow Java's specification: a character is uppercase if +
+The upper-case ASCII characters are +
+Latin-1 adds another 30 upper-case characters to the ASCII set: +
00C0 | LATIN CAPITAL LETTER A WITH GRAVE |
00C1 | LATIN CAPITAL LETTER A WITH ACUTE |
00C2 | LATIN CAPITAL LETTER A WITH CIRCUMFLEX |
00C3 | LATIN CAPITAL LETTER A WITH TILDE |
00C4 | LATIN CAPITAL LETTER A WITH DIAERESIS |
00C5 | LATIN CAPITAL LETTER A WITH RING ABOVE |
00C6 | LATIN CAPITAL LETTER AE |
00C7 | LATIN CAPITAL LETTER C WITH CEDILLA |
00C8 | LATIN CAPITAL LETTER E WITH GRAVE |
00C9 | LATIN CAPITAL LETTER E WITH ACUTE |
00CA | LATIN CAPITAL LETTER E WITH CIRCUMFLEX |
00CB | LATIN CAPITAL LETTER E WITH DIAERESIS |
00CC | LATIN CAPITAL LETTER I WITH GRAVE |
00CD | LATIN CAPITAL LETTER I WITH ACUTE |
00CE | LATIN CAPITAL LETTER I WITH CIRCUMFLEX |
00CF | LATIN CAPITAL LETTER I WITH DIAERESIS |
00D0 | LATIN CAPITAL LETTER ETH |
00D1 | LATIN CAPITAL LETTER N WITH TILDE |
00D2 | LATIN CAPITAL LETTER O WITH GRAVE |
00D3 | LATIN CAPITAL LETTER O WITH ACUTE |
00D4 | LATIN CAPITAL LETTER O WITH CIRCUMFLEX |
00D5 | LATIN CAPITAL LETTER O WITH TILDE |
00D6 | LATIN CAPITAL LETTER O WITH DIAERESIS |
00D8 | LATIN CAPITAL LETTER O WITH STROKE |
00D9 | LATIN CAPITAL LETTER U WITH GRAVE |
00DA | LATIN CAPITAL LETTER U WITH ACUTE |
00DB | LATIN CAPITAL LETTER U WITH CIRCUMFLEX |
00DC | LATIN CAPITAL LETTER U WITH DIAERESIS |
00DD | LATIN CAPITAL LETTER Y WITH ACUTE |
00DE | LATIN CAPITAL LETTER THORN |
+In Unicode, a character is titlecase if it has the category Lt in +the character attribute database. There are very few of these characters; +here is the entire 31-character list as of Unicode 3.0: +
01C5 | LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON + |
01C8 | LATIN CAPITAL LETTER L WITH SMALL LETTER J + |
01CB | LATIN CAPITAL LETTER N WITH SMALL LETTER J + |
01F2 | LATIN CAPITAL LETTER D WITH SMALL LETTER Z + |
1F88 | GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI + |
1F89 | GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI + |
1F8A | GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI + |
1F8B | GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI + |
1F8C | GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI + |
1F8D | GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI + |
1F8E | GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI + |
1F8F | GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI + |
1F98 | GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI + |
1F99 | GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI + |
1F9A | GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI + |
1F9B | GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI + |
1F9C | GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI + |
1F9D | GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI + |
1F9E | GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI + |
1F9F | GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI + |
1FA8 | GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI + |
1FA9 | GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI + |
1FAA | GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI + |
1FAB | GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI + |
1FAC | GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI + |
1FAD | GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI + |
1FAE | GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI + |
1FAF | GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI + |
1FBC | GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI + |
1FCC | GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI + |
1FFC | GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI + |
+There are no ASCII or Latin-1 titlecase characters. + + + +
+In Unicode, a letter is any character with one of the letter categories +(Lu, Ll, Lt, Lm, Lo) in the Unicode character database. + +
+There are 52 ASCII letters +
+There are 117 Latin-1 letters. These are the 115 characters that are
+members of the Latin-1 char-set:lower-case
and char-set:upper-case
sets,
+plus
+
00AA | FEMININE ORDINAL INDICATOR |
00BA | MASCULINE ORDINAL INDICATOR |
+(These two letters are considered lower-case by Unicode, but not by +Java or SRFI 14.) + + +
+In Unicode, a character is a digit if it has the category Nd in +the character attribute database. In Latin-1 and ASCII, the only +such characters are 0123456789. In Unicode, there are other digit +characters in other code blocks, such as Gujarati digits and Tibetan +digits. + + + +
+The only hex digits are 0123456789abcdefABCDEF. + + + +
+The union of char-set:letter
and char-set:digit.
+
+
+
+A graphic character is one that would put ink on paper. The ASCII and Latin-1 +graphic characters are the members of +
char-set:letter |
char-set:digit |
char-set:punctuation |
char-set:symbol |
+A printing character is one that would occupy space when printed, i.e.,
+a graphic character or a space character. char-set:printing
is the union
+of char-set:whitespace
and char-set:graphic.
+
+
+
+In Unicode, a whitespace character is either +
+There are 24 whitespace characters in Unicode 3.0: +
0009 | HORIZONTAL TABULATION | \t control-I |
000A | LINE FEED | \n control-J |
000B | VERTICAL TABULATION | \v control-K |
000C | FORM FEED | \f control-L |
000D | CARRIAGE RETURN | \r control-M |
0020 | SPACE | Zs |
00A0 | NO-BREAK SPACE | Zs |
1680 | OGHAM SPACE MARK | Zs |
2000 | EN QUAD | Zs |
2001 | EM QUAD | Zs |
2002 | EN SPACE | Zs |
2003 | EM SPACE | Zs |
2004 | THREE-PER-EM SPACE | Zs |
2005 | FOUR-PER-EM SPACE | Zs |
2006 | SIX-PER-EM SPACE | Zs |
2007 | FIGURE SPACE | Zs |
2008 | PUNCTUATION SPACE | Zs |
2009 | THIN SPACE | Zs |
200A | HAIR SPACE | Zs |
200B | ZERO WIDTH SPACE | Zs |
2028 | LINE SEPARATOR | Zl |
2029 | PARAGRAPH SEPARATOR | Zp |
202F | NARROW NO-BREAK SPACE | Zs |
3000 | IDEOGRAPHIC SPACE | Zs |
+The ASCII whitespace characters are the first six characters in the above list
+-- line feed, horizontal tabulation, vertical tabulation, form feed, carriage
+return, and space. These are also exactly the characters recognised by the
+Posix isspace()
procedure. Latin-1 adds the no-break space.
+
+
+Note: Java's isWhitespace()
method is incompatible, including
+
0009 | HORIZONTAL TABULATION | (\t control-I) |
001C | FILE SEPARATOR | (control-\) |
001D | GROUP SEPARATOR | (control-]) |
001E | RECORD SEPARATOR | (control-^) |
001F | UNIT SEPARATOR | (control-_) |
+and excluding +
00A0 | NO-BREAK SPACE |
+Java's excluding the no-break space means that tokenizers can simply break
+character streams at "whitespace" boundaries. However, the exclusion introduces
+exceptions in other places, e.g. char-set:printing
is no longer simply the
+union of char-set:graphic
and char-set:whitespace.
+
+
+
+
+The ISO control characters are the Unicode/Latin-1 characters in the ranges +[U+0000,U+001F] and [U+007F,U+009F]. + +
+ASCII restricts this set to the characters in the range [U+0000,U+001F] +plus the character U+007F. + +
+Note that Unicode defines other control characters which do not belong to this
+set (hence the qualifying prefix "iso-" in the name). This restriction is
+compatible with the Java IsISOControl()
method.
+
+
+
+
+In Unicode, a punctuation character is any character that has one of the +punctuation categories in the Unicode character database (Pc, Pd, Ps, +Pe, Pi, Pf, or Po.) + +
+ASCII has 23 punctuation characters: +
+!"#%&'()*,-./:;?@[\]_{} ++
+Latin-1 adds six more: +
00A1 | INVERTED EXCLAMATION MARK + |
00AB | LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + |
00AD | SOFT HYPHEN + |
00B7 | MIDDLE DOT + |
00BB | RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + |
00BF | INVERTED QUESTION MARK + |
+Note that the nine ASCII characters $+<=>^`|~
are not
+punctuation. They are "symbols."
+
+
+
+
+In Unicode, a symbol is any character that has one of the symbol categories +in the Unicode character database (Sm, Sc, Sk, or So). There +are nine ASCII symbol characters: +
+$+<=>^`|~ ++
+Latin-1 adds 18 more: +
00A2 | CENT SIGN |
00A3 | POUND SIGN |
00A4 | CURRENCY SIGN |
00A5 | YEN SIGN |
00A6 | BROKEN BAR |
00A7 | SECTION SIGN |
00A8 | DIAERESIS |
00A9 | COPYRIGHT SIGN |
00AC | NOT SIGN |
00AE | REGISTERED SIGN |
00AF | MACRON |
00B0 | DEGREE SIGN |
00B1 | PLUS-MINUS SIGN |
00B4 | ACUTE ACCENT |
00B6 | PILCROW SIGN |
00B8 | CEDILLA |
00D7 | MULTIPLICATION SIGN |
00F7 | DIVISION SIGN |
+Blank chars are horizontal whitespace. In Unicode, a blank character is either +
+There are eighteen blank characters in Unicode 3.0: +
0009 | HORIZONTAL TABULATION | \t control-I |
0020 | SPACE | Zs |
00A0 | NO-BREAK SPACE | Zs |
1680 | OGHAM SPACE MARK | Zs |
2000 | EN QUAD | Zs |
2001 | EM QUAD | Zs |
2002 | EN SPACE | Zs |
2003 | EM SPACE | Zs |
2004 | THREE-PER-EM SPACE | Zs |
2005 | FOUR-PER-EM SPACE | Zs |
2006 | SIX-PER-EM SPACE | Zs |
2007 | FIGURE SPACE | Zs |
2008 | PUNCTUATION SPACE | Zs |
2009 | THIN SPACE | Zs |
200A | HAIR SPACE | Zs |
200B | ZERO WIDTH SPACE | Zs |
202F | NARROW NO-BREAK SPACE | Zs |
3000 | IDEOGRAPHIC SPACE | Zs |
+The ASCII blank characters are the first two characters above -- +horizontal tab and space. Latin-1 adds the no-break space. + +
+Java doesn't have the concept of "blank" characters, so there are no +compatibility issues. + + + +
+This SRFI comes with a reference implementation. It resides at: +
++I have placed this source on the Net with an unencumbered, "open" copyright. +Some of the code in the reference implementation bears a distant family +relation to the MIT Scheme implementation, and being derived from that code, +is covered by the MIT Scheme copyright (which is a generic BSD-style +open-source copyright -- see the source file for details). The remainder of +the code was written by myself for scsh or for this SRFI; I have placed this +code under the scsh copyright, which is also a generic BSD-style open-source +copyright. + +
+The code is written for portability and should be simple to port to +any Scheme. It has only the following deviations from R4RS, clearly +discussed in the comments: +
error
procedure;
+ values
procedure for producing multiple return values;
+ check-arg
procedure for argument checking;
+ let-optionals*
and :optional
macros for for parsing, checking and defaulting
+ optional arguments from rest lists;
+ define-record-type
form;
+ bitwise-and
for the hash function;
+ %latin1->char
and %char->latin1
.
++The library is written for clarity and well-commented; the current source is +about 375 lines of source code and 375 lines of comments and white space. +It is also written for efficiency. Fast paths are provided for common cases. + +
+This is not to say that the implementation can't be tuned up for +a specific Scheme implementation. There are notes in comments addressing +ways implementors can tune the reference implementation for performance. + +
+In short, I've written the reference implementation to make it as painless +as possible for an implementor -- or a regular programmer -- to adopt this +library and get good results with it. + +
+The code uses a rather simple-minded, inefficient representation for +ASCII/Latin-1 char-sets -- a 256-character string. The character whose code is +i is in the set if s[i] = ASCII 1 (soh, or ^a); +not in the set if s[i] = ASCII 0 (nul). +A much faster and denser representation would be 16 or 32 bytes worth +of bit string. A portable implementation using bit sets awaits standards for +bitwise logical-ops and byte vectors. + +
+"Large" character types, such as Unicode, should use a sparse representation, +taking care that the Latin-1 subset continues to be represented with a +dense 32-byte bit set. + + + +
+The design of this library benefited greatly from the feedback provided during +the SRFI discussion phase. Among those contributing thoughtful commentary and +suggestions, both on the mailing list and by private discussion, were Paolo +Amoroso, Lars Arvestad, Alan Bawden, Jim Bender, Dan Bornstein, Per Bothner, +Will Clinger, Brian Denheyer, Kent Dybvig, Sergei Egorov, Marc Feeley, +Matthias Felleisen, Will Fitzgerald, Matthew Flatt, Arthur A. Gleckler, Ben +Goetter, Sven Hartrumpf, Erik Hilsdale, Shiro Kawai, Richard Kelsey, Oleg +Kiselyov, Bengt Kleberg, Donovan Kolbly, Bruce Korb, Shriram Krishnamurthi, +Bruce Lewis, Tom Lord, Brad Lucier, Dave Mason, David Rush, Klaus Schilling, +Jonathan Sobel, Mike Sperber, Mikael Staldal, Vladimir Tsyshevsky, Donald +Welsh, and Mike Wilson. I am grateful to them for their assistance. + +
+I am also grateful the authors, implementors and documentors of all the +systems mentioned in the introduction. Aubrey Jaffer should be noted for his +work in producing Web-accessible versions of the R5RS spec, which was a +tremendous aid. + +
+This is not to imply that these individuals necessarily endorse the final +results, of course. + +
+During this document's long development period, great patience was exhibited +by Mike Sperber, who is the editor for the SRFI, and by Hillary Sullivan, +who is not. + + +
+Certain portions of this document -- the specific, marked segments of text +describing the R5RS procedures -- were adapted with permission from the R5RS +report. + +
+All other text is copyright (C) Olin Shivers (1998, 1999, 2000). +All Rights Reserved. + +
+This document and translations of it may be copied and furnished to others, +and derivative works that comment on or otherwise explain it or assist in its +implementation may be prepared, copied, published and distributed, in whole or +in part, without restriction of any kind, provided that the above copyright +notice and this paragraph are included on all such copies and derivative +works. However, this document itself may not be modified in any way, such as +by removing the copyright notice or references to the Scheme Request For +Implementation process or editors, except as needed for the purpose of +developing SRFIs in which case the procedures for copyrights defined in the +SRFI process must be followed, or as required to translate it into languages +other than English. + +
+The limited permissions granted above are perpetual and will not be revoked by +the authors or their successors or assigns. + +
+This document and the information contained herein is provided on an +"as is" basis and the authors and the SRFI editors +disclaim all warranties, express or implied, including but not limited to any +warranty that the use of the information herein will not infringe any rights +or any implied warranties of merchantability or fitness for a particular +purpose. + + + + diff --git a/scsh/lib/cset-lib.scm b/scsh/lib/cset-lib.scm new file mode 100644 index 0000000..2effd4b --- /dev/null +++ b/scsh/lib/cset-lib.scm @@ -0,0 +1,804 @@ +;;; SRFI-14 character-sets library -*- Scheme -*- +;;; +;;; - Ported from MIT Scheme runtime by Brian D. Carlstrom. +;;; - Massively rehacked & extended by Olin Shivers 6/98. +;;; - Massively redesigned and rehacked 5/2000 during SRFI process. +;;; At this point, the code bears the following relationship to the +;;; MIT Scheme code: "This is my grandfather's axe. My father replaced +;;; the head, and I have replaced the handle." Nonetheless, we preserve +;;; the MIT Scheme copyright: +;;; Copyright (c) 1988-1995 Massachusetts Institute of Technology +;;; The MIT Scheme license is a "free software" license. See the end of +;;; this file for the tedious details. + +;;; Exports: +;;; char-set? char-set= char-set<= +;;; char-set-hash +;;; char-set-cursor char-set-ref char-set-cursor-next end-of-char-set? +;;; char-set-fold char-set-unfold char-set-unfold! +;;; char-set-for-each char-set-map +;;; char-set-copy char-set +;;; +;;; list->char-set string->char-set +;;; list->char-set! string->char-set! +;;; +;;; filterchar-set ucs-range->char-set ->char-set +;;; filterchar-set! ucs-range->char-set! +;;; +;;; char-set->list char-set->string +;;; +;;; char-set-size char-set-count char-set-contains? +;;; char-set-every char-set-any +;;; +;;; char-set-adjoin char-set-delete +;;; char-set-adjoin! char-set-delete! +;;; + +;;; char-set-complement char-set-union char-set-intersection +;;; char-set-complement! char-set-union! char-set-intersection! +;;; +;;; char-set-difference char-set-xor char-set-diff+intersection +;;; char-set-difference! char-set-xor! char-set-diff+intersection! +;;; +;;; char-set:lower-case char-set:upper-case char-set:title-case +;;; char-set:letter char-set:digit char-set:letter+digit +;;; char-set:graphic char-set:printing char-set:whitespace +;;; char-set:iso-control char-set:punctuation char-set:symbol +;;; char-set:hex-digit char-set:blank char-set:ascii +;;; char-set:empty char-set:full + +;;; Imports +;;; This code has the following non-R5RS dependencies: +;;; - ERROR +;;; - %LATIN1->CHAR %CHAR->LATIN1 +;;; - LET-OPTIONALS* and :OPTIONAL macros for parsing, checking & defaulting +;;; optional arguments from rest lists. +;;; - BITWISE-AND for CHAR-SET-HASH +;;; - The SRFI-19 DEFINE-RECORD-TYPE record macro +;;; - A simple CHECK-ARG procedure: +;;; (lambda (pred val caller) (if (not (pred val)) (error val caller))) + +;;; This is simple code, not great code. Char sets are represented as 256-char +;;; strings. If char I is ASCII/Latin-1 0, then it isn't in the set; if char I +;;; is ASCII/Latin-1 1, then it is in the set. +;;; - Should be rewritten to use bit strings or byte vecs. +;;; - Is Latin-1 specific. Would certainly have to be rewritten for Unicode. + +;;; See the end of the file for porting and performance-tuning notes. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define-record-type :char-set + (make-char-set s) + char-set? + (s char-set:s)) + + +(define (%string-copy s) (substring s 0 (string-length s))) + +;;; Parse, type-check & default a final optional BASE-CS parameter from +;;; a rest argument. Return a *fresh copy* of the underlying string. +;;; The default is the empty set. The PROC argument is to help us +;;; generate informative error exceptions. + +(define (%default-base maybe-base proc) + (if (pair? maybe-base) + (let ((bcs (car maybe-base)) + (tail (cdr maybe-base))) + (if (null? tail) + (if (char-set? bcs) (%string-copy (char-set:s bcs)) + (error "BASE-CS parameter not a char-set" proc bcs)) + (error "Expected final base char set -- too many parameters" + proc maybe-base))) + (make-string 256 (%latin1->char 0)))) + +;;; If CS is really a char-set, do CHAR-SET:S, otw report an error msg on +;;; behalf of our caller, PROC. This procedure exists basically to provide +;;; explicit error-checking & reporting. + +(define (%char-set:s/check cs proc) + (let lp ((cs cs)) + (if (char-set? cs) (char-set:s cs) + (lp (error "Not a char-set" cs proc))))) + + + +;;; These internal functions hide a lot of the dependency on the +;;; underlying string representation of char sets. They should be +;;; inlined if possible. + +(define (si=0? s i) (zero? (%char->latin1 (string-ref s i)))) +(define (si=1? s i) (not (si=0? s i))) +(define c0 (%latin1->char 0)) +(define c1 (%latin1->char 1)) +(define (si s i) (%char->latin1 (string-ref s i))) +(define (%set0! s i) (string-set! s i c0)) +(define (%set1! s i) (string-set! s i c1)) + +;;; These do various "s[i] := s[i] op val" operations -- see +;;; %CHAR-SET-ALGEBRA. They are used to implement the various +;;; set-algebra procedures. +(define (setv! s i v) (string-set! s i (%latin1->char v))) ; SET to a Value. +(define (%not! s i v) (setv! s i (- 1 v))) +(define (%and! s i v) (if (zero? v) (%set0! s i))) +(define (%or! s i v) (if (not (zero? v)) (%set1! s i))) +(define (%minus! s i v) (if (not (zero? v)) (%set0! s i))) +(define (%xor! s i v) (if (not (zero? v)) (setv! s i (- 1 (si s i))))) + + +(define (char-set-copy cs) + (make-char-set (%string-copy (%char-set:s/check cs char-set-copy)))) + +(define (char-set= . rest) + (or (null? rest) + (let* ((cs1 (car rest)) + (rest (cdr rest)) + (s1 (%char-set:s/check cs1 char-set=))) + (let lp ((rest rest)) + (or (not (pair? rest)) + (and (string=? s1 (%char-set:s/check (car rest) char-set=)) + (lp (cdr rest)))))))) + +(define (char-set<= . rest) + (or (null? rest) + (let ((cs1 (car rest)) + (rest (cdr rest))) + (let lp ((s1 (%char-set:s/check cs1 char-set<=)) (rest rest)) + (or (not (pair? rest)) + (let ((s2 (%char-set:s/check (car rest) char-set<=)) + (rest (cdr rest))) + (if (eq? s1 s2) (lp s2 rest) ; Fast path + (let lp2 ((i 255)) ; Real test + (if (< i 0) (lp s2 rest) + (and (<= (si s1 i) (si s2 i)) + (lp2 (- i 1)))))))))))) + +;;; Hash +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Compute (c + 37 c + 37^2 c + ...) modulo BOUND, with sleaze thrown in +;;; to keep the intermediate values small. (We do the calculation with just +;;; enough bits to represent BOUND, masking off high bits at each step in +;;; calculation. If this screws up any important properties of the hash +;;; function I'd like to hear about it. -Olin) +;;; +;;; If you keep BOUND small enough, the intermediate calculations will +;;; always be fixnums. How small is dependent on the underlying Scheme system; +;;; we use a default BOUND of 2^22 = 4194304, which should hack it in +;;; Schemes that give you at least 29 signed bits for fixnums. The core +;;; calculation that you don't want to overflow is, worst case, +;;; (+ 65535 (* 37 (- bound 1))) +;;; where 65535 is the max character code. Choose the default BOUND to be the +;;; biggest power of two that won't cause this expression to fixnum overflow, +;;; and everything will be copacetic. + +(define (char-set-hash cs . maybe-bound) + (let* ((bound (:optional maybe-bound 4194304 (lambda (n) (and (integer? n) + (exact? n) + (<= 0 n))))) + (bound (if (zero? bound) 4194304 bound)) ; 0 means default. + (s (%char-set:s/check cs char-set-hash)) + ;; Compute a 111...1 mask that will cover BOUND-1: + (mask (let lp ((i #x10000)) ; Let's skip first 16 iterations, eh? + (if (>= i bound) (- i 1) (lp (+ i i)))))) + + (let lp ((i 255) (ans 0)) + (if (< i 0) (modulo ans bound) + (lp (- i 1) + (if (si=0? s i) ans + (bitwise-and mask (+ (* 37 ans) i)))))))) + + +(define (char-set-contains? cs char) + (si=1? (%char-set:s/check cs char-set-contains?) + (%char->latin1 (check-arg char? char char-set-contains?)))) + +(define (char-set-size cs) + (let ((s (%char-set:s/check cs char-set-size))) + (let lp ((i 255) (size 0)) + (if (< i 0) size + (lp (- i 1) (+ size (si s i))))))) + +(define (char-set-count pred cset) + (check-arg procedure? pred char-set-count) + (let ((s (%char-set:s/check cset char-set-count))) + (let lp ((i 255) (count 0)) + (if (< i 0) count + (lp (- i 1) + (if (and (si=1? s i) (pred (%latin1->char i))) + (+ count 1) + count)))))) + + +;;; -- Adjoin & delete + +(define (%set-char-set set proc cs chars) + (let ((s (%string-copy (%char-set:s/check cs proc)))) + (for-each (lambda (c) (set s (%char->latin1 c))) + chars) + (make-char-set s))) + +(define (%set-char-set! set proc cs chars) + (let ((s (%char-set:s/check cs proc))) + (for-each (lambda (c) (set s (%char->latin1 c))) + chars)) + cs) + +(define (char-set-adjoin cs . chars) + (%set-char-set %set1! char-set-adjoin cs chars)) +(define (char-set-adjoin! cs . chars) + (%set-char-set! %set1! char-set-adjoin! cs chars)) +(define (char-set-delete cs . chars) + (%set-char-set %set0! char-set-delete cs chars)) +(define (char-set-delete! cs . chars) + (%set-char-set! %set0! char-set-delete! cs chars)) + + +;;; Cursors +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Simple implementation. A cursors is an integer index into the +;;; mark vector, and -1 for the end-of-char-set cursor. +;;; +;;; If we represented char sets as a bit set, we could do the following +;;; trick to pick the lowest bit out of the set: +;;; (count-bits (xor (- cset 1) cset)) +;;; (But first mask out the bits already scanned by the cursor first.) + +(define (char-set-cursor cset) + (%char-set-cursor-next cset 256 char-set-cursor)) + +(define (end-of-char-set? cursor) (< cursor 0)) + +(define (char-set-ref cset cursor) (%latin1->char cursor)) + +(define (char-set-cursor-next cset cursor) + (check-arg (lambda (i) (and (integer? i) (exact? i) (<= 0 i 255))) cursor + char-set-cursor-next) + (%char-set-cursor-next cset cursor char-set-cursor-next)) + +(define (%char-set-cursor-next cset cursor proc) ; Internal + (let ((s (%char-set:s/check cset proc))) + (let lp ((cur cursor)) + (let ((cur (- cur 1))) + (if (or (< cur 0) (si=1? s cur)) cur + (lp cur)))))) + + +;;; -- for-each map fold unfold every any + +(define (char-set-for-each proc cs) + (check-arg procedure? proc char-set-for-each) + (let ((s (%char-set:s/check cs char-set-for-each))) + (let lp ((i 255)) + (cond ((>= i 0) + (if (si=1? s i) (proc (%latin1->char i))) + (lp (- i 1))))))) + +(define (char-set-map proc cs) + (check-arg procedure? proc char-set-map) + (let ((s (%char-set:s/check cs char-set-map)) + (ans (make-string 256 c0))) + (let lp ((i 255)) + (cond ((>= i 0) + (if (si=1? s i) + (%set1! ans (%char->latin1 (proc (%latin1->char i))))) + (lp (- i 1))))) + (make-char-set ans))) + +(define (char-set-fold kons knil cs) + (check-arg procedure? kons char-set-fold) + (let ((s (%char-set:s/check cs char-set-fold))) + (let lp ((i 255) (ans knil)) + (if (< i 0) ans + (lp (- i 1) + (if (si=0? s i) ans + (kons (%latin1->char i) ans))))))) + +(define (char-set-every pred cs) + (check-arg procedure? pred char-set-every) + (let ((s (%char-set:s/check cs char-set-every))) + (let lp ((i 255)) + (or (< i 0) + (and (or (si=0? s i) (pred (%latin1->char i))) + (lp (- i 1))))))) + +(define (char-set-any pred cs) + (check-arg procedure? pred char-set-any) + (let ((s (%char-set:s/check cs char-set-any))) + (let lp ((i 255)) + (and (>= i 0) + (or (and (si=1? s i) (pred (%latin1->char i))) + (lp (- i 1))))))) + + +(define (%char-set-unfold! proc p f g s seed) + (check-arg procedure? p proc) + (check-arg procedure? f proc) + (check-arg procedure? g proc) + (let lp ((seed seed)) + (cond ((not (p seed)) ; P says we are done. + (%set1! s (%char->latin1 (f seed))) ; Add (F SEED) to set. + (lp (g seed)))))) ; Loop on (G SEED). + +(define (char-set-unfold p f g seed . maybe-base) + (let ((bs (%default-base maybe-base char-set-unfold))) + (%char-set-unfold! char-set-unfold p f g bs seed) + (make-char-set bs))) + +(define (char-set-unfold! p f g seed base-cset) + (%char-set-unfold! char-set-unfold! p f g + (%char-set:s/check base-cset char-set-unfold!) + seed) + base-cset) + + + +;;; list <--> char-set + +(define (%list->char-set! chars s) + (for-each (lambda (char) (%set1! s (%char->latin1 char))) + chars)) + +(define (char-set . chars) + (let ((s (make-string 256 c0))) + (%list->char-set! chars s) + (make-char-set s))) + +(define (list->char-set chars . maybe-base) + (let ((bs (%default-base maybe-base list->char-set))) + (%list->char-set! chars bs) + (make-char-set bs))) + +(define (list->char-set! chars base-cs) + (%list->char-set! chars (%char-set:s/check base-cs list->char-set!)) + base-cs) + + +(define (char-set->list cs) + (let ((s (%char-set:s/check cs char-set->list))) + (let lp ((i 255) (ans '())) + (if (< i 0) ans + (lp (- i 1) + (if (si=0? s i) ans + (cons (%latin1->char i) ans))))))) + + + +;;; string <--> char-set + +(define (%string->char-set! str bs proc) + (check-arg string? str proc) + (do ((i (- (string-length str) 1) (- i 1))) + ((< i 0)) + (%set1! bs (%char->latin1 (string-ref str i))))) + +(define (string->char-set str . maybe-base) + (let ((bs (%default-base maybe-base string->char-set))) + (%string->char-set! str bs string->char-set) + (make-char-set bs))) + +(define (string->char-set! str base-cs) + (%string->char-set! str (%char-set:s/check base-cs string->char-set!) + string->char-set!) + base-cs) + + +(define (char-set->string cs) + (let* ((s (%char-set:s/check cs char-set->string)) + (ans (make-string (char-set-size cs)))) + (let lp ((i 255) (j 0)) + (if (< i 0) ans + (let ((j (if (si=0? s i) j + (begin (string-set! ans j (%latin1->char i)) + (+ j 1))))) + (lp (- i 1) j)))))) + + +;;; -- UCS-range -> char-set + +(define (%ucs-range->char-set! lower upper error? bs proc) + (check-arg (lambda (x) (and (integer? x) (exact? x) (<= 0 x))) lower proc) + (check-arg (lambda (x) (and (integer? x) (exact? x) (<= lower x))) upper proc) + + (if (and (< lower upper) (< 256 upper) error?) + (error "Requested UCS range contains unavailable characters -- this implementation only supports Latin-1" + proc lower upper)) + + (let lp ((i (- (min upper 256) 1))) + (cond ((<= lower i) (%set1! bs i) (lp (- i 1)))))) + +(define (ucs-range->char-set lower upper . rest) + (let-optionals* rest ((error? #f) rest) + (let ((bs (%default-base rest ucs-range->char-set))) + (%ucs-range->char-set! lower upper error? bs ucs-range->char-set) + (make-char-set bs)))) + +(define (ucs-range->char-set! lower upper error? base-cs) + (%ucs-range->char-set! lower upper error? + (%char-set:s/check base-cs ucs-range->char-set!) + ucs-range->char-set) + base-cs) + + +;;; -- predicate -> char-set + +(define (%char-set-filter! pred ds bs proc) + (check-arg procedure? pred proc) + (let lp ((i 255)) + (cond ((>= i 0) + (if (and (si=1? ds i) (pred (%latin1->char i))) + (%set1! bs i)) + (lp (- i 1)))))) + +(define (char-set-filter predicate domain . maybe-base) + (let ((bs (%default-base maybe-base char-set-filter))) + (%char-set-filter! predicate + (%char-set:s/check domain char-set-filter!) + bs + char-set-filter) + (make-char-set bs))) + +(define (char-set-filter! predicate domain base-cs) + (%char-set-filter! predicate + (%char-set:s/check domain char-set-filter!) + (%char-set:s/check base-cs char-set-filter!) + char-set-filter!) + base-cs) + + +;;; {string, char, char-set, char predicate} -> char-set + +(define (->char-set x) + (cond ((char-set? x) x) + ((string? x) (string->char-set x)) + ((char? x) (char-set x)) + (else (error "->char-set: Not a charset, string or char." x)))) + + + +;;; Set algebra +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; The exported ! procs are "linear update" -- allowed, but not required, to +;;; side-effect their first argument when computing their result. In other +;;; words, you must use them as if they were completely functional, just like +;;; their non-! counterparts, and you must additionally ensure that their +;;; first arguments are "dead" at the point of call. In return, we promise a +;;; more efficient result, plus allowing you to always assume char-sets are +;;; unchangeable values. + +;;; Apply P to each index and its char code in S: (P I VAL). +;;; Used by the set-algebra ops. + +(define (%string-iter p s) + (let lp ((i (- (string-length s) 1))) + (cond ((>= i 0) + (p i (%char->latin1 (string-ref s i))) + (lp (- i 1)))))) + +;;; String S represents some initial char-set. (OP s i val) does some +;;; kind of s[i] := s[i] op val update. Do +;;; S := S OP CSETi +;;; for all the char-sets in the list CSETS. The n-ary set-algebra ops +;;; all use this internal proc. + +(define (%char-set-algebra s csets op proc) + (for-each (lambda (cset) + (let ((s2 (%char-set:s/check cset proc))) + (let lp ((i 255)) + (cond ((>= i 0) + (op s i (si s2 i)) + (lp (- i 1))))))) + csets)) + + +;;; -- Complement + +(define (char-set-complement cs) + (let ((s (%char-set:s/check cs char-set-complement)) + (ans (make-string 256))) + (%string-iter (lambda (i v) (%not! ans i v)) s) + (make-char-set ans))) + +(define (char-set-complement! cset) + (let ((s (%char-set:s/check cset char-set-complement!))) + (%string-iter (lambda (i v) (%not! s i v)) s)) + cset) + + +;;; -- Union + +(define (char-set-union! cset1 . csets) + (%char-set-algebra (%char-set:s/check cset1 char-set-union!) + csets %or! char-set-union!) + cset1) + +(define (char-set-union . csets) + (if (pair? csets) + (let ((s (%string-copy (%char-set:s/check (car csets) char-set-union)))) + (%char-set-algebra s (cdr csets) %or! char-set-union) + (make-char-set s)) + (char-set-copy char-set:empty))) + + +;;; -- Intersection + +(define (char-set-intersection! cset1 . csets) + (%char-set-algebra (%char-set:s/check cset1 char-set-intersection!) + csets %and! char-set-intersection!) + cset1) + +(define (char-set-intersection . csets) + (if (pair? csets) + (let ((s (%string-copy (%char-set:s/check (car csets) char-set-intersection)))) + (%char-set-algebra s (cdr csets) %and! char-set-intersection) + (make-char-set s)) + (char-set-copy char-set:full))) + + +;;; -- Difference + +(define (char-set-difference! cset1 . csets) + (%char-set-algebra (%char-set:s/check cset1 char-set-difference!) + csets %minus! char-set-difference!) + cset1) + +(define (char-set-difference cs1 . csets) + (if (pair? csets) + (let ((s (%string-copy (%char-set:s/check cs1 char-set-difference)))) + (%char-set-algebra s csets %minus! char-set-difference) + (make-char-set s)) + (char-set-copy cs1))) + + +;;; -- Xor + +(define (char-set-xor! cset1 . csets) + (%char-set-algebra (%char-set:s/check cset1 char-set-xor!) + csets %xor! char-set-xor!) + cset1) + +(define (char-set-xor . csets) + (if (pair? csets) + (let ((s (%string-copy (%char-set:s/check (car csets) char-set-xor)))) + (%char-set-algebra s (cdr csets) %xor! char-set-xor) + (make-char-set s)) + (char-set-copy char-set:empty))) + + +;;; -- Difference & intersection + +(define (%char-set-diff+intersection! diff int csets proc) + (for-each (lambda (cs) + (%string-iter (lambda (i v) + (if (not (zero? v)) + (cond ((si=1? diff i) + (%set0! diff i) + (%set1! int i))))) + (%char-set:s/check cs proc))) + csets)) + +(define (char-set-diff+intersection! cs1 cs2 . csets) + (let ((s1 (%char-set:s/check cs1 char-set-diff+intersection!)) + (s2 (%char-set:s/check cs2 char-set-diff+intersection!))) + (%string-iter (lambda (i v) (if (zero? v) + (%set0! s2 i) + (if (si=1? s2 i) (%set0! s1 i)))) + s1) + (%char-set-diff+intersection! s1 s2 csets char-set-diff+intersection!)) + (values cs1 cs2)) + +(define (char-set-diff+intersection cs1 . csets) + (let ((diff (string-copy (%char-set:s/check cs1 char-set-diff+intersection))) + (int (make-string 256 c0))) + (%char-set-diff+intersection! diff int csets char-set-diff+intersection) + (values (make-char-set diff) (make-char-set int)))) + + +;;;; System character sets +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; These definitions are for Latin-1. +;;; +;;; If your Scheme implementation allows you to mark the underlying strings +;;; as immutable, you should do so -- it would be very, very bad if a client's +;;; buggy code corrupted these constants. + +(define char-set:empty (char-set)) +(define char-set:full (char-set-complement char-set:empty)) + +(define char-set:lower-case + (let* ((a-z (ucs-range->char-set #x61 #x7B)) + (latin1 (ucs-range->char-set! #xdf #xf7 #t a-z)) + (latin2 (ucs-range->char-set! #xf8 #x100 #t latin1))) + (char-set-adjoin! latin2 (%latin1->char #xb5)))) + +(define char-set:upper-case + (let ((A-Z (ucs-range->char-set #x41 #x5B))) + ;; Add in the Latin-1 upper-case chars. + (ucs-range->char-set! #xd8 #xdf #t + (ucs-range->char-set! #xc0 #xd7 #t A-Z)))) + +(define char-set:title-case char-set:empty) + +(define char-set:letter + (let ((u/l (char-set-union char-set:upper-case char-set:lower-case))) + (char-set-adjoin! u/l + (%latin1->char #xaa) ; FEMININE ORDINAL INDICATOR + (%latin1->char #xba)))) ; MASCULINE ORDINAL INDICATOR + +(define char-set:digit (string->char-set "0123456789")) +(define char-set:hex-digit (string->char-set "0123456789abcdefABCDEF")) + +(define char-set:letter+digit + (char-set-union char-set:letter char-set:digit)) + +(define char-set:punctuation + (let ((ascii (string->char-set "!\"#%&'()*,-./:;?@[\\]_{}")) + (latin-1-chars (map %latin1->char '(#xA1 ; INVERTED EXCLAMATION MARK + #xAB ; LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + #xAD ; SOFT HYPHEN + #xB7 ; MIDDLE DOT + #xBB ; RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + #xBF)))) ; INVERTED QUESTION MARK + (list->char-set! latin-1-chars ascii))) + +(define char-set:symbol + (let ((ascii (string->char-set "$+<=>^`|~")) + (latin-1-chars (map %latin1->char '(#x00A2 ; CENT SIGN + #x00A3 ; POUND SIGN + #x00A4 ; CURRENCY SIGN + #x00A5 ; YEN SIGN + #x00A6 ; BROKEN BAR + #x00A7 ; SECTION SIGN + #x00A8 ; DIAERESIS + #x00A9 ; COPYRIGHT SIGN + #x00AC ; NOT SIGN + #x00AE ; REGISTERED SIGN + #x00AF ; MACRON + #x00B0 ; DEGREE SIGN + #x00B1 ; PLUS-MINUS SIGN + #x00B4 ; ACUTE ACCENT + #x00B6 ; PILCROW SIGN + #x00B8 ; CEDILLA + #x00D7 ; MULTIPLICATION SIGN + #x00F7)))) ; DIVISION SIGN + (list->char-set! latin-1-chars ascii))) + + +(define char-set:graphic + (char-set-union char-set:letter+digit char-set:punctuation char-set:symbol)) + +(define char-set:whitespace + (list->char-set (map %latin1->char '(#x09 ; HORIZONTAL TABULATION + #x0A ; LINE FEED + #x0B ; VERTICAL TABULATION + #x0C ; FORM FEED + #x0D ; CARRIAGE RETURN + #x20 ; SPACE + #xA0)))) + +(define char-set:printing (char-set-union char-set:whitespace char-set:graphic)) ; NO-BREAK SPACE + +(define char-set:blank + (list->char-set (map %latin1->char '(#x09 ; HORIZONTAL TABULATION + #x20 ; SPACE + #xA0)))) ; NO-BREAK SPACE + + +(define char-set:iso-control + (ucs-range->char-set! #x7F #xA0 #t (ucs-range->char-set 0 32))) + +(define char-set:ascii (ucs-range->char-set 0 128)) + + +;;; Porting & performance-tuning notes +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; See the section at the beginning of this file on external dependencies. +;;; +;;; First and foremost, rewrite this code to use bit vectors of some sort. +;;; This will give big speedup and memory savings. +;;; +;;; - LET-OPTIONALS* macro. +;;; This is only used once. You can rewrite the use, port the hairy macro +;;; definition (which is implemented using a Clinger-Rees low-level +;;; explicit-renaming macro system), or port the simple, high-level +;;; definition, which is less efficient. +;;; +;;; - :OPTIONAL macro +;;; Very simply defined using an R5RS high-level macro. +;;; +;;; Implementations that can arrange for the base char sets to be immutable +;;; should do so. (E.g., Scheme 48 allows one to mark a string as immutable, +;;; which can be used to protect the underlying strings.) It would be very, +;;; very bad if a client's buggy code corrupted these constants. +;;; +;;; There is a fair amount of argument checking. This is, strictly speaking, +;;; unnecessary -- the actual body of the procedures will blow up if an +;;; illegal value is passed in. However, the error message will not be as good +;;; as if the error were caught at the "higher level." Also, a very, very +;;; smart Scheme compiler may be able to exploit having the type checks done +;;; early, so that the actual body of the procedures can assume proper values. +;;; This isn't likely; this kind of compiler technology isn't common any +;;; longer. +;;; +;;; The overhead of optional-argument parsing is irritating. The optional +;;; arguments must be consed into a rest list on entry, and then parsed out. +;;; Function call should be a matter of a few register moves and a jump; it +;;; should not involve heap allocation! Your Scheme system may have a superior +;;; non-R5RS optional-argument system that can eliminate this overhead. If so, +;;; then this is a prime candidate for optimising these procedures, +;;; *especially* the many optional BASE-CS parameters. +;;; +;;; Note that optional arguments are also a barrier to procedure integration. +;;; If your Scheme system permits you to specify alternate entry points +;;; for a call when the number of optional arguments is known in a manner +;;; that enables inlining/integration, this can provide performance +;;; improvements. +;;; +;;; There is enough *explicit* error checking that *all* internal operations +;;; should *never* produce a type or index-range error. Period. Feel like +;;; living dangerously? *Big* performance win to be had by replacing string +;;; and record-field accessors and setters with unsafe equivalents in the +;;; code. Similarly, fixnum-specific operators can speed up the arithmetic +;;; done on the index values in the inner loops. The only arguments that are +;;; not completely error checked are +;;; - string lists (complete checking requires time proportional to the +;;; length of the list) +;;; - procedure arguments, such as char->char maps & predicates. +;;; There is no way to check the range & domain of procedures in Scheme. +;;; Procedures that take these parameters cannot fully check their +;;; arguments. But all other types to all other procedures are fully +;;; checked. +;;; +;;; This does open up the alternate possibility of simply *removing* these +;;; checks, and letting the safe primitives raise the errors. On a dumb +;;; Scheme system, this would provide speed (by eliminating the redundant +;;; error checks) at the cost of error-message clarity. +;;; +;;; In an interpreted Scheme, some of these procedures, or the internal +;;; routines with % prefixes, are excellent candidates for being rewritten +;;; in C. +;;; +;;; It would also be nice to have the ability to mark some of these +;;; routines as candidates for inlining/integration. +;;; +;;; See the comments preceding the hash function code for notes on tuning +;;; the default bound so that the code never overflows your implementation's +;;; fixnum size into bignum calculation. +;;; +;;; All the %-prefixed routines in this source code are written +;;; to be called internally to this library. They do *not* perform +;;; friendly error checks on the inputs; they assume everything is +;;; proper. They also do not take optional arguments. These two properties +;;; save calling overhead and enable procedure integration -- but they +;;; are not appropriate for exported routines. + +;;; Copyright notice +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Copyright (c) 1988-1995 Massachusetts Institute of Technology +;;; +;;; This material was developed by the Scheme project at the Massachusetts +;;; Institute of Technology, Department of Electrical Engineering and +;;; Computer Science. Permission to copy and modify this software, to +;;; redistribute either the original software or a modified version, and +;;; to use this software for any purpose is granted, subject to the +;;; following restrictions and understandings. +;;; +;;; 1. Any copy made of this software must include this copyright notice +;;; in full. +;;; +;;; 2. Users of this software agree to make their best efforts (a) to +;;; return to the MIT Scheme project any improvements or extensions that +;;; they make, so that these may be included in future releases; and (b) +;;; to inform MIT of noteworthy uses of this software. +;;; +;;; 3. All materials developed as a consequence of the use of this +;;; software shall duly acknowledge such use, in accordance with the usual +;;; standards of acknowledging credit in academic research. +;;; +;;; 4. MIT has made no warrantee or representation that the operation of +;;; this software will be error-free, and MIT is under no obligation to +;;; provide any services, by way of maintenance, update, or otherwise. +;;; +;;; 5. In conjunction with products arising from the use of this material, +;;; there shall be no use of the name of the Massachusetts Institute of +;;; Technology nor of any adaptation thereof in any advertising, +;;; promotional, or sales literature without prior written consent from +;;; MIT in each case. diff --git a/scsh/lib/cset-lib.txt b/scsh/lib/cset-lib.txt new file mode 100644 index 0000000..75a77d5 --- /dev/null +++ b/scsh/lib/cset-lib.txt @@ -0,0 +1,1271 @@ +The SRFI 14 character-set library -*- outline -*- +Olin Shivers +98/11/8 +Last Update: 2000/7/4 + +Emacs should display this document in outline mode. Say c-h m for +instructions on how to move through it by sections (e.g., c-c c-n, c-c c-p). + +* Table of contents +------------------- +Abstract +Variable index +Rationale + Linear-update operations + Extra-SRFI recommendations +Specification + General procedures + Iterating over character sets + Creating character sets + Querying character sets + Character-set algebra + Standard character sets +Unicode, Latin-1 and ASCII definitions of the standard character sets +Reference implementation +Acknowledgements +References & links +Copyright + + +------------------------------------------------------------------------------- +* Abstract +---------- + +The ability to efficiently represent and manipulate sets of characters is an +unglamorous but very useful capability for text-processing code -- one that +tends to pop up in the definitions of other libraries. Hence it is useful to +specify a general substrate for this functionality early. This SRFI defines a +general library that provides this functionality. + +It is accompanied by a reference implementation for the spec. The reference +implementation is fairly efficient, straightforwardly portable, and has a +"free software" copyright. The implementation is tuned for "small" 7 or 8 +bit character types, such as ASCII or Latin-1; the data structures and +algorithms would have to be altered for larger 16 or 32 bit character types +such as Unicode -- however, the specs have been carefully designed with these +larger character types in mind. + +Several forthcoming SRFIs can be defined in terms of this one: + - string library + - delimited input procedures (e.g., READ-LINE) + - regular expressions + + +------------------------------------------------------------------------------- +* Variable index +----------------- +Here is the complete set of bindings -- procedural and otherwise -- +exported by this library. In a Scheme system that has a module or package +system, these procedures should be contained in a module named "char-set-lib". + +char-set? char-set= char-set<= + +char-set-hash + +char-set-cursor char-set-ref char-set-cursor-next end-of-char-set? +char-set-fold char-set-unfold char-set-unfold! +char-set-for-each char-set-map + +char-set-copy char-set + +list->char-set string->char-set +list->char-set! string->char-set! + +char-set-filter ucs-range->char-set +char-set-filter! ucs-range->char-set! + +->char-set + +char-set->list char-set->string + +char-set-size char-set-count char-set-contains? + +char-set-every char-set-any + +char-set-adjoin char-set-delete +char-set-adjoin! char-set-delete! + +char-set-complement char-set-union char-set-intersection +char-set-complement! char-set-union! char-set-intersection! + +char-set-difference char-set-xor char-set-diff+intersection +char-set-difference! char-set-xor! char-set-diff+intersection! + +char-set:lower-case char-set:upper-case char-set:title-case +char-set:letter char-set:digit char-set:letter+digit +char-set:graphic char-set:printing char-set:whitespace +char-set:iso-control char-set:punctuation char-set:symbol +char-set:hex-digit char-set:blank char-set:ascii +char-set:empty char-set:full + + +------------------------------------------------------------------------------- +* Rationale +----------- + +The ability to efficiently manipulate sets of characters is quite +useful for text-processing code. Encapsulating this functionality in +a general, efficiently implemented library can assist all such code. +This library defines a new data structure to represent these sets, called +a "char-set." The char-set type is distinct from all other types. + +This library is designed to be portable across implementations that use +different character types and representations, especially ASCII, Latin-1 +and Unicode. Some effort has been made to preserve compatibility with Java +in the Unicode case (see the definition of CHAR-SET:WHITESPACE for the +single real deviation). + + +** Linear-update operations +=========================== +The procedures of this SRFI, by default, are "pure functional" -- they do not +alter their parameters. However, this SRFI defines a set of "linear-update" +procedures which have a hybrid pure-functional/side-effecting semantics: they +are allowed, but not required, to side-effect one of their parameters in order +to construct their result. An implementation may legally implement these +procedures as pure, side-effect-free functions, or it may implement them using +side effects, depending upon the details of what is the most efficient or +simple to implement in terms of the underlying representation. + +The linear-update routines all have names ending with "!". + +Clients of these procedures *may not* rely upon these procedures working by +side effect. For example, this is not guaranteed to work: + + (let* ((cs1 (char-set #\a #\b #\c)) ; cs1 = {a,b,c}. + (cs2 (char-set-adjoin! cs1 #\d))) ; Add d to {a,b,c}. + cs1) ; Could be either {a,b,c} or {a,b,c,d}. + +However, this is well-defined: + + (let ((cs (char-set #\a #\b #\c))) + (char-set-adjoin! cs #\d)) ; Add d to {a,b,c}. + +So clients of these procedures write in a functional style, but must +additionally be sure that, when the procedure is called, there are no other +live pointers to the potentially-modified character set (hence the term +"linear update"). + +There are two benefits to this convention: + - Implementations are free to provide the most efficient possible + implementation, either functional or side-effecting. + - Programmers may nonetheless continue to assume that character sets + are purely functional data structures: they may be reliably shared + without needing to be copied, uniquified, and so forth. + +Note that pure functional representations are the right thing for +ASCII- or Latin-1-based Scheme implementations, since a char-set can +be represented in an ASCII Scheme with 4 32-bit words. Pure set-algebra +operations on such a representation are very fast and efficient. Programmers +who code using linear-update operations are guaranteed the system will +provide the best implementation across multiple platforms. + +In practice, these procedures are most useful for efficiently constructing +character sets in a side-effecting manner, in some limited local context, +before passing the character set outside the local construction scope to be +used in a functional manner. + +Scheme provides no assistance in checking the linearity of the potentially +side-effected parameters passed to these functions --- there's no linear +type checker or run-time mechanism for detecting violations. (But +sophisticated programming environments, such as DrScheme, might help.) + +** Extra-SRFI recommendations +============================= +Users are cautioned that the R5RS predicates + CHAR-ALPHABETIC? + CHAR-NUMERIC? + CHAR-WHITESPACE? + CHAR-UPPER-CASE? + CHAR-LOWER-CASE? +may or may not be in agreement with the SRFI 14 base character sets + CHAR-SET:LETTER + CHAR-SET:DIGIT + CHAR-SET:WHITESPACE + CHAR-SET:UPPER-CASE + CHAR-SET:LOWER-CASE +Implementors are strongly encouraged to bring these predicates into +agreement with the base character sets of this SRFI; not to do so risks +major confusion. + + +------------------------------------------------------------------------------- +* Specification +--------------- + +In the following procedure specifications: + - A CS parameter is a character set. + + - An S parameter is a string. + + - A CHAR parameter is a character. + + - A CHAR-LIST parameter is a list of characters. + + - A PRED parameter is a unary character predicate procedure, returning + a true/false value when applied to a character. + + - An OBJ parameter may be any value at all. + +Passing values to procedures with these parameters that do not satisfy these +types is an error. + +Unless otherwise noted in the specification of a procedure, procedures +always return character sets that are distinct (from the point of view +of the linear-update operations) from the parameter character sets. For +example, CHAR-SET-ADJOIN is guaranteed to provide a fresh character set, +even if it is not given any character parameters. + +Parameters given in square brackets are optional. Unless otherwise noted in +the text describing the procedure, any prefix of these optional parameters may +be supplied, from zero arguments to the full list. When a procedure returns +multiple values, this is shown by listing the return values in square +brackets, as well. So, for example, the procedure with signature + + halts? f [x init-store] -> [boolean integer] + +would take one (F), two (F, X) or three (F, X, INPUT-STORE) input parameters, +and return two values, a boolean and an integer. + +A parameter followed by "..." means zero-or-more elements. So the procedure +with the signature + sum-squares x ... -> number +takes zero or more arguments (X ...), while the procedure with signature + spell-check doc dict1 dict2 ... -> string-list +takes two required parameters (DOC and DICT1) and zero or more +optional parameters (DICT2 ...). + + +** General procedures +===================== +char-set? obj -> boolean + Is the object OBJ a character set? + +char-set= cs1 ... -> boolean + Are the character sets equal? + + Boundary cases: + (char-set=) => true + (char-set= cs) => true + + Rationale: transitive binary relations are generally extended to n-ary + relations in Scheme, which enables clearer, more concise code to be + written. While the zero-argument and one-argument cases will almost + certainly not arise in first-order uses of such relations, they may well + arise in higher-order cases or macro-generated code. E.g., consider + (apply char-set= cset-list) + This is well-defined if the list is empty or a singleton list. Hence + we extend these relations to any number of arguments. Implementors + have reported actual uses of n-ary relations in higher-order cases + allowing for fewer than two arguments. The way of Scheme is to handle the + general case; we provide the fully general extension. + + A counter-argument to this extension is that R5RS's transitive binary + arithmetic relations (=, <, etc.) require at least two arguments, hence + this decision is a break with the prior convention -- although it is + at least one that is backwards-compatible. + +char-set<= cs1 ... -> boolean + Returns true if every character set CSi is a subset of character set CSi+1. + + Boundary cases: + (char-set<=) => true + (char-set<= cs) => true + + Rationale: See CHAR-SET= for discussion of zero- and one-argument + applications. Consider testing a list of char-sets for monotonicity + with (APPLY CHAR-SET<= CSET-LIST). + +char-set-hash cs [bound] -> integer + Compute a hash value for the character set CS. BOUND is a non-negative + exact integer specifying the range of the hash function. A positive + value restricts the return value to the range [0,BOUND). + + If BOUND is either zero or not given, the implementation may use + an implementation-specific default value, chosen to be as large as + is efficiently practical. For instance, the default range might be chosen + for a given implementation to map all character sets into the range of + integers that can be represented with a single machine word. + + Invariant: + (char-set= cs1 cs2) => (= (char-set-hash cs1 b) (char-set-hash cs2 b)) + + A legal but nonetheless discouraged implementation: + (define (char-set-hash cs . maybe-bound) 1) + + Rationale: allowing the user to specify an explicit bound simplifies user + code by removing the mod operation that typically accompanies every hash + computation, and also may allow the implementation of the hash function to + exploit a reduced range to efficiently compute the hash value. E.g., for + small bounds, the hash function may be computed in a fashion such that + intermediate values never overflow into bignum integers, allowing the + implementor to provide a fixnum-specific "fast path" for computing the + common cases very rapidly. + +** Iterating over character sets +=================================== + +char-set-cursor cset -> cursor +char-set-ref cset cursor -> char +char-set-cursor-next cset cursor -> cursor +end-of-char-set? cursor -> boolean + Cursors are a low-level facility for iterating over the characters in a + set. A cursor is a value that indexes a character in a char set. + CHAR-SET-CURSOR produces a new cursor for a given char set. The set + element indexed by the cursor is fetched with CHAR-SET-REF. A cursor index + is incremented with CHAR-SET-CURSOR-NEXT; in this way, code can step + through every character in a char set. Stepping a cursor "past the end" of + a char set produces a cursor that answers true to END-OF-CHAR-SET?. It is + an error to pass such a cursor to CHAR-SET-REF or to CHAR-SET-CURSOR-NEXT. + + A cursor value may not be used in conjunction with a different character + set; if it is passed to CHAR-SET-REF or CHAR-SET-CURSOR-NEXT with + a character set other than the one used to create it, the results and + effects are undefined. + + Cursor values are *not* necessarily distinct from other types. They may be + integers, linked lists, records, procedures or other values. This license + is granted to allow cursors to be very "lightweight" values suitable for + tight iteration, even in fairly simple implementations. + + Note that these primitives are necessary to export an iteration facility + for char sets to loop macros. + + Example: + + (define cs (char-set #\G #\a #\T #\e #\c #\h)) + + ;; Collect elts of CS into a list. + (let lp ((cur (char-set-cursor cs)) (ans '())) + (if (end-of-char-set? cur) ans + (lp (char-set-cursor-next cs cur) + (cons (char-set-ref cs cur) ans)))) + => (#\G #\T #\a #\c #\e #\h) + + ;; Equivalently, using a list unfold (from SRFI 1): + (unfold-right end-of-char-set? + (curry char-set-ref cs) + (curry char-set-cursor-next cs) + (char-set-cursor cs)) + => (#\G #\T #\a #\c #\e #\h) + + Rationale: Note that the cursor API's four functions "fit" the functional + protocol used by the unfolders provided by the list, string and char-set + SRFIs (see the example above). By way of contrast, here is a simpler, + two-function API that was rejected for failing this criterion. Besides + CHAR-SET-CURSOR, it provided a single function that mapped a cursor and a + character set to two values, the indexed character and the next cursor. If + the cursor had exhausted the character set, then this function returned + false instead of the character value, and another end-of-char-set cursor. + In this way, the other three functions of the current API were combined + together. + +char-set-fold kons knil cs -> object + This is the fundamental iterator for character sets. Applies the function + KONS across the character set CS using initial state value KNIL. That is, + if CS is the empty set, the procedure returns KNIL. Otherwise, some + element c of CS is chosen; let cs' be the remaining, unchosen characters. + The procedure returns + (char-set-fold KONS (KONS c KNIL) cs') + + Examples: + ;; CHAR-SET-MEMBERS + (lambda (cs) (char-set-fold cons '() cs)) + + ;; CHAR-SET-SIZE + (lambda (cs) (char-set-fold (lambda (c i) (+ i 1)) 0 cs)) + + ;; How many vowels in the char set? + (lambda (cs) + (char-set-fold (lambda (c i) (if (vowel? c) (+ i 1) i)) + 0 cs)) + +char-set-unfold f p g seed [base-cs] -> char-set +char-set-unfold! f p g seed base-cs -> char-set + This is a fundamental constructor for char-sets. + - G is used to generate a series of "seed" values from the initial seed: + SEED, (G SEED), (G^2 SEED), (G^3 SEED), ... + - P tells us when to stop -- when it returns true when applied to one + of these seed values. + - F maps each seed value to a character. These characters are added + to the base character set BASE-CS to form the result; BASE-CS defaults to + the empty set. CHAR-SET-UNFOLD! adds the characters to BASE-CS in a + linear-update -- it is allowed, but not required, to side-effect + and use BASE-CS's storage to construct the result. + + More precisely, the following definitions hold, ignoring the + optional-argument issues: + + (define (char-set-unfold p f g seed base-cs) + (char-set-unfold! p f g seed (char-set-copy base-cs))) + + (define (char-set-unfold! p f g seed base-cs) + (let lp ((seed seed) (cs base-cs)) + (if (p seed) cs ; P says we are done. + (lp (g seed) ; Loop on (G SEED). + (char-set-adjoin! cs (f seed)))))) ; Add (F SEED) to set. + + (Note that the actual implementation may be more efficient.) + + Examples: + + (port->char-set p) = (char-set-unfold eof-object? values + (lambda (x) (read-char p)) + (read-char p)) + + (list->char-set lis) = (char-set-unfold null? car cdr lis) + +char-set-for-each proc cs -> unspecified + Apply procedure PROC to each character in the character set CS. + Note that the order in which PROC is applied to the characters in the + set is not specified, and may even change from one procedure application + to another. + + Nothing at all is specified about the value returned by this procedure; it + is not even required to be consistent from call to call. It is simply + required to be a value (or values) that may be passed to a command + continuation, e.g. as the value of an expression appearing as a + non-terminal subform of a BEGIN expression. Note that in R5RS, this + restricts the procedure to returning a single value; non-R5RS systems may + not even provide this restriction. + +char-set-map proc cs -> char-set + PROC is a char->char procedure. Apply it to all the characters in + the char-set CS, and collect the results into a new character set. + + Essentially lifts PROC from a char->char procedure to a char-set -> + char-set procedure. + + Example: + (char-set-map char-downcase cset) + + +** Creating character sets +========================== +char-set-copy cs -> char-set + Returns a copy of the character set CS. "Copy" means that if either the + input parameter or the result value of this procedure is passed to one of + the linear-update procedures described below, the other character set is + guaranteed not to be altered. + + A system that provides pure-functional implementations of the + linear-operator suite could implement this procedure as the identity + function -- so copies are *not* guaranteed to be distinct by EQ?. + +char-set char1 ... -> char-set + Return a character set containing the given characters. + +list->char-set char-list [base-cs] -> char-set +list->char-set! char-list base-cs -> char-set + Return a character set containing the characters in the list of + characters CHAR-LIST. + + If character set BASE-CS is provided, the characters from CHAR-LIST + are added to it. LIST->CHAR-SET! is allowed, but not required, + to side-effect and reuse the storage in BASE-CS; LIST->CHAR-SET + produces a fresh character set. + +string->char-set s [base-cs] -> char-set +string->char-set! s base-cs -> char-set + Return a character set containing the characters in the string S. + + If character set BASE-CS is provided, the characters from S are added to + it. STRING->CHAR-SET! is allowed, but not required, to side-effect and + reuse the storage in BASE-CS; STRING->CHAR-SET produces a fresh character + set. + +char-set-filter pred cs [base-cs] -> char-set +char-set-filter! pred cs base-cs -> char-set + Returns a character set containing every character c in CS + such that (PRED c) returns true. + + If character set BASE-CS is provided, the characters specified by PRED + are added to it. CHAR-SET-FILTER! is allowed, but not required, + to side-effect and reuse the storage in BASE-CS; CHAR-SET-FILTER + produces a fresh character set. + + An implementation may not save away a reference to PRED and invoke it + after CHAR-SET-FILTER or CHAR-SET-FILTER! returns -- that is, "lazy," + on-demand implementations are not allowed, as PRED may have external + dependencies on mutable data or have other side-effects. + + Rationale: This procedure provides a means of converting a character + predicate into its equivalent character set; the CS parameter allows the + programmer to bound the predicate's domain. Programmers should be aware + that filtering a character set such as CHAR-SET:FULL could be a very + expensive operation in an implementation that provided an extremely large + character type, such as 32-bit Unicode. An earlier draft of this library + provided a simple PREDICATE->CHAR-SET procedure, which was rejected in + favor of CHAR-SET-FILTER for this reason. + +ucs-range->char-set lower upper [error? base-cs] -> char-set +ucs-range->char-set! lower upper error? base-cs -> char-set + LOWER and UPPER are exact non-negative integers; LOWER <= UPPER. + + Returns a character set containing every character whose ISO/IEC 10646 + UCS-4 code lies in the half-open range [LOWER,UPPER). + + - If the requested range includes unassigned UCS values, these are + silently ignored (the current UCS specification has "holes" in the + space of assigned codes). + + - If the requested range includes "private" or "user space" codes, these + are handled in an implementation-specific manner; however, a UCS- or + Unicode-based Scheme implementation should pass them through + transparently. + + - If any code from the requested range specifies a valid, assigned + UCS character that has no corresponding representative in the + implementation's character type, then (1) an error is raised if ERROR? + is true, and (2) the code is ignored if ERROR? is false (the default). + This might happen, for example, if the implementation uses ASCII + characters, and the requested range includes non-ASCII characters. + + If character set BASE-CS is provided, the characters specified by the + range are added to it. UCS-RANGE->CHAR-SET! is allowed, but not required, + to side-effect and reuse the storage in BASE-CS; UCS-RANGE->CHAR-SET + produces a fresh character set. + + Note that ASCII codes are a subset of the Latin-1 codes, which are in turn + a subset of the 16-bit Unicode codes, which are themselves a subset of the + 32-bit UCS-4 codes. We commit to a specific encoding in this routine, + regardless of the underlying representation of characters, so that client + code using this library will be portable. I.e., a conformant Scheme + implementation may use EBCDIC or SHIFT-JIS to encode characters; it must + simply map the UCS characters from the given range into the native + representation when possible, and report errors when not possible. + +->char-set x -> char-set + Coerces X into a char-set. X may be a string, character or char-set. A + string is converted to the set of its constituent characters; a character + is converted to a singleton set; a char-set is returned as-is. This + procedure is intended for use by other procedures that want to provide + "user-friendly," wide-spectrum interfaces to their clients. + + +** Querying character sets +========================== +char-set-size cs -> integer + Returns the number of elements in character set CS. + +char-set-count pred cs -> integer + Apply PRED to the chars of character set CS, and return the number + of chars that caused the predicate to return true. + +char-set->list cs -> character-list + This procedure returns a list of the members of character set CS. + The order in which CS's characters appear in the list is not defined, + and may be different from one call to another. + +char-set->string cs -> string + This procedure returns a string containing the members of character set CS. + The order in which CS's characters appear in the string is not defined, + and may be different from one call to another. + +char-set-contains? cs char -> boolean + This procedure tests CHAR for membership in character set CS. + + The MIT Scheme character-set package called this procedure + CHAR-SET-MEMBER?, but the argument order isn't consistent with the name. + +char-set-every pred cs -> boolean +char-set-any pred cs -> object + The CHAR-SET-EVERY procedure returns true if predicate PRED + returns true of every character in the character set CS. + + Likewise, CHAR-SET-ANY applies PRED to every character in + character set CS, and returns the first true value it finds. + If no character produces a true value, it returns false. + + The order in which these procedures sequence through the elements of + CS is not specified. + + Note that if you need to determine the actual character on which a + predicate returns true, use CHAR-SET-ANY and arrange for the predicate + to return the character parameter as its true value, e.g. + (char-set-any (lambda (c) (and (char-upper-case? c) c)) + cs) + + +** Character-set algebra +======================== +char-set-adjoin cs char1 ... -> char-set +char-set-delete cs char1 ... -> char-set + Add/delete the CHARi characters to/from character set CS. + +char-set-adjoin! cs char1 ... -> char-set +char-set-delete! cs char1 ... -> char-set + Linear-update variants. These procedures are allowed, but not + required, to side-effect their first parameter. + +char-set-complement cs -> char-set +char-set-union cs1 ... -> char-set +char-set-intersection cs1 ... -> char-set +char-set-difference cs1 cs2 ... -> char-set +char-set-xor cs1 ... -> char-set +char-set-diff+intersection cs1 cs2 ... -> [char-set char-set] + These procedures implement set complement, union, intersection, + difference, and exclusive-or for character sets. The union, intersection + and xor operations are n-ary. The difference function is also n-ary, + associates to the left (that is, it computes the difference between + its first argument and the union of all the other arguments), + and requires at least one argument. + + Boundary cases: + (char-set-union) => char-set:empty + (char-set-intersection) => char-set:all + (char-set-xor) => char-set:empty + (char-set-difference cs) => cs + + CHAR-SET-DIFF+INTERSECTION returns both the difference and the + intersection of the arguments -- it partitions its first parameter. + It is equivalent to + (values (char-set-difference cs1 cs2 ...) + (char-set-intersection cs1 (char-set-union cs2 ...))) + but can be implemented more efficiently. + + Programmers should be aware that CHAR-SET-COMPLEMENT could potentially + be a very expensive operation in Scheme implementations that provide + a very large character type, such as 32-bit Unicode. If this is a + possibility, sets can be complimented with respect to a smaller + universe using CHAR-SET-DIFFERENCE. + +char-set-complement! cs -> char-set +char-set-union! cs1 cs2 ... -> char-set +char-set-intersection! cs1 cs2 ... -> char-set +char-set-difference! cs1 cs2 ... -> char-set +char-set-xor! cs1 cs2 ... -> char-set +char-set-diff+intersection! cs1 cs2 cs3 ... -> [char-set char-set] + These are linear-update variants of the set-algebra functions. + They are allowed, but not required, to side-effect their first + (required) parameter. + + + CHAR-SET-DIFF+INTERSECTION! is allowed to side-effect both of + its two required parameters, CS1 and CS2. + +** Standard character sets +========================== +Several character sets are predefined for convenience: + char-set:lower-case Lower-case letters + char-set:upper-case Upper-case letters + char-set:title-case Title-case letters + char-set:letter Letters + char-set:digit Digits + char-set:letter+digit Letters and digits + char-set:graphic Printing characters except spaces + char-set:printing Printing characters including spaces + char-set:whitespace Whitespace characters + char-set:iso-control The ISO control characters + char-set:punctuation Punctuation characters + char-set:symbol Symbol characters + char-set:hex-digit A hexadecimal digit: 0-9, A-F, a-f + char-set:blank Blank characters -- horizontal whitespace + char-set:ascii All characters in the ASCII set. + char-set:empty Empty set + char-set:full All characters + +Note that there may be characters in CHAR-SET:LETTER that are neither upper or +lower case---this might occur in implementations that use a character type +richer than ASCII, such as Unicode. A "graphic character" is one that would +put ink on your page. While the exact composition of these sets may vary +depending upon the character type provided by the underlying Scheme system, +here are the definitions for some of the sets in an ASCII implementation: + + char-set:lower-case a-z + char-set:upper-case A-Z + char-set:letter A-Z and a-z + char-set:digit 0123456789 + char-set:punctuation !"#%&'()*,-./:;?@[\]_{} + char-set:symbol $+<=>^`|~ + char-set:whitespace Space, newline, tab, form feed, + vertical tab, carriage return + char-set:blank Space and tab + char-set:graphic letter + digit + punctuation + symbol + char-set:printing graphic + whitespace + char-set:iso-control ASCII 0-31 and 127 + +Note that the existence of the CHAR-SET:ASCII set implies that the underlying +character set is required to be at least as rich as ASCII (including +ASCII's control characters). + +Rationale: The name choices reflect a shift from the older +"alphabetic/numeric" terms found in R5RS and Posix to newer, +Unicode-influenced "letter/digit" lexemes. + +------------------------------------------------------------------------------- +* Unicode, Latin-1 and ASCII definitions of the standard character sets +----------------------------------------------------------------------- + +In Unicode Scheme implementations, the base character sets are compatible with +Java's Unicode specifications. For ASCII or Latin-1, we simply restrict the +Unicode set specifications to their first 128 or 256 codes, respectively. +Scheme implementations that are not based on ASCII, Latin-1 or Unicode should +attempt to preserve the sense or spirit of these definitions. + +The following descriptions frequently make reference to the "Unicode character +database." This is a file, available at URL + ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt +Each line contains a description of a Unicode character. The first +semicolon-delimited field of the line gives the hex value of the character's +code; the second field gives the name of the character, and the third field +gives a two-letter category. Other fields give simple 1-1 case-mappings for +the character and other information; see + ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.html +for further description of the file's format. Note in particular the +two-letter category specified in the the third field, which is referenced +frequently in the descriptions below. + +** char-set:lower-case +====================== +For Unicode, we follow Java's specification: a character is lowercase if + + it is not in the range [U+2000,U+2FFF], and + + the Unicode attribute table does not give a lowercase mapping for it, and + + at least one of the following is true: + - the Unicode attribute table gives a mapping to uppercase + for the character, or + - the name for the character in the Unicode attribute table contains + the words "SMALL LETTER" or "SMALL LIGATURE". + +The lower-case ASCII characters are + abcdefghijklmnopqrstuvwxyz +Latin-1 adds another 33 lower-case characters to the ASCII set: + 00B5 MICRO SIGN + 00DF LATIN SMALL LETTER SHARP S + 00E0 LATIN SMALL LETTER A WITH GRAVE + 00E1 LATIN SMALL LETTER A WITH ACUTE + 00E2 LATIN SMALL LETTER A WITH CIRCUMFLEX + 00E3 LATIN SMALL LETTER A WITH TILDE + 00E4 LATIN SMALL LETTER A WITH DIAERESIS + 00E5 LATIN SMALL LETTER A WITH RING ABOVE + 00E6 LATIN SMALL LETTER AE + 00E7 LATIN SMALL LETTER C WITH CEDILLA + 00E8 LATIN SMALL LETTER E WITH GRAVE + 00E9 LATIN SMALL LETTER E WITH ACUTE + 00EA LATIN SMALL LETTER E WITH CIRCUMFLEX + 00EB LATIN SMALL LETTER E WITH DIAERESIS + 00EC LATIN SMALL LETTER I WITH GRAVE + 00ED LATIN SMALL LETTER I WITH ACUTE + 00EE LATIN SMALL LETTER I WITH CIRCUMFLEX + 00EF LATIN SMALL LETTER I WITH DIAERESIS + 00F0 LATIN SMALL LETTER ETH + 00F1 LATIN SMALL LETTER N WITH TILDE + 00F2 LATIN SMALL LETTER O WITH GRAVE + 00F3 LATIN SMALL LETTER O WITH ACUTE + 00F4 LATIN SMALL LETTER O WITH CIRCUMFLEX + 00F5 LATIN SMALL LETTER O WITH TILDE + 00F6 LATIN SMALL LETTER O WITH DIAERESIS + 00F8 LATIN SMALL LETTER O WITH STROKE + 00F9 LATIN SMALL LETTER U WITH GRAVE + 00FA LATIN SMALL LETTER U WITH ACUTE + 00FB LATIN SMALL LETTER U WITH CIRCUMFLEX + 00FC LATIN SMALL LETTER U WITH DIAERESIS + 00FD LATIN SMALL LETTER Y WITH ACUTE + 00FE LATIN SMALL LETTER THORN + 00FF LATIN SMALL LETTER Y WITH DIAERESIS +Note that three of these have no corresponding Latin-1 upper-case character: + 00B5 MICRO SIGN + 00DF LATIN SMALL LETTER SHARP S + 00FF LATIN SMALL LETTER Y WITH DIAERESIS +(The compatibility micro character uppercases to the non-Latin-1 Greek capital +mu; the German sharp s character uppercases to the pair of characters "SS," +and the capital y-with-diaeresis is non-Latin-1.) + +(Note that the Java spec for lowercase characters given at + http://java.sun.com/docs/books/jls/html/javalang.doc4.html#14345 +is inconsistent. U+00B5 MICRO SIGN fulfills the requirements for a lower-case +character (as of Unicode 3.0), but is not given in the numeric list of +lower-case character codes.) + +(Note that the Java spec for isLowerCase() given at + http://java.sun.com/products/jdk/1.2/docs/api/java/lang/Character.html#isLowerCase(char) +gives three mutually inconsistent definitions of "lower case." The first is +the definition used in this SRFI. Following text says "A character is +considered to be lowercase if and only if it is specified to be lowercase by +the Unicode 2.0 standard (category Ll in the Unicode specification data +file)." The former spec excludes U+00AA FEMININE ORDINAL INDICATOR and +U+00BA MASCULINE ORDINAL INDICATOR; the later spec includes them. Finally, +the spec enumerates a list of characters in the Latin-1 subset; this list +excludes U+00B5 MICRO SIGN, which is included in both of the previous specs.) + + +** char-set:upper-case +====================== +For Unicode, we follow Java's specification: a character is uppercase if + + it is not in the range [U+2000,U+2FFF], and + + the Unicode attribute table does not give an uppercase mapping for it + (this excludes titlecase characters), and + + at least one of the following is true: + - the Unicode attribute table gives a mapping to lowercase + for the character, or + - the name for the character in the Unicode attribute table contains + the words "CAPITAL LETTER" or "CAPITAL LIGATURE". + +The upper-case ASCII characters are + ABCDEFGHIJKLMNOPQRSTUVWXYZ +Latin-1 adds another 30 upper-case characters to the ASCII set: + 00C0 LATIN CAPITAL LETTER A WITH GRAVE + 00C1 LATIN CAPITAL LETTER A WITH ACUTE + 00C2 LATIN CAPITAL LETTER A WITH CIRCUMFLEX + 00C3 LATIN CAPITAL LETTER A WITH TILDE + 00C4 LATIN CAPITAL LETTER A WITH DIAERESIS + 00C5 LATIN CAPITAL LETTER A WITH RING ABOVE + 00C6 LATIN CAPITAL LETTER AE + 00C7 LATIN CAPITAL LETTER C WITH CEDILLA + 00C8 LATIN CAPITAL LETTER E WITH GRAVE + 00C9 LATIN CAPITAL LETTER E WITH ACUTE + 00CA LATIN CAPITAL LETTER E WITH CIRCUMFLEX + 00CB LATIN CAPITAL LETTER E WITH DIAERESIS + 00CC LATIN CAPITAL LETTER I WITH GRAVE + 00CD LATIN CAPITAL LETTER I WITH ACUTE + 00CE LATIN CAPITAL LETTER I WITH CIRCUMFLEX + 00CF LATIN CAPITAL LETTER I WITH DIAERESIS + 00D0 LATIN CAPITAL LETTER ETH + 00D1 LATIN CAPITAL LETTER N WITH TILDE + 00D2 LATIN CAPITAL LETTER O WITH GRAVE + 00D3 LATIN CAPITAL LETTER O WITH ACUTE + 00D4 LATIN CAPITAL LETTER O WITH CIRCUMFLEX + 00D5 LATIN CAPITAL LETTER O WITH TILDE + 00D6 LATIN CAPITAL LETTER O WITH DIAERESIS + 00D8 LATIN CAPITAL LETTER O WITH STROKE + 00D9 LATIN CAPITAL LETTER U WITH GRAVE + 00DA LATIN CAPITAL LETTER U WITH ACUTE + 00DB LATIN CAPITAL LETTER U WITH CIRCUMFLEX + 00DC LATIN CAPITAL LETTER U WITH DIAERESIS + 00DD LATIN CAPITAL LETTER Y WITH ACUTE + 00DE LATIN CAPITAL LETTER THORN + + +** char-set:title-case +====================== +In Unicode, a character is titlecase if it has the category Lt in +the character attribute database. There are very few of these characters; +here is the entire 31-character list as of Unicode 3.0: + + 01C5 LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON + 01C8 LATIN CAPITAL LETTER L WITH SMALL LETTER J + 01CB LATIN CAPITAL LETTER N WITH SMALL LETTER J + 01F2 LATIN CAPITAL LETTER D WITH SMALL LETTER Z + 1F88 GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI + 1F89 GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI + 1F8A GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI + 1F8B GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI + 1F8C GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI + 1F8D GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI + 1F8E GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI + 1F8F GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI + 1F98 GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI + 1F99 GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI + 1F9A GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI + 1F9B GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI + 1F9C GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI + 1F9D GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI + 1F9E GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI + 1F9F GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI + 1FA8 GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI + 1FA9 GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI + 1FAA GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI + 1FAB GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI + 1FAC GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI + 1FAD GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI + 1FAE GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI + 1FAF GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI + 1FBC GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI + 1FCC GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI + 1FFC GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI + +There are no ASCII or Latin-1 titlecase characters. + + +** char-set:letter +================== +In Unicode, a letter is any character with one of the letter categories +(Lu, Ll, Lt, Lm, Lo) in the Unicode character database. + +There are 52 ASCII letters + abcdefghijklmnopqrstuvwxyz + ABCDEFGHIJKLMNOPQRSTUVWXYZ + +There are 117 Latin-1 letters. These are the 115 characters that are +members of the Latin-1 CHAR-SET:LOWER-CASE and CHAR-SET:UPPER-CASE sets, +plus + 00AA FEMININE ORDINAL INDICATOR + 00BA MASCULINE ORDINAL INDICATOR +(These two letters are considered lower-case by Unicode, but not by +Java or SRFI 14.) + +** char-set:digit +================= +In Unicode, a character is a digit if it has the category Nd in +the character attribute database. In Latin-1 and ASCII, the only +such characters are 0123456789. In Unicode, there are other digit +characters in other code blocks, such as Gujarati digits and Tibetan +digits. + + +** char-set:hex-digit +===================== +The only hex digits are 0123456789abcdefABCDEF. + + +** char-set:letter+digit +======================== +The union of CHAR-SET:LETTER and CHAR-SET:DIGIT. + + +** char-set:graphic +=================== +A graphic character is one that would put ink on paper. The ASCII and Latin-1 +graphic characters are the members of + CHAR-SET:LETTER + CHAR-SET:DIGIT + CHAR-SET:PUNCTUATION + CHAR-SET:SYMBOL + + +** char-set:printing +==================== +A printing character is one that would occupy space when printed, i.e., +a graphic character or a space character. CHAR-SET:PRINTING is the union +of CHAR-SET:WHITESPACE and CHAR-SET:GRAPHIC. + + +** char-set:whitespace +====================== +In Unicode, a whitespace character is either + - a character with one of the space, line, or paragraph separator categories + (Zs, Zl or Zp) of the Unicode character database. + - U+0009 Horizontal tabulation (\t control-I) + - U+000A Line feed (\n control-J) + - U+000B Vertical tabulation (\v control-K) + - U+000C Form feed (\f control-L) + - U+000D Carriage return (\r control-M) + +There are 24 whitespace characters in Unicode 3.0: + 0009 HORIZONTAL TABULATION \t control-I + 000A LINE FEED \n control-J + 000B VERTICAL TABULATION \v control-K + 000C FORM FEED \f control-L + 000D CARRIAGE RETURN \r control-M + 0020 SPACE Zs + 00A0 NO-BREAK SPACE Zs + 1680 OGHAM SPACE MARK Zs + 2000 EN QUAD Zs + 2001 EM QUAD Zs + 2002 EN SPACE Zs + 2003 EM SPACE Zs + 2004 THREE-PER-EM SPACE Zs + 2005 FOUR-PER-EM SPACE Zs + 2006 SIX-PER-EM SPACE Zs + 2007 FIGURE SPACE Zs + 2008 PUNCTUATION SPACE Zs + 2009 THIN SPACE Zs + 200A HAIR SPACE Zs + 200B ZERO WIDTH SPACE Zs + 2028 LINE SEPARATOR Zl + 2029 PARAGRAPH SEPARATOR Zp + 202F NARROW NO-BREAK SPACE Zs + 3000 IDEOGRAPHIC SPACE Zs + +The ASCII whitespace characters are the first six characters in the above list +-- line feed, horizontal tabulation, vertical tabulation, form feed, carriage +return, and space. These are also exactly the characters recognised by the +Posix isspace() procedure. Latin-1 adds the no-break space. + +Note: Java's isWhitespace() method is incompatible, including + 001C FILE SEPARATOR (control-\) + 001D GROUP SEPARATOR (control-]) + 001E RECORD SEPARATOR (control-^) + 001F UNIT SEPARATOR (control-_) +and excluding + 00A0 NO-BREAK SPACE + +Java's excluding the no-break space means that tokenizers can simply break +character streams at "whitespace" boundaries. However, the exclusion introduces +exceptions in other places, e.g. CHAR-SET:PRINTING is no longer simply the +union of CHAR-SET:GRAPHIC and CHAR-SET:WHITESPACE. + + +** char-set:iso-control +======================= +The ISO control characters are the Unicode/Latin-1 characters in the ranges +[U+0000,U+001F] and [U+007F,U+009F]. + +ASCII restricts this set to the characters in the range [U+0000,U+001F] +plus the character U+007F. + +Note that Unicode defines other control characters which do not belong to this +set (hence the qualifying prefix "iso-" in the name). This restriction is +compatible with the Java IsISOControl() method. + + +** char-set:punctuation +======================= +In Unicode, a punctuation character is any character that has one of the +punctuation categories in the Unicode character database (Pc, Pd, Ps, +Pe, Pi, Pf, or Po.) + +ASCII has 23 punctuation characters: + !"#%&'()*,-./:;?@[\]_{} + +Latin-1 adds six more: + 00A1 INVERTED EXCLAMATION MARK + 00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + 00AD SOFT HYPHEN + 00B7 MIDDLE DOT + 00BB RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + 00BF INVERTED QUESTION MARK + +Note that the nine ASCII characters $+<=>^`|~ are *not* punctuation. +They are "symbols." + + +** char-set:symbol +================== +In Unicode, a symbol is any character that has one of the symbol categories +in the Unicode character database (Sm, Sc, Sk, or So). There are nine ASCII +symbol characters: + $+<=>^`|~ + +Latin-1 adds 18 more: + 00A2 CENT SIGN + 00A3 POUND SIGN + 00A4 CURRENCY SIGN + 00A5 YEN SIGN + 00A6 BROKEN BAR + 00A7 SECTION SIGN + 00A8 DIAERESIS + 00A9 COPYRIGHT SIGN + 00AC NOT SIGN + 00AE REGISTERED SIGN + 00AF MACRON + 00B0 DEGREE SIGN + 00B1 PLUS-MINUS SIGN + 00B4 ACUTE ACCENT + 00B6 PILCROW SIGN + 00B8 CEDILLA + 00D7 MULTIPLICATION SIGN + 00F7 DIVISION SIGN + + +** char-set:blank +================= +Blank chars are horizontal whitespace. In Unicode, a blank character is either + - a character with the space separator category (Zs) in the Unicode + character database. + - U+0009 Horizontal tabulation (\t control-I) + +There are eighteen blank characters in Unicode 3.0: + 0009 HORIZONTAL TABULATION \t control-I + 0020 SPACE Zs + 00A0 NO-BREAK SPACE Zs + 1680 OGHAM SPACE MARK Zs + 2000 EN QUAD Zs + 2001 EM QUAD Zs + 2002 EN SPACE Zs + 2003 EM SPACE Zs + 2004 THREE-PER-EM SPACE Zs + 2005 FOUR-PER-EM SPACE Zs + 2006 SIX-PER-EM SPACE Zs + 2007 FIGURE SPACE Zs + 2008 PUNCTUATION SPACE Zs + 2009 THIN SPACE Zs + 200A HAIR SPACE Zs + 200B ZERO WIDTH SPACE Zs + 202F NARROW NO-BREAK SPACE Zs + 3000 IDEOGRAPHIC SPACE Zs + +The ASCII blank characters are the first two characters above -- +horizontal tab and space. Latin-1 adds the no-break space. + +Java doesn't have the concept of "blank" characters, so there are no +compatibility issues. + + +------------------------------------------------------------------------------- +* Reference implementation +-------------------------- + +This SRFI comes with a reference implementation. It resides at: + http://srfi.schemers.org/srfi-14/srfi-14.scm +I have placed this source on the Net with an unencumbered, "open" copyright. +Some of the code in the reference implementation bears a distant family +relation to the MIT Scheme implementation, and being derived from that code, +is covered by the MIT Scheme copyright (which is a generic BSD-style +open-source copyright -- see the source file for details). The remainder of +the code was written by myself for scsh or for this SRFI; I have placed this +code under the scsh copyright, which is also a generic BSD-style open-source +copyright. + +The code is written for portability and should be simple to port to +any Scheme. It has only the following deviations from R4RS, clearly +discussed in the comments: + - an ERROR procedure; + - the R5RS VALUES procedure for producing multiple return values; + - a simple CHECK-ARG procedure for argument checking; + - LET-OPTIONALS* and :OPTIONAL macros for for parsing, checking & defaulting + optional arguments from rest lists; + - The SRFI-19 DEFINE-RECORD-TYPE form; + - BITWISE-AND for the hash function; + - %LATIN1->CHAR & %CHAR->LATIN1. + +The library is written for clarity and well-commented; the current source is +about 375 lines of source code and 375 lines of comments and white space. +It is also written for efficiency. Fast paths are provided for common cases. + +This is not to say that the implementation can't be tuned up for +a specific Scheme implementation. There are notes in comments addressing +ways implementors can tune the reference implementation for performance. + +In short, I've written the reference implementation to make it as painless +as possible for an implementor -- or a regular programmer -- to adopt this +library and get good results with it. + +The code uses a rather simple-minded, inefficient representation for +ASCII/Latin-1 char-sets -- a 256-character string. The character whose code is +I is in the set if S[I] = ASCII 1 (soh, or ^a); not in the set if S[I] = ASCII +0 (nul). A much faster and denser representation would be 16 or 32 bytes worth +of bit string. A portable implementation using bit sets awaits standards for +bitwise logical-ops and byte vectors. + +"Large" character types, such as Unicode, should use a sparse representation, +taking care that the Latin-1 subset continues to be represented with a +dense 32-byte bit set. + + +------------------------------------------------------------------------------- +* Acknowledgements +------------------ + +The design of this library benefited greatly from the feedback provided during +the SRFI discussion phase. Among those contributing thoughtful commentary and +suggestions, both on the mailing list and by private discussion, were Paolo +Amoroso, Lars Arvestad, Alan Bawden, Jim Bender, Dan Bornstein, Per Bothner, +Will Clinger, Brian Denheyer, Kent Dybvig, Sergei Egorov, Marc Feeley, +Matthias Felleisen, Will Fitzgerald, Matthew Flatt, Arthur A. Gleckler, Ben +Goetter, Sven Hartrumpf, Erik Hilsdale, Shiro Kawai, Richard Kelsey, Oleg +Kiselyov, Bengt Kleberg, Donovan Kolbly, Bruce Korb, Shriram Krishnamurthi, +Bruce Lewis, Tom Lord, Brad Lucier, Dave Mason, David Rush, Klaus Schilling, +Jonathan Sobel, Mike Sperber, Mikael Staldal, Vladimir Tsyshevsky, Donald +Welsh, and Mike Wilson. I am grateful to them for their assistance. + +I am also grateful the authors, implementors and documentors of all the systems +mentioned in the introduction. Aubrey Jaffer and Kent Pitman should be noted +for their work in producing Web-accessible versions of the R5RS and Common +Lisp spec, which was a tremendous aid. + +This is not to imply that these individuals necessarily endorse the final +results, of course. + +During this document's long development period, great patience was exhibited +by Mike Sperber, who is the editor for the SRFI, and by Hillary Sullivan, +who is not. + +------------------------------------------------------------------------------- +* References & links +-------------------- + +[Java] + The following URLs provide documentation on relevant Java classes. + + http://java.sun.com/products/jdk/1.2/docs/api/java/lang/Character.html + http://java.sun.com/products/jdk/1.2/docs/api/java/lang/String.html + http://java.sun.com/products/jdk/1.2/docs/api/java/lang/StringBuffer.html + http://java.sun.com/products/jdk/1.2/docs/api/java/text/Collator.html + http://java.sun.com/products/jdk/1.2/docs/api/java/text/package-summary.html + +[MIT-Scheme] + http://www.swiss.ai.mit.edu/projects/scheme/ + +[R5RS] + Revised^5 report on the algorithmic language Scheme. + R. Kelsey, W. Clinger, J. Rees (editors). + Higher-Order and Symbolic Computation, Vol. 11, No. 1, September, 1998. + and ACM SIGPLAN Notices, Vol. 33, No. 9, October, 1998. + + Available at http://www.schemers.org/Documents/Standards/ + +[SRFI] + The SRFI web site. + http://srfi.schemers.org/ + +[SRFI-14] + SRFI-14: Character-set library. + http://srfi.schemers.org/srfi-14/ + + This document, in HTML: + http://srfi.schemers.org/srfi-14/srfi-14.html + This document, in plain text format: + http://srfi.schemers.org/srfi-14/srfi-14.txt + Source code for the reference implementation: + http://srfi.schemers.org/srfi-14/srfi-14.scm + Scheme 48 module specification, with typings: + http://srfi.schemers.org/srfi-14/srfi-14-s48-module.scm + Regression-test suite: + http://srfi.schemers.org/srfi-14/srfi-14-tests.scm + +[Unicode] + http://www.unicode.org/ + +[UnicodeData] + The Unicode character database. + ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.html + ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt + + +------------------------------------------------------------------------------- +* Copyright +----------- + +Certain portions of this document -- the specific, marked segments of text +describing the R5RS procedures -- were adapted with permission from the R5RS +report. + +All other text is copyright (C) Olin Shivers (1998, 1999). +All Rights Reserved. + +This document and translations of it may be copied and furnished to others, +and derivative works that comment on or otherwise explain it or assist in its +implementation may be prepared, copied, published and distributed, in whole or +in part, without restriction of any kind, provided that the above copyright +notice and this paragraph are included on all such copies and derivative +works. However, this document itself may not be modified in any way, such as +by removing the copyright notice or references to the Scheme Request For +Implementation process or editors, except as needed for the purpose of +developing SRFIs in which case the procedures for copyrights defined in the +SRFI process must be followed, or as required to translate it into languages +other than English. + +The limited permissions granted above are perpetual and will not be revoked by +the authors or their successors or assigns. + +This document and the information contained herein is provided on an "AS IS" +basis and THE AUTHORS AND THE SRFI EDITORS DISCLAIM ALL WARRANTIES, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE +INFORMATION HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF +MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + + + +------------------------------------------------------------------------------- +* Ispell "buffer local" dictionary +---------------------------------- + +Ispell dumps "buffer local" words here. Please ignore. + + LocalWords: SRFI Unicode API RS lib ARG ascii xor diff defs Generalise cs CSi + LocalWords: kons knil proc upcase cset lp eof lis cdr pred ary CHARi Posix op + LocalWords: uniquified DrScheme soh nul HTML srfi html txt scm Clinger Rees + LocalWords: SIGPLAN refs ucs iso CS's downcase IEC conformant JIS ASCII URL + LocalWords: FFF abcdefghijklmnopqrstuvwxyz DF DIAERESIS AE EA EB EC EE EF ETH + LocalWords: FA FB FC FD FF SS diaeresis isLowerCase Ll AA BA titlecase CA CB + LocalWords: CC CD CE CF DA DC DD Lt CARON PSILI PROSGEGRAMMENI DASIA VARIA Lu + LocalWords: OXIA PERISPOMENI FAA FAB FAC FAE FAF FBC FFC Lm Lo abcdefABCDEF + LocalWords: Zs Zl Zp OGHAM IDEOGRAPHIC recognised isspace isWhitespace Pc Pd + LocalWords: tokenizers IsISOControl Ps Pe Pf AB BB BF Sm Sc Sk AC AF MACRON + LocalWords: PILCROW obj EQ scsh ops UnicodeData Paolo Amoroso Arvestad Bawden + LocalWords: Bornstein Bothner Denheyer Dybvig Egorov Feeley Matthias Flatt eq + LocalWords: Felleisen Gleckler Goetter Sven Hartrumpf Hilsdale Shiro Kawai + LocalWords: Kiselyov Bengt Kleberg Kolbly Korb Shriram Krishnamurthi Lucier + LocalWords: Schilling Sobel Mikael Staldal Tsyshevsky documentors Jaffer ans + LocalWords: Sperber bignum fixnum ref init doc dict subform diff --git a/scsh/lib/cset-obsolete.scm b/scsh/lib/cset-obsolete.scm new file mode 100644 index 0000000..c37e761 --- /dev/null +++ b/scsh/lib/cset-obsolete.scm @@ -0,0 +1,52 @@ +;;; Support for obsolete, deprecated 0.5.2 char-set procedures. +;;; Will go away in a future release. + +(define-interface obsolete-char-set-interface + (export char-set-members ; char-set->list + chars->char-set ; list->char-set + ascii-range->char-set ; ucs-range->char-set (not exact) + predicate->char-set ; char-set-filter (not exact) + ->char-set ; no longer handles a predicate + char-set-every? ; char-set-every + + char-set-invert ; char-set-complement + char-set-invert! ; char-set-complement! + + char-set:alphabetic ; char-set:letter + char-set:numeric ; char-set:digit + char-set:alphanumeric ; char-set:letter+digit + char-set:control)) ; char-set:iso-control + + +(define-structure obsolete-char-set-lib obsolete-char-set-interface + (open scsh-utilities char-set-lib scheme) + (begin + + (define char-set-members + (deprecated-proc char-set->list 'char-set-members + "Use CHAR-SET->LIST instead.")) + (define chars->char-set + (deprecated-proc list->char-set 'chars->char-set + "Use LIST->CHAR-SET instead.")) + (define ascii-range->char-set + (deprecated-proc (lambda (lower upper) (ucs-range->char-set lower upper #t)) + 'ascii-range->char-set + "Use UCS-RANGE->CHAR-SET instead.")) + (define predicate->char-set + (deprecated-proc (lambda (pred) (char-set-filter pred char-set:full)) + 'predicate->char-set + "Change code to use CHAR-SET-FILTER.")) + (define char-set-every? + (deprecated-proc char-set-every 'char-set-every? + "Use CHAR-SET-EVERYyn instead.")) + (define char-set-invert + (deprecated-proc char-set-complement 'char-set-invert + "Use CHAR-SET-COMPLEMENTyn instead.")) + (define char-set-invert! + (deprecated-proc char-set-complement! 'char-set-invert! + "Use CHAR-SET-COMPLEMENT!yn instead.")) + + (define char-set:alphabetic char-set:letter) + (define char-set:numeric char-set:digit) + (define char-set:alphanumeric char-set:letter+digit) + (define char-set:control char-set:iso-control))) diff --git a/scsh/lib/cset-package.scm b/scsh/lib/cset-package.scm new file mode 100644 index 0000000..28812c0 --- /dev/null +++ b/scsh/lib/cset-package.scm @@ -0,0 +1,151 @@ +;;; SRFI-14 interface for Scheme48 -*- Scheme -*- +;;; +;;; Complete interface spec for the SRFI-14 char-set-lib library in the +;;; Scheme48 interface and module language. The interface is fully typed, in +;;; the Scheme48 type notation. The structure definitions also provide a +;;; formal description of the external dependencies of the source code. + +(define-interface char-set-interface + (export (char-set? (proc (:value) :boolean)) + ((char-set= char-set<=) (proc (&rest :value) :boolean)) + + (char-set-hash (proc (:value &opt :exact-integer) :exact-integer)) + + ;; Cursors are exact integers in the reference implementation. + ;; These typings would be different with a different cursor + ;; implementation. + ;; Too bad Scheme doesn't have abstract data types. + (char-set-cursor (proc (:value) :exact-integer)) + (char-set-ref (proc (:value :exact-integer) :char)) + (char-set-cursor-next (proc (:value :exact-integer) :exact-integer)) + (end-of-char-set? (proc (:value) :boolean)) + + (char-set-fold (proc ((proc (:char :value) :value) :value :value) + :value)) + (char-set-unfold (proc ((proc (:value) :boolean) + (proc (:value) :value) + (proc (:value) :value) + :value + &opt :value) + :value)) + + (char-set-unfold! (proc ((proc (:value) :boolean) + (proc (:value) :value) + (proc (:value) :value) + :value :value) + :value)) + + (char-set-for-each (proc ((proc (:char) :values) :value) :unspecific)) + (char-set-map (proc ((proc (:char) :char) :value) :value)) + + (char-set-copy (proc (:value) :value)) + + (char-set (proc (&rest :char) :value)) + + (list->char-set (proc (:value &opt :value) :value)) + (list->char-set! (proc (:value :value) :value)) + + (string->char-set (proc (:value &opt :value) :value)) + (string->char-set! (proc (:value :value) :value)) + + (ucs-range->char-set (proc (:exact-integer :exact-integer &opt + :boolean :value) + :value)) + (ucs-range->char-set! (proc (:exact-integer :exact-integer + :boolean :value) + :value)) + + (char-set-filter (proc ((proc (:char) :boolean) :value &opt :value) :value)) + (char-set-filter! (proc ((proc (:char) :boolean) :value :value) :value)) + + (->char-set (proc (:value) :value)) + + (char-set-size (proc (:value) :exact-integer)) + (char-set-count (proc ((proc (:char) :boolean) :value) :exact-integer)) + (char-set-contains? (proc (:char :value) :boolean)) + + (char-set-every (proc ((proc (:char) :boolean) :value) :boolean)) + (char-set-any (proc ((proc (:char) :boolean) :value) :value)) + + ((char-set-adjoin char-set-delete + char-set-adjoin! char-set-delete!) + (proc (:value &rest :char) :value)) + + (char-set->list (proc (:value) :value)) + (char-set->string (proc (:value) :string)) + + (char-set-complement (proc (:value) :value)) + ((char-set-union char-set-intersection char-set-xor) + (proc (&rest :value) :value)) + + (char-set-difference (proc (:value &opt :value) :value)) + + (char-set-diff+intersection (proc (:value &rest :value) + (some-values :value :value))) + + (char-set-complement! (proc (:value) :value)) + + ((char-set-union! char-set-intersection! + char-set-xor! char-set-difference!) + (proc (:value &opt :value) :value)) + + (char-set-diff+intersection! (proc (:value :value &rest :value) + (some-values :value :value))) + + char-set:lower-case + char-set:upper-case + char-set:letter + char-set:digit + char-set:letter+digit + char-set:graphic + char-set:printing + char-set:whitespace + char-set:blank + char-set:iso-control + char-set:punctuation + char-set:symbol + char-set:hex-digit + char-set:ascii + char-set:empty + char-set:full + )) + +; rdelim.scm gets into the innards of char-sets. +(define-interface scsh-char-set-low-level-interface + (export (char-set:s (proc (:value) :string)))) + +(define-structures ((char-set-lib char-set-interface) + (scsh-char-set-low-level-lib scsh-char-set-low-level-interface)) + (open error-package ; ERROR procedure + let-opt ; LET-OPTIONALS* and :OPTIONAL + ascii ; CHAR->ASCII ASCII->CHAR + bitwise ; BITWISE-AND + jar-d-r-t-package ; DEFINE-RECORD-TYPE/JAR macro. + scheme) + + (begin (define (check-arg pred val caller) + (let lp ((val val)) + (if (pred val) val (lp (error "Bad argument" val pred caller))))) + + (define %latin1->char ascii->char) ; Works for S48 + (define %char->latin1 char->ascii) ; Works for S48 + + ;; Here's a SRFI-19 d-r-t defined in terms of jar's almost-identical + ;; d-r-t. + (define-syntax define-record-type + (syntax-rules () + ((define-record-type ?name ?stuff ...) + (define-record-type/jar ?name ?name ?stuff ...))))) + + (files cset-lib) + (optimize auto-integrate)) + +;;; Import jar's DEFINE-RECORD-TYPE macro, and export it under the +;;; name DEFINE-RECORD-TYPE/JAR. +(define-structure jar-d-r-t-package (export (define-record-type/jar :syntax)) + (open define-record-types ; JAR's record macro + scheme) + (begin (define-syntax define-record-type/jar + (syntax-rules () + ((define-record-type/jar ?stuff ...) + (define-record-type ?stuff ...)))))) diff --git a/scsh/lib/cset-tests.scm b/scsh/lib/cset-tests.scm new file mode 100644 index 0000000..0b96314 --- /dev/null +++ b/scsh/lib/cset-tests.scm @@ -0,0 +1,200 @@ +;;; This is a regression testing suite for the SRFI-14 char-set library. +;;; Olin Shivers + +(let-syntax ((test (syntax-rules () + ((test form ...) + (cond ((not form) (error "Test failed" 'form)) ... + (else 'OK)))))) + (let ((vowel (lambda (c) (member c '(#\a #\e #\i #\o #\u))))) + +(test + (not (char-set? 5)) + + (char-set? (char-set #\a #\e #\i #\o #\u)) + + (char-set=) + (char-set= (char-set)) + + (char-set= (char-set #\a #\e #\i #\o #\u) + (string->char-set "ioeauaiii")) + + (not (char-set= (char-set #\e #\i #\o #\u) + (string->char-set "ioeauaiii"))) + + (char-set<=) + (char-set<= (char-set)) + + (char-set<= (char-set #\a #\e #\i #\o #\u) + (string->char-set "ioeauaiii")) + + (char-set<= (char-set #\e #\i #\o #\u) + (string->char-set "ioeauaiii")) + + (<= 0 (char-set-hash char-set:graphic 100) 99) + + (= 4 (char-set-fold (lambda (c i) (+ i 1)) 0 + (char-set #\e #\i #\o #\u #\e #\e))) + + (char-set= (string->char-set "eiaou2468013579999") + (char-set-unfold null? car cdr '(#\a #\e #\i #\o #\u #\u #\u) + char-set:digit)) + + (char-set= (string->char-set "eiaou246801357999") + (char-set-unfold! null? car cdr '(#\a #\e #\i #\o #\u) + (string->char-set "0123456789"))) + + (not (char-set= (string->char-set "eiaou246801357") + (char-set-unfold! null? car cdr '(#\a #\e #\i #\o #\u) + (string->char-set "0123456789")))) + + (let ((cs (string->char-set "0123456789"))) + (char-set-for-each (lambda (c) (set! cs (char-set-delete cs c))) + (string->char-set "02468000")) + (char-set= cs (string->char-set "97531"))) + + (not (let ((cs (string->char-set "0123456789"))) + (char-set-for-each (lambda (c) (set! cs (char-set-delete cs c))) + (string->char-set "02468")) + (char-set= cs (string->char-set "7531")))) + + (char-set= (char-set-map char-upcase (string->char-set "aeiou")) + (string->char-set "IOUAEEEE")) + + (not (char-set= (char-set-map char-upcase (string->char-set "aeiou")) + (string->char-set "OUAEEEE"))) + + (char-set= (char-set-copy (string->char-set "aeiou")) + (string->char-set "aeiou")) + + (char-set= (char-set #\x #\y) (string->char-set "xy")) + (not (char-set= (char-set #\x #\y #\z) (string->char-set "xy"))) + + (char-set= (string->char-set "xy") (list->char-set '(#\x #\y))) + (not (char-set= (string->char-set "axy") (list->char-set '(#\x #\y)))) + + (char-set= (string->char-set "xy12345") + (list->char-set '(#\x #\y) (string->char-set "12345"))) + (not (char-set= (string->char-set "y12345") + (list->char-set '(#\x #\y) (string->char-set "12345")))) + + (char-set= (string->char-set "xy12345") + (list->char-set! '(#\x #\y) (string->char-set "12345"))) + (not (char-set= (string->char-set "y12345") + (list->char-set! '(#\x #\y) (string->char-set "12345")))) + + (char-set= (string->char-set "aeiou12345") + (char-set-filter vowel? char-set:ascii (string->char-set "12345"))) + (not (char-set= (string->char-set "aeou12345") + (char-set-filter vowel? char-set:ascii (string->char-set "12345")))) + + (char-set= (string->char-set "aeiou12345") + (char-set-filter! vowel? char-set:ascii (string->char-set "12345"))) + (not (char-set= (string->char-set "aeou12345") + (char-set-filter! vowel? char-set:ascii (string->char-set "12345")))) + + + (char-set= (string->char-set "abcdef12345") + (ucs-range->char-set 97 103 #t (string->char-set "12345"))) + (not (char-set= (string->char-set "abcef12345") + (ucs-range->char-set 97 103 #t (string->char-set "12345")))) + + (char-set= (string->char-set "abcdef12345") + (ucs-range->char-set! 97 103 #t (string->char-set "12345"))) + (not (char-set= (string->char-set "abcef12345") + (ucs-range->char-set! 97 103 #t (string->char-set "12345")))) + + + (char-set= (->char-set #\x) + (->char-set "x") + (->char-set (char-set #\x))) + + (not (char-set= (->char-set #\x) + (->char-set "y") + (->char-set (char-set #\x)))) + + (= 10 (char-set-size (char-set-intersection char-set:ascii char-set:digit))) + + (= 5 (char-set-count vowel? char-set:ascii)) + + (equal? '(#\x) (char-set->list (char-set #\x))) + (not (equal? '(#\X) (char-set->list (char-set #\x)))) + + (equal? "x" (char-set->string (char-set #\x))) + (not (equal? "X" (char-set->string (char-set #\x)))) + + (char-set-contains? (->char-set "xyz") #\x) + (not (char-set-contains? (->char-set "xyz") #\a)) + + (char-set-every char-lower-case? (->char-set "abcd")) + (not (char-set-every char-lower-case? (->char-set "abcD"))) + (char-set-any char-lower-case? (->char-set "abcd")) + (not (char-set-any char-lower-case? (->char-set "ABCD"))) + + (char-set= (->char-set "ABCD") + (let ((cs (->char-set "abcd"))) + (let lp ((cur (char-set-cursor cs)) (ans '())) + (if (end-of-char-set? cur) (list->char-set ans) + (lp (char-set-cursor-next cs cur) + (cons (char-upcase (char-set-ref cs cur)) ans)))))) + + + (char-set= (char-set-adjoin (->char-set "123") #\x #\a) + (->char-set "123xa")) + (not (char-set= (char-set-adjoin (->char-set "123") #\x #\a) + (->char-set "123x"))) + (char-set= (char-set-adjoin! (->char-set "123") #\x #\a) + (->char-set "123xa")) + (not (char-set= (char-set-adjoin! (->char-set "123") #\x #\a) + (->char-set "123x"))) + + (char-set= (char-set-delete (->char-set "123") #\2 #\a #\2) + (->char-set "13")) + (not (char-set= (char-set-delete (->char-set "123") #\2 #\a #\2) + (->char-set "13a"))) + (char-set= (char-set-delete! (->char-set "123") #\2 #\a #\2) + (->char-set "13")) + (not (char-set= (char-set-delete! (->char-set "123") #\2 #\a #\2) + (->char-set "13a"))) + + (char-set= (char-set-intersection char-set:hex-digit (char-set-complement char-set:digit)) + (->char-set "abcdefABCDEF")) + (char-set= (char-set-intersection! (char-set-complement! (->char-set "0123456789")) + char-set:hex-digit) + (->char-set "abcdefABCDEF")) + + (char-set= (char-set-union char-set:hex-digit + (->char-set "abcdefghijkl")) + (->char-set "abcdefABCDEFghijkl0123456789")) + (char-set= (char-set-union! (->char-set "abcdefghijkl") + char-set:hex-digit) + (->char-set "abcdefABCDEFghijkl0123456789")) + + (char-set= (char-set-difference (->char-set "abcdefghijklmn") + char-set:hex-digit) + (->char-set "ghijklmn")) + (char-set= (char-set-difference! (->char-set "abcdefghijklmn") + char-set:hex-digit) + (->char-set "ghijklmn")) + + (char-set= (char-set-xor (->char-set "0123456789") + char-set:hex-digit) + (->char-set "abcdefABCDEF")) + (char-set= (char-set-xor! (->char-set "0123456789") + char-set:hex-digit) + (->char-set "abcdefABCDEF")) + + (call-with-values (lambda () + (char-set-diff+intersection char-set:hex-digit + char-set:letter)) + (lambda (d i) + (and (char-set= d (->char-set "0123456789")) + (char-set= i (->char-set "abcdefABCDEF"))))) + + (call-with-values (lambda () + (char-set-diff+intersection! (char-set-copy char-set:hex-digit) + (char-set-copy char-set:letter))) + (lambda (d i) + (and (char-set= d (->char-set "0123456789")) + (char-set= i (->char-set "abcdefABCDEF")))))) + +)) diff --git a/scsh/lib/list-lib.scm b/scsh/lib/list-lib.scm index 2491355..7386882 100644 --- a/scsh/lib/list-lib.scm +++ b/scsh/lib/list-lib.scm @@ -16,6 +16,11 @@ ;;; This implementation is intended as a portable reference implementation ;;; for SRFI-1. See the porting notes below for more information. +;;; Revision history +;;;;;;;;;;;;;;;;;;;; +;;; This is version 1.1. 12/18/2000 +;;; Fixes a small bug in DELETE-DUPLICATES!. + ;;; Exported: ;;; xcons tree-copy make-list list-tabulate cons* list-copy ;;; proper-list? circular-list? dotted-list? not-pair? null-list? list= @@ -384,7 +389,7 @@ (define (null-list? l) (cond ((pair? l) #f) ((null? l) #t) - (else (error "null-pair?: argument out of domain" l)))) + (else (error "null-list?: argument out of domain" l)))) (define (list= = . lists) @@ -1239,7 +1244,7 @@ (new-tail (recur (delete x tail elt=)))) (if (eq? tail new-tail) lis (cons x new-tail))))))) -(define (delete-duplicates! lis maybe-=) +(define (delete-duplicates! lis . maybe-=) (let ((elt= (:optional maybe-= equal?))) (check-arg procedure? elt= delete-duplicates!) (let recur ((lis lis)) diff --git a/scsh/lib/srfi-1.html b/scsh/lib/srfi-1.html index 125ae20..18e6c39 100644 --- a/scsh/lib/srfi-1.html +++ b/scsh/lib/srfi-1.html @@ -1,4 +1,4 @@ - -
+Olin Shivers +
- Olin Shivers / - shivers@ai.mit.edu + http://www.ai.mit.edu/~shivers/ / + shivers@ai.mit.edu + ++This SRFI is currently in ``final status. To see an explanation of each status that a SRFI can hold, see here. +You can access the discussion via the archive of the mailing list. +
+
The set of basic list and pair operations provided by R4RS/R5RS Scheme is far from satisfactory. Because this set is so small and basic, most @@ -278,9 +294,9 @@ library and get good results with it.
Here is a short list of the procedures provided by the list-lib package. -R5RS procedures are shown in -bold; -extended R5RS +R5RS procedures are shown in +bold; +extended R5RS procedures, in bold italic.
car
pair -> value
+car
pair -> value
cdr
pair -> value
(eq? x y)
=> (= x y)
.
Note that this implies, in turn, that two lists that are eq?
are
also set-equal by any legal comparison procedure. This allows for
constant-time determination of set operations on eq?
lists.
@@ -3142,7 +3160,7 @@ John David Stone, and Joerg F. Wittenberger. I am grateful to them for their
assistance.
I am also grateful the authors, implementors and documentors of all the systems -mentioned in the introduction. Aubrey Jaffer and Kent Pitman should be noted +mentioned in the rationale. Aubrey Jaffer and Kent Pitman should be noted for their work in producing Web-accessible versions of the R5RS and Common Lisp spec, which was a tremendous aid.
@@ -3156,31 +3174,15 @@ results, of course.