From c266ffbf4607c2cedde40d218ee475b09b6c1d49 Mon Sep 17 00:00:00 2001 From: shivers Date: Tue, 16 Jun 1998 21:04:38 +0000 Subject: [PATCH] Extended char-set package. --- scsh/char-set.scm | 310 ++++++++++++++++++++++++--------------- scsh/rdelim.scm | 20 +-- scsh/scsh-interfaces.scm | 45 ++++-- scsh/scsh-package.scm | 5 +- 4 files changed, 244 insertions(+), 136 deletions(-) diff --git a/scsh/char-set.scm b/scsh/char-set.scm index c9178dd..d58a724 100644 --- a/scsh/char-set.scm +++ b/scsh/char-set.scm @@ -1,13 +1,42 @@ ;;; -*-Scheme-*- ;;; ;;; Character Sets package -;;; ported from MIT Scheme runtime -;;; by Brian D. Carlstrom -;;; Sleazy code. +;;; - ported from MIT Scheme runtime +;;; by Brian D. Carlstrom +;;; - Rehacked & extended by Olin Shivers 6/98. + +;;; This is not great code. Char sets are represented as 256-char +;;; strings. If char i is ASCII 0, then it isn't in the set; if char i +;;; is ASCII 1, then it is in the set. +;;; - Should be rewritten to use bit strings, or at least byte vecs. +;;; - Is ASCII/Latin-1 specific. Would certainly have to be rewritten +;;; for Unicode. +;;; - The standard character sets are not Latin-1 compliant, just ASCII. + +;;; This code uses jar's DEFINE-RECORD-TYPE macro to define the char-set +;;; record type, because the scsh-standard DEFINE-RECORD form automatically +;;; defines a COPY-FOO function, which is not the one we want, being a shallow +;;; copy of the record fields. + +;;; New dfns: +;;; (char-set= cs1 cs2) +;;; (char-set<= cs1 cs2) +;;; (reduce-char-set kons knil cs) +;;; (set-char-set! cs char in?) +;;; (char-set-for-each f cs) +;;; (copy-char-set cs) +;;; (char-set-size cs) +;;; char-set:printing (char-printing? c) +;;; char-set:blank (char-blank? c) +;;; char-set:control (char-control? c) +;;; char-set:hex-digit (char-hex-digit? c) +;;; char-set:ascii (char-ascii? c) +;;; char-set:empty +;;; char-set:full (define char:newline (ascii->char 13)) (define char:tab (ascii->char 9)) -(define char:linefeed (ascii->char 13)) +(define char:vtab (ascii->char 11)) (define char:page (ascii->char 12)) (define char:return (ascii->char 10)) (define char:space (ascii->char 32)) @@ -23,42 +52,95 @@ ;;;; Character Sets -(define (char-set? object) - (and (string? object) - (= (string-length object) 256))) +;(define-record char-set +; s) ; 256-char string; each char is either ASCII 0 or ASCII 1. + +;;; Use jar's record macro. +(define-record-type char-set :char-set + (make-char-set s) + char-set? + (s char-set:s)) + +(define (copy-char-set cs) (make-char-set (string-copy (char-set:s cs)))) + +(define (char-set= cs1 cs2) + (let ((s1 (char-set:s cs1)) + (s2 (char-set:s cs2))) + (let lp ((i 255)) + (or (< i 0) + (and (char=? (string-ref s1 i) (string-ref s2 i)) + (lp (- i 1))))))) + +(define (char-set<= cs1 cs2) + (let ((s1 (char-set:s cs1)) + (s2 (char-set:s cs2))) + (let lp ((i 255)) + (or (< i 0) + (and (<= (char->ascii (string-ref s1 i)) + (char->ascii (string-ref s2 i))) + (lp (- i 1))))))) + + +(define (char-set-size cs) + (let ((s (char-set:s cs))) + (let lp ((i 255) (size 0)) + (if (< i 0) size + (lp (- i 1) + (if (= 0 (char->ascii (string-ref s i))) size (+ size 1))))))) + +(define (set-char-set! cs char in?) + (string-set! (char-set:s cs) + (char->ascii char) + (ascii->char (if in? 1 0)))) + +(define (char-set-for-each f cs) + (let ((s (char-set:s cs))) + (let lp ((i 255)) + (cond ((>= i 0) + (if (not (= 0 (char->ascii (string-ref s i)))) + (f (ascii->char i))) + (lp (- i 1))))))) + +(define (reduce-char-set kons knil cs) + (let ((s (char-set:s cs))) + (let lp ((i 255) (ans knil)) + (if (< i 0) ans + (lp (- i 1) + (if (= 0 (char->ascii (string-ref s i))) + ans + (kons (ascii->char i) ans))))))) (define (char-set . chars) (chars->char-set chars)) (define (chars->char-set chars) - (let ((char-set (make-string 256 (ascii->char 0)))) + (let ((s (make-string 256 (ascii->char 0)))) (for-each (lambda (char) - (string-set! char-set (char->ascii char) (ascii->char 1))) + (string-set! s (char->ascii char) (ascii->char 1))) chars) - char-set)) + (make-char-set s))) (define (string->char-set str) - (let ((char-set (make-string 256 (ascii->char 0)))) + (let ((s (make-string 256 (ascii->char 0)))) (do ((i (- (string-length str) 1) (- i 1))) - ((< i 0) char-set) - (string-set! char-set (char->ascii (string-ref str i)) + ((< i 0) (make-char-set s)) + (string-set! s (char->ascii (string-ref str i)) (ascii->char 1))))) (define (ascii-range->char-set lower upper) - (let ((char-set (make-string 256 (ascii->char 0)))) - (string-fill-range! char-set lower upper (ascii->char 1)) - char-set)) + (let ((s (make-string 256 (ascii->char 0)))) + (string-fill-range! s lower upper (ascii->char 1)) + (make-char-set s))) (define (predicate->char-set predicate) - (let ((char-set (make-string 256))) - (let loop ((code 0)) - (if (< code 256) - (begin (string-set! char-set code - (if (predicate (ascii->char code)) - (ascii->char 1) - (ascii->char 0))) - (loop (+ 1 code))))) - char-set)) + (let ((s (make-string 256))) + (let lp ((i 255)) + (if (>= i 0) + (begin (string-set! s i (if (predicate (ascii->char i)) + (ascii->char 1) + (ascii->char 0))) + (lp (- i 1))))) + (make-char-set s))) ;;; {string, char, char-set, char predicate} -> char-set @@ -74,12 +156,13 @@ ;;;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -(define (char-set-members char-set) - (define (loop code) - (cond ((>= code 256) '()) - ((zero? (char->ascii (string-ref char-set code))) (loop (+ 1 code))) - (else (cons (ascii->char code) (loop (+ 1 code)))))) - (loop 0)) +(define (char-set-members cs) + (let ((s (char-set:s cs))) + (let lp ((i 255) (ans '())) + (if (< i 0) ans + (lp (- i 1) + (if (zero? (char->ascii (string-ref s i))) ans + (cons (ascii->char i) ans))))))) ;;; De-releasing CHAR-SET-MEMBER? ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -90,108 +173,101 @@ ;;; break code. I ended up just choosing a new proc name that consistent with ;;; its arg order -- (CHAR-SET-CONTAINS? cset char). -(define (char-set-contains? char-set char) - (not (zero? (char->ascii (string-ref char-set (char->ascii char)))))) +(define (char-set-contains? cs char) + (not (zero? (char->ascii (string-ref (char-set:s cs) + (char->ascii char)))))) ;;; This actually isn't exported. Just CYA. (define (char-set-member? . args) (error "CHAR-SET-MEMBER? is no longer provided. Use CHAR-SET-CONTAINS? instead.")) -(define (char-set-invert char-set) - (predicate->char-set - (lambda (char) (not (char-set-contains? char-set char))))) +(define (char-set-invert cs) + (predicate->char-set (lambda (char) + (not (char-set-contains? cs char))))) -(define (char-set-union char-set-1 char-set-2) - (predicate->char-set - (lambda (char) - (or (char-set-contains? char-set-1 char) - (char-set-contains? char-set-2 char))))) +;;; The union, intersection, and difference code is ugly, +;;; because the ops are n-ary. -(define (char-set-intersection char-set-1 char-set-2) - (predicate->char-set - (lambda (char) - (and (char-set-contains? char-set-1 char) - (char-set-contains? char-set-2 char))))) +;;; Apply P to each index and it's char in S: (P I C). +;;; Used by the intersection & difference. +(define (string-iter s p) + (let lp ((i (- (string-length s) 1))) + (cond ((>= i 0) + (p i (string-ref s i)) + (lp (- i 1)))))) -(define (char-set-difference char-set-1 char-set-2) - (predicate->char-set - (lambda (char) - (and (char-set-contains? char-set-1 char) - (not (char-set-contains? char-set-2 char)))))) +(define (char-set-union . csets) + (if (pair? csets) + (let ((cset (copy-char-set (car csets)))) + (for-each (lambda (cs) + (char-set-for-each (lambda (c) (set-char-set! cset c #t)) + cs)) + (cdr csets)) + cset) + char-set:empty)) + +(define (char-set-intersection . csets) + (if (pair? csets) + (let* ((cset (copy-char-set (car csets))) + (s (char-set:s cset))) + (for-each (lambda (cs) + (string-iter (char-set:s cs) + (lambda (i c) + (if (= 0 (char->ascii c)) + (string-set! s i (ascii->char 0)))))) + (cdr csets)) + cset) + char-set:full)) + +(define (char-set-difference cs1 . csets) + (if (pair? csets) + (let* ((cset (copy-char-set cs1)) + (s (char-set:s cset))) + (for-each (lambda (cs) + (string-iter (char-set:s cs) + (lambda (i c) + (if (= 1 (char->ascii c)) + (string-set! s i (ascii->char 0)))))) + csets) + cset) + cs1)) ;;;; System Character Sets -(define char-set:upper-case (ascii-range->char-set #x41 #x5B)) (define char-set:lower-case (ascii-range->char-set #x61 #x7B)) -(define char-set:numeric (ascii-range->char-set #x30 #x3A)) -(define char-set:graphic (ascii-range->char-set #x20 #x7F)) -(define char-set:not-graphic (char-set-invert char-set:graphic)) -(define char-set:whitespace - (char-set char:newline char:tab char:linefeed - char:page char:return char:space)) -(define char-set:not-whitespace (char-set-invert char-set:whitespace)) +(define char-set:upper-case (ascii-range->char-set #x41 #x5B)) (define char-set:alphabetic (char-set-union char-set:upper-case char-set:lower-case)) +(define char-set:numeric (ascii-range->char-set #x30 #x3A)) (define char-set:alphanumeric (char-set-union char-set:alphabetic char-set:numeric)) -(define char-set:standard - (char-set-union char-set:graphic (char-set char:newline))) +(define char-set:graphic (ascii-range->char-set #x21 #x7F)) +(define char-set:printing (ascii-range->char-set #x20 #x7F)) +(define char-set:whitespace (char-set char:tab char:newline char:vtab + char:page char:return char:space)) +(define char-set:blank (char-set char:space char:tab)) +(define char-set:control (char-set-union (ascii-range->char-set 0 32) + (char-set (ascii->char 127)))) +(define char-set:punctuation + (string->char-set "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")) +(define char-set:hex-digit (string->char-set "0123456789abcdefABCDEF")) +(define char-set:ascii (ascii-range->char-set 0 128)) +(define char-set:empty (char-set)) +(define char-set:full (char-set-invert char-set:empty)) -(define (char-upper-case? char) - (char-set-contains? char-set:upper-case char)) -(define (char-lower-case? char) - (char-set-contains? char-set:lower-case char)) - -(define (char-numeric? char) - (char-set-contains? char-set:numeric char)) - -(define (char-graphic? char) - (char-set-contains? char-set:graphic char)) - -(define (char-whitespace? char) - (char-set-contains? char-set:whitespace char)) - -(define (char-alphabetic? char) - (char-set-contains? char-set:alphabetic char)) - -(define (char-alphanumeric? char) - (char-set-contains? char-set:alphanumeric char)) - -(define (char-standard? char) - (char-set-contains? char-set:standard char)) - -;;; Bullshit legalese -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;$Header: /home/flat/Dropbox/Hacks/scsh/scsh-cvs/scsh-0.5/scsh/char-set.scm,v 1.2 1995/11/20 06:20:12 shivers Exp $ - -;Copyright (c) 1988 Massachusetts Institute of Technology - -;This material was developed by the Scheme project at the Massachusetts -;Institute of Technology, Department of Electrical Engineering and -;Computer Science. Permission to copy this software, to redistribute -;it, and to use it for any purpose is granted, subject to the following -;restrictions and understandings. - -;1. Any copy made of this software must include this copyright notice -;in full. - -;2. Users of this software agree to make their best efforts (a) to -;return to the MIT Scheme project any improvements or extensions that -;they make, so that these may be included in future releases; and (b) -;to inform MIT of noteworthy uses of this software. - -;3. All materials developed as a consequence of the use of this -;software shall duly acknowledge such use, in accordance with the usual -;standards of acknowledging credit in academic research. - -;4. MIT has made no warrantee or representation that the operation of -;this software will be error-free, and MIT is under no obligation to -;provide any services, by way of maintenance, update, or otherwise. - -;5. In conjunction with products arising from the use of this material, -;there shall be no use of the name of the Massachusetts Institute of -;Technology nor of any adaptation thereof in any advertising, -;promotional, or sales literature without prior written consent from -;MIT in each case. +(define (char-set->pred cs) (lambda (c) (char-set-contains? cs c))) +(define char-lower-case? (char-set->pred char-set:lower-case)) +(define char-upper-case? (char-set->pred char-set:upper-case)) +(define char-alphabetic? (char-set->pred char-set:alphabetic)) +(define char-numeric? (char-set->pred char-set:numeric)) +(define char-alphanumeric? (char-set->pred char-set:alphanumeric)) +(define char-graphic? (char-set->pred char-set:graphic)) +(define char-printing? (char-set->pred char-set:printing)) +(define char-whitespace? (char-set->pred char-set:whitespace)) +(define char-blank? (char-set->pred char-set:blank)) +(define char-control? (char-set->pred char-set:control)) +(define char-punctuation? (char-set->pred char-set:punctuation)) +(define char-hex-digit? (char-set->pred char-set:hex-digit)) +(define char-ascii? (char-set->pred char-set:ascii)) diff --git a/scsh/rdelim.scm b/scsh/rdelim.scm index 1827ac3..1888db1 100644 --- a/scsh/rdelim.scm +++ b/scsh/rdelim.scm @@ -4,10 +4,10 @@ ;;; should be quite fast. ;;; ;;; N.B.: -;;; The C primitive %READ-DELIMITED-FDPORT!/ERRNO relies on knowing the -;;; representation of character sets. If these are changed from their -;;; current representation as 256-element strings, this code must be changed -;;; as well. +;;; The C primitives %READ-DELIMITED-FDPORT!/ERRNO and +;;; %SKIP-CHAR-SET-FDPORT/ERRNO rely on knowing the representation of +;;; character sets. If these are changed from their current representation, +;;; this code must be changed as well. ;;; (read-delimited delims [port delim-action]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -165,14 +165,15 @@ (error "Illegal START/END substring indices" buf start end %read-delimited!)) - (let ((delims (->char-set delims))) + (let* ((delims (->char-set delims)) + (sdelims (char-set:s delims))) (if (fdport? port) ;; Direct C support for Unix file ports -- zippy quick. (let lp ((start start) (total 0)) (receive (terminator num-read) - (%read-delimited-fdport!/errno delims buf gobble? + (%read-delimited-fdport!/errno sdelims buf gobble? port start end) (let ((total (+ num-read total))) (cond ((not (integer? terminator)) (values terminator total)) @@ -222,8 +223,9 @@ (define (skip-char-set skip-chars . maybe-port) - (let ((port (:optional maybe-port (current-input-port))) - (cset (->char-set skip-chars))) + (let* ((port (:optional maybe-port (current-input-port))) + (cset (->char-set skip-chars)) + (scset (char-set:s cset))) (cond ((not (input-port? port)) (error "Illegal value -- not an input port." port)) @@ -231,7 +233,7 @@ ;; Direct C support for Unix file ports -- zippy quick. ((fdport? port) (let lp ((total 0)) - (receive (err num-read) (%skip-char-set-fdport/errno cset port) + (receive (err num-read) (%skip-char-set-fdport/errno scset port) (let ((total (+ total num-read))) (cond ((not err) total) ((= errno/intr err) (lp total)) diff --git a/scsh/scsh-interfaces.scm b/scsh/scsh-interfaces.scm index 24db2e7..bd34928 100644 --- a/scsh/scsh-interfaces.scm +++ b/scsh/scsh-interfaces.scm @@ -679,9 +679,18 @@ (define-interface char-set-interface - (export char:newline char:tab char:linefeed char:page char:return char:space + (export char:newline char:tab char:page char:return char:space char:vtab char-ascii? + char-set? + copy-char-set + char-set= + char-set<= + char-set-size + + set-char-set! + char-set-for-each + reduce-char-set char-set chars->char-set @@ -698,22 +707,40 @@ char-set-intersection char-set-difference - char-set:upper-case char-set:lower-case - char-set:numeric - char-set:whitespace - char-set:not-whitespace + char-set:upper-case char-set:alphabetic + char-set:numeric char-set:alphanumeric char-set:graphic + char-set:printing + char-set:whitespace + char-set:blank + char-set:control + char-set:punctuation + char-set:hex-digit + char-set:ascii + char-set:empty + char-set:full - char-upper-case? char-lower-case? - char-numeric? - char-whitespace? + char-upper-case? char-alphabetic? + char-numeric? char-alphanumeric? - char-graphic?)) + char-graphic? + char-printing? + char-whitespace? + char-blank? + char-control? + char-punctuation? + char-hex-digit? + char-ascii? + + ;; This is not properly part of the interface, + ;; and should be moved to an internals interface -- + ;; it is used by rdelim.scm code. + char-set:s)) (define-interface scsh-field-reader-interface diff --git a/scsh/scsh-package.scm b/scsh/scsh-package.scm index f1d0280..184074d 100644 --- a/scsh/scsh-package.scm +++ b/scsh/scsh-package.scm @@ -54,7 +54,10 @@ (define-structure char-set-package char-set-interface - (open error-package ascii scheme) + (open error-package + ascii + define-record-types ; JAR's record macro. + scheme) (files char-set))