Extended char-set package.
This commit is contained in:
parent
269b603081
commit
c266ffbf46
|
@ -1,13 +1,42 @@
|
|||
;;; -*-Scheme-*-
|
||||
;;;
|
||||
;;; Character Sets package
|
||||
;;; ported from MIT Scheme runtime
|
||||
;;; by Brian D. Carlstrom
|
||||
;;; Sleazy code.
|
||||
;;; - ported from MIT Scheme runtime
|
||||
;;; by Brian D. Carlstrom
|
||||
;;; - Rehacked & extended by Olin Shivers 6/98.
|
||||
|
||||
;;; This is not great code. Char sets are represented as 256-char
|
||||
;;; strings. If char i is ASCII 0, then it isn't in the set; if char i
|
||||
;;; is ASCII 1, then it is in the set.
|
||||
;;; - Should be rewritten to use bit strings, or at least byte vecs.
|
||||
;;; - Is ASCII/Latin-1 specific. Would certainly have to be rewritten
|
||||
;;; for Unicode.
|
||||
;;; - The standard character sets are not Latin-1 compliant, just ASCII.
|
||||
|
||||
;;; This code uses jar's DEFINE-RECORD-TYPE macro to define the char-set
|
||||
;;; record type, because the scsh-standard DEFINE-RECORD form automatically
|
||||
;;; defines a COPY-FOO function, which is not the one we want, being a shallow
|
||||
;;; copy of the record fields.
|
||||
|
||||
;;; New dfns:
|
||||
;;; (char-set= cs1 cs2)
|
||||
;;; (char-set<= cs1 cs2)
|
||||
;;; (reduce-char-set kons knil cs)
|
||||
;;; (set-char-set! cs char in?)
|
||||
;;; (char-set-for-each f cs)
|
||||
;;; (copy-char-set cs)
|
||||
;;; (char-set-size cs)
|
||||
;;; char-set:printing (char-printing? c)
|
||||
;;; char-set:blank (char-blank? c)
|
||||
;;; char-set:control (char-control? c)
|
||||
;;; char-set:hex-digit (char-hex-digit? c)
|
||||
;;; char-set:ascii (char-ascii? c)
|
||||
;;; char-set:empty
|
||||
;;; char-set:full
|
||||
|
||||
(define char:newline (ascii->char 13))
|
||||
(define char:tab (ascii->char 9))
|
||||
(define char:linefeed (ascii->char 13))
|
||||
(define char:vtab (ascii->char 11))
|
||||
(define char:page (ascii->char 12))
|
||||
(define char:return (ascii->char 10))
|
||||
(define char:space (ascii->char 32))
|
||||
|
@ -23,42 +52,95 @@
|
|||
|
||||
;;;; Character Sets
|
||||
|
||||
(define (char-set? object)
|
||||
(and (string? object)
|
||||
(= (string-length object) 256)))
|
||||
;(define-record char-set
|
||||
; s) ; 256-char string; each char is either ASCII 0 or ASCII 1.
|
||||
|
||||
;;; Use jar's record macro.
|
||||
(define-record-type char-set :char-set
|
||||
(make-char-set s)
|
||||
char-set?
|
||||
(s char-set:s))
|
||||
|
||||
(define (copy-char-set cs) (make-char-set (string-copy (char-set:s cs))))
|
||||
|
||||
(define (char-set= cs1 cs2)
|
||||
(let ((s1 (char-set:s cs1))
|
||||
(s2 (char-set:s cs2)))
|
||||
(let lp ((i 255))
|
||||
(or (< i 0)
|
||||
(and (char=? (string-ref s1 i) (string-ref s2 i))
|
||||
(lp (- i 1)))))))
|
||||
|
||||
(define (char-set<= cs1 cs2)
|
||||
(let ((s1 (char-set:s cs1))
|
||||
(s2 (char-set:s cs2)))
|
||||
(let lp ((i 255))
|
||||
(or (< i 0)
|
||||
(and (<= (char->ascii (string-ref s1 i))
|
||||
(char->ascii (string-ref s2 i)))
|
||||
(lp (- i 1)))))))
|
||||
|
||||
|
||||
(define (char-set-size cs)
|
||||
(let ((s (char-set:s cs)))
|
||||
(let lp ((i 255) (size 0))
|
||||
(if (< i 0) size
|
||||
(lp (- i 1)
|
||||
(if (= 0 (char->ascii (string-ref s i))) size (+ size 1)))))))
|
||||
|
||||
(define (set-char-set! cs char in?)
|
||||
(string-set! (char-set:s cs)
|
||||
(char->ascii char)
|
||||
(ascii->char (if in? 1 0))))
|
||||
|
||||
(define (char-set-for-each f cs)
|
||||
(let ((s (char-set:s cs)))
|
||||
(let lp ((i 255))
|
||||
(cond ((>= i 0)
|
||||
(if (not (= 0 (char->ascii (string-ref s i))))
|
||||
(f (ascii->char i)))
|
||||
(lp (- i 1)))))))
|
||||
|
||||
(define (reduce-char-set kons knil cs)
|
||||
(let ((s (char-set:s cs)))
|
||||
(let lp ((i 255) (ans knil))
|
||||
(if (< i 0) ans
|
||||
(lp (- i 1)
|
||||
(if (= 0 (char->ascii (string-ref s i)))
|
||||
ans
|
||||
(kons (ascii->char i) ans)))))))
|
||||
|
||||
(define (char-set . chars)
|
||||
(chars->char-set chars))
|
||||
|
||||
(define (chars->char-set chars)
|
||||
(let ((char-set (make-string 256 (ascii->char 0))))
|
||||
(let ((s (make-string 256 (ascii->char 0))))
|
||||
(for-each (lambda (char)
|
||||
(string-set! char-set (char->ascii char) (ascii->char 1)))
|
||||
(string-set! s (char->ascii char) (ascii->char 1)))
|
||||
chars)
|
||||
char-set))
|
||||
(make-char-set s)))
|
||||
|
||||
(define (string->char-set str)
|
||||
(let ((char-set (make-string 256 (ascii->char 0))))
|
||||
(let ((s (make-string 256 (ascii->char 0))))
|
||||
(do ((i (- (string-length str) 1) (- i 1)))
|
||||
((< i 0) char-set)
|
||||
(string-set! char-set (char->ascii (string-ref str i))
|
||||
((< i 0) (make-char-set s))
|
||||
(string-set! s (char->ascii (string-ref str i))
|
||||
(ascii->char 1)))))
|
||||
|
||||
(define (ascii-range->char-set lower upper)
|
||||
(let ((char-set (make-string 256 (ascii->char 0))))
|
||||
(string-fill-range! char-set lower upper (ascii->char 1))
|
||||
char-set))
|
||||
(let ((s (make-string 256 (ascii->char 0))))
|
||||
(string-fill-range! s lower upper (ascii->char 1))
|
||||
(make-char-set s)))
|
||||
|
||||
(define (predicate->char-set predicate)
|
||||
(let ((char-set (make-string 256)))
|
||||
(let loop ((code 0))
|
||||
(if (< code 256)
|
||||
(begin (string-set! char-set code
|
||||
(if (predicate (ascii->char code))
|
||||
(ascii->char 1)
|
||||
(ascii->char 0)))
|
||||
(loop (+ 1 code)))))
|
||||
char-set))
|
||||
(let ((s (make-string 256)))
|
||||
(let lp ((i 255))
|
||||
(if (>= i 0)
|
||||
(begin (string-set! s i (if (predicate (ascii->char i))
|
||||
(ascii->char 1)
|
||||
(ascii->char 0)))
|
||||
(lp (- i 1)))))
|
||||
(make-char-set s)))
|
||||
|
||||
|
||||
;;; {string, char, char-set, char predicate} -> char-set
|
||||
|
@ -74,12 +156,13 @@
|
|||
|
||||
;;;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
||||
|
||||
(define (char-set-members char-set)
|
||||
(define (loop code)
|
||||
(cond ((>= code 256) '())
|
||||
((zero? (char->ascii (string-ref char-set code))) (loop (+ 1 code)))
|
||||
(else (cons (ascii->char code) (loop (+ 1 code))))))
|
||||
(loop 0))
|
||||
(define (char-set-members cs)
|
||||
(let ((s (char-set:s cs)))
|
||||
(let lp ((i 255) (ans '()))
|
||||
(if (< i 0) ans
|
||||
(lp (- i 1)
|
||||
(if (zero? (char->ascii (string-ref s i))) ans
|
||||
(cons (ascii->char i) ans)))))))
|
||||
|
||||
;;; De-releasing CHAR-SET-MEMBER?
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
@ -90,108 +173,101 @@
|
|||
;;; break code. I ended up just choosing a new proc name that consistent with
|
||||
;;; its arg order -- (CHAR-SET-CONTAINS? cset char).
|
||||
|
||||
(define (char-set-contains? char-set char)
|
||||
(not (zero? (char->ascii (string-ref char-set (char->ascii char))))))
|
||||
(define (char-set-contains? cs char)
|
||||
(not (zero? (char->ascii (string-ref (char-set:s cs)
|
||||
(char->ascii char))))))
|
||||
|
||||
;;; This actually isn't exported. Just CYA.
|
||||
(define (char-set-member? . args)
|
||||
(error "CHAR-SET-MEMBER? is no longer provided. Use CHAR-SET-CONTAINS? instead."))
|
||||
|
||||
(define (char-set-invert char-set)
|
||||
(predicate->char-set
|
||||
(lambda (char) (not (char-set-contains? char-set char)))))
|
||||
(define (char-set-invert cs)
|
||||
(predicate->char-set (lambda (char)
|
||||
(not (char-set-contains? cs char)))))
|
||||
|
||||
(define (char-set-union char-set-1 char-set-2)
|
||||
(predicate->char-set
|
||||
(lambda (char)
|
||||
(or (char-set-contains? char-set-1 char)
|
||||
(char-set-contains? char-set-2 char)))))
|
||||
;;; The union, intersection, and difference code is ugly,
|
||||
;;; because the ops are n-ary.
|
||||
|
||||
(define (char-set-intersection char-set-1 char-set-2)
|
||||
(predicate->char-set
|
||||
(lambda (char)
|
||||
(and (char-set-contains? char-set-1 char)
|
||||
(char-set-contains? char-set-2 char)))))
|
||||
;;; Apply P to each index and it's char in S: (P I C).
|
||||
;;; Used by the intersection & difference.
|
||||
(define (string-iter s p)
|
||||
(let lp ((i (- (string-length s) 1)))
|
||||
(cond ((>= i 0)
|
||||
(p i (string-ref s i))
|
||||
(lp (- i 1))))))
|
||||
|
||||
(define (char-set-difference char-set-1 char-set-2)
|
||||
(predicate->char-set
|
||||
(lambda (char)
|
||||
(and (char-set-contains? char-set-1 char)
|
||||
(not (char-set-contains? char-set-2 char))))))
|
||||
(define (char-set-union . csets)
|
||||
(if (pair? csets)
|
||||
(let ((cset (copy-char-set (car csets))))
|
||||
(for-each (lambda (cs)
|
||||
(char-set-for-each (lambda (c) (set-char-set! cset c #t))
|
||||
cs))
|
||||
(cdr csets))
|
||||
cset)
|
||||
char-set:empty))
|
||||
|
||||
(define (char-set-intersection . csets)
|
||||
(if (pair? csets)
|
||||
(let* ((cset (copy-char-set (car csets)))
|
||||
(s (char-set:s cset)))
|
||||
(for-each (lambda (cs)
|
||||
(string-iter (char-set:s cs)
|
||||
(lambda (i c)
|
||||
(if (= 0 (char->ascii c))
|
||||
(string-set! s i (ascii->char 0))))))
|
||||
(cdr csets))
|
||||
cset)
|
||||
char-set:full))
|
||||
|
||||
(define (char-set-difference cs1 . csets)
|
||||
(if (pair? csets)
|
||||
(let* ((cset (copy-char-set cs1))
|
||||
(s (char-set:s cset)))
|
||||
(for-each (lambda (cs)
|
||||
(string-iter (char-set:s cs)
|
||||
(lambda (i c)
|
||||
(if (= 1 (char->ascii c))
|
||||
(string-set! s i (ascii->char 0))))))
|
||||
csets)
|
||||
cset)
|
||||
cs1))
|
||||
|
||||
;;;; System Character Sets
|
||||
|
||||
(define char-set:upper-case (ascii-range->char-set #x41 #x5B))
|
||||
(define char-set:lower-case (ascii-range->char-set #x61 #x7B))
|
||||
(define char-set:numeric (ascii-range->char-set #x30 #x3A))
|
||||
(define char-set:graphic (ascii-range->char-set #x20 #x7F))
|
||||
(define char-set:not-graphic (char-set-invert char-set:graphic))
|
||||
(define char-set:whitespace
|
||||
(char-set char:newline char:tab char:linefeed
|
||||
char:page char:return char:space))
|
||||
(define char-set:not-whitespace (char-set-invert char-set:whitespace))
|
||||
(define char-set:upper-case (ascii-range->char-set #x41 #x5B))
|
||||
(define char-set:alphabetic
|
||||
(char-set-union char-set:upper-case char-set:lower-case))
|
||||
(define char-set:numeric (ascii-range->char-set #x30 #x3A))
|
||||
(define char-set:alphanumeric
|
||||
(char-set-union char-set:alphabetic char-set:numeric))
|
||||
(define char-set:standard
|
||||
(char-set-union char-set:graphic (char-set char:newline)))
|
||||
(define char-set:graphic (ascii-range->char-set #x21 #x7F))
|
||||
(define char-set:printing (ascii-range->char-set #x20 #x7F))
|
||||
(define char-set:whitespace (char-set char:tab char:newline char:vtab
|
||||
char:page char:return char:space))
|
||||
(define char-set:blank (char-set char:space char:tab))
|
||||
(define char-set:control (char-set-union (ascii-range->char-set 0 32)
|
||||
(char-set (ascii->char 127))))
|
||||
(define char-set:punctuation
|
||||
(string->char-set "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"))
|
||||
(define char-set:hex-digit (string->char-set "0123456789abcdefABCDEF"))
|
||||
(define char-set:ascii (ascii-range->char-set 0 128))
|
||||
(define char-set:empty (char-set))
|
||||
(define char-set:full (char-set-invert char-set:empty))
|
||||
|
||||
(define (char-upper-case? char)
|
||||
(char-set-contains? char-set:upper-case char))
|
||||
|
||||
(define (char-lower-case? char)
|
||||
(char-set-contains? char-set:lower-case char))
|
||||
|
||||
(define (char-numeric? char)
|
||||
(char-set-contains? char-set:numeric char))
|
||||
|
||||
(define (char-graphic? char)
|
||||
(char-set-contains? char-set:graphic char))
|
||||
|
||||
(define (char-whitespace? char)
|
||||
(char-set-contains? char-set:whitespace char))
|
||||
|
||||
(define (char-alphabetic? char)
|
||||
(char-set-contains? char-set:alphabetic char))
|
||||
|
||||
(define (char-alphanumeric? char)
|
||||
(char-set-contains? char-set:alphanumeric char))
|
||||
|
||||
(define (char-standard? char)
|
||||
(char-set-contains? char-set:standard char))
|
||||
|
||||
;;; Bullshit legalese
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;$Header: /home/flat/Dropbox/Hacks/scsh/scsh-cvs/scsh-0.5/scsh/char-set.scm,v 1.2 1995/11/20 06:20:12 shivers Exp $
|
||||
|
||||
;Copyright (c) 1988 Massachusetts Institute of Technology
|
||||
|
||||
;This material was developed by the Scheme project at the Massachusetts
|
||||
;Institute of Technology, Department of Electrical Engineering and
|
||||
;Computer Science. Permission to copy this software, to redistribute
|
||||
;it, and to use it for any purpose is granted, subject to the following
|
||||
;restrictions and understandings.
|
||||
|
||||
;1. Any copy made of this software must include this copyright notice
|
||||
;in full.
|
||||
|
||||
;2. Users of this software agree to make their best efforts (a) to
|
||||
;return to the MIT Scheme project any improvements or extensions that
|
||||
;they make, so that these may be included in future releases; and (b)
|
||||
;to inform MIT of noteworthy uses of this software.
|
||||
|
||||
;3. All materials developed as a consequence of the use of this
|
||||
;software shall duly acknowledge such use, in accordance with the usual
|
||||
;standards of acknowledging credit in academic research.
|
||||
|
||||
;4. MIT has made no warrantee or representation that the operation of
|
||||
;this software will be error-free, and MIT is under no obligation to
|
||||
;provide any services, by way of maintenance, update, or otherwise.
|
||||
|
||||
;5. In conjunction with products arising from the use of this material,
|
||||
;there shall be no use of the name of the Massachusetts Institute of
|
||||
;Technology nor of any adaptation thereof in any advertising,
|
||||
;promotional, or sales literature without prior written consent from
|
||||
;MIT in each case.
|
||||
(define (char-set->pred cs) (lambda (c) (char-set-contains? cs c)))
|
||||
|
||||
(define char-lower-case? (char-set->pred char-set:lower-case))
|
||||
(define char-upper-case? (char-set->pred char-set:upper-case))
|
||||
(define char-alphabetic? (char-set->pred char-set:alphabetic))
|
||||
(define char-numeric? (char-set->pred char-set:numeric))
|
||||
(define char-alphanumeric? (char-set->pred char-set:alphanumeric))
|
||||
(define char-graphic? (char-set->pred char-set:graphic))
|
||||
(define char-printing? (char-set->pred char-set:printing))
|
||||
(define char-whitespace? (char-set->pred char-set:whitespace))
|
||||
(define char-blank? (char-set->pred char-set:blank))
|
||||
(define char-control? (char-set->pred char-set:control))
|
||||
(define char-punctuation? (char-set->pred char-set:punctuation))
|
||||
(define char-hex-digit? (char-set->pred char-set:hex-digit))
|
||||
(define char-ascii? (char-set->pred char-set:ascii))
|
||||
|
|
|
@ -4,10 +4,10 @@
|
|||
;;; should be quite fast.
|
||||
;;;
|
||||
;;; N.B.:
|
||||
;;; The C primitive %READ-DELIMITED-FDPORT!/ERRNO relies on knowing the
|
||||
;;; representation of character sets. If these are changed from their
|
||||
;;; current representation as 256-element strings, this code must be changed
|
||||
;;; as well.
|
||||
;;; The C primitives %READ-DELIMITED-FDPORT!/ERRNO and
|
||||
;;; %SKIP-CHAR-SET-FDPORT/ERRNO rely on knowing the representation of
|
||||
;;; character sets. If these are changed from their current representation,
|
||||
;;; this code must be changed as well.
|
||||
|
||||
;;; (read-delimited delims [port delim-action])
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
@ -165,14 +165,15 @@
|
|||
(error "Illegal START/END substring indices"
|
||||
buf start end %read-delimited!))
|
||||
|
||||
(let ((delims (->char-set delims)))
|
||||
(let* ((delims (->char-set delims))
|
||||
(sdelims (char-set:s delims)))
|
||||
|
||||
(if (fdport? port)
|
||||
|
||||
;; Direct C support for Unix file ports -- zippy quick.
|
||||
(let lp ((start start) (total 0))
|
||||
(receive (terminator num-read)
|
||||
(%read-delimited-fdport!/errno delims buf gobble?
|
||||
(%read-delimited-fdport!/errno sdelims buf gobble?
|
||||
port start end)
|
||||
(let ((total (+ num-read total)))
|
||||
(cond ((not (integer? terminator)) (values terminator total))
|
||||
|
@ -222,8 +223,9 @@
|
|||
|
||||
|
||||
(define (skip-char-set skip-chars . maybe-port)
|
||||
(let ((port (:optional maybe-port (current-input-port)))
|
||||
(cset (->char-set skip-chars)))
|
||||
(let* ((port (:optional maybe-port (current-input-port)))
|
||||
(cset (->char-set skip-chars))
|
||||
(scset (char-set:s cset)))
|
||||
|
||||
(cond ((not (input-port? port))
|
||||
(error "Illegal value -- not an input port." port))
|
||||
|
@ -231,7 +233,7 @@
|
|||
;; Direct C support for Unix file ports -- zippy quick.
|
||||
((fdport? port)
|
||||
(let lp ((total 0))
|
||||
(receive (err num-read) (%skip-char-set-fdport/errno cset port)
|
||||
(receive (err num-read) (%skip-char-set-fdport/errno scset port)
|
||||
(let ((total (+ total num-read)))
|
||||
(cond ((not err) total)
|
||||
((= errno/intr err) (lp total))
|
||||
|
|
|
@ -679,9 +679,18 @@
|
|||
|
||||
|
||||
(define-interface char-set-interface
|
||||
(export char:newline char:tab char:linefeed char:page char:return char:space
|
||||
(export char:newline char:tab char:page char:return char:space char:vtab
|
||||
char-ascii?
|
||||
|
||||
char-set?
|
||||
copy-char-set
|
||||
char-set=
|
||||
char-set<=
|
||||
char-set-size
|
||||
|
||||
set-char-set!
|
||||
char-set-for-each
|
||||
reduce-char-set
|
||||
|
||||
char-set
|
||||
chars->char-set
|
||||
|
@ -698,22 +707,40 @@
|
|||
char-set-intersection
|
||||
char-set-difference
|
||||
|
||||
char-set:upper-case
|
||||
char-set:lower-case
|
||||
char-set:numeric
|
||||
char-set:whitespace
|
||||
char-set:not-whitespace
|
||||
char-set:upper-case
|
||||
char-set:alphabetic
|
||||
char-set:numeric
|
||||
char-set:alphanumeric
|
||||
char-set:graphic
|
||||
char-set:printing
|
||||
char-set:whitespace
|
||||
char-set:blank
|
||||
char-set:control
|
||||
char-set:punctuation
|
||||
char-set:hex-digit
|
||||
char-set:ascii
|
||||
char-set:empty
|
||||
char-set:full
|
||||
|
||||
char-upper-case?
|
||||
char-lower-case?
|
||||
char-numeric?
|
||||
char-whitespace?
|
||||
char-upper-case?
|
||||
char-alphabetic?
|
||||
char-numeric?
|
||||
char-alphanumeric?
|
||||
char-graphic?))
|
||||
char-graphic?
|
||||
char-printing?
|
||||
char-whitespace?
|
||||
char-blank?
|
||||
char-control?
|
||||
char-punctuation?
|
||||
char-hex-digit?
|
||||
char-ascii?
|
||||
|
||||
;; This is not properly part of the interface,
|
||||
;; and should be moved to an internals interface --
|
||||
;; it is used by rdelim.scm code.
|
||||
char-set:s))
|
||||
|
||||
|
||||
(define-interface scsh-field-reader-interface
|
||||
|
|
|
@ -54,7 +54,10 @@
|
|||
|
||||
|
||||
(define-structure char-set-package char-set-interface
|
||||
(open error-package ascii scheme)
|
||||
(open error-package
|
||||
ascii
|
||||
define-record-types ; JAR's record macro.
|
||||
scheme)
|
||||
(files char-set))
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue