scsh-0.5/scsh/char-set.scm

274 lines
8.7 KiB
Scheme
Raw Normal View History

1995-10-13 23:34:21 -04:00
;;; -*-Scheme-*-
;;;
;;; Character Sets package
1998-06-16 17:04:38 -04:00
;;; - ported from MIT Scheme runtime
;;; by Brian D. Carlstrom
;;; - Rehacked & extended by Olin Shivers 6/98.
;;; This is not great code. Char sets are represented as 256-char
;;; strings. If char i is ASCII 0, then it isn't in the set; if char i
;;; is ASCII 1, then it is in the set.
;;; - Should be rewritten to use bit strings, or at least byte vecs.
;;; - Is ASCII/Latin-1 specific. Would certainly have to be rewritten
;;; for Unicode.
;;; - The standard character sets are not Latin-1 compliant, just ASCII.
;;; This code uses jar's DEFINE-RECORD-TYPE macro to define the char-set
;;; record type, because the scsh-standard DEFINE-RECORD form automatically
;;; defines a COPY-FOO function, which is not the one we want, being a shallow
;;; copy of the record fields.
;;; New dfns:
;;; (char-set= cs1 cs2)
;;; (char-set<= cs1 cs2)
;;; (reduce-char-set kons knil cs)
;;; (set-char-set! cs char in?)
;;; (char-set-for-each f cs)
;;; (copy-char-set cs)
;;; (char-set-size cs)
;;; char-set:printing (char-printing? c)
;;; char-set:blank (char-blank? c)
;;; char-set:control (char-control? c)
;;; char-set:hex-digit (char-hex-digit? c)
;;; char-set:ascii (char-ascii? c)
;;; char-set:empty
;;; char-set:full
1995-10-13 23:34:21 -04:00
(define char:newline (ascii->char 13))
(define char:tab (ascii->char 9))
1998-06-16 17:04:38 -04:00
(define char:vtab (ascii->char 11))
1995-10-13 23:34:21 -04:00
(define char:page (ascii->char 12))
(define char:return (ascii->char 10))
(define char:space (ascii->char 32))
(define (string-fill-range! str lower upper ch)
(do ((index lower (+ index 1)))
((>= index upper) str)
(string-set! str index ch)))
(define (char-ascii? char)
(let ((maybe-ascii (char->ascii char)))
(and (<= 0 maybe-ascii 127) maybe-ascii)))
1995-10-13 23:34:21 -04:00
;;;; Character Sets
1998-06-16 17:04:38 -04:00
;(define-record char-set
; s) ; 256-char string; each char is either ASCII 0 or ASCII 1.
;;; Use jar's record macro.
(define-record-type char-set :char-set
(make-char-set s)
char-set?
(s char-set:s))
(define (copy-char-set cs) (make-char-set (string-copy (char-set:s cs))))
(define (char-set= cs1 cs2)
(let ((s1 (char-set:s cs1))
(s2 (char-set:s cs2)))
(let lp ((i 255))
(or (< i 0)
(and (char=? (string-ref s1 i) (string-ref s2 i))
(lp (- i 1)))))))
(define (char-set<= cs1 cs2)
(let ((s1 (char-set:s cs1))
(s2 (char-set:s cs2)))
(let lp ((i 255))
(or (< i 0)
(and (<= (char->ascii (string-ref s1 i))
(char->ascii (string-ref s2 i)))
(lp (- i 1)))))))
(define (char-set-size cs)
(let ((s (char-set:s cs)))
(let lp ((i 255) (size 0))
(if (< i 0) size
(lp (- i 1)
(if (= 0 (char->ascii (string-ref s i))) size (+ size 1)))))))
(define (set-char-set! cs char in?)
(string-set! (char-set:s cs)
(char->ascii char)
(ascii->char (if in? 1 0))))
(define (char-set-for-each f cs)
(let ((s (char-set:s cs)))
(let lp ((i 255))
(cond ((>= i 0)
(if (not (= 0 (char->ascii (string-ref s i))))
(f (ascii->char i)))
(lp (- i 1)))))))
(define (reduce-char-set kons knil cs)
(let ((s (char-set:s cs)))
(let lp ((i 255) (ans knil))
(if (< i 0) ans
(lp (- i 1)
(if (= 0 (char->ascii (string-ref s i)))
ans
(kons (ascii->char i) ans)))))))
1995-10-13 23:34:21 -04:00
(define (char-set . chars)
(chars->char-set chars))
(define (chars->char-set chars)
1998-06-16 17:04:38 -04:00
(let ((s (make-string 256 (ascii->char 0))))
1995-10-13 23:34:21 -04:00
(for-each (lambda (char)
1998-06-16 17:04:38 -04:00
(string-set! s (char->ascii char) (ascii->char 1)))
1995-10-13 23:34:21 -04:00
chars)
1998-06-16 17:04:38 -04:00
(make-char-set s)))
1995-10-13 23:34:21 -04:00
(define (string->char-set str)
1998-06-16 17:04:38 -04:00
(let ((s (make-string 256 (ascii->char 0))))
1995-10-13 23:34:21 -04:00
(do ((i (- (string-length str) 1) (- i 1)))
1998-06-16 17:04:38 -04:00
((< i 0) (make-char-set s))
(string-set! s (char->ascii (string-ref str i))
1995-10-13 23:34:21 -04:00
(ascii->char 1)))))
(define (ascii-range->char-set lower upper)
1998-06-16 17:04:38 -04:00
(let ((s (make-string 256 (ascii->char 0))))
(string-fill-range! s lower upper (ascii->char 1))
(make-char-set s)))
1995-10-13 23:34:21 -04:00
(define (predicate->char-set predicate)
1998-06-16 17:04:38 -04:00
(let ((s (make-string 256)))
(let lp ((i 255))
(if (>= i 0)
(begin (string-set! s i (if (predicate (ascii->char i))
(ascii->char 1)
(ascii->char 0)))
(lp (- i 1)))))
(make-char-set s)))
1995-10-13 23:34:21 -04:00
;;; {string, char, char-set, char predicate} -> char-set
(define (->char-set x)
(cond ((char-set? x) x)
((string? x) (string->char-set x))
((char? x) (char-set x))
((procedure? x) (predicate->char-set x))
(else (error "->char-set: Not a charset, string, char, or predicate."
x))))
;;;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
1998-06-16 17:04:38 -04:00
(define (char-set-members cs)
(let ((s (char-set:s cs)))
(let lp ((i 255) (ans '()))
(if (< i 0) ans
(lp (- i 1)
(if (zero? (char->ascii (string-ref s i))) ans
(cons (ascii->char i) ans)))))))
1995-10-13 23:34:21 -04:00
;;; De-releasing CHAR-SET-MEMBER?
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; No other way to do it. MIT Scheme defines it (c-s-m? cset char); scsh 0.3
;;; defined it (c-s-m? char cset). MIT Scheme's arg order is not consistent
;;; with the MEMBER? procedure or common math notation, but they were here
;;; first, so I didn't want to just silently invert their arg order -- could
;;; break code. I ended up just choosing a new proc name that consistent with
;;; its arg order -- (CHAR-SET-CONTAINS? cset char).
1998-06-16 17:04:38 -04:00
(define (char-set-contains? cs char)
(not (zero? (char->ascii (string-ref (char-set:s cs)
(char->ascii char))))))
1995-10-13 23:34:21 -04:00
;;; This actually isn't exported. Just CYA.
(define (char-set-member? . args)
(error "CHAR-SET-MEMBER? is no longer provided. Use CHAR-SET-CONTAINS? instead."))
1998-06-16 17:04:38 -04:00
(define (char-set-invert cs)
(predicate->char-set (lambda (char)
(not (char-set-contains? cs char)))))
;;; The union, intersection, and difference code is ugly,
;;; because the ops are n-ary.
;;; Apply P to each index and it's char in S: (P I C).
;;; Used by the intersection & difference.
(define (string-iter s p)
(let lp ((i (- (string-length s) 1)))
(cond ((>= i 0)
(p i (string-ref s i))
(lp (- i 1))))))
(define (char-set-union . csets)
(if (pair? csets)
(let ((cset (copy-char-set (car csets))))
(for-each (lambda (cs)
(char-set-for-each (lambda (c) (set-char-set! cset c #t))
cs))
(cdr csets))
cset)
char-set:empty))
(define (char-set-intersection . csets)
(if (pair? csets)
(let* ((cset (copy-char-set (car csets)))
(s (char-set:s cset)))
(for-each (lambda (cs)
(string-iter (char-set:s cs)
(lambda (i c)
(if (= 0 (char->ascii c))
(string-set! s i (ascii->char 0))))))
(cdr csets))
cset)
char-set:full))
(define (char-set-difference cs1 . csets)
(if (pair? csets)
(let* ((cset (copy-char-set cs1))
(s (char-set:s cset)))
(for-each (lambda (cs)
(string-iter (char-set:s cs)
(lambda (i c)
(if (= 1 (char->ascii c))
(string-set! s i (ascii->char 0))))))
csets)
cset)
cs1))
1995-10-13 23:34:21 -04:00
;;;; System Character Sets
(define char-set:lower-case (ascii-range->char-set #x61 #x7B))
1998-06-16 17:04:38 -04:00
(define char-set:upper-case (ascii-range->char-set #x41 #x5B))
1995-10-13 23:34:21 -04:00
(define char-set:alphabetic
(char-set-union char-set:upper-case char-set:lower-case))
1998-06-16 17:04:38 -04:00
(define char-set:numeric (ascii-range->char-set #x30 #x3A))
1995-10-13 23:34:21 -04:00
(define char-set:alphanumeric
(char-set-union char-set:alphabetic char-set:numeric))
1998-06-16 17:04:38 -04:00
(define char-set:graphic (ascii-range->char-set #x21 #x7F))
(define char-set:printing (ascii-range->char-set #x20 #x7F))
(define char-set:whitespace (char-set char:tab char:newline char:vtab
char:page char:return char:space))
(define char-set:blank (char-set char:space char:tab))
(define char-set:control (char-set-union (ascii-range->char-set 0 32)
(char-set (ascii->char 127))))
(define char-set:punctuation
(string->char-set "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"))
(define char-set:hex-digit (string->char-set "0123456789abcdefABCDEF"))
(define char-set:ascii (ascii-range->char-set 0 128))
(define char-set:empty (char-set))
(define char-set:full (char-set-invert char-set:empty))
(define (char-set->pred cs) (lambda (c) (char-set-contains? cs c)))
(define char-lower-case? (char-set->pred char-set:lower-case))
(define char-upper-case? (char-set->pred char-set:upper-case))
(define char-alphabetic? (char-set->pred char-set:alphabetic))
(define char-numeric? (char-set->pred char-set:numeric))
(define char-alphanumeric? (char-set->pred char-set:alphanumeric))
(define char-graphic? (char-set->pred char-set:graphic))
(define char-printing? (char-set->pred char-set:printing))
(define char-whitespace? (char-set->pred char-set:whitespace))
(define char-blank? (char-set->pred char-set:blank))
(define char-control? (char-set->pred char-set:control))
(define char-punctuation? (char-set->pred char-set:punctuation))
(define char-hex-digit? (char-set->pred char-set:hex-digit))
(define char-ascii? (char-set->pred char-set:ascii))