;;; -*-Scheme-*- ;;; ;;; Character Sets package ;;; - ported from MIT Scheme runtime ;;; by Brian D. Carlstrom ;;; - Rehacked & extended by Olin Shivers 6/98. ;;; This is not great code. Char sets are represented as 256-char ;;; strings. If char i is ASCII 0, then it isn't in the set; if char i ;;; is ASCII 1, then it is in the set. ;;; - Should be rewritten to use bit strings, or at least byte vecs. ;;; - Is ASCII/Latin-1 specific. Would certainly have to be rewritten ;;; for Unicode. ;;; - The standard character sets are not Latin-1 compliant, just ASCII. ;;; This code uses jar's DEFINE-RECORD-TYPE macro to define the char-set ;;; record type, because the scsh-standard DEFINE-RECORD form automatically ;;; defines a COPY-FOO function, which is not the one we want, being a shallow ;;; copy of the record fields. ;;; New dfns: ;;; (char-set= cs1 cs2) ;;; (char-set<= cs1 cs2) ;;; (reduce-char-set kons knil cs) ;;; (set-char-set! cs char in?) ;;; (char-set-for-each f cs) ;;; (copy-char-set cs) ;;; (char-set-size cs) ;;; char-set:printing (char-printing? c) ;;; char-set:blank (char-blank? c) ;;; char-set:control (char-control? c) ;;; char-set:hex-digit (char-hex-digit? c) ;;; char-set:ascii (char-ascii? c) ;;; char-set:empty ;;; char-set:full (define char:newline (ascii->char 13)) (define char:tab (ascii->char 9)) (define char:vtab (ascii->char 11)) (define char:page (ascii->char 12)) (define char:return (ascii->char 10)) (define char:space (ascii->char 32)) (define (string-fill-range! str lower upper ch) (do ((index lower (+ index 1))) ((>= index upper) str) (string-set! str index ch))) (define (char-ascii? char) (let ((maybe-ascii (char->ascii char))) (and (<= 0 maybe-ascii 127) maybe-ascii))) ;;;; Character Sets ;(define-record char-set ; s) ; 256-char string; each char is either ASCII 0 or ASCII 1. ;;; Use jar's record macro. (define-record-type char-set :char-set (make-char-set s) char-set? (s char-set:s)) (define (copy-char-set cs) (make-char-set (string-copy (char-set:s cs)))) (define (char-set= cs1 cs2) (let ((s1 (char-set:s cs1)) (s2 (char-set:s cs2))) (let lp ((i 255)) (or (< i 0) (and (char=? (string-ref s1 i) (string-ref s2 i)) (lp (- i 1))))))) (define (char-set<= cs1 cs2) (let ((s1 (char-set:s cs1)) (s2 (char-set:s cs2))) (let lp ((i 255)) (or (< i 0) (and (<= (char->ascii (string-ref s1 i)) (char->ascii (string-ref s2 i))) (lp (- i 1))))))) (define (char-set-size cs) (let ((s (char-set:s cs))) (let lp ((i 255) (size 0)) (if (< i 0) size (lp (- i 1) (if (= 0 (char->ascii (string-ref s i))) size (+ size 1))))))) (define (set-char-set! cs char in?) (string-set! (char-set:s cs) (char->ascii char) (ascii->char (if in? 1 0)))) (define (char-set-for-each f cs) (let ((s (char-set:s cs))) (let lp ((i 255)) (cond ((>= i 0) (if (not (= 0 (char->ascii (string-ref s i)))) (f (ascii->char i))) (lp (- i 1))))))) (define (reduce-char-set kons knil cs) (let ((s (char-set:s cs))) (let lp ((i 255) (ans knil)) (if (< i 0) ans (lp (- i 1) (if (= 0 (char->ascii (string-ref s i))) ans (kons (ascii->char i) ans))))))) (define (char-set . chars) (chars->char-set chars)) (define (chars->char-set chars) (let ((s (make-string 256 (ascii->char 0)))) (for-each (lambda (char) (string-set! s (char->ascii char) (ascii->char 1))) chars) (make-char-set s))) (define (string->char-set str) (let ((s (make-string 256 (ascii->char 0)))) (do ((i (- (string-length str) 1) (- i 1))) ((< i 0) (make-char-set s)) (string-set! s (char->ascii (string-ref str i)) (ascii->char 1))))) (define (ascii-range->char-set lower upper) (let ((s (make-string 256 (ascii->char 0)))) (string-fill-range! s lower upper (ascii->char 1)) (make-char-set s))) (define (predicate->char-set predicate) (let ((s (make-string 256))) (let lp ((i 255)) (if (>= i 0) (begin (string-set! s i (if (predicate (ascii->char i)) (ascii->char 1) (ascii->char 0))) (lp (- i 1))))) (make-char-set s))) ;;; {string, char, char-set, char predicate} -> char-set (define (->char-set x) (cond ((char-set? x) x) ((string? x) (string->char-set x)) ((char? x) (char-set x)) ((procedure? x) (predicate->char-set x)) (else (error "->char-set: Not a charset, string, char, or predicate." x)))) ;;;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- (define (char-set-members cs) (let ((s (char-set:s cs))) (let lp ((i 255) (ans '())) (if (< i 0) ans (lp (- i 1) (if (zero? (char->ascii (string-ref s i))) ans (cons (ascii->char i) ans))))))) ;;; De-releasing CHAR-SET-MEMBER? ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; No other way to do it. MIT Scheme defines it (c-s-m? cset char); scsh 0.3 ;;; defined it (c-s-m? char cset). MIT Scheme's arg order is not consistent ;;; with the MEMBER? procedure or common math notation, but they were here ;;; first, so I didn't want to just silently invert their arg order -- could ;;; break code. I ended up just choosing a new proc name that consistent with ;;; its arg order -- (CHAR-SET-CONTAINS? cset char). (define (char-set-contains? cs char) (not (zero? (char->ascii (string-ref (char-set:s cs) (char->ascii char)))))) ;;; This actually isn't exported. Just CYA. (define (char-set-member? . args) (error "CHAR-SET-MEMBER? is no longer provided. Use CHAR-SET-CONTAINS? instead.")) (define (char-set-invert cs) (predicate->char-set (lambda (char) (not (char-set-contains? cs char))))) ;;; The union, intersection, and difference code is ugly, ;;; because the ops are n-ary. ;;; Apply P to each index and it's char in S: (P I C). ;;; Used by the intersection & difference. (define (string-iter s p) (let lp ((i (- (string-length s) 1))) (cond ((>= i 0) (p i (string-ref s i)) (lp (- i 1)))))) (define (char-set-union . csets) (if (pair? csets) (let ((cset (copy-char-set (car csets)))) (for-each (lambda (cs) (char-set-for-each (lambda (c) (set-char-set! cset c #t)) cs)) (cdr csets)) cset) char-set:empty)) (define (char-set-intersection . csets) (if (pair? csets) (let* ((cset (copy-char-set (car csets))) (s (char-set:s cset))) (for-each (lambda (cs) (string-iter (char-set:s cs) (lambda (i c) (if (= 0 (char->ascii c)) (string-set! s i (ascii->char 0)))))) (cdr csets)) cset) char-set:full)) (define (char-set-difference cs1 . csets) (if (pair? csets) (let* ((cset (copy-char-set cs1)) (s (char-set:s cset))) (for-each (lambda (cs) (string-iter (char-set:s cs) (lambda (i c) (if (= 1 (char->ascii c)) (string-set! s i (ascii->char 0)))))) csets) cset) cs1)) ;;;; System Character Sets (define char-set:lower-case (ascii-range->char-set #x61 #x7B)) (define char-set:upper-case (ascii-range->char-set #x41 #x5B)) (define char-set:alphabetic (char-set-union char-set:upper-case char-set:lower-case)) (define char-set:numeric (ascii-range->char-set #x30 #x3A)) (define char-set:alphanumeric (char-set-union char-set:alphabetic char-set:numeric)) (define char-set:graphic (ascii-range->char-set #x21 #x7F)) (define char-set:printing (ascii-range->char-set #x20 #x7F)) (define char-set:whitespace (char-set char:tab char:newline char:vtab char:page char:return char:space)) (define char-set:blank (char-set char:space char:tab)) (define char-set:control (char-set-union (ascii-range->char-set 0 32) (char-set (ascii->char 127)))) (define char-set:punctuation (string->char-set "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")) (define char-set:hex-digit (string->char-set "0123456789abcdefABCDEF")) (define char-set:ascii (ascii-range->char-set 0 128)) (define char-set:empty (char-set)) (define char-set:full (char-set-invert char-set:empty)) (define (char-set->pred cs) (lambda (c) (char-set-contains? cs c))) (define char-lower-case? (char-set->pred char-set:lower-case)) (define char-upper-case? (char-set->pred char-set:upper-case)) (define char-alphabetic? (char-set->pred char-set:alphabetic)) (define char-numeric? (char-set->pred char-set:numeric)) (define char-alphanumeric? (char-set->pred char-set:alphanumeric)) (define char-graphic? (char-set->pred char-set:graphic)) (define char-printing? (char-set->pred char-set:printing)) (define char-whitespace? (char-set->pred char-set:whitespace)) (define char-blank? (char-set->pred char-set:blank)) (define char-control? (char-set->pred char-set:control)) (define char-punctuation? (char-set->pred char-set:punctuation)) (define char-hex-digit? (char-set->pred char-set:hex-digit)) (define char-ascii? (char-set->pred char-set:ascii))