829 lines
25 KiB
Scheme
829 lines
25 KiB
Scheme
; Part of Scheme 48 1.9. See file COPYING for notices and license.
|
|
|
|
; Authors: Mike Sperber, Robert Tansom
|
|
|
|
; Copyright (c) 2005-2006 by Basis Technology Corporation.
|
|
|
|
; This is basically a complete re-implementation, suitable for Unicode.
|
|
|
|
; Some bits and pieces from Olin's reference implementation remain,
|
|
; but none from the MIT Scheme code. For whatever remains, the
|
|
; following copyright holds:
|
|
|
|
; Copyright (c) 1994-2003 by Olin Shivers
|
|
;
|
|
; All rights reserved.
|
|
;
|
|
; Redistribution and use in source and binary forms, with or without
|
|
; modification, are permitted provided that the following conditions
|
|
; are met:
|
|
; 1. Redistributions of source code must retain the above copyright
|
|
; notice, this list of conditions and the following disclaimer.
|
|
; 2. Redistributions in binary form must reproduce the above copyright
|
|
; notice, this list of conditions and the following disclaimer in the
|
|
; documentation and/or other materials provided with the distribution.
|
|
; 3. The name of the authors may not be used to endorse or promote products
|
|
; derived from this software without specific prior written permission.
|
|
;
|
|
; THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
|
|
; IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
; OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
; IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
; INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
; NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
; THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
(define-record-type :char-set
|
|
(make-char-set simple i-list)
|
|
char-set?
|
|
;; byte vector for the Latin-1 part
|
|
(simple char-set-simple
|
|
set-char-set-simple!)
|
|
;; inversion list for the rest
|
|
(i-list char-set-i-list
|
|
set-char-set-i-list!))
|
|
|
|
(define-record-discloser :char-set
|
|
(lambda (cs)
|
|
(list 'char-set
|
|
(char-set-size cs))))
|
|
|
|
(define (make-char-set-immutable! char-set)
|
|
(make-immutable! char-set)
|
|
(make-immutable! (char-set-simple char-set)))
|
|
; inversion lists are always immutable
|
|
|
|
;;; "Simple Csets"---we use mutable byte vectors for the Latin-1 part
|
|
|
|
(define *simple-cset-boundary* 256)
|
|
|
|
(define (simple-char? c)
|
|
(< (char->scalar-value c) *simple-cset-boundary*))
|
|
|
|
(define (make-empty-simple-cset)
|
|
(make-byte-vector *simple-cset-boundary* 0))
|
|
|
|
(define (make-full-simple-cset)
|
|
(make-byte-vector *simple-cset-boundary* 1))
|
|
|
|
(define (copy-simple-cset s)
|
|
(byte-vector-copy s))
|
|
|
|
; don't mistake these for abstractions
|
|
(define (simple-cset-code-not-member? s i) (zero? (byte-vector-ref s i)))
|
|
(define (simple-cset-code-member? s i) (not (simple-cset-code-not-member? s i)))
|
|
(define (simple-cset-ref s i) (byte-vector-ref s i))
|
|
(define (simple-cset-set! s i v) (byte-vector-set! s i v))
|
|
(define (simple-cset-remove-code! s i) (byte-vector-set! s i 0))
|
|
(define (simple-cset-adjoin-code! s i) (byte-vector-set! s i 1))
|
|
|
|
(define (simple-cset-contains? s char)
|
|
(simple-cset-code-member? s (char->scalar-value char)))
|
|
|
|
(define (simple-cset=? s1 s2)
|
|
(byte-vector=? s1 s2))
|
|
|
|
(define (simple-cset<=? s1 s2)
|
|
(or (eq? s1 s2)
|
|
(let loop ((i 0))
|
|
(if (>= i *simple-cset-boundary*)
|
|
#t
|
|
(and (<= (simple-cset-ref s1 i) (simple-cset-ref s2 i))
|
|
(loop (+ 1 i)))))))
|
|
|
|
(define (simple-cset-size s)
|
|
(let loop ((i 0) (size 0))
|
|
(if (>= i *simple-cset-boundary*)
|
|
size
|
|
(loop (+ 1 i) (+ size (simple-cset-ref s i))))))
|
|
|
|
(define (simple-cset-count pred s)
|
|
(let loop ((i 0) (count 0))
|
|
(if (>= i *simple-cset-boundary*)
|
|
count
|
|
(loop (+ 1 i)
|
|
(if (and (simple-cset-code-member? s i) (pred (scalar-value->char i)))
|
|
(+ count 1)
|
|
count)))))
|
|
|
|
(define (simple-cset-modify! set s chars)
|
|
(for-each (lambda (c) (set s (char->scalar-value c)))
|
|
chars)
|
|
s)
|
|
|
|
(define (simple-cset-modify set s chars)
|
|
(simple-cset-modify! set (copy-simple-cset s) chars))
|
|
|
|
(define (simple-cset-adjoin s . chars)
|
|
(simple-cset-modify simple-cset-adjoin-code! s chars))
|
|
(define (simple-cset-adjoin! s . chars)
|
|
(simple-cset-modify! simple-cset-adjoin-code! s chars))
|
|
(define (simple-cset-delete s . chars)
|
|
(simple-cset-modify simple-cset-remove-code! s chars))
|
|
(define (simple-cset-delete! s . chars)
|
|
(simple-cset-modify! simple-cset-remove-code! s chars))
|
|
|
|
;;; If we represented char sets as a bit set, we could do the following
|
|
;;; trick to pick the lowest bit out of the set:
|
|
;;; (count-bits (xor (- cset 1) cset))
|
|
;;; (But first mask out the bits already scanned by the cursor first.)
|
|
|
|
(define (simple-cset-cursor-next s cursor)
|
|
(let loop ((cur cursor))
|
|
(let ((cur (- cur 1)))
|
|
(if (or (< cur 0) (simple-cset-code-member? s cur))
|
|
cur
|
|
(loop cur)))))
|
|
|
|
(define (end-of-simple-cset? cursor)
|
|
(negative? cursor))
|
|
|
|
(define (simple-cset-cursor-ref cursor)
|
|
(scalar-value->char cursor))
|
|
|
|
(define (simple-cset-for-each proc s)
|
|
(let loop ((i 0))
|
|
(if (< i *simple-cset-boundary*)
|
|
(begin
|
|
(if (simple-cset-code-member? s i)
|
|
(proc (scalar-value->char i)))
|
|
(loop (+ 1 i))))))
|
|
|
|
(define (simple-cset-fold kons knil s)
|
|
(let loop ((i 0) (ans knil))
|
|
(if (>= i *simple-cset-boundary*)
|
|
ans
|
|
(loop (+ 1 i)
|
|
(if (simple-cset-code-not-member? s i)
|
|
ans
|
|
(kons (scalar-value->char i) ans))))))
|
|
|
|
(define (simple-cset-every? pred s)
|
|
(let loop ((i 0))
|
|
(cond
|
|
((>= i *simple-cset-boundary*)
|
|
#t)
|
|
((or (simple-cset-code-not-member? s i)
|
|
(pred (scalar-value->char i)))
|
|
(loop (+ 1 i)))
|
|
(else
|
|
#f))))
|
|
|
|
(define (simple-cset-any pred s)
|
|
(let loop ((i 0))
|
|
(cond
|
|
((>= i *simple-cset-boundary*) #f)
|
|
((and (simple-cset-code-member? s i)
|
|
(pred (scalar-value->char i))))
|
|
(else
|
|
(loop (+ 1 i))))))
|
|
|
|
(define (ucs-range->simple-cset lower upper)
|
|
(let ((s (make-empty-simple-cset)))
|
|
(let loop ((i lower))
|
|
(if (< i upper)
|
|
(begin
|
|
(simple-cset-adjoin-code! s i)
|
|
(loop (+ 1 i)))))
|
|
s))
|
|
|
|
; Algebra
|
|
|
|
; These do various "s[i] := s[i] op val" operations
|
|
|
|
(define (simple-cset-invert-code! s i v)
|
|
(simple-cset-set! s i (- 1 v)))
|
|
|
|
(define (simple-cset-and-code! s i v)
|
|
(if (zero? v)
|
|
(simple-cset-remove-code! s i)))
|
|
(define (simple-cset-or-code! s i v)
|
|
(if (not (zero? v))
|
|
(simple-cset-adjoin-code! s i)))
|
|
(define (simple-cset-minus-code! s i v)
|
|
(if (not (zero? v))
|
|
(simple-cset-remove-code! s i)))
|
|
(define (simple-cset-xor-code! s i v)
|
|
(if (not (zero? v))
|
|
(simple-cset-set! s i (- 1 (simple-cset-ref s i)))))
|
|
|
|
(define (simple-cset-complement s)
|
|
(simple-cset-complement! (copy-simple-cset s)))
|
|
|
|
(define (simple-cset-complement! s)
|
|
(byte-vector-iter (lambda (i v) (simple-cset-invert-code! s i v)) s)
|
|
s)
|
|
|
|
(define (simple-cset-op! s simple-csets code-op!)
|
|
(for-each (lambda (s2)
|
|
(let loop ((i 0))
|
|
(if (< i *simple-cset-boundary*)
|
|
(begin
|
|
(code-op! s i (simple-cset-ref s2 i))
|
|
(loop (+ 1 i))))))
|
|
simple-csets)
|
|
s)
|
|
|
|
(define (simple-cset-union! s1 . ss)
|
|
(simple-cset-op! s1 ss simple-cset-or-code!))
|
|
|
|
(define (simple-cset-union . ss)
|
|
(if (pair? ss)
|
|
(apply simple-cset-union!
|
|
(byte-vector-copy (car ss))
|
|
(cdr ss))
|
|
(make-empty-simple-cset)))
|
|
|
|
(define (simple-cset-intersection! s1 . ss)
|
|
(simple-cset-op! s1 ss simple-cset-and-code!))
|
|
|
|
(define (simple-cset-intersection . ss)
|
|
(if (pair? ss)
|
|
(apply simple-cset-intersection!
|
|
(byte-vector-copy (car ss))
|
|
(cdr ss))
|
|
(make-full-simple-cset)))
|
|
|
|
(define (simple-cset-difference! s1 . ss)
|
|
(simple-cset-op! s1 ss simple-cset-minus-code!))
|
|
|
|
(define (simple-cset-difference s1 . ss)
|
|
(if (pair? ss)
|
|
(apply simple-cset-difference! (copy-simple-cset s1) ss)
|
|
(copy-simple-cset s1)))
|
|
|
|
(define (simple-cset-xor! s1 . ss)
|
|
(simple-cset-op! s1 ss simple-cset-xor-code!))
|
|
|
|
(define (simple-cset-xor . ss)
|
|
(if (pair? ss)
|
|
(apply simple-cset-xor!
|
|
(byte-vector-copy (car ss))
|
|
(cdr ss))
|
|
(make-empty-simple-cset)))
|
|
|
|
(define (simple-cset-diff+intersection! s1 s2 . ss)
|
|
(byte-vector-iter (lambda (i v)
|
|
(cond
|
|
((zero? v)
|
|
(simple-cset-remove-code! s2 i))
|
|
((simple-cset-code-member? s2 i)
|
|
(simple-cset-remove-code! s1 i))))
|
|
s1)
|
|
|
|
(for-each (lambda (s)
|
|
(byte-vector-iter (lambda (i v)
|
|
(if (and (not (zero? v))
|
|
(simple-cset-code-member? s1 i))
|
|
(begin
|
|
(simple-cset-remove-code! s1 i)
|
|
(simple-cset-adjoin-code! s2 i))))
|
|
s))
|
|
ss)
|
|
|
|
(values s1 s2))
|
|
|
|
|
|
|
|
; Compute (c + 37 c + 37^2 c + ...) modulo BOUND, with sleaze thrown
|
|
; in to keep the intermediate values small. (We do the calculation
|
|
; with just enough bits to represent BOUND, masking off high bits at
|
|
; each step in calculation. If this screws up any important properties
|
|
; of the hash function I'd like to hear about it. -Olin)
|
|
|
|
(define (simple-cset-hash s bound)
|
|
;; The mask that will cover BOUND-1:
|
|
(let ((mask (let loop ((i #x10000)) ; Let's skip first 16 iterations, eh?
|
|
(if (>= i bound) (- i 1) (loop (+ i i))))))
|
|
(let loop ((i (- *simple-cset-boundary* 1)) (ans 0))
|
|
(if (< i 0)
|
|
(modulo ans bound)
|
|
(loop (- i 1)
|
|
(if (simple-cset-code-not-member? s i)
|
|
ans
|
|
(bitwise-and mask (+ (* 37 ans) i))))))))
|
|
|
|
;;; Now for the real character sets
|
|
|
|
(define (make-empty-char-set)
|
|
(make-char-set (make-empty-simple-cset)
|
|
(make-empty-inversion-list *simple-cset-boundary* (+ 1 #x10ffff))))
|
|
(define (make-full-char-set)
|
|
(make-char-set (make-full-simple-cset)
|
|
(range->inversion-list *simple-cset-boundary* (+ 1 #x10ffff)
|
|
*simple-cset-boundary* (+ 1 #x10ffff))))
|
|
|
|
(define (char-set-copy cs)
|
|
(make-char-set (copy-simple-cset (char-set-simple cs))
|
|
(inversion-list-copy (char-set-i-list cs))))
|
|
|
|
; n-ary version
|
|
(define (char-set= . rest)
|
|
(or (null? rest)
|
|
(let ((cs1 (car rest))
|
|
(rest (cdr rest)))
|
|
(let loop ((rest rest))
|
|
(or (not (pair? rest))
|
|
(and (char-set=/2 cs1 (car rest))
|
|
(loop (cdr rest))))))))
|
|
|
|
; binary version
|
|
(define (char-set=/2 cs-1 cs-2)
|
|
(and (simple-cset=? (char-set-simple cs-1) (char-set-simple cs-2))
|
|
(inversion-list=? (char-set-i-list cs-1)
|
|
(char-set-i-list cs-2))))
|
|
|
|
; n-ary
|
|
(define (char-set<= . rest)
|
|
(or (null? rest)
|
|
(let ((cs1 (car rest))
|
|
(rest (cdr rest)))
|
|
(let loop ((cs1 cs1) (rest rest))
|
|
(or (not (pair? rest))
|
|
(and (char-set<=/2 cs1 (car rest))
|
|
(loop (car rest) (cdr rest))))))))
|
|
|
|
; binary
|
|
(define (char-set<=/2 cs-1 cs-2)
|
|
(and (simple-cset<=? (char-set-simple cs-1) (char-set-simple cs-2))
|
|
(inversion-list<=? (char-set-i-list cs-1)
|
|
(char-set-i-list cs-2))))
|
|
|
|
(define (inversion-list<=? i-list-1 i-list-2)
|
|
(inversion-list=? i-list-1
|
|
(inversion-list-intersection i-list-1 i-list-2)))
|
|
|
|
;;; Hash
|
|
|
|
; We follow Olin's reference implementation:
|
|
;
|
|
; If you keep BOUND small enough, the intermediate calculations will
|
|
; always be fixnums. How small is dependent on the underlying Scheme system;
|
|
; we use a default BOUND of 2^22 = 4194304, which should hack it in
|
|
; Schemes that give you at least 29 signed bits for fixnums. The core
|
|
; calculation that you don't want to overflow is, worst case,
|
|
; (+ 65535 (* 37 (- bound 1)))
|
|
; where 65535 is the max character code. Choose the default BOUND to be the
|
|
; biggest power of two that won't cause this expression to fixnum overflow,
|
|
; and everything will be copacetic.
|
|
|
|
(define char-set-hash
|
|
(opt-lambda (cs (bound 4194304))
|
|
(if (not (and (integer? bound)
|
|
(exact? bound)
|
|
(<= 0 bound)))
|
|
(assertion-violation 'char-set-hash "invalid bound" bound))
|
|
(let ((bound (if (zero? bound) 4194304 bound)))
|
|
(modulo (+ (simple-cset-hash (char-set-simple cs) bound)
|
|
(* 37 (inversion-list-hash (char-set-i-list cs) bound)))
|
|
bound))))
|
|
|
|
(define (char-set-contains? cs char)
|
|
(if (simple-char? char)
|
|
(simple-cset-contains? (char-set-simple cs) char)
|
|
(inversion-list-member? (char->scalar-value char)
|
|
(char-set-i-list cs))))
|
|
|
|
(define (char-set-size cs)
|
|
(+ (simple-cset-size (char-set-simple cs))
|
|
(inversion-list-size (char-set-i-list cs))))
|
|
|
|
(define (char-set-count pred cset)
|
|
(+ (simple-cset-count pred (char-set-simple cset))
|
|
(inversion-list-count pred (char-set-i-list cset))))
|
|
|
|
(define (inversion-list-count pred i-list)
|
|
(inversion-list-fold/done? (lambda (v count)
|
|
(if (pred (scalar-value->char v))
|
|
(+ 1 count)
|
|
count))
|
|
0
|
|
(lambda (v) #f)
|
|
i-list))
|
|
|
|
(define (make-char-set-char-op simple-cset-op inversion-list-op)
|
|
(lambda (cs . chars)
|
|
(call-with-values
|
|
(lambda () (partition-list simple-char? chars))
|
|
(lambda (simple-chars non-simple-chars)
|
|
(make-char-set (apply simple-cset-op (char-set-simple cs) simple-chars)
|
|
(apply inversion-list-op (char-set-i-list cs)
|
|
(map char->scalar-value non-simple-chars)))))))
|
|
|
|
(define (make-char-set-char-op! simple-cset-op! simple-cset-op
|
|
inversion-list-op)
|
|
(lambda (cs . chars)
|
|
(call-with-values
|
|
(lambda () (partition-list simple-char? chars))
|
|
(lambda (simple-chars non-simple-chars)
|
|
(if (null? non-simple-chars)
|
|
(apply simple-cset-op! (char-set-simple cs) simple-chars)
|
|
(begin
|
|
(set-char-set-simple! cs
|
|
(apply simple-cset-op (char-set-simple cs)
|
|
simple-chars))
|
|
(set-char-set-i-list! cs
|
|
(apply inversion-list-op (char-set-i-list cs)
|
|
(map char->scalar-value non-simple-chars)))))))
|
|
cs))
|
|
|
|
(define char-set-adjoin
|
|
(make-char-set-char-op simple-cset-adjoin inversion-list-adjoin))
|
|
(define char-set-adjoin!
|
|
(make-char-set-char-op! simple-cset-adjoin! simple-cset-adjoin
|
|
inversion-list-adjoin))
|
|
(define char-set-delete
|
|
(make-char-set-char-op simple-cset-delete inversion-list-remove))
|
|
(define char-set-delete!
|
|
(make-char-set-char-op! simple-cset-delete! simple-cset-delete
|
|
inversion-list-remove))
|
|
|
|
;;; Cursors
|
|
|
|
; A cursor is either an integer index into the mark vector (-1 for the
|
|
; end-of-char-set cursor) as in the reference implementation, and an
|
|
; inversion-list cursor otherwise.
|
|
|
|
(define (char-set-cursor cset)
|
|
(let ((simple-cursor
|
|
(simple-cset-cursor-next (char-set-simple cset)
|
|
*simple-cset-boundary*)))
|
|
(if (end-of-simple-cset? simple-cursor)
|
|
(inversion-list-cursor (char-set-i-list cset))
|
|
simple-cursor)))
|
|
|
|
(define (end-of-char-set? cursor)
|
|
(and (inversion-list-cursor? cursor)
|
|
(inversion-list-cursor-at-end? cursor)))
|
|
|
|
(define (char-set-ref cset cursor)
|
|
(if (number? cursor)
|
|
(simple-cset-cursor-ref cursor)
|
|
(scalar-value->char (inversion-list-cursor-ref cursor))))
|
|
|
|
(define (char-set-cursor-next cset cursor)
|
|
(cond
|
|
((number? cursor)
|
|
(let ((next (simple-cset-cursor-next (char-set-simple cset) cursor)))
|
|
(if (end-of-simple-cset? next)
|
|
(inversion-list-cursor (char-set-i-list cset))
|
|
next)))
|
|
(else
|
|
(inversion-list-cursor-next (char-set-i-list cset) cursor))))
|
|
|
|
(define (char-set-for-each proc cs)
|
|
(simple-cset-for-each proc (char-set-simple cs))
|
|
(inversion-list-fold/done? (lambda (n _)
|
|
(proc (scalar-value->char n))
|
|
(unspecific))
|
|
#f
|
|
(lambda (_) #f)
|
|
(char-set-i-list cs)))
|
|
|
|
; this is pretty inefficent
|
|
(define (char-set-map proc cs)
|
|
(let ((simple-cset (make-empty-simple-cset))
|
|
(other-scalar-values '()))
|
|
|
|
(define (adjoin! c)
|
|
(let ((c (proc c)))
|
|
(if (simple-char? c)
|
|
(simple-cset-adjoin! simple-cset c)
|
|
(set! other-scalar-values
|
|
(cons (char->scalar-value c) other-scalar-values)))))
|
|
|
|
(char-set-for-each adjoin! cs)
|
|
|
|
(make-char-set simple-cset
|
|
(apply numbers->inversion-list
|
|
*simple-cset-boundary* (+ 1 #x10ffff)
|
|
other-scalar-values))))
|
|
|
|
(define (char-set-fold kons knil cs)
|
|
(inversion-list-fold/done? (lambda (n v)
|
|
(kons (scalar-value->char n) v))
|
|
(simple-cset-fold kons knil (char-set-simple cs))
|
|
(lambda (_) #f)
|
|
(char-set-i-list cs)))
|
|
|
|
(define (char-set-every pred cs)
|
|
(and (simple-cset-every? pred (char-set-simple cs))
|
|
(inversion-list-fold/done? (lambda (n v)
|
|
(and v
|
|
(pred (scalar-value->char n))))
|
|
#t
|
|
not
|
|
(char-set-i-list cs))))
|
|
|
|
(define (char-set-any pred cs)
|
|
(or (simple-cset-any pred (char-set-simple cs))
|
|
(inversion-list-fold/done? (lambda (n v)
|
|
(or v
|
|
(pred (scalar-value->char n))))
|
|
#f
|
|
values
|
|
(char-set-i-list cs))))
|
|
|
|
(define (base-char-set maybe-base-cs)
|
|
(if maybe-base-cs
|
|
(char-set-copy maybe-base-cs)
|
|
(make-empty-char-set)))
|
|
|
|
(define char-set-unfold
|
|
(opt-lambda (p f g seed (maybe-base-cs #f))
|
|
(char-set-unfold! p f g seed
|
|
(base-char-set maybe-base-cs))))
|
|
|
|
(define (char-set-unfold! p f g seed base-cs)
|
|
(let loop ((seed seed) (cs base-cs))
|
|
(if (p seed) cs ; P says we are done.
|
|
(loop (g seed) ; Loop on (G SEED).
|
|
(char-set-adjoin! cs (f seed)))))) ; Add (F SEED) to set.
|
|
|
|
; converting from and to lists
|
|
|
|
(define (char-set . chars)
|
|
(list->char-set chars))
|
|
|
|
(define list->char-set
|
|
(opt-lambda (chars (maybe-base-cs #f))
|
|
(list->char-set! chars
|
|
(base-char-set maybe-base-cs))))
|
|
|
|
(define (list->char-set! chars cs)
|
|
(for-each (lambda (c)
|
|
(char-set-adjoin! cs c))
|
|
chars)
|
|
cs)
|
|
|
|
(define (char-set->list cs)
|
|
(char-set-fold cons '() cs))
|
|
|
|
; converting to and from strings
|
|
|
|
(define string->char-set
|
|
(opt-lambda (str (maybe-base-cs #f))
|
|
(string->char-set! str
|
|
(base-char-set maybe-base-cs))))
|
|
|
|
(define (string->char-set! str cs)
|
|
(do ((i (- (string-length str) 1) (- i 1)))
|
|
((< i 0))
|
|
(char-set-adjoin! cs (string-ref str i)))
|
|
cs)
|
|
|
|
(define (char-set->string cs)
|
|
(let ((ans (make-string (char-set-size cs))))
|
|
(char-set-fold (lambda (ch i)
|
|
(string-set! ans i ch)
|
|
(+ i 1))
|
|
0
|
|
cs)
|
|
ans))
|
|
|
|
(define ucs-range->char-set
|
|
(opt-lambda (lower upper (error? #f) (maybe-base-cs #f))
|
|
(ucs-range->char-set! lower upper error?
|
|
(base-char-set maybe-base-cs))))
|
|
|
|
(define (ucs-range->char-set! lower upper error? base-cs)
|
|
(if (negative? lower)
|
|
(assertion-violation 'ucs-range->char-set! "negative lower bound" lower))
|
|
(if (> lower #x10ffff)
|
|
(assertion-violation 'ucs-range->char-set! "invalid lower bound" lower))
|
|
(if (negative? upper)
|
|
(assertion-violation 'ucs-range->char-set! "negative upper bound" upper))
|
|
(if (> upper #x110000)
|
|
(assertion-violation 'ucs-range->char-set! "invalid lower bound" upper))
|
|
(if (not (<= lower upper))
|
|
(assertion-violation 'ucs-range->char-set! "decreasing bounds" lower upper))
|
|
|
|
(let ((create-inversion-list
|
|
(lambda (lower upper)
|
|
(cond
|
|
((and (>= lower #xD800)
|
|
(>= #xe000 upper))
|
|
(make-empty-inversion-list *simple-cset-boundary* (+ 1 #x10ffff)))
|
|
((<= upper #xe000)
|
|
(range->inversion-list *simple-cset-boundary* (+ 1 #x10ffff)
|
|
lower (min #xd800 upper)))
|
|
((>= lower #xd800)
|
|
(range->inversion-list *simple-cset-boundary* (+ 1 #x10ffff)
|
|
(max #xe000 lower) upper))
|
|
(else
|
|
;; hole
|
|
(ranges->inversion-list *simple-cset-boundary* (+ 1 #x10ffff)
|
|
(cons lower #xd800)
|
|
(cons #xe000 upper)))))))
|
|
(char-set-union!
|
|
base-cs
|
|
(cond
|
|
((>= lower *simple-cset-boundary*)
|
|
(make-char-set (make-empty-simple-cset)
|
|
(create-inversion-list lower upper)))
|
|
((< upper *simple-cset-boundary*)
|
|
(make-char-set (ucs-range->simple-cset lower upper)
|
|
(make-empty-inversion-list *simple-cset-boundary* (+ 1 #x10ffff))))
|
|
(else
|
|
(make-char-set (ucs-range->simple-cset lower *simple-cset-boundary*)
|
|
(create-inversion-list *simple-cset-boundary* upper)))))))
|
|
|
|
(define char-set-filter
|
|
(opt-lambda (predicate domain (maybe-base-cs #f))
|
|
(char-set-filter! predicate
|
|
domain
|
|
(base-char-set maybe-base-cs))))
|
|
|
|
(define (char-set-filter! predicate domain base-cs)
|
|
(char-set-fold (lambda (ch _)
|
|
(if (predicate ch)
|
|
(char-set-adjoin! base-cs ch)))
|
|
(unspecific)
|
|
domain)
|
|
base-cs)
|
|
|
|
; {string, char, char-set, char predicate} -> char-set
|
|
|
|
; This is called ->CHAR-SET in the SRFI, but that's not a valid R5RS
|
|
; identifier.
|
|
|
|
(define (x->char-set x)
|
|
(cond ((char-set? x) x)
|
|
((string? x) (string->char-set x))
|
|
((char? x) (char-set x))
|
|
(else (assertion-violation 'x->char-set "Not a charset, string or char."))))
|
|
|
|
|
|
; Set algebra
|
|
|
|
(define *surrogate-complement-i-list*
|
|
(inversion-list-complement
|
|
(range->inversion-list *simple-cset-boundary* (+ 1 #x10ffff)
|
|
#xd800 #xe000)))
|
|
|
|
(define (char-set-complement cs)
|
|
(make-char-set (simple-cset-complement (char-set-simple cs))
|
|
(inversion-list-intersection
|
|
(inversion-list-complement (char-set-i-list cs))
|
|
*surrogate-complement-i-list*)))
|
|
|
|
(define (char-set-complement! cs)
|
|
(set-char-set-simple! cs
|
|
(simple-cset-complement! (char-set-simple cs)))
|
|
(set-char-set-i-list! cs
|
|
(inversion-list-intersection
|
|
(inversion-list-complement (char-set-i-list cs))
|
|
*surrogate-complement-i-list*))
|
|
cs)
|
|
|
|
(define (make-char-set-op! simple-cset-op! inversion-list-op)
|
|
(lambda (cset1 . csets)
|
|
(set-char-set-simple! cset1
|
|
(apply simple-cset-op!
|
|
(char-set-simple cset1)
|
|
(map char-set-simple csets)))
|
|
(set-char-set-i-list! cset1
|
|
(apply inversion-list-op
|
|
(char-set-i-list cset1)
|
|
(map char-set-i-list csets)))
|
|
cset1))
|
|
|
|
(define (make-char-set-op char-set-op! make-neutral)
|
|
(lambda csets
|
|
(if (pair? csets)
|
|
(apply char-set-op! (char-set-copy (car csets)) (cdr csets))
|
|
(make-neutral))))
|
|
|
|
(define char-set-union!
|
|
(make-char-set-op! simple-cset-union! inversion-list-union))
|
|
(define char-set-union
|
|
(make-char-set-op char-set-union! make-empty-char-set))
|
|
|
|
(define char-set-intersection!
|
|
(make-char-set-op! simple-cset-intersection! inversion-list-intersection))
|
|
(define char-set-intersection
|
|
(make-char-set-op char-set-intersection! make-full-char-set))
|
|
|
|
(define char-set-difference!
|
|
(make-char-set-op! simple-cset-difference! inversion-list-difference))
|
|
|
|
(define (char-set-difference cset1 . csets)
|
|
(apply char-set-difference! (char-set-copy cset1) csets))
|
|
|
|
; copied from inversion-list.scm
|
|
(define (binary->n-ary proc/2)
|
|
(lambda (arg-1 . args)
|
|
(if (and (pair? args)
|
|
(null? (cdr args)))
|
|
(proc/2 arg-1 (car args))
|
|
(let loop ((args args)
|
|
(result arg-1))
|
|
(if (null? args)
|
|
result
|
|
(loop (cdr args) (proc/2 result (car args))))))))
|
|
|
|
(define inversion-list-xor
|
|
(binary->n-ary
|
|
(lambda (i-list-1 i-list-2)
|
|
(inversion-list-union (inversion-list-intersection
|
|
(inversion-list-complement i-list-1)
|
|
i-list-2)
|
|
(inversion-list-intersection
|
|
i-list-1
|
|
(inversion-list-complement i-list-2))))))
|
|
|
|
; Really inefficient for things outside Latin-1
|
|
; WHO NEEDS THIS NONSENSE, ANYWAY?
|
|
(define char-set-xor!
|
|
(make-char-set-op! simple-cset-xor! inversion-list-xor))
|
|
|
|
(define char-set-xor
|
|
(make-char-set-op char-set-xor! make-empty-char-set))
|
|
|
|
(define (char-set-diff+intersection! cs1 cs2 . csets)
|
|
(call-with-values
|
|
(lambda () (apply simple-cset-diff+intersection!
|
|
(char-set-simple cs1) (char-set-simple cs2)
|
|
(map char-set-simple csets)))
|
|
(lambda (simple-diff simple-intersection)
|
|
(set-char-set-simple! cs1 simple-diff)
|
|
(set-char-set-simple! cs2 simple-intersection)
|
|
(let ((i-list-1 (char-set-i-list cs1))
|
|
(i-list-2 (char-set-i-list cs2))
|
|
(i-list-rest (map char-set-i-list csets)))
|
|
(set-char-set-i-list! cs1
|
|
(apply inversion-list-difference
|
|
i-list-1 i-list-2
|
|
i-list-rest))
|
|
(set-char-set-i-list! cs2
|
|
(inversion-list-intersection
|
|
i-list-1
|
|
(apply inversion-list-union
|
|
i-list-2
|
|
i-list-rest)))
|
|
(values cs1 cs2)))))
|
|
|
|
(define (char-set-diff+intersection cs1 . csets)
|
|
(apply char-set-diff+intersection!
|
|
(char-set-copy cs1)
|
|
(make-empty-char-set)
|
|
csets))
|
|
|
|
;; Byte vector utilities
|
|
|
|
(define (byte-vector-copy b)
|
|
(let* ((size (byte-vector-length b))
|
|
(result (make-byte-vector size 0)))
|
|
(copy-bytes! b 0 result 0 size)
|
|
result))
|
|
|
|
;;; Apply P to each index and its char code in S: (P I VAL).
|
|
;;; Used by the set-algebra ops.
|
|
|
|
(define (byte-vector-iter p s)
|
|
(let loop ((i (- (byte-vector-length s) 1)))
|
|
(if (>= i 0)
|
|
(begin
|
|
(p i (byte-vector-ref s i))
|
|
(loop (- i 1))))))
|
|
|
|
;; Utility for srfi-14-base-char-sets.scm, which follows
|
|
|
|
; The range vector is an even-sized vector with [lower, upper)
|
|
; pairs.
|
|
|
|
(define (range-vector->char-set range-vector)
|
|
(let ((size (vector-length range-vector))
|
|
(simple-cset (make-empty-simple-cset)))
|
|
|
|
(let loop ((index 0) (ranges '()))
|
|
(if (>= index size)
|
|
(make-char-set simple-cset
|
|
(apply ranges->inversion-list
|
|
*simple-cset-boundary* (+ 1 #x10ffff)
|
|
ranges))
|
|
(let ((lower (vector-ref range-vector index))
|
|
(upper (vector-ref range-vector (+ 1 index))))
|
|
|
|
(define (fill-simple-cset! lower upper)
|
|
(let loop ((scalar-value lower))
|
|
(if (< scalar-value upper)
|
|
(begin
|
|
(simple-cset-adjoin-code! simple-cset scalar-value)
|
|
(loop (+ 1 scalar-value))))))
|
|
|
|
|
|
(cond
|
|
((>= lower *simple-cset-boundary*)
|
|
(loop (+ 2 index) (cons (cons lower upper) ranges)))
|
|
((< upper *simple-cset-boundary*)
|
|
(fill-simple-cset! lower upper)
|
|
(loop (+ 2 index) ranges))
|
|
(else
|
|
(fill-simple-cset! lower *simple-cset-boundary*)
|
|
(loop (+ 2 index)
|
|
(cons (cons *simple-cset-boundary* upper) ranges)))))))))
|