scsh-0.5/scsh/lib/ccp.scm

577 lines
18 KiB
Scheme

;;; Char->char partial maps -*- Scheme -*-
;;; Copyright (C) 1998 by Olin Shivers.
;;; CCPs are an efficient data structure for doing simple string transforms,
;;; similar to the kinds of things you would do with the tr(1) program.
;;;
;;; This code is tuned for a 7- or 8-bit character type. Large, 16-bit
;;; character types would need a more sophisticated data structure, tuned
;;; for sparseness. I would suggest something like this:
;;; (define-record ccp
;;; domain ; The domain char-set
;;; map ; Sorted vector of (char . string) pairs
;;; ; specifying the map.
;;; id?) ; If true, mappings not specified by MAP are
;;; ; identity mapping. If false, MAP must
;;; ; specify a mapping for every char in DOMAIN.
;;;
;;; A (char . string) elements in MAP specifies a mapping for the contiguous
;;; sequence of L chars beginning with CHAR (in the sequence of the underlying
;;; char type representation), where L is the length of STRING. These MAP elements
;;; are sorted by CHAR, so that binary search can be used to get from an input
;;; character C to the right MAP element quickly.
;;;
;;; This representation should be reasonably compact for standard mappings on,
;;; say, a Unicode CCP. An implementation might wish to have a cache field
;;; in the record for storing the full 8kb bitset when performing ccp-map
;;; operations. Or, an implementation might want to store the Latin-1 subset
;;; of the map in a dense format, and keep the remainder in a sparse format.
(define num-chars (char-set-size char-set:full)) ; AKA 256.
(define-record ccp
domain ; The domain char-set
dshared? ; Is the domain value shared or linear?
map ; 256-elt string
mshared?) ; Is the map string shared or linear?
;;; Accessors and setters that manage the linear bookkeeping
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define (ccp-domain ccp)
(set-ccp:dshared? ccp #t)
(ccp:domain ccp))
;;; CCP is a linear ccp. PROC is a domain->domain function; it must be
;;; linear in its parameter and result.
;;;
;;; Updates the domain of the CCP with PROC, returns the resulting
;;; CCP; reuses the old one to construct the new one.
(define (restrict-linear-ccp-domain ccp proc)
(let ((new-d (proc (if (ccp:dshared? ccp)
(begin (set-ccp:dshared? ccp #f)
(char-set-copy (ccp:domain ccp)))
(ccp:domain ccp)))))
(set-ccp:domain ccp new-d)
ccp))
;;; CCP is a linear CCP. PROC is a domain x cmap -> domain function.
;;; It is passed a linear domain and cmap string. It may side-effect
;;; the cmap string, and returns the resulting updated domain.
;;; We return the resulting CCP, reusing the parameter to construct it.
(define (linear-update-ccp ccp proc)
(let* ((cmap (if (ccp:mshared? ccp)
(begin (set-ccp:mshared? ccp #f)
(string-copy (ccp:map ccp)))
(ccp:map ccp)))
(new-d (proc (if (ccp:dshared? ccp)
(begin (set-ccp:dshared? ccp #f)
(char-set-copy (ccp:domain ccp)))
(ccp:domain ccp))
cmap)))
(set-ccp:domain ccp new-d)
ccp))
;;; Return CCP's map field, and mark it as shared. CCP functions that
;;; restrict a ccp's domain share map strings, so they use this guy.
(define (ccp:map/shared ccp)
(set-ccp:mshared? ccp #t)
(ccp:map ccp))
(define (ccp-copy ccp) (make-ccp (char-set-copy (ccp:domain ccp)) #f
(string-copy (ccp:map ccp)) #f))
;;; N-ary equality relation for partial maps
(define (ccp= ccp1 . rest)
(let ((domain (ccp:domain ccp1))
(cmap (ccp:map ccp1)))
(every (lambda (ccp2)
(and (char-set= domain (ccp:domain ccp2))
(let ((cmap2 (ccp:map ccp2)))
(char-set-every (lambda (c)
(let ((i (char->ascii c)))
(char=? (string-ref cmap i)
(string-ref cmap2 i))))
domain))))
rest)))
;;; N-ary subset relation for partial maps
(define (ccp<= ccp1 . rest)
(let lp ((domain1 (ccp:domain ccp1))
(cmap1 (ccp:map ccp1))
(rest rest))
(or (not (pair? rest))
(let* ((ccp2 (car rest))
(domain2 (ccp:domain ccp2))
(cmap2 (ccp:map ccp2))
(rest (cdr rest)))
(and (char-set<= domain1 domain2)
(let ((cmap2 (ccp:map ccp2)))
(char-set-every (lambda (c)
(let ((i (char->ascii c)))
(char=? (string-ref cmap1 i)
(string-ref cmap2 i))))
domain1))
(lp domain2 cmap2 rest))))))
;;; CCP iterators
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define (ccp-fold kons knil ccp)
(let ((cmap (ccp:map ccp)))
(char-set-fold (lambda (c v) (kons c (string-ref cmap (char->ascii c)) v))
knil
(ccp:domain ccp))))
(define (ccp-for-each proc ccp)
(let ((cmap (ccp:map ccp)))
(char-set-for-each (lambda (c) (proc c (string-ref cmap (char->ascii c))))
(ccp:domain ccp))))
(define (ccp->alist ccp)
(ccp-fold (lambda (from to alist) (cons (cons from to) alist))
'()
ccp))
;;; CCP-RESTRICT
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Restrict a ccp's domain.
(define (ccp-restrict ccp cset)
(make-ccp (char-set-intersection cset (ccp:domain ccp))
#f
(ccp:map/shared ccp)
#t))
(define (ccp-restrict! ccp cset)
(restrict-linear-ccp-domain ccp (lambda (d) (char-set-intersection! d cset))))
;;; CCP-ADJOIN ccp from-char1 to-char1 ...
;;; CCP-DELETE ccp char1 ...
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Add & delete mappings to/from a ccp.
(define (ccp-delete ccp . chars)
(make-ccp (apply char-set-delete (ccp:domain ccp) chars)
#f
(ccp:map/shared ccp)
#t))
(define (ccp-delete! ccp . chars)
(restrict-linear-ccp-domain ccp (lambda (d) (apply char-set-delete! d chars))))
(define (ccp-adjoin ccp . chars)
(let ((cmap (string-copy (ccp:map ccp))))
(make-ccp (install-ccp-adjoin! cmap (char-set-copy (ccp:domain ccp)) chars)
#f
cmap
#f)))
(define (ccp-adjoin! ccp . chars)
(linear-update-ccp ccp (lambda (d cmap) (install-ccp-adjoin! cmap d chars))))
(define (install-ccp-adjoin! cmap domain chars)
(let lp ((chars chars) (d domain))
(if (pair? chars)
(let ((from (car chars))
(to (cadr chars))
(chars (cddr chars)))
(string-set! cmap (char->ascii from) to)
(lp chars (char-set-adjoin! d from)))
d)))
;;; CCP-EXTEND ccp1 ...
;;; CCP-EXTEND! ccp1 ccp2 ...
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Extend ccp1 with ccp2, etc.
(define (ccp-extend . ccps)
(if (pair? ccps)
(let ((ccp0 (car ccps))
(ccps (cdr ccps)))
(if (pair? ccps)
(let ((cmap (string-copy (ccp:map ccp0)))) ; Copy cmap.
;; The FOLD installs each ccp in CCPS into CMAP and produces
;; the new domain.
(make-ccp (fold (lambda (ccp d)
(install-ccp-extension! cmap d ccp))
(char-set-copy (ccp:domain ccp0))
ccps)
#f cmap #f))
ccp0)) ; Only 1 parameter
ccp:0)) ; 0 parameters
(define (ccp-extend! ccp0 . ccps)
(linear-update-ccp ccp0
(lambda (domain cmap)
(fold (lambda (ccp d) (install-ccp-extension! cmap d ccp))
domain
ccps))))
;;; Side-effect CMAP, linear-update and return DOMAIN.
(define (install-ccp-extension! cmap domain ccp)
(let ((cmap1 (ccp:map ccp))
(domain1 (ccp:domain ccp)))
(char-set-for-each (lambda (c)
(let ((i (char->ascii c)))
(string-set! cmap i (string-ref cmap1 i))))
domain1)
(char-set-union! domain domain1)))
;;; Compose the CCPs. 0-ary case: (ccp-compose) = ccp:1.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; For each character C-IN in the original domain, we push it
;;; through the pipeline of CCPs. If we ever land outside the
;;; domain of a ccp, we punt C-IN. If we push it all the way
;;; through, we add C-IN to our result domain, and add the mapping
;;; into the cmap we are assembling.
;;;
;;; Looping this way avoids building up intermediate temporary
;;; CCPs. If CCP's were small bitsets, we might be better off
;;; slicing the double-nested loops the other way around.
(define (ccp-compose . ccps)
(cond ((not (pair? ccps)) ccp:1) ; 0 args => ccp:1
((not (pair? (cdr ccps))) (car ccps)) ; 1 arg
(else
(let* ((v (list->vector ccps))
(vlen-2 (- (vector-length v) 2))
(cmap (make-string num-chars))
(d1 (ccp:domain (vector-ref v (+ vlen-2 1))))
(d (char-set-fold (lambda (c-in d)
(let lp ((c c-in) (i vlen-2))
(if (>= i 0)
(let ((ccp (vector-ref v i)))
(if (char-set-contains? (ccp:domain ccp) c)
(lp (string-ref (ccp:map ccp)
(char->ascii c))
(- i 1))
;; Lose: remove c-in from d.
(char-set-delete! d c-in)))
;; Win: C-IN -> C
(begin (string-set! cmap
(char->ascii c-in)
c)
d))))
(char-set-copy d1)
d1)))
(make-ccp d #f cmap #f)))))
;;; ALIST->CPP
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define (alist->ccp cc-alist . maybe-base-ccp)
(let ((base (:optional maybe-base-ccp ccp:0)))
(if (pair? cc-alist)
(let ((cmap (string-copy (ccp:map base))))
(make-ccp (install-ccp-alist! cmap
(char-set-copy (ccp:domain base))
cc-alist)
#f cmap #f))
base)))
(define (alist->ccp! alist base)
(linear-update-ccp base (lambda (d cmap) (install-ccp-alist! cmap d alist))))
;;; Side-effect CMAP, linear-update and return DOMAIN.
(define (install-ccp-alist! cmap domain alist)
(fold (lambda (from/to d) (let ((from (car from/to))
(to (cdr from/to)))
(string-set! cmap (char->ascii from) to)
(char-set-adjoin! domain from)))
domain
alist))
;;; PROC->CCP
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; (proc->ccp proc [domain base-ccp])
(define (proc->ccp proc . args)
(let-optionals args ((proc-domain char-set:full)
(base ccp:0))
(let ((cmap (string-copy (ccp:map base))))
(make-ccp (install-ccp-proc! cmap (char-set-copy (ccp:domain base))
proc proc-domain)
#f cmap #f))))
(define (proc->ccp! proc proc-domain base)
(linear-update-ccp base
(lambda (d cmap) (install-ccp-proc! cmap d proc proc-domain))))
(define (install-ccp-proc! cmap domain proc proc-domain)
(char-set-for-each (lambda (c) (string-set! cmap (char->ascii c) (proc c)))
proc-domain)
(char-set-union! domain proc-domain))
;;; CONSTANT-CCP
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; (constant-ccp char [domain base-ccp])
;;; Extend BASE-CCP with the a map taking every char in DOMAIN to CHAR.
;;; DOMAIN defaults to char-set:full. BASE-CCP defaults to CCP:0.
(define (constant-ccp char . args)
(let-optionals args ((char-domain char-set:full) (base ccp:0))
(let ((cmap (string-copy (ccp:map base))))
(make-ccp (install-constant-ccp! cmap (char-set-copy (ccp:domain base))
char char-domain)
#f cmap #f))))
(define (constant-ccp! char char-domain base)
(linear-update-ccp base
(lambda (d cmap) (install-constant-ccp! cmap d char char-domain))))
;;; Install the constant mapping into CMAP0 by side-effect,
;;; linear-update & return DOMAIN0 with the constant-mapping's domain.
(define (install-constant-ccp! cmap0 domain0 char char-domain)
(char-set-for-each (lambda (c) (string-set! cmap0 (char->ascii c) char))
char-domain)
(char-set-union! domain0 char-domain))
;;; CCP/MAPPINGS
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; (ccp/mappings from1 to1 from2 to2 ...) -> ccp
;;; (extend-ccp/mappings base-ccp from1 to1 from2 to2 ...) -> ccp
;;; (extend-ccp/mappings! base-ccp from1 to1 from2 to2 ...) -> ccp
;;; Each FROM element is either a string or a (lo-char . hi-char) range.
;;; Each TO element is either a string or a lo-char. Strings are replicated
;;; to match the length of the corresponding FROM element.
;;; CCP/MAPPINGS's base CCP is CCP:0
;;;
;;; Tedious code.
;;; Internal utility.
;;; Install the FROM->TO mapping pair into DOMAIN & CMAP by side-effect.
;;; Return the new domain.
(define (install-ccp-mapping-pair! cmap domain from to)
;; Tedium -- four possibilities here:
;; str->str, str->lo-char,
;; range->str, range->lo-char.
(if (string? from)
(if (string? to)
;; "abc" -> "ABC"
(let ((len1 (string-length from))
(len2 (string-length to)))
(let lp2 ((i (- len1 1))
(j (modulo (- len2 1) len1))
(d domain))
(if (>= i 0)
(let ((c (string-ref from i)))
(string-set! cmap
(char->ascii c)
(string-ref to i))
(lp2 (- i 1)
(- (if (> j 0) j len2) 1)
(char-set-adjoin! d c)))
d)))
;; "abc" -> #\A
(let lp2 ((i (- (string-length from) 1))
(j (char->ascii to))
(d domain))
(if (>= i 0)
(let ((c (string-ref from i)))
(string-set! cmap
(char->ascii c)
(ascii->char j))
(lp2 (- i 1)
(- j 1)
(char-set-adjoin! d c)))
d)))
(let ((from-start (char->ascii (car from)))
(from-end (char->ascii (cdr from))))
(if (string? to)
(let ((len2-1 (- (string-length to) 1)))
;; (#\a . #\c) -> "ABC"
(let lp2 ((i from-start) (j 0) (d domain))
(if (<= i from-end)
(let ((c (string-ref to j)))
(string-set! cmap i c)
(lp2 (+ i 1)
(if (= j len2-1) 0 (+ j 1))
(char-set-adjoin! d c)))
d)))
;; (#\a . #\c) -> #\A
(do ((i from-start (+ i 1))
(j (char->ascii to) (+ j 1))
(d domain (begin (string-set! cmap i (ascii->char j))
(char-set-adjoin d (ascii->char i)))))
((> i from-end) d))))))
;;; Internal utility -- side-effects CMAP; linear-updates & returns DOMAIN.
(define (install-mapping-pairs cmap domain args)
(let lp ((domain domain) (args args))
(if (pair? args)
(lp (install-ccp-mapping-pair! cmap domain (car args) (cadr args))
(cddr args))
domain)))
(define (ccp/mappings . args)
(let ((cmap (make-string num-chars)))
(make-ccp (install-mapping-pairs (make-string num-chars)
(char-set-copy char-set:empty)
args)
#f cmap #f)))
(define (extend-ccp/mappings base . args)
(let ((cmap (string-copy (ccp:map base))))
(make-ccp (install-mapping-pairs cmap (char-set-copy (ccp:domain base)) args)
#f cmap #f)))
(define (extend-ccp/mappings! base . args)
(linear-update-ccp base (lambda (d cmap) (install-mapping-pairs cmap d args))))
;;; CONSTRUCT-CCP! ccp elt ...
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; The kitchen-sink constructor; static typing be damned.
;;; ELTS are interpreted as follows:
;;; (lo-char . hi-char) to-string|lo-char ; ccp/range
;;; from-string to-string|lo-char ; ccp/range
;;; ccp ; ccp-extend
;;; alist ; alist->ccp
;;; domain char ; ccp-constant
;;; domain proc ; proc->ccp
(define (construct-ccp! ccp . elts)
(linear-update-ccp ccp (lambda (d cmap) (install-ccp-construct! cmap d elts))))
(define (construct-ccp base . elts)
(let ((cmap (string-copy (ccp:map base))))
(make-ccp (install-ccp-construct! cmap (char-set-copy (ccp:domain base)) elts)
#f cmap #f)))
;;; Install the mappings into CMAP by side-effect,
;;; linear-update & return DOMAIN with the final domain.
(define (install-ccp-construct! cmap domain elts)
(let lp ((d domain) (elts elts))
;(format #t "d=~s elts=~s\n" d elts)
(if (not (pair? elts)) d
(let ((elt (car elts))
(elts (cdr elts)))
(cond ((pair? elt)
(cond ((pair? (car elt)) ; ELT is an alist.
(lp (install-ccp-alist! cmap d elt) elts))
((char? (car elt)) ; ELT is (lo-char . hi-char) range.
(lp (install-ccp-mapping-pair! cmap d elt (car elts))
(cdr elts)))
(else (error "Illegal elt to construct-ccp" elt))))
((string? elt)
(lp (install-ccp-mapping-pair! cmap d elt (car elts))
(cdr elts)))
((ccp? elt) (lp (install-ccp-extension! cmap d elt) elts))
((char-set? elt)
(let ((elt2 (car elts))
(elts (cdr elts)))
(lp (cond ((char? elt2)
(install-constant-ccp! cmap d elt2 elt))
((procedure? elt2)
(install-ccp-proc! cmap d elt2 elt))
(else (error "Illegal elt-pair to construct-ccp"
elt elt2)))
elts)))
(else (error "Illegal elt to construct-ccp" elt)))))))
;;; CCP unfold
(define (ccp-unfold p f g seed)
(let lp ((seed seed) (ccp (ccp-copy ccp:0)))
(if (p seed) ccp
(lp (g seed)
(receive (from to) (f seed)
(lp (g seed) (ccp-adjoin! ccp from to)))))))
;;; Using CCPs
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; TR ccp string [start end] -> string
;;; CCP-MAP ccp string [start end] -> string
;;; CCP-MAP! ccp string [start end] -> undefined
;;; CCP-APP ccp char -> char or false
;;; If a char in S is not in CCP's domain, it is dropped from the result.
;;; You can use this to map and delete chars from a string.
(define (tr ccp s . maybe-start+end)
(let-optionals maybe-start+end ((start 0) (end (string-length s)))
;; Count up the chars in S that are in the domain,
;; and allocate the answer string ANS:
(let* ((len (- end start))
(domain (ccp:domain ccp))
(ans-len (string-fold (lambda (c numchars)
(if (char-set-contains? domain c)
(+ numchars 1)
numchars))
0 s start end))
(ans (make-string ans-len)))
;; Apply the map, installing the resulting chars into ANS:
(string-fold (lambda (c i) (cond ((ccp-app ccp c) =>
(lambda (c)
(string-set! ans i c)
(+ i 1)))
(else i))) ; Not in domain -- drop it.
0 s start end)
ans)))
(define (ccp-map ccp s . maybe-start+end)
(apply string-map (lambda (c) (ccp-app ccp c)) s maybe-start+end))
(define (ccp-map! ccp s . maybe-start+end)
(apply string-map! (lambda (c) (ccp-app ccp c)) s maybe-start+end))
(define (ccp-app ccp char)
(and (char-set-contains? (ccp:domain ccp) char)
(string-ref (ccp:map ccp) (char->ascii char))))
;;; Primitive CCPs
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define id-cmap
(let ((m (make-string num-chars)))
(do ((i (- num-chars 1) (- i 1)))
((< i 0))
(string-set! m i (ascii->char i)))
m))
(define ccp:0 (make-ccp char-set:empty #t id-cmap #t))
(define ccp:1 (make-ccp char-set:full #t id-cmap #t))
(define ccp:upcase (proc->ccp char-upcase char-set:full))
(define ccp:downcase (proc->ccp char-downcase char-set:full))