diff --git a/scsh/lib/ccp.scm b/scsh/lib/ccp.scm new file mode 100644 index 0000000..16dcece --- /dev/null +++ b/scsh/lib/ccp.scm @@ -0,0 +1,576 @@ +;;; Char->char partial maps -*- Scheme -*- +;;; Copyright (C) 1998 by Olin Shivers. + +;;; CCPs are an efficient data structure for doing simple string transforms, +;;; similar to the kinds of things you would do with the tr(1) program. +;;; +;;; This code is tuned for a 7- or 8-bit character type. Large, 16-bit +;;; character types would need a more sophisticated data structure, tuned +;;; for sparseness. I would suggest something like this: +;;; (define-record ccp +;;; domain ; The domain char-set +;;; map ; Sorted vector of (char . string) pairs +;;; ; specifying the map. +;;; id?) ; If true, mappings not specified by MAP are +;;; ; identity mapping. If false, MAP must +;;; ; specify a mapping for every char in DOMAIN. +;;; +;;; A (char . string) elements in MAP specifies a mapping for the contiguous +;;; sequence of L chars beginning with CHAR (in the sequence of the underlying +;;; char type representation), where L is the length of STRING. These MAP elements +;;; are sorted by CHAR, so that binary search can be used to get from an input +;;; character C to the right MAP element quickly. +;;; +;;; This representation should be reasonably compact for standard mappings on, +;;; say, a Unicode CCP. An implementation might wish to have a cache field +;;; in the record for storing the full 8kb bitset when performing ccp-map +;;; operations. Or, an implementation might want to store the Latin-1 subset +;;; of the map in a dense format, and keep the remainder in a sparse format. + +(define num-chars (char-set-size char-set:full)) ; AKA 256. + +(define-record ccp + domain ; The domain char-set + dshared? ; Is the domain value shared or linear? + map ; 256-elt string + mshared?) ; Is the map string shared or linear? + + +;;; Accessors and setters that manage the linear bookkeeping +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define (ccp-domain ccp) + (set-ccp:dshared? ccp #t) + (ccp:domain ccp)) + +;;; CCP is a linear ccp. PROC is a domain->domain function; it must be +;;; linear in its parameter and result. +;;; +;;; Updates the domain of the CCP with PROC, returns the resulting +;;; CCP; reuses the old one to construct the new one. + +(define (restrict-linear-ccp-domain ccp proc) + (let ((new-d (proc (if (ccp:dshared? ccp) + (begin (set-ccp:dshared? ccp #f) + (char-set-copy (ccp:domain ccp))) + (ccp:domain ccp))))) + (set-ccp:domain ccp new-d) + ccp)) + +;;; CCP is a linear CCP. PROC is a domain x cmap -> domain function. +;;; It is passed a linear domain and cmap string. It may side-effect +;;; the cmap string, and returns the resulting updated domain. +;;; We return the resulting CCP, reusing the parameter to construct it. + +(define (linear-update-ccp ccp proc) + (let* ((cmap (if (ccp:mshared? ccp) + (begin (set-ccp:mshared? ccp #f) + (string-copy (ccp:map ccp))) + (ccp:map ccp))) + + (new-d (proc (if (ccp:dshared? ccp) + (begin (set-ccp:dshared? ccp #f) + (char-set-copy (ccp:domain ccp))) + (ccp:domain ccp)) + cmap))) + (set-ccp:domain ccp new-d) + ccp)) + + + +;;; Return CCP's map field, and mark it as shared. CCP functions that +;;; restrict a ccp's domain share map strings, so they use this guy. +(define (ccp:map/shared ccp) + (set-ccp:mshared? ccp #t) + (ccp:map ccp)) + +(define (ccp-copy ccp) (make-ccp (char-set-copy (ccp:domain ccp)) #f + (string-copy (ccp:map ccp)) #f)) + +;;; N-ary equality relation for partial maps + +(define (ccp= ccp1 . rest) + (let ((domain (ccp:domain ccp1)) + (cmap (ccp:map ccp1))) + (every (lambda (ccp2) + (and (char-set= domain (ccp:domain ccp2)) + (let ((cmap2 (ccp:map ccp2))) + (char-set-every? (lambda (c) + (let ((i (char->ascii c))) + (char=? (string-ref cmap i) + (string-ref cmap2 i)))) + domain)))) + rest))) + + +;;; N-ary subset relation for partial maps + +(define (ccp<= ccp1 . rest) + (let lp ((domain1 (ccp:domain ccp1)) + (cmap1 (ccp:map ccp1)) + (rest rest)) + (or (not (pair? rest)) + (let* ((ccp2 (car rest)) + (domain2 (ccp:domain ccp2)) + (cmap2 (ccp:map ccp2)) + (rest (cdr rest))) + (and (char-set<= domain1 domain2) + (let ((cmap2 (ccp:map ccp2))) + (char-set-every? (lambda (c) + (let ((i (char->ascii c))) + (char=? (string-ref cmap1 i) + (string-ref cmap2 i)))) + domain1)) + (lp domain2 cmap2 rest)))))) + + +;;; CCP iterators +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define (ccp-fold kons knil ccp) + (let ((cmap (ccp:map ccp))) + (char-set-fold (lambda (c v) (kons c (string-ref cmap (char->ascii c)) v)) + knil + (ccp:domain ccp)))) + +(define (ccp-for-each proc ccp) + (let ((cmap (ccp:map ccp))) + (char-set-for-each (lambda (c) (proc c (string-ref cmap (char->ascii c)))) + (ccp:domain ccp)))) + +(define (ccp->alist ccp) + (ccp-fold (lambda (from to alist) (cons (cons from to) alist)) + '() + ccp)) + + +;;; CCP-RESTRICT +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Restrict a ccp's domain. + +(define (ccp-restrict ccp cset) + (make-ccp (char-set-intersection cset (ccp:domain ccp)) + #f + (ccp:map/shared ccp) + #t)) + +(define (ccp-restrict! ccp cset) + (restrict-linear-ccp-domain ccp (lambda (d) (char-set-intersection! d cset)))) + + +;;; CCP-ADJOIN ccp from-char1 to-char1 ... +;;; CCP-DELETE ccp char1 ... +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Add & delete mappings to/from a ccp. + +(define (ccp-delete ccp . chars) + (make-ccp (apply char-set-delete (ccp:domain ccp) chars) + #f + (ccp:map/shared ccp) + #t)) + +(define (ccp-delete! ccp . chars) + (restrict-linear-ccp-domain ccp (lambda (d) (apply char-set-delete! d chars)))) + + +(define (ccp-adjoin ccp . chars) + (let ((cmap (string-copy (ccp:map ccp)))) + (make-ccp (install-ccp-adjoin! cmap (char-set-copy (ccp:domain ccp)) chars) + #f + cmap + #f))) + +(define (ccp-adjoin! ccp . chars) + (linear-update-ccp ccp (lambda (d cmap) (install-ccp-adjoin! cmap d chars)))) + +(define (install-ccp-adjoin! cmap domain chars) + (let lp ((chars chars) (d domain)) + (if (pair? chars) + (let ((from (car chars)) + (to (cadr chars)) + (chars (cddr chars))) + (string-set! cmap (char->ascii from) to) + (lp chars (char-set-adjoin! d from))) + d))) + + +;;; CCP-EXTEND ccp1 ... +;;; CCP-EXTEND! ccp1 ccp2 ... +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Extend ccp1 with ccp2, etc. + +(define (ccp-extend . ccps) + (if (pair? ccps) + (let ((ccp0 (car ccps)) + (ccps (cdr ccps))) + (if (pair? ccps) + (let ((cmap (string-copy (ccp:map ccp0)))) ; Copy cmap. + ;; The FOLD installs each ccp in CCPS into CMAP and produces + ;; the new domain. + (make-ccp (fold (lambda (ccp d) + (install-ccp-extension! cmap d ccp)) + (char-set-copy (ccp:domain ccp0)) + ccps) + #f cmap #f)) + + ccp0)) ; Only 1 parameter + + ccp:0)) ; 0 parameters + +(define (ccp-extend! ccp0 . ccps) + (linear-update-ccp ccp0 + (lambda (domain cmap) + (fold (lambda (ccp d) (install-ccp-extension! cmap d ccp)) + domain + ccps)))) + + +;;; Side-effect CMAP, linear-update and return DOMAIN. +(define (install-ccp-extension! cmap domain ccp) + (let ((cmap1 (ccp:map ccp)) + (domain1 (ccp:domain ccp))) + (char-set-for-each (lambda (c) + (let ((i (char->ascii c))) + (string-set! cmap i (string-ref cmap1 i)))) + domain1) + (char-set-union! domain domain1))) + + +;;; Compose the CCPs. 0-ary case: (ccp-compose) = ccp:1. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; For each character C-IN in the original domain, we push it +;;; through the pipeline of CCPs. If we ever land outside the +;;; domain of a ccp, we punt C-IN. If we push it all the way +;;; through, we add C-IN to our result domain, and add the mapping +;;; into the cmap we are assembling. +;;; +;;; Looping this way avoids building up intermediate temporary +;;; CCPs. If CCP's were small bitsets, we might be better off +;;; slicing the double-nested loops the other way around. + +(define (ccp-compose . ccps) + (cond ((not (pair? ccps)) ccp:1) ; 0 args => ccp:1 + ((not (pair? (cdr ccps))) (car ccps)) ; 1 arg + (else + (let* ((v (list->vector ccps)) + (vlen-2 (- (vector-length v) 2)) + (cmap (make-string num-chars)) + (d1 (ccp:domain (vector-ref v (+ vlen-2 1)))) + (d (char-set-fold (lambda (c-in d) + (let lp ((c c-in) (i vlen-2)) + (if (>= i 0) + (let ((ccp (vector-ref v i))) + (if (char-set-contains? (ccp:domain ccp) c) + (lp (string-ref (ccp:map ccp) + (char->ascii c)) + (- i 1)) + + ;; Lose: remove c-in from d. + (char-set-delete! d c-in))) + + ;; Win: C-IN -> C + (begin (string-set! cmap + (char->ascii c-in) + c) + d)))) + (char-set-copy d1) + d1))) + (make-ccp d #f cmap #f))))) + + + +;;; ALIST->CPP +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define (alist->ccp cc-alist . maybe-base-ccp) + (let ((base (:optional maybe-base-ccp ccp:0))) + (if (pair? cc-alist) + (let ((cmap (string-copy (ccp:map base)))) + (make-ccp (install-ccp-alist! cmap + (char-set-copy (ccp:domain base)) + cc-alist) + #f cmap #f)) + base))) + +(define (alist->ccp! alist base) + (linear-update-ccp base (lambda (d cmap) (install-ccp-alist! cmap d alist)))) + +;;; Side-effect CMAP, linear-update and return DOMAIN. +(define (install-ccp-alist! cmap domain alist) + (fold (lambda (from/to d) (let ((from (car from/to)) + (to (cdr from/to))) + (string-set! cmap (char->ascii from) to) + (char-set-adjoin! domain from))) + domain + alist)) + + +;;; PROC->CCP +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; (proc->ccp proc [domain base-ccp]) + +(define (proc->ccp proc . args) + (let-optionals args ((proc-domain char-set:full) + (base ccp:0)) + (let ((cmap (string-copy (ccp:map base)))) + (make-ccp (install-ccp-proc! cmap (char-set-copy (ccp:domain base)) + proc proc-domain) + #f cmap #f)))) + +(define (proc->ccp! proc proc-domain base) + (linear-update-ccp base + (lambda (d cmap) (install-ccp-proc! cmap d proc proc-domain)))) + +(define (install-ccp-proc! cmap domain proc proc-domain) + (char-set-for-each (lambda (c) (string-set! cmap (char->ascii c) (proc c))) + proc-domain) + (char-set-union! domain proc-domain)) + + +;;; CONSTANT-CCP +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; (constant-ccp char [domain base-ccp]) +;;; Extend BASE-CCP with the a map taking every char in DOMAIN to CHAR. +;;; DOMAIN defaults to char-set:full. BASE-CCP defaults to CCP:0. + +(define (constant-ccp char . args) + (let-optionals args ((char-domain char-set:full) (base ccp:0)) + (let ((cmap (string-copy (ccp:map base)))) + (make-ccp (install-constant-ccp! cmap (char-set-copy (ccp:domain base)) + char char-domain) + #f cmap #f)))) + +(define (constant-ccp! char char-domain base) + (linear-update-ccp base + (lambda (d cmap) (install-constant-ccp! cmap d char char-domain)))) + +;;; Install the constant mapping into CMAP0 by side-effect, +;;; linear-update & return DOMAIN0 with the constant-mapping's domain. +(define (install-constant-ccp! cmap0 domain0 char char-domain) + (char-set-for-each (lambda (c) (string-set! cmap0 (char->ascii c) char)) + char-domain) + (char-set-union! domain0 char-domain)) + + +;;; CCP/MAPPINGS +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; (ccp/mappings from1 to1 from2 to2 ...) -> ccp +;;; (extend-ccp/mappings base-ccp from1 to1 from2 to2 ...) -> ccp +;;; (extend-ccp/mappings! base-ccp from1 to1 from2 to2 ...) -> ccp +;;; Each FROM element is either a string or a (lo-char . hi-char) range. +;;; Each TO element is either a string or a lo-char. Strings are replicated +;;; to match the length of the corresponding FROM element. +;;; CCP/MAPPINGS's base CCP is CCP:0 +;;; +;;; Tedious code. + +;;; Internal utility. +;;; Install the FROM->TO mapping pair into DOMAIN & CMAP by side-effect. +;;; Return the new domain. + +(define (install-ccp-mapping-pair! cmap domain from to) + ;; Tedium -- four possibilities here: + ;; str->str, str->lo-char, + ;; range->str, range->lo-char. + (if (string? from) + (if (string? to) + ;; "abc" -> "ABC" + (let ((len1 (string-length from)) + (len2 (string-length to))) + (let lp2 ((i (- len1 1)) + (j (modulo (- len2 1) len1)) + (d domain)) + (if (>= i 0) + (let ((c (string-ref from i))) + (string-set! cmap + (char->ascii c) + (string-ref to i)) + (lp2 (- i 1) + (- (if (> j 0) j len2) 1) + (char-set-adjoin! d c))) + d))) + + ;; "abc" -> #\A + (let lp2 ((i (- (string-length from) 1)) + (j (char->ascii to)) + (d domain)) + (if (>= i 0) + (let ((c (string-ref from i))) + (string-set! cmap + (char->ascii c) + (ascii->char j)) + (lp2 (- i 1) + (- j 1) + (char-set-adjoin! d c))) + d))) + + (let ((from-start (char->ascii (car from))) + (from-end (char->ascii (cdr from)))) + (if (string? to) + (let ((len2-1 (- (string-length to) 1))) + ;; (#\a . #\c) -> "ABC" + (let lp2 ((i from-start) (j 0) (d domain)) + (if (<= i from-end) + (let ((c (string-ref to j))) + (string-set! cmap i c) + (lp2 (+ i 1) + (if (= j len2-1) 0 (+ j 1)) + (char-set-adjoin! d c))) + d))) + + ;; (#\a . #\c) -> #\A + (do ((i from-start (+ i 1)) + (j (char->ascii to) (+ j 1)) + (d domain (begin (string-set! cmap i (ascii->char j)) + (char-set-adjoin d (ascii->char i))))) + ((> i from-end) d)))))) + +;;; Internal utility -- side-effects CMAP; linear-updates & returns DOMAIN. +(define (install-mapping-pairs cmap domain args) + (let lp ((domain domain) (args args)) + (if (pair? args) + (lp (install-ccp-mapping-pair! cmap domain (car args) (cadr args)) + (cddr args)) + domain))) + +(define (ccp/mappings . args) + (let ((cmap (make-string num-chars))) + (make-ccp (install-mapping-pairs (make-string num-chars) + (char-set-copy char-set:empty) + args) + #f cmap #f))) + +(define (extend-ccp/mappings base . args) + (let ((cmap (string-copy (ccp:map base)))) + (make-ccp (install-mapping-pairs cmap (char-set-copy (ccp:domain base)) args) + #f cmap #f))) + +(define (extend-ccp/mappings! base . args) + (linear-update-ccp base (lambda (d cmap) (install-mapping-pairs cmap d args)))) + + +;;; CONSTRUCT-CCP! ccp elt ... +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; The kitchen-sink constructor; static typing be damned. +;;; ELTS are interpreted as follows: +;;; (lo-char . hi-char) to-string|lo-char ; ccp/range +;;; from-string to-string|lo-char ; ccp/range +;;; ccp ; ccp-extend +;;; alist ; alist->ccp +;;; domain char ; ccp-constant +;;; domain proc ; proc->ccp + +(define (construct-ccp! ccp . elts) + (linear-update-ccp ccp (lambda (d cmap) (install-ccp-construct! cmap d elts)))) + +(define (construct-ccp base . elts) + (let ((cmap (string-copy (ccp:map base)))) + (make-ccp (install-ccp-construct! cmap (char-set-copy (ccp:domain base)) elts) + #f cmap #f))) + +;;; Install the mappings into CMAP by side-effect, +;;; linear-update & return DOMAIN with the final domain. + +(define (install-ccp-construct! cmap domain elts) + (let lp ((d domain) (elts elts)) + ;(format #t "d=~s elts=~s\n" d elts) + (if (not (pair? elts)) d + (let ((elt (car elts)) + (elts (cdr elts))) + (cond ((pair? elt) + (cond ((pair? (car elt)) ; ELT is an alist. + (lp (install-ccp-alist! cmap d elt) elts)) + ((char? (car elt)) ; ELT is (lo-char . hi-char) range. + (lp (install-ccp-mapping-pair! cmap d elt (car elts)) + (cdr elts))) + (else (error "Illegal elt to construct-ccp" elt)))) + + ((string? elt) + (lp (install-ccp-mapping-pair! cmap d elt (car elts)) + (cdr elts))) + + ((ccp? elt) (lp (install-ccp-extension! cmap d elt) elts)) + + ((char-set? elt) + (let ((elt2 (car elts)) + (elts (cdr elts))) + (lp (cond ((char? elt2) + (install-constant-ccp! cmap d elt2 elt)) + ((procedure? elt2) + (install-ccp-proc! cmap d elt2 elt)) + (else (error "Illegal elt-pair to construct-ccp" + elt elt2))) + elts))) + + (else (error "Illegal elt to construct-ccp" elt))))))) + + +;;; CCP unfold + +(define (ccp-unfold p f g seed) + (let lp ((seed seed) (ccp (ccp-copy ccp:0))) + (if (p seed) ccp + (lp (g seed) + (receive (from to) (f seed) + (lp (g seed) (ccp-adjoin! ccp from to))))))) + + + +;;; Using CCPs +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; TR ccp string [start end] -> string +;;; CCP-MAP ccp string [start end] -> string +;;; CCP-MAP! ccp string [start end] -> undefined +;;; CCP-APP ccp char -> char or false + +;;; If a char in S is not in CCP's domain, it is dropped from the result. +;;; You can use this to map and delete chars from a string. + +(define (tr ccp s . maybe-start+end) + (let-optionals maybe-start+end ((start 0) (end (string-length s))) + ;; Count up the chars in S that are in the domain, + ;; and allocate the answer string ANS: + (let* ((len (- end start)) + (domain (ccp:domain ccp)) + (ans-len (string-fold (lambda (c numchars) + (if (char-set-contains? domain c) + (+ numchars 1) + numchars)) + 0 s start end)) + (ans (make-string ans-len))) + + ;; Apply the map, installing the resulting chars into ANS: + (string-fold (lambda (c i) (cond ((ccp-app ccp c) => + (lambda (c) + (string-set! ans i c) + (+ i 1))) + (else i))) ; Not in domain -- drop it. + 0 s start end) + ans))) + +(define (ccp-map ccp s . maybe-start+end) + (apply string-map (lambda (c) (ccp-app ccp c)) s maybe-start+end)) + +(define (ccp-map! ccp s . maybe-start+end) + (apply string-map! (lambda (c) (ccp-app ccp c)) s maybe-start+end)) + +(define (ccp-app ccp char) + (and (char-set-contains? (ccp:domain ccp) char) + (string-ref (ccp:map ccp) (char->ascii char)))) + + +;;; Primitive CCPs +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define id-cmap + (let ((m (make-string num-chars))) + (do ((i (- num-chars 1) (- i 1))) + ((< i 0)) + (string-set! m i (ascii->char i))) + m)) + +(define ccp:0 (make-ccp char-set:empty #t id-cmap #t)) +(define ccp:1 (make-ccp char-set:full #t id-cmap #t)) + +(define ccp:upcase (proc->ccp char-upcase char-set:full)) +(define ccp:downcase (proc->ccp char-downcase char-set:full))