From e18289f61c461b8d0bd7f770d17d75d2c79071ed Mon Sep 17 00:00:00 2001 From: olin-shivers Date: Wed, 21 Mar 2001 22:28:31 +0000 Subject: [PATCH] Added a small fix to string-lib.scm. Removed the obsolete strings.txt. --- scsh/lib/string-lib.scm | 2129 +++++++++++++++++++++++++-------------- scsh/lib/strings.txt | 578 ----------- 2 files changed, 1384 insertions(+), 1323 deletions(-) delete mode 100644 scsh/lib/strings.txt diff --git a/scsh/lib/string-lib.scm b/scsh/lib/string-lib.scm index 0282b36..beedda1 100644 --- a/scsh/lib/string-lib.scm +++ b/scsh/lib/string-lib.scm @@ -1,229 +1,325 @@ -;;; Scheme Underground string-processing library -*- Scheme -*- -;;; Olin Shivers 11/98 - -;;; SRFI DRAFT -- SRFI DRAFT -- SRFI DRAFT -- SRFI DRAFT -- SRFI DRAFT -;;; This is *draft* code for a SRFI proposal. If you see this notice in -;;; production code, you've got obsolete, bad source -- go find the final -;;; non-draft code on the Net. -;;; SRFI DRAFT -- SRFI DRAFT -- SRFI DRAFT -- SRFI DRAFT -- SRFI DRAFT - -;;; Some of this code had (extremely distant) origins in MIT Scheme's string -;;; lib, and was substantially reworked by Olin Shivers (shivers@ai.mit.edu) -;;; 9/98. As such, it is -;;; Copyright (c) 1988-1994 Massachusetts Institute of Technology. -;;; The copyright terms are essentially open-software terms; -;;; the precise terms are at the end of this file. -;;; -;;; The KMP string-search code was massively rehacked from Stephen Bevan's -;;; code, written for scmlib, and is thus covered by the GPL. If that's a -;;; problem, write one from scratch (there are citations to standard textbooks -;;; in the comments), or rip it out and use the ten-line doubly-nested loop -;;; that's commented out just above this code. +;;; SRFI 13 string library reference implementation -*- Scheme -*- +;;; Olin Shivers 7/2000 ;;; -;;; I wish I could mark definitions in this code to be inlined. -;;; Certain functions could benefit from compiler support. -;;; -;;; My policy on checking start/end substring specs is not uniform. -;;; I avoided doing arg checks when the function directly calls another -;;; lower-level function that will check the start/end specs as well. -;;; This has the advantage of not doing redundant checks, but the disadvantage -;;; is that errors are not reported early, at the highest possible call. -;;; There's not much high-level error checking of the other args, anyway. -;;; -Olin +;;; Copyright (c) 1988-1994 Massachusetts Institute of Technology. +;;; Copyright (c) 1998, 1999, 2000 Olin Shivers. All rights reserved. +;;; The details of the copyrights appear at the end of the file. Short +;;; summary: BSD-style open source. ;;; Exports: -;;; string-map string-map! +;;; string-map string-map! ;;; string-fold string-unfold ;;; string-fold-right string-unfold-right -;;; string-tabulate -;;; string-for-each string-iter +;;; string-tabulate string-for-each string-for-each-index ;;; string-every string-any +;;; string-hash string-hash-ci ;;; string-compare string-compare-ci -;;; substring-compare substring-compare-ci -;;; string= string< string> string<= string>= string<> +;;; string= string< string> string<= string>= string<> ;;; string-ci= string-ci< string-ci> string-ci<= string-ci>= string-ci<> -;;; substring= substring<> substring-ci= substring-ci<> -;;; substring< substring> substring-ci< substring-ci> -;;; substring<= substring>= substring-ci<= substring-ci>= -;;; string-upper-case? string-lower-case? -;;; capitalize-string capitalize-words string-downcase string-upcase -;;; capitalize-string! capitalize-words! string-downcase! string-upcase! +;;; string-downcase string-upcase string-titlecase +;;; string-downcase! string-upcase! string-titlecase! ;;; string-take string-take-right ;;; string-drop string-drop-right ;;; string-pad string-pad-right ;;; string-trim string-trim-right string-trim-both ;;; string-filter string-delete -;;; string-index string-index-right string-skip string-skip-right -;;; string-prefix-count string-prefix-count-ci -;;; string-suffix-count string-suffix-count-ci -;;; substring-prefix-count substring-prefix-count-ci -;;; substring-suffix-count substring-suffix-count-ci +;;; string-index string-index-right +;;; string-skip string-skip-right +;;; string-count +;;; string-prefix-length string-prefix-length-ci +;;; string-suffix-length string-suffix-length-ci ;;; string-prefix? string-prefix-ci? ;;; string-suffix? string-suffix-ci? -;;; substring-prefix? substring-prefix-ci? -;;; substring-suffix? substring-suffix-ci? -;;; substring? substring-ci? -;;; string-fill! string-copy! string-copy substring +;;; string-contains string-contains-ci +;;; string-copy! substring/shared ;;; string-reverse string-reverse! reverse-list->string -;;; string->list -;;; string-concat string-concat/shared string-append/shared +;;; string-concatenate string-concatenate/shared string-concatenate-reverse +;;; string-append/shared ;;; xsubstring string-xcopy! ;;; string-null? -;;; join-strings +;;; string-join +;;; string-tokenize +;;; string-replace ;;; -;;; string? make-string string string-length string-ref string-set! -;;; string-append list->string +;;; R5RS extended: +;;; string->list string-copy string-fill! ;;; -;;; make-kmp-restart-vector -;;; parse-final-start+end -;;; parse-start+end +;;; R5RS re-exports: +;;; string? make-string string-length string-ref string-set! +;;; +;;; R5RS re-exports (also defined here but commented-out): +;;; string string-append list->string +;;; +;;; Low-level routines: +;;; make-kmp-restart-vector string-kmp-partial-search kmp-step +;;; string-parse-start+end +;;; string-parse-final-start+end +;;; let-string-start+end ;;; check-substring-spec +;;; substring-spec-ok? ;;; Imports +;;; This is a fairly large library. While it was written for portability, you +;;; must be aware of its dependencies in order to run it in a given scheme +;;; implementation. Here is a complete list of the dependencies it has and the +;;; assumptions it makes beyond stock R5RS Scheme: +;;; ;;; This code has the following non-R5RS dependencies: -;;; - (RECEIVE (var ...) mv-exp body ...) multiple-value binding macro -;;; - Various imports from the char-set library -;;; - ERROR -;;; - LET-OPTIONALS and :OPTIONAL macros for handling optional arguments -;;; - The R5RS SUBSTRING function is accessed using the Scheme 48 -;;; STRUCTURE-REF magic accessor. +;;; - (RECEIVE (var ...) mv-exp body ...) multiple-value binding macro; +;;; +;;; - Various imports from the char-set library for the routines that can +;;; take char-set arguments; +;;; +;;; - An n-ary ERROR procedure; +;;; +;;; - BITWISE-AND for the hash functions; +;;; +;;; - A simple CHECK-ARG procedure for checking parameter values; it is +;;; (lambda (pred val proc) +;;; (if (pred val) val (error "Bad arg" val pred proc))) +;;; +;;; - :OPTIONAL and LET-OPTIONALS* macros for parsing, defaulting & +;;; type-checking optional parameters from a rest argument; +;;; +;;; - CHAR-CASED? and CHAR-TITLECASE for the STRING-TITLECASE & +;;; STRING-TITLECASE! procedures. The former returns true iff a character is +;;; one that has case distinctions; in ASCII it returns true on a-z and A-Z. +;;; CHAR-TITLECASE is analagous to CHAR-UPCASE and CHAR-DOWNCASE. In ASCII & +;;; Latin-1, it is the same as CHAR-UPCASE. +;;; +;;; The code depends upon a small set of core string primitives from R5RS: +;;; MAKE-STRING STRING-REF STRING-SET! STRING? STRING-LENGTH SUBSTRING +;;; (Actually, SUBSTRING is not a primitive, but we assume that an +;;; implementation's native version is probably faster than one we could +;;; define, so we import it from R5RS.) +;;; +;;; The code depends upon a small set of R5RS character primitives: +;;; char? char=? char-ci=? charinteger (for the hash functions) +;;; +;;; We assume the following: +;;; - CHAR-DOWNCASE o CHAR-UPCASE = CHAR-DOWNCASE +;;; - CHAR-CI=? is equivalent to +;;; (lambda (c1 c2) (char=? (char-downcase (char-upcase c1)) +;;; (char-downcase (char-upcase c2)))) +;;; - CHAR-UPCASE, CHAR-DOWNCASE and CHAR-TITLECASE are locale-insensitive +;;; and consistent with Unicode's 1-1 char-mapping spec. +;;; These things are typically true, but if not, you would need to modify +;;; the case-mapping and case-insensitive routines. +;;; Enough introductory blather. On to the source code. (But see the end of +;;; the file for further notes on porting & performance tuning.) + ;;; Support for START/END substring specs ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; This macro parses optional start/end arguments from arg lists, defaulting ;;; them to 0/(string-length s), and checks them for correctness. -(define-syntax let-start+end +(define-syntax let-string-start+end (syntax-rules () - ((let-start+end (start end) proc s-exp args-exp body ...) - (receive (start end) (parse-final-start+end proc s-exp args-exp) + ((let-string-start+end (start end) proc s-exp args-exp body ...) + (receive (start end) (string-parse-final-start+end proc s-exp args-exp) + body ...)) + ((let-string-start+end (start end rest) proc s-exp args-exp body ...) + (receive (rest start end) (string-parse-start+end proc s-exp args-exp) body ...)))) +;;; This one parses out a *pair* of final start/end indices. +;;; Not exported; for internal use. +(define-syntax let-string-start+end2 + (syntax-rules () + ((l-s-s+e2 (start1 end1 start2 end2) proc s1 s2 args body ...) + (let ((procv proc)) ; Make sure PROC is only evaluated once. + (let-string-start+end (start1 end1 rest) procv s1 args + (let-string-start+end (start2 end2) procv s2 rest + body ...)))))) -;;; Returns three values: start end rest -(define (parse-start+end proc s args) +;;; Returns three values: rest start end + +(define (string-parse-start+end proc s args) + (if (not (string? s)) (error "Non-string value" proc s)) (let ((slen (string-length s))) (if (pair? args) (let ((start (car args)) (args (cdr args))) - (if (or (not (integer? start)) (< start 0)) - (error "Illegal substring START spec" proc start s) + (if (and (integer? start) (exact? start) (>= start 0)) (receive (end args) (if (pair? args) (let ((end (car args)) (args (cdr args))) - (if (or (not (integer? end)) (< slen end)) - (error "Illegal substring END spec" proc end s) - (values end args))) + (if (and (integer? end) (exact? end) (<= end slen)) + (values end args) + (error "Illegal substring END spec" proc end s))) (values slen args)) - (if (<= start end) (values start end args) + (if (<= start end) (values args start end) (error "Illegal substring START/END spec" - proc start end s))))) + proc start end s))) + (error "Illegal substring START spec" proc start s))) - (values 0 (string-length s) '())))) + (values '() 0 slen)))) -(define (parse-final-start+end proc s args) - (receive (start end rest) (parse-start+end proc s args) +(define (string-parse-final-start+end proc s args) + (receive (rest start end) (string-parse-start+end proc s args) (if (pair? rest) (error "Extra arguments to procedure" proc rest) (values start end)))) +(define (substring-spec-ok? s start end) + (and (string? s) + (integer? start) + (exact? start) + (integer? end) + (exact? end) + (<= 0 start) + (<= start end) + (<= end (string-length s)))) + (define (check-substring-spec proc s start end) - (if (or (< start 0) - (< (string-length s) end) - (< end start)) - (error "Illegal substring START/END spec." proc s start end))) + (if (not (substring-spec-ok? s start end)) + (error "Illegal substring spec." proc s start end))) + + +;;; Defined by R5RS, so commented out here. +;(define (string . chars) +; (let* ((len (length chars)) +; (ans (make-string len))) +; (do ((i 0 (+ i 1)) +; (chars chars (cdr chars))) +; ((>= i len)) +; (string-set! ans i (car chars))) +; ans)) +; +;(define (string . chars) (string-unfold null? car cdr chars)) -;;; substring S START [END] -;;; string-copy S [START END] +;;; substring/shared S START [END] +;;; string-copy S [START END] ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;; Redefine SUBSTRING so that the END parameter is optional. -;;; SUBSTRINGX is the underlying R5RS SUBSTRING function. All -;;; the code in this file uses the simple SUBSTRINGX, so you can -;;; easily port this code. -(define substringx (structure-ref scheme substring)) ; Simple R5RS SUBSTRING +;;; All this goop is just arg parsing & checking surrounding a call to the +;;; actual primitive, %SUBSTRING/SHARED. -(define (substring s start . maybe-end) ; Our SUBSTRING - (substringx s start (:optional maybe-end (string-length s)))) +(define (substring/shared s start . maybe-end) + (check-arg string? s substring/shared) + (let ((slen (string-length s))) + (check-arg (lambda (start) (and (integer? start) (exact? start) (<= 0 start))) + start substring/shared) + (%substring/shared s start + (:optional maybe-end slen + (lambda (end) (and (integer? end) + (exact? end) + (<= start end) + (<= end slen))))))) + +;;; Split out so that other routines in this library can avoid arg-parsing +;;; overhead for END parameter. +(define (%substring/shared s start end) + (if (and (zero? start) (= end (string-length s))) s + (substring s start end))) (define (string-copy s . maybe-start+end) - (let-start+end (start end) string-copy s maybe-start+end - (substringx s start end))) - + (let-string-start+end (start end) string-copy s maybe-start+end + (substring s start end))) +;This library uses the R5RS SUBSTRING, but doesn't export it. +;Here is a definition, just for completeness. +;(define (substring s start end) +; (check-substring-spec substring s start end) +; (let* ((slen (- end start)) +; (ans (make-string slen))) +; (do ((i 0 (+ i 1)) +; (j start (+ j 1))) +; ((>= i slen) ans) +; (string-set! ans i (string-ref s j))))) ;;; Basic iterators and other higher-order abstractions ;;; (string-map proc s [start end]) ;;; (string-map! proc s [start end]) ;;; (string-fold kons knil s [start end]) ;;; (string-fold-right kons knil s [start end]) -;;; (string-unfold p f g seed) -;;; (string-for-each proc s [start end]) -;;; (string-iter proc s [start end]) -;;; (string-every pred s [start end]) -;;; (string-any pred s [start end]) +;;; (string-unfold p f g seed [base make-final]) +;;; (string-unfold-right p f g seed [base make-final]) +;;; (string-for-each proc s [start end]) +;;; (string-for-each-index proc s [start end]) +;;; (string-every char-set/char/pred s [start end]) +;;; (string-any char-set/char/pred s [start end]) ;;; (string-tabulate proc len) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;; No guarantees about order in MAP, FOR-EACH, EVERY, ANY procs. -;;; ;;; You want compiler support for high-level transforms on fold and unfold ops. ;;; You'd at least like a lot of inlining for clients of these procedures. -;;; Hold your breath. +;;; Don't hold your breath. (define (string-map proc s . maybe-start+end) - (let-start+end (start end) string-map s maybe-start+end - (let* ((len (- end start)) - (ans (make-string len))) - (do ((i (- end 1) (- i 1)) - (j (- len 1) (- j 1))) - ((< j 0)) - (string-set! ans j (proc (string-ref s i)))) - ans))) + (check-arg procedure? proc string-map) + (let-string-start+end (start end) string-map s maybe-start+end + (%string-map proc s start end))) + +(define (%string-map proc s start end) ; Internal utility + (let* ((len (- end start)) + (ans (make-string len))) + (do ((i (- end 1) (- i 1)) + (j (- len 1) (- j 1))) + ((< j 0)) + (string-set! ans j (proc (string-ref s i)))) + ans)) (define (string-map! proc s . maybe-start+end) - (let-start+end (start end) string-map! s maybe-start+end - (do ((i (- end 1) (- i 1))) - ((< i start)) - (string-set! s i (proc (string-ref s i)))))) + (check-arg procedure? proc string-map!) + (let-string-start+end (start end) string-map! s maybe-start+end + (%string-map! proc s start end))) + +(define (%string-map! proc s start end) + (do ((i (- end 1) (- i 1))) + ((< i start)) + (string-set! s i (proc (string-ref s i))))) (define (string-fold kons knil s . maybe-start+end) - (let-start+end (start end) string-fold s maybe-start+end + (check-arg procedure? kons string-fold) + (let-string-start+end (start end) string-fold s maybe-start+end (let lp ((v knil) (i start)) (if (< i end) (lp (kons (string-ref s i) v) (+ i 1)) v)))) (define (string-fold-right kons knil s . maybe-start+end) - (let-start+end (start end) string-fold-right s maybe-start+end + (check-arg procedure? kons string-fold-right) + (let-string-start+end (start end) string-fold-right s maybe-start+end (let lp ((v knil) (i (- end 1))) (if (>= i start) (lp (kons (string-ref s i) v) (- i 1)) v)))) -;;; (string-unfold p f g seed) +;;; (string-unfold p f g seed [base make-final]) ;;; This is the fundamental constructor for strings. ;;; - G is used to generate a series of "seed" values from the initial seed: ;;; SEED, (G SEED), (G^2 SEED), (G^3 SEED), ... ;;; - P tells us when to stop -- when it returns true when applied to one ;;; of these seed values. ;;; - F maps each seed value to the corresponding character -;;; in the result string. +;;; in the result string. These chars are assembled into the +;;; string in a left-to-right order. +;;; - BASE is the optional initial/leftmost portion of the constructed string; +;;; it defaults to the empty string "". +;;; - MAKE-FINAL is applied to the terminal seed value (on which P returns +;;; true) to produce the final/rightmost portion of the constructed string. +;;; It defaults to (LAMBDA (X) ""). ;;; ;;; In other words, the following (simple, inefficient) definition holds: -;;; (string-unfold p f g seed) = -;;; (if (p seed) "" -;;; (string-append (string (f seed)) -;;; (string-unfold p f g (g seed)))) +;;; (define (string-unfold p f g seed base make-final) +;;; (string-append base +;;; (let recur ((seed seed)) +;;; (if (p seed) (make-final seed) +;;; (string-append (string (f seed)) +;;; (recur (g seed))))))) ;;; ;;; STRING-UNFOLD is a fairly powerful constructor -- you can use it to ;;; reverse a string, copy a string, convert a list to a string, read ;;; a port into a string, and so forth. Examples: ;;; (port->string port) = ;;; (string-unfold (compose eof-object? peek-char) -;;; read-char identity port) +;;; read-char values port) ;;; ;;; (list->string lis) = (string-unfold null? car cdr lis) ;;; @@ -235,89 +331,195 @@ ;;; it to give a clear, straightforward description of what the function ;;; does. -;(define (string-unfold p f g seed) -; (let recur ((seed seed) (i 0)) -; (if (p seed) (make-string i) -; (let* ((c (f seed)) -; (s (recur (g seed) (+ i 1)))) -; (string-set! s i c) -; s)))) - -;;; This formulation chunks up the constructed string into 1024-char chunks, -;;; popping the stack frames. So it'll reduce stack growth by a factor of -;;; 1024. Marc Feeley alerted me to this issue and its general solution. - -(define (string-unfold p f g seed) - (string-concat/shared - (let recur ((seed seed)) - (receive (s seed done?) - (let recur2 ((seed seed) (i 0)) - (cond ((p seed) (values (make-string i) seed #t)) - ((>= i 1024) (values (make-string i) seed #f)) - (else (let ((c (f seed))) - (receive (s seed done?) - (recur2 (g seed) (+ i 1)) - (string-set! s i c) - (values s seed done?)))))) - - (if done? (list s) - (cons s (recur seed))))))) - - -;;; This is the same as STRING-UNFOLD, but defined for multiple -;;; seed parameters. If you pass N seeds, then -;;; - P maps N parameters to a boolean. -;;; - F maps N parameters to a character. -;;; - G maps N parameters to N return values. -;;; This definition does a lot of consing; it would need a fair amount -;;; of compiler support to be efficient. - -; Not released -;(define (string-unfoldn p f g . seeds) -; (apply string-append -; (let recur ((seeds seeds)) -; (receive (s seeds done?) -; (let recur2 ((seeds seeds) (i 0)) -; (cond ((apply p seeds) (values (make-string i) seeds #t)) -; ((>= i 1024) (values (make-string i) seeds #f)) -; (else (let ((c (apply f seeds))) -; (receive seeds (apply g seeds) -; (receive (s seeds done?) -; (recur2 seeds (+ i 1)) -; (string-set! s i c) -; (values s seeds done?))))))) +;(define (string-unfold p f g seed base make-final) +; (let ((ans (let recur ((seed seed) (i (string-length base))) +; (if (p seed) +; (let* ((final (make-final seed)) +; (ans (make-string (+ i (string-length final))))) +; (string-copy! ans i final) +; ans) ; -; (if done? (list s) -; (cons s (recur seeds))))))) +; (let* ((c (f seed)) +; (s (recur (g seed) (+ i 1)))) +; (string-set! s i c) +; s))))) +; (string-copy! ans 0 base) +; ans)) + +;;; The strategy is to allocate a series of chunks into which we stash the +;;; chars as we generate them. Chunk size goes up in powers of two starting +;;; with 40 and levelling out at 4k, i.e. +;;; 40 40 80 160 320 640 1280 2560 4096 4096 4096 4096 4096... +;;; This should work pretty well for short strings, 1-line (80 char) strings, +;;; and longer ones. When done, we allocate an answer string and copy the +;;; chars over from the chunk buffers. + +(define (string-unfold p f g seed . base+make-final) + (check-arg procedure? p string-unfold) + (check-arg procedure? f string-unfold) + (check-arg procedure? g string-unfold) + (let-optionals* base+make-final + ((base "" (string? base)) + (make-final (lambda (x) "") (procedure? make-final))) + (let lp ((chunks '()) ; Previously filled chunks + (nchars 0) ; Number of chars in CHUNKS + (chunk (make-string 40)) ; Current chunk into which we write + (chunk-len 40) + (i 0) ; Number of chars written into CHUNK + (seed seed)) + (let lp2 ((i i) (seed seed)) + (if (not (p seed)) + (let ((c (f seed)) + (seed (g seed))) + (if (< i chunk-len) + (begin (string-set! chunk i c) + (lp2 (+ i 1) seed)) + + (let* ((nchars2 (+ chunk-len nchars)) + (chunk-len2 (min 4096 nchars2)) + (new-chunk (make-string chunk-len2))) + (string-set! new-chunk 0 c) + (lp (cons chunk chunks) (+ nchars chunk-len) + new-chunk chunk-len2 1 seed)))) + + ;; We're done. Make the answer string & install the bits. + (let* ((final (make-final seed)) + (flen (string-length final)) + (base-len (string-length base)) + (j (+ base-len nchars i)) + (ans (make-string (+ j flen)))) + (%string-copy! ans j final 0 flen) ; Install FINAL. + (let ((j (- j i))) + (%string-copy! ans j chunk 0 i) ; Install CHUNK[0,I). + (let lp ((j j) (chunks chunks)) ; Install CHUNKS. + (if (pair? chunks) + (let* ((chunk (car chunks)) + (chunks (cdr chunks)) + (chunk-len (string-length chunk)) + (j (- j chunk-len))) + (%string-copy! ans j chunk 0 chunk-len) + (lp j chunks))))) + (%string-copy! ans 0 base 0 base-len) ; Install BASE. + ans)))))) + +(define (string-unfold-right p f g seed . base+make-final) + (let-optionals* base+make-final + ((base "" (string? base)) + (make-final (lambda (x) "") (procedure? make-final))) + (let lp ((chunks '()) ; Previously filled chunks + (nchars 0) ; Number of chars in CHUNKS + (chunk (make-string 40)) ; Current chunk into which we write + (chunk-len 40) + (i 40) ; Number of chars available in CHUNK + (seed seed)) + (let lp2 ((i i) (seed seed)) ; Fill up CHUNK from right + (if (not (p seed)) ; to left. + (let ((c (f seed)) + (seed (g seed))) + (if (> i 0) + (let ((i (- i 1))) + (string-set! chunk i c) + (lp2 i seed)) + + (let* ((nchars2 (+ chunk-len nchars)) + (chunk-len2 (min 4096 nchars2)) + (new-chunk (make-string chunk-len2)) + (i (- chunk-len2 1))) + (string-set! new-chunk i c) + (lp (cons chunk chunks) (+ nchars chunk-len) + new-chunk chunk-len2 i seed)))) + + ;; We're done. Make the answer string & install the bits. + (let* ((final (make-final seed)) + (flen (string-length final)) + (base-len (string-length base)) + (chunk-used (- chunk-len i)) + (j (+ base-len nchars chunk-used)) + (ans (make-string (+ j flen)))) + (%string-copy! ans 0 final 0 flen) ; Install FINAL. + (%string-copy! ans flen chunk i chunk-len); Install CHUNK[I,). + (let lp ((j (+ flen chunk-used)) ; Install CHUNKS. + (chunks chunks)) + (if (pair? chunks) + (let* ((chunk (car chunks)) + (chunks (cdr chunks)) + (chunk-len (string-length chunk))) + (%string-copy! ans j chunk 0 chunk-len) + (lp (+ j chunk-len) chunks)) + (%string-copy! ans j base 0 base-len))); Install BASE. + ans)))))) + (define (string-for-each proc s . maybe-start+end) - (let-start+end (start end) string-for-each s maybe-start+end - (do ((i (- end 1) (- i 1))) - ((< i start)) - (proc (string-ref s i))))) + (check-arg procedure? proc string-for-each) + (let-string-start+end (start end) string-for-each s maybe-start+end + (let lp ((i start)) + (if (< i end) + (begin (proc (string-ref s i)) + (lp (+ i 1))))))) -(define (string-iter proc s . maybe-start+end) - (let-start+end (start end) string-iter s maybe-start+end - (do ((i start (+ i 1))) - ((>= i end)) - (proc (string-ref s i))))) +(define (string-for-each-index proc s . maybe-start+end) + (check-arg procedure? proc string-for-each-index) + (let-string-start+end (start end) string-for-each-index s maybe-start+end + (let lp ((i start)) + (if (< i end) (begin (proc i) (lp (+ i 1))))))) -(define (string-every pred s . maybe-start+end) - (let-start+end (start end) string-every s maybe-start+end - (let lp ((i (- end 1))) - (or (< i start) - (and (pred (string-ref s i)) - (lp (- i 1))))))) +(define (string-every criterion s . maybe-start+end) + (let-string-start+end (start end) string-every s maybe-start+end + (cond ((char? criterion) + (let lp ((i start)) + (or (>= i end) + (and (char=? criterion (string-ref s i)) + (lp (+ i 1)))))) -(define (string-any pred s . maybe-start+end) - (let-start+end (start end) string-any s maybe-start+end - (let lp ((i (- end 1))) - (and (>= i start) - (or (pred (string-ref s i)) - (lp (- i 1))))))) + ((char-set? criterion) + (let lp ((i start)) + (or (>= i end) + (and (char-set-contains? criterion (string-ref s i)) + (lp (+ i 1)))))) + + ((procedure? criterion) ; Slightly funky loop so that + (or (= start end) ; final (PRED S[END-1]) call + (let lp ((i start)) ; is a tail call. + (let ((c (string-ref s i)) + (i1 (+ i 1))) + (if (= i1 end) (criterion c) ; Tail call. + (and (criterion c) (lp i1))))))) + + (else (error "Second param is neither char-set, char, or predicate procedure." + string-every criterion))))) + + +(define (string-any criterion s . maybe-start+end) + (let-string-start+end (start end) string-any s maybe-start+end + (cond ((char? criterion) + (let lp ((i start)) + (and (< i end) + (or (char=? criterion (string-ref s i)) + (lp (+ i 1)))))) + + ((char-set? criterion) + (let lp ((i start)) + (and (< i end) + (or (char-set-contains? criterion (string-ref s i)) + (lp (+ i 1)))))) + + ((procedure? criterion) ; Slightly funky loop so that + (and (< start end) ; final (PRED S[END-1]) call + (let lp ((i start)) ; is a tail call. + (let ((c (string-ref s i)) + (i1 (+ i 1))) + (if (= i1 end) (criterion c) ; Tail call + (or (criterion c) (lp i1))))))) + + (else (error "Second param is neither char-set, char, or predicate procedure." + string-any criterion))))) (define (string-tabulate proc len) + (check-arg procedure? proc string-tabulate) + (check-arg (lambda (val) (and (integer? val) (exact? val) (<= 0 val))) + len string-tabulate) (let ((s (make-string len))) (do ((i (- len 1) (- i 1))) ((< i 0)) @@ -326,145 +528,162 @@ -;;; string-prefix-count[-ci] s1 s2 -;;; string-suffix-count[-ci] s1 s2 -;;; substring-prefix-count[-ci] s1 start1 end1 s2 start2 end2 -;;; substring-suffix-count[-ci] s1 start1 end1 s2 start2 end2 +;;; string-prefix-length[-ci] s1 s2 [start1 end1 start2 end2] +;;; string-suffix-length[-ci] s1 s2 [start1 end1 start2 end2] ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Find the length of the common prefix/suffix. ;;; It is not required that the two substrings passed be of equal length. ;;; This was microcode in MIT Scheme -- a very tightly bummed primitive. +;;; %STRING-PREFIX-LENGTH is the core routine of all string-comparisons, +;;; so should be as tense as possible. -(define (substring-prefix-count s1 start1 end1 s2 start2 end2) - (check-substring-spec substring-prefix-count s1 start1 end1) - (check-substring-spec substring-prefix-count s2 start2 end2) +(define (%string-prefix-length s1 start1 end1 s2 start2 end2) (let* ((delta (min (- end1 start1) (- end2 start2))) (end1 (+ start1 delta))) - (let lp ((i start1) (j start2)) - (if (or (>= i end1) - (not (char=? (string-ref s1 i) - (string-ref s2 j)))) - (- i start1) - (lp (+ i 1) (+ j 1)))))) -(define (substring-suffix-count s1 start1 end1 s2 start2 end2) - (check-substring-spec substring-suffix-count s1 start1 end1) - (check-substring-spec substring-suffix-count s2 start2 end2) + (if (and (eq? s1 s2) (= start1 start2)) ; EQ fast path + delta + + (let lp ((i start1) (j start2)) ; Regular path + (if (or (>= i end1) + (not (char=? (string-ref s1 i) + (string-ref s2 j)))) + (- i start1) + (lp (+ i 1) (+ j 1))))))) + +(define (%string-suffix-length s1 start1 end1 s2 start2 end2) (let* ((delta (min (- end1 start1) (- end2 start2))) (start1 (- end1 delta))) - (let lp ((i (- end1 1)) (j (- end2 1))) - (if (or (< i start1) - (not (char=? (string-ref s1 i) - (string-ref s2 j)))) - (- (- end1 i) 1) - (lp (- i 1) (- j 1)))))) -(define (substring-prefix-count-ci s1 start1 end1 s2 start2 end2) - (check-substring-spec substring-prefix-count-ci s1 start1 end1) - (check-substring-spec substring-prefix-count-ci s2 start2 end2) + (if (and (eq? s1 s2) (= end1 end2)) ; EQ fast path + delta + + (let lp ((i (- end1 1)) (j (- end2 1))) ; Regular path + (if (or (< i start1) + (not (char=? (string-ref s1 i) + (string-ref s2 j)))) + (- (- end1 i) 1) + (lp (- i 1) (- j 1))))))) + +(define (%string-prefix-length-ci s1 start1 end1 s2 start2 end2) (let* ((delta (min (- end1 start1) (- end2 start2))) (end1 (+ start1 delta))) - (let lp ((i start1) (j start2)) - (if (or (>= i end1) - (not (char-ci=? (string-ref s1 i) - (string-ref s2 j)))) - (- i start1) - (lp (+ i 1) (+ j 1)))))) -(define (substring-suffix-count-ci s1 start1 end1 s2 start2 end2) - (check-substring-spec substring-suffix-count-ci s1 start1 end1) - (check-substring-spec substring-suffix-count-ci s2 start2 end2) + (if (and (eq? s1 s2) (= start1 start2)) ; EQ fast path + delta + + (let lp ((i start1) (j start2)) ; Regular path + (if (or (>= i end1) + (not (char-ci=? (string-ref s1 i) + (string-ref s2 j)))) + (- i start1) + (lp (+ i 1) (+ j 1))))))) + +(define (%string-suffix-length-ci s1 start1 end1 s2 start2 end2) (let* ((delta (min (- end1 start1) (- end2 start2))) (start1 (- end1 delta))) - (let lp ((i (- end1 1)) (j (- end2 1))) - (if (or (< i start1) - (not (char-ci=? (string-ref s1 i) - (string-ref s2 j)))) - (- (- end1 i) 1) - (lp (- i 1) (- j 1)))))) + + (if (and (eq? s1 s2) (= end1 end2)) ; EQ fast path + delta + + (let lp ((i (- end1 1)) (j (- end2 1))) ; Regular path + (if (or (< i start1) + (not (char-ci=? (string-ref s1 i) + (string-ref s2 j)))) + (- (- end1 i) 1) + (lp (- i 1) (- j 1))))))) -(define (string-prefix-count s1 s2) - (substring-prefix-count s1 0 (string-length s1) s2 0 (string-length s2))) +(define (string-prefix-length s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string-prefix-length s1 s2 maybe-starts+ends + (%string-prefix-length s1 start1 end1 s2 start2 end2))) -(define (string-suffix-count s1 s2) - (substring-suffix-count s1 0 (string-length s1) s2 0 (string-length s2))) +(define (string-suffix-length s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string-suffix-length s1 s2 maybe-starts+ends + (%string-suffix-length s1 start1 end1 s2 start2 end2))) -(define (string-prefix-count-ci s1 s2) - (substring-prefix-count-ci s1 0 (string-length s1) s2 0 (string-length s2))) +(define (string-prefix-length-ci s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string-prefix-length-ci s1 s2 maybe-starts+ends + (%string-prefix-length-ci s1 start1 end1 s2 start2 end2))) -(define (string-suffix-count-ci s1 s2) - (substring-suffix-count-ci s1 0 (string-length s1) s2 0 (string-length s2))) +(define (string-suffix-length-ci s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string-suffix-length-ci s1 s2 maybe-starts+ends + (%string-suffix-length-ci s1 start1 end1 s2 start2 end2))) - -;;; string-prefix? s1 s2 -;;; string-suffix? s1 s2 -;;; string-prefix-ci? s1 s2 -;;; string-suffix-ci? s1 s2 -;;; -;;; substring-prefix? s1 start1 end1 s2 start2 end2 -;;; substring-suffix? s1 start1 end1 s2 start2 end2 -;;; substring-prefix-ci? s1 start1 end1 s2 start2 end2 -;;; substring-suffix-ci? s1 start1 end1 s2 start2 end2 +;;; string-prefix? s1 s2 [start1 end1 start2 end2] +;;; string-suffix? s1 s2 [start1 end1 start2 end2] +;;; string-prefix-ci? s1 s2 [start1 end1 start2 end2] +;;; string-suffix-ci? s1 s2 [start1 end1 start2 end2] ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; These are all simple derivatives of the previous counting funs. -(define (string-prefix? s1 s2) - (substring-prefix? s1 0 (string-length s1) s2 0 (string-length s2))) +(define (string-prefix? s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string-prefix? s1 s2 maybe-starts+ends + (%string-prefix? s1 start1 end1 s2 start2 end2))) -(define (string-suffix? s1 s2) - (substring-suffix-ci? s1 0 (string-length s1) s2 0 (string-length s2))) +(define (string-suffix? s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string-suffix? s1 s2 maybe-starts+ends + (%string-suffix? s1 start1 end1 s2 start2 end2))) -(define (string-prefix-ci? s1 s2) - (substring-prefix-ci? s1 0 (string-length s1) s2 0 (string-length s2))) +(define (string-prefix-ci? s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string-prefix-ci? s1 s2 maybe-starts+ends + (%string-prefix-ci? s1 start1 end1 s2 start2 end2))) -(define (string-suffix-ci? s1 s2) - (substring-suffix-ci? s1 0 (string-length s1) s2 0 (string-length s2))) +(define (string-suffix-ci? s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string-suffix-ci? s1 s2 maybe-starts+ends + (%string-suffix-ci? s1 start1 end1 s2 start2 end2))) -(define (substring-prefix? s1 start1 end1 s2 start2 end2) + +;;; Here are the internal routines that do the real work. + +(define (%string-prefix? s1 start1 end1 s2 start2 end2) (let ((len1 (- end1 start1))) (and (<= len1 (- end2 start2)) ; Quick check - (= (substring-prefix-count s1 start1 end1 - s2 start2 end2) + (= (%string-prefix-length s1 start1 end1 + s2 start2 end2) len1)))) -(define (substring-suffix? s1 start1 end1 s2 start2 end2) +(define (%string-suffix? s1 start1 end1 s2 start2 end2) (let ((len1 (- end1 start1))) (and (<= len1 (- end2 start2)) ; Quick check - (= len1 (substring-suffix-count s1 start1 end1 - s2 start2 end2))))) + (= len1 (%string-suffix-length s1 start1 end1 + s2 start2 end2))))) -(define (substring-prefix-ci? s1 start1 end1 s2 start2 end2) +(define (%string-prefix-ci? s1 start1 end1 s2 start2 end2) (let ((len1 (- end1 start1))) (and (<= len1 (- end2 start2)) ; Quick check - (= len1 (substring-prefix-count-ci s1 start1 end1 - s2 start2 end2))))) + (= len1 (%string-prefix-length-ci s1 start1 end1 + s2 start2 end2))))) -(define (substring-suffix-ci? s1 start1 end1 s2 start2 end2) +(define (%string-suffix-ci? s1 start1 end1 s2 start2 end2) (let ((len1 (- end1 start1))) (and (<= len1 (- end2 start2)) ; Quick check - (= len1 (substring-suffix-count-ci s1 start1 end1 - s2 start2 end2))))) + (= len1 (%string-suffix-length-ci s1 start1 end1 + s2 start2 end2))))) -;;; string-compare s1 s2 lt-proc eq-proc gt-proc -;;; string-compare-ci s1 s2 eq-proc lt-proc gt-proc -;;; substring-compare s1 start1 end1 s2 start2 end2 -;;; lt-proc eq-proc gt-proc -;;; substring-compare-ci s1 start1 end1 s2 start2 end2 -;;; lt-proc eq-proc gt-proc +;;; string-compare s1 s2 proc< proc= proc> [start1 end1 start2 end2] +;;; string-compare-ci s1 s2 proc< proc= proc> [start1 end1 start2 end2] ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Primitive string-comparison functions. ;;; Continuation order is different from MIT Scheme. ;;; Continuations are applied to s1's mismatch index; ;;; in the case of equality, this is END1. -(define (substring-compare s1 start1 end1 s2 start2 end2 +(define (%string-compare s1 start1 end1 s2 start2 end2 proc< proc= proc>) (let ((size1 (- end1 start1)) (size2 (- end2 start2))) - (let ((match (substring-prefix-count s1 start1 end1 s2 start2 end2))) + (let ((match (%string-prefix-length s1 start1 end1 s2 start2 end2))) (if (= match size1) ((if (= match size2) proc= proc<) end1) ((if (= match size2) @@ -474,11 +693,11 @@ proc< proc>)) (+ match start1)))))) -(define (substring-compare-ci s1 start1 end1 s2 start2 end2 +(define (%string-compare-ci s1 start1 end1 s2 start2 end2 proc< proc= proc>) (let ((size1 (- end1 start1)) (size2 (- end2 start2))) - (let ((match (substring-prefix-count-ci s1 start1 end1 s2 start2 end2))) + (let ((match (%string-prefix-length-ci s1 start1 end1 s2 start2 end2))) (if (= match size1) ((if (= match size2) proc= proc<) end1) ((if (= match size2) proc> @@ -487,15 +706,22 @@ proc< proc>)) (+ start1 match)))))) -(define (string-compare s1 s2 proc< proc= proc>) - (substring-compare s1 0 (string-length s1) - s2 0 (string-length s2) - proc< proc= proc>)) +(define (string-compare s1 s2 proc< proc= proc> . maybe-starts+ends) + (check-arg procedure? proc< string-compare) + (check-arg procedure? proc= string-compare) + (check-arg procedure? proc> string-compare) + (let-string-start+end2 (start1 end1 start2 end2) + string-compare s1 s2 maybe-starts+ends + (%string-compare s1 start1 end1 s2 start2 end2 proc< proc= proc>))) + +(define (string-compare-ci s1 s2 proc< proc= proc> . maybe-starts+ends) + (check-arg procedure? proc< string-compare-ci) + (check-arg procedure? proc= string-compare-ci) + (check-arg procedure? proc> string-compare-ci) + (let-string-start+end2 (start1 end1 start2 end2) + string-compare-ci s1 s2 maybe-starts+ends + (%string-compare-ci s1 start1 end1 s2 start2 end2 proc< proc= proc>))) -(define (string-compare-ci s1 s2 proc< proc= proc>) - (substring-compare-ci s1 0 (string-length s1) - s2 0 (string-length s2) - proc< proc= proc>)) ;;; string= string<> string-ci= string-ci<> @@ -503,225 +729,242 @@ ;;; string<= string>= string-ci<= string-ci>= ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Simple definitions in terms of the previous comparison funs. -;;; Inequality predicates return #f or mismatch index. -;;; I sure hope these defns get integrated. +;;; I sure hope the %STRING-COMPARE calls get integrated. -(define (string= s1 s2) - (if (eq? s1 s2) (string-length s1) ; Fast path - (string-compare s1 s2 (lambda (i) #f) (lambda (i) i) (lambda (i) #f)))) +(define (string= s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string= s1 s2 maybe-starts+ends + (and (= (- end1 start1) (- end2 start2)) ; Quick filter + (or (and (eq? s1 s2) (= start1 start2)) ; Fast path + (%string-compare s1 start1 end1 s2 start2 end2 ; Real test + (lambda (i) #f) + values + (lambda (i) #f)))))) -(define (string< s1 s2) - (and (not (eq? s1 s2)) ; Fast path - (string-compare s1 s2 (lambda (i) i) (lambda (i) #f) (lambda (i) #f)))) +(define (string<> s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string<> s1 s2 maybe-starts+ends + (or (not (= (- end1 start1) (- end2 start2))) ; Fast path + (and (not (and (eq? s1 s2) (= start1 start2))) ; Quick filter + (%string-compare s1 start1 end1 s2 start2 end2 ; Real test + values + (lambda (i) #f) + values))))) -(define (string> s1 s2) - (and (not (eq? s1 s2)) ; Fast path - (string-compare s1 s2 (lambda (i) #f) (lambda (i) #f) (lambda (i) i)))) +(define (string< s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string< s1 s2 maybe-starts+ends + (if (and (eq? s1 s2) (= start1 start2)) ; Fast path + (< end1 end2) -(define (string<= s1 s2) - (if (eq? s1 s2) (string-length s1) ; Fast path - (string-compare s1 s2 (lambda (i) i) (lambda (i) i) (lambda (i) #f)))) + (%string-compare s1 start1 end1 s2 start2 end2 ; Real test + values + (lambda (i) #f) + (lambda (i) #f))))) -(define (string>= s1 s2) - (if (eq? s1 s2) (string-length s1) ; Fast path - (string-compare s1 s2 (lambda (i) #f) (lambda (i) i) (lambda (i) i)))) +(define (string> s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string> s1 s2 maybe-starts+ends + (if (and (eq? s1 s2) (= start1 start2)) ; Fast path + (> end1 end2) -(define (string<> s1 s2) - (and (not (eq? s1 s2)) ; Fast path - (string-compare s1 s2 (lambda (i) i) (lambda (i) #f) (lambda (i) i)))) + (%string-compare s1 start1 end1 s2 start2 end2 ; Real test + (lambda (i) #f) + (lambda (i) #f) + values)))) +(define (string<= s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string<= s1 s2 maybe-starts+ends + (if (and (eq? s1 s2) (= start1 start2)) ; Fast path + (<= end1 end2) -(define (string-ci= s1 s2) - (if (eq? s1 s2) (string-length s1) ; Fast path - (string-compare-ci s1 s2 (lambda (i) #f) (lambda (i) i) (lambda (i) #f)))) + (%string-compare s1 start1 end1 s2 start2 end2 ; Real test + values + values + (lambda (i) #f))))) -(define (string-ci< s1 s2) - (and (not (eq? s1 s2)) ; Fast path - (string-compare-ci s1 s2 (lambda (i) i) (lambda (i) #f) (lambda (i) #f)))) +(define (string>= s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string>= s1 s2 maybe-starts+ends + (if (and (eq? s1 s2) (= start1 start2)) ; Fast path + (>= end1 end2) -(define (string-ci> s1 s2) - (and (not (eq? s1 s2)) ; Fast path - (string-compare-ci s1 s2 (lambda (i) #f) (lambda (i) #f) (lambda (i) i)))) + (%string-compare s1 start1 end1 s2 start2 end2 ; Real test + (lambda (i) #f) + values + values)))) -(define (string-ci<= s1 s2) - (if (eq? s1 s2) (string-length s1) ; Fast path - (string-compare-ci s1 s2 (lambda (i) i) (lambda (i) i) (lambda (i) #f)))) +(define (string-ci= s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string-ci= s1 s2 maybe-starts+ends + (and (= (- end1 start1) (- end2 start2)) ; Quick filter + (or (and (eq? s1 s2) (= start1 start2)) ; Fast path + (%string-compare-ci s1 start1 end1 s2 start2 end2 ; Real test + (lambda (i) #f) + values + (lambda (i) #f)))))) -(define (string-ci>= s1 s2) - (if (eq? s1 s2) (string-length s1) ; Fast path - (string-compare-ci s1 s2 (lambda (i) #f) (lambda (i) i) (lambda (i) i)))) +(define (string-ci<> s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string-ci<> s1 s2 maybe-starts+ends + (or (not (= (- end1 start1) (- end2 start2))) ; Fast path + (and (not (and (eq? s1 s2) (= start1 start2))) ; Quick filter + (%string-compare-ci s1 start1 end1 s2 start2 end2 ; Real test + values + (lambda (i) #f) + values))))) -(define (string-ci<> s1 s2) - (and (not (eq? s1 s2)) ; Fast path - (string-compare-ci s1 s2 (lambda (i) i) (lambda (i) #f) (lambda (i) i)))) +(define (string-ci< s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string-ci< s1 s2 maybe-starts+ends + (if (and (eq? s1 s2) (= start1 start2)) ; Fast path + (< end1 end2) + (%string-compare-ci s1 start1 end1 s2 start2 end2 ; Real test + values + (lambda (i) #f) + (lambda (i) #f))))) -(define (substring= s1 start1 end1 s2 start2 end2) - (substring-compare s1 start1 end1 - s2 start2 end2 - (lambda (i) #f) - (lambda (i) i) - (lambda (i) #f))) +(define (string-ci> s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string-ci> s1 s2 maybe-starts+ends + (if (and (eq? s1 s2) (= start1 start2)) ; Fast path + (> end1 end2) -(define (substring<> s1 start1 end1 s2 start2 end2) - (substring-compare s1 start1 end1 - s2 start2 end2 - (lambda (i) i) - (lambda (i) #f) - (lambda (i) i))) + (%string-compare-ci s1 start1 end1 s2 start2 end2 ; Real test + (lambda (i) #f) + (lambda (i) #f) + values)))) -(define (substring< s1 start1 end1 s2 start2 end2) - (substring-compare s1 start1 end1 - s2 start2 end2 - (lambda (i) i) - (lambda (i) #f) - (lambda (i) #f))) +(define (string-ci<= s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string-ci<= s1 s2 maybe-starts+ends + (if (and (eq? s1 s2) (= start1 start2)) ; Fast path + (<= end1 end2) -(define (substring> s1 start1 end1 s2 start2 end2) - (substring< s2 start2 end2 s1 start1 end1)) + (%string-compare-ci s1 start1 end1 s2 start2 end2 ; Real test + values + values + (lambda (i) #f))))) -(define (substring<= s1 start1 end1 s2 start2 end2) - (substring-compare s1 start1 end1 - s2 start2 end2 - (lambda (i) i) - (lambda (i) i) - (lambda (i) #f))) - -(define (substring>= s1 start1 end1 s2 start2 end2) - (substring<= s2 start2 end2 s1 start1 end1)) - -(define (substring-ci= s1 start1 end1 s2 start2 end2) - (substring-compare-ci s1 start1 end1 - s2 start2 end2 - (lambda (i) #f) - (lambda (i) i) - (lambda (i) #f))) - -(define (substring-ci<> s1 start1 end1 s2 start2 end2) - (substring-compare-ci s1 start1 end1 - s2 start2 end2 - (lambda (i) i) - (lambda (i) #f) - (lambda (i) i))) - -(define (substring-ci< s1 start1 end1 s2 start2 end2) - (substring-compare-ci s1 start1 end1 - s2 start2 end2 - (lambda (i) i) - (lambda (i) #f) - (lambda (i) #f))) - -(define (substring-ci> s1 start1 end1 s2 start2 end2) - (substring-ci< s2 start2 end2 s1 start1 end1)) - -(define (substring-ci<= s1 start1 end1 s2 start2 end2) - (substring-compare-ci s1 start1 end1 - s2 start2 end2 - (lambda (i) i) - (lambda (i) i) - (lambda (i) #f))) - -(define (substring-ci>= s1 start1 end1 s2 start2 end2) - (substring-ci<= s2 start2 end2 s1 start1 end1)) +(define (string-ci>= s1 s2 . maybe-starts+ends) + (let-string-start+end2 (start1 end1 start2 end2) + string-ci>= s1 s2 maybe-starts+ends + (if (and (eq? s1 s2) (= start1 start2)) ; Fast path + (>= end1 end2) + (%string-compare-ci s1 start1 end1 s2 start2 end2 ; Real test + (lambda (i) #f) + values + values)))) +;;; Hash +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Compute (c + 37 c + 37^2 c + ...) modulo BOUND, with sleaze thrown in +;;; to keep the intermediate values small. (We do the calculation with just +;;; enough bits to represent BOUND, masking off high bits at each step in +;;; calculation. If this screws up any important properties of the hash +;;; function I'd like to hear about it. -Olin) +;;; +;;; If you keep BOUND small enough, the intermediate calculations will +;;; always be fixnums. How small is dependent on the underlying Scheme system; +;;; we use a default BOUND of 2^22 = 4194304, which should hack it in +;;; Schemes that give you at least 29 signed bits for fixnums. The core +;;; calculation that you don't want to overflow is, worst case, +;;; (+ 65535 (* 37 (- bound 1))) +;;; where 65535 is the max character code. Choose the default BOUND to be the +;;; biggest power of two that won't cause this expression to fixnum overflow, +;;; and everything will be copacetic. + +(define (%string-hash s char->int bound start end) + (let ((iref (lambda (s i) (char->int (string-ref s i)))) + ;; Compute a 111...1 mask that will cover BOUND-1: + (mask (let lp ((i #x10000)) ; Let's skip first 16 iterations, eh? + (if (>= i bound) (- i 1) (lp (+ i i)))))) + (let lp ((i start) (ans 0)) + (if (>= i end) (modulo ans bound) + (lp (+ i 1) (bitwise-and mask (+ (* 37 ans) (iref s i)))))))) + +(define (string-hash s . maybe-bound+start+end) + (let-optionals* maybe-bound+start+end ((bound 4194304 (and (integer? bound) + (exact? bound) + (<= 0 bound))) + rest) + (let ((bound (if (zero? bound) 4194304 bound))) ; 0 means default. + (let-string-start+end (start end) string-hash s rest + (%string-hash s char->integer bound start end))))) + +(define (string-hash-ci s . maybe-bound+start+end) + (let-optionals* maybe-bound+start+end ((bound 4194304 (and (integer? bound) + (exact? bound) + (<= 0 bound))) + rest) + (let ((bound (if (zero? bound) 4194304 bound))) ; 0 means default. + (let-string-start+end (start end) string-hash-ci s rest + (%string-hash s (lambda (c) (char->integer (char-downcase c))) + bound start end))))) + ;;; Case hacking ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;; string-upper-case? -;;; string-lower-case? -;;; ;;; string-upcase s [start end] ;;; string-upcase! s [start end] ;;; string-downcase s [start end] ;;; string-downcase! s [start end] ;;; -;;; capitalize-string s [start end] -;;; capitalize-string! s [start end] -;;; Uppercase first alphanum char, lowercase rest. -;;; -;;; capitalize-words s [start end] -;;; capitalize-words! s [start end] -;;; Capitalize every contiguous alphanum sequence: uppercase +;;; string-titlecase s [start end] +;;; string-titlecase! s [start end] +;;; Capitalize every contiguous alpha sequence: capitalise ;;; first char, lowercase rest. -;;; These two use a different definition of an "upper-/lower-case string" -;;; than MIT Scheme uses: - -(define (string-upper-case? s . maybe-start+end) - (not (apply string-any char-lower-case? s maybe-start+end))) - -(define (string-lower-case? s . maybe-start+end) - (not (apply string-any char-upper-case? s maybe-start+end))) - - (define (string-upcase s . maybe-start+end) - (apply string-map char-upcase s maybe-start+end)) + (let-string-start+end (start end) string-upcase s maybe-start+end + (%string-map char-upcase s start end))) (define (string-upcase! s . maybe-start+end) - (apply string-map! char-upcase s maybe-start+end)) + (let-string-start+end (start end) string-upcase! s maybe-start+end + (%string-map! char-upcase s start end))) (define (string-downcase s . maybe-start+end) - (apply string-map char-downcase s maybe-start+end)) + (let-string-start+end (start end) string-downcase s maybe-start+end + (%string-map char-downcase s start end))) (define (string-downcase! s . maybe-start+end) - (apply string-map! char-downcase s maybe-start+end)) + (let-string-start+end (start end) string-downcase! s maybe-start+end + (%string-map! char-downcase s start end))) - -;;; capitalize-string s [start end] -;;; capitalize-string! s [start end] -;;; Uppercase first alphanum char, lowercase rest. - -(define (really-capitalize-string! s start end) - (cond ((string-index s char-set:alphanumeric start end) => - (lambda (i) - (string-set! s i (char-upcase (string-ref s i))) - (string-downcase! s i))))) - -(define (capitalize-string! s . maybe-start+end) - (let-start+end (start end) capitalize-string! s maybe-start+end - (really-capitalize-string! s start end))) - -(define (capitalize-string s . maybe-start+end) - (let-start+end (start end) capitalize-string s maybe-start+end - (let ((ans (substringx s start end))) - (really-capitalize-string! ans 0 (- end start)) - ans))) - -;;; capitalize-words s [start end] -;;; capitalize-words! s [start end] -;;; Capitalize every contiguous alphanum sequence: uppercase -;;; first char, lowercase rest. - -(define (really-capitalize-words! s start end) +(define (%string-titlecase! s start end) (let lp ((i start)) - (cond ((string-index s char-set:alphanumeric i end) => + (cond ((string-index s char-cased? i end) => (lambda (i) - (string-set! s i (char-upcase (string-ref s i))) + (string-set! s i (char-titlecase (string-ref s i))) (let ((i1 (+ i 1))) - (cond ((string-skip s char-set:alphanumeric i1 end) => + (cond ((string-skip s char-cased? i1 end) => (lambda (j) (string-downcase! s i1 j) (lp (+ j 1)))) (else (string-downcase! s i1 end))))))))) -(define (capitalize-words! s . maybe-start+end) - (let-start+end (start end) capitalize-string! s maybe-start+end - (really-capitalize-words! s start end))) +(define (string-titlecase! s . maybe-start+end) + (let-string-start+end (start end) string-titlecase! s maybe-start+end + (%string-titlecase! s start end))) -(define (capitalize-words s . maybe-start+end) - (let-start+end (start end) capitalize-string! s maybe-start+end - (let ((ans (substringx s start end))) - (really-capitalize-words! ans 0 (- end start)) +(define (string-titlecase s . maybe-start+end) + (let-string-start+end (start end) string-titlecase! s maybe-start+end + (let ((ans (substring s start end))) + (%string-titlecase! ans 0 (- end start)) ans))) - ;;; Cutting & pasting strings ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; string-take string nchars ;;; string-drop string nchars ;;; +;;; string-take-right string nchars +;;; string-drop-right string nchars +;;; ;;; string-pad string k [char start end] ;;; string-pad-right string k [char start end] ;;; @@ -733,67 +976,80 @@ ;;; say what you want to trim. (define (string-take s n) - (if (> n 0) - (substringx s 0 n) - (let ((len (string-length s))) - (substringx s (+ len n) len)))) + (check-arg string? s string-take) + (check-arg (lambda (val) (and (integer? n) (exact? n) + (<= 0 n (string-length s)))) + n string-take) + (%substring/shared s 0 n)) + +(define (string-take-right s n) + (check-arg string? s string-take-right) + (let ((len (string-length s))) + (check-arg (lambda (val) (and (integer? n) (exact? n) (<= 0 n len))) + n string-take-right) + (%substring/shared s (- len n) len))) (define (string-drop s n) + (check-arg string? s string-drop) (let ((len (string-length s))) - (if (> n 0) - (substringx s n len) - (substringx s 0 (+ len n))))) + (check-arg (lambda (val) (and (integer? n) (exact? n) (<= 0 n len))) + n string-drop) + (%substring/shared s n len))) -(define (string-trim s . args) - (let-optionals args ((criteria char-set:whitespace) - (start 0) - (end (string-length s))) - (cond ((string-skip s criteria start end) => - (lambda (i) (substringx s i end))) - (else "")))) - -(define (string-trim-right s . args) - (let-optionals args ((criteria char-set:whitespace) - (start 0) - (end (string-length s))) - (cond ((string-skip-right s criteria end start) => - (lambda (i) (substringx s 0 (+ 1 i)))) - (else "")))) - -(define (string-trim-both s . args) - (let-optionals args ((criteria char-set:whitespace) - (start 0) - (end (string-length s))) - (cond ((string-skip s criteria start end) => - (lambda (i) (substringx s i (+ 1 (string-skip-right s criteria end))))) - (else "")))) +(define (string-drop-right s n) + (check-arg string? s string-drop-right) + (let ((len (string-length s))) + (check-arg (lambda (val) (and (integer? n) (exact? n) (<= 0 n len))) + n string-drop-right) + (%substring/shared s 0 (- len n)))) -(define (string-pad-right s n . args) - (let-optionals args ((char #\space) (start 0) (end (string-length s))) - (check-substring-spec string-pad-right s start end) - (let ((len (- end start))) - (cond ((= n len) ; No pad. - (if (zero? start) s (substringx s start end))) +(define (string-trim s . criterion+start+end) + (let-optionals* criterion+start+end ((criterion char-set:whitespace) rest) + (let-string-start+end (start end) string-trim s rest + (cond ((string-skip s criterion start end) => + (lambda (i) (%substring/shared s i end))) + (else ""))))) - ((< n len) (substringx s start (+ start n))) ; Trim. +(define (string-trim-right s . criterion+start+end) + (let-optionals* criterion+start+end ((criterion char-set:whitespace) rest) + (let-string-start+end (start end) string-trim-right s rest + (cond ((string-skip-right s criterion start end) => + (lambda (i) (%substring/shared s 0 (+ 1 i)))) + (else ""))))) - (else (let ((ans (make-string n char))) - (string-copy! ans 0 s start end) - ans)))))) +(define (string-trim-both s . criterion+start+end) + (let-optionals* criterion+start+end ((criterion char-set:whitespace) rest) + (let-string-start+end (start end) string-trim-both s rest + (cond ((string-skip s criterion start end) => + (lambda (i) + (%substring/shared s i (+ 1 (string-skip-right s criterion i end))))) + (else ""))))) -(define (string-pad s n . args) - (let-optionals args ((char #\space) (start 0) (end (string-length s))) - (check-substring-spec string-pad s start end) - (let ((len (- end start))) - (cond ((= n len) ; No pad. - (if (zero? start) s (substringx s start end))) - ((< n len) (substringx s (- end n) end)) ; Trim. +(define (string-pad-right s n . char+start+end) + (let-optionals* char+start+end ((char #\space (char? char)) rest) + (let-string-start+end (start end) string-pad-right s rest + (check-arg (lambda (n) (and (integer? n) (exact? n) (<= 0 n))) + n string-pad-right) + (let ((len (- end start))) + (if (<= n len) + (%substring/shared s start (+ start n)) + (let ((ans (make-string n char))) + (%string-copy! ans 0 s start end) + ans)))))) - (else (let ((ans (make-string n char))) - (string-copy! ans (- n len) s start end) - ans)))))) +(define (string-pad s n . char+start+end) + (let-optionals* char+start+end ((char #\space (char? char)) rest) + (let-string-start+end (start end) string-pad s rest + (check-arg (lambda (n) (and (integer? n) (exact? n) (<= 0 n))) + n string-pad) + (let ((len (- end start))) + (if (<= n len) + (%substring/shared s (- end n) end) + (let ((ans (make-string n char))) + (%string-copy! ans (- n len) s start end) + ans)))))) @@ -802,30 +1058,30 @@ ;;; string-delete char/char-set/pred string [start end] ;;; string-filter char/char-set/pred string [start end] ;;; -;;; If the filter criteria is a char or char-set, we scan the string twice -;;; with string-fold -- once to determine the length of the result string, +;;; If the criterion is a char or char-set, we scan the string twice with +;;; string-fold -- once to determine the length of the result string, ;;; and once to do the filtered copy. -;;; If the filter criteria is a predicate, we don't do this double-scan -;;; strategy, because the predicate might have side-effects or be very -;;; expensive to compute. So we preallocate a temp buffer pessimistically, -;;; and only do one scan over S. This is likely to be faster and more -;;; space-efficient than consing a list. +;;; If the criterion is a predicate, we don't do this double-scan strategy, +;;; because the predicate might have side-effects or be very expensive to +;;; compute. So we preallocate a temp buffer pessimistically, and only do +;;; one scan over S. This is likely to be faster and more space-efficient +;;; than consing a list. -(define (string-delete criteria s . maybe-start+end) - (let-start+end (start end) string-delete s maybe-start+end - (if (procedure? criteria) +(define (string-delete criterion s . maybe-start+end) + (let-string-start+end (start end) string-delete s maybe-start+end + (if (procedure? criterion) (let* ((slen (- end start)) (temp (make-string slen)) (ans-len (string-fold (lambda (c i) - (if (criteria c) i + (if (criterion c) i (begin (string-set! temp i c) (+ i 1)))) 0 s start end))) - (if (= ans-len slen) temp (substringx temp 0 ans-len))) + (if (= ans-len slen) temp (substring temp 0 ans-len))) - (let* ((cset (cond ((char-set? criteria) criteria) - ((char? criteria) (char-set criteria)) - (else (error "string-delete criteria not predicate, char or char-set" criteria)))) + (let* ((cset (cond ((char-set? criterion) criterion) + ((char? criterion) (char-set criterion)) + (else (error "string-delete criterion not predicate, char or char-set" criterion)))) (len (string-fold (lambda (c i) (if (char-set-contains? cset c) i (+ i 1))) @@ -838,22 +1094,22 @@ 0 s start end) ans)))) -(define (string-filter criteria s . maybe-start+end) - (let-start+end (start end) string-filter s maybe-start+end - (if (procedure? criteria) +(define (string-filter criterion s . maybe-start+end) + (let-string-start+end (start end) string-filter s maybe-start+end + (if (procedure? criterion) (let* ((slen (- end start)) (temp (make-string slen)) (ans-len (string-fold (lambda (c i) - (if (criteria c) + (if (criterion c) (begin (string-set! temp i c) (+ i 1)) i)) 0 s start end))) - (if (= ans-len slen) temp (substringx temp 0 ans-len))) + (if (= ans-len slen) temp (substring temp 0 ans-len))) - (let* ((cset (cond ((char-set? criteria) criteria) - ((char? criteria) (char-set criteria)) - (else (error "string-delete criteria not predicate, char or char-set" criteria)))) + (let* ((cset (cond ((char-set? criterion) criterion) + ((char? criterion) (char-set criterion)) + (else (error "string-delete criterion not predicate, char or char-set" criterion)))) (len (string-fold (lambda (c i) (if (char-set-contains? cset c) (+ i 1) @@ -867,104 +1123,126 @@ 0 s start end) ans)))) - ;;; String search ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; string-index string char/char-set/pred [start end] -;;; string-index-right string char/char-set/pred [end start] +;;; string-index-right string char/char-set/pred [start end] ;;; string-skip string char/char-set/pred [start end] -;;; string-skip-right string char/char-set/pred [end start] -;;; Note the odd start/end ordering of index-right and skip-right params. +;;; string-skip-right string char/char-set/pred [start end] +;;; string-count string char/char-set/pred [start end] ;;; There's a lot of replicated code here for efficiency. ;;; For example, the char/char-set/pred discrimination has ;;; been lifted above the inner loop of each proc. -(define (string-index str criteria . maybe-start+end) - (let-start+end (start end) string-index str maybe-start+end - (cond ((char? criteria) +(define (string-index str criterion . maybe-start+end) + (let-string-start+end (start end) string-index str maybe-start+end + (cond ((char? criterion) (let lp ((i start)) (and (< i end) - (if (char=? criteria (string-ref str i)) i + (if (char=? criterion (string-ref str i)) i (lp (+ i 1)))))) - ((char-set? criteria) + ((char-set? criterion) (let lp ((i start)) (and (< i end) - (if (char-set-contains? criteria (string-ref str i)) i + (if (char-set-contains? criterion (string-ref str i)) i (lp (+ i 1)))))) - ((procedure? criteria) + ((procedure? criterion) (let lp ((i start)) (and (< i end) - (if (criteria (string-ref str i)) i + (if (criterion (string-ref str i)) i (lp (+ i 1)))))) (else (error "Second param is neither char-set, char, or predicate procedure." - string-index criteria))))) + string-index criterion))))) -(define (string-index-right str criteria . maybe-end+start) - (let-optionals maybe-end+start ((start 0) (end (string-length str))) - (check-substring-spec string-index-right str start end) - (cond ((char? criteria) +(define (string-index-right str criterion . maybe-start+end) + (let-string-start+end (start end) string-index-right str maybe-start+end + (cond ((char? criterion) (let lp ((i (- end 1))) (and (>= i 0) - (if (char=? criteria (string-ref str i)) i + (if (char=? criterion (string-ref str i)) i (lp (- i 1)))))) - ((char-set? criteria) + ((char-set? criterion) (let lp ((i (- end 1))) (and (>= i 0) - (if (char-set-contains? criteria (string-ref str i)) i + (if (char-set-contains? criterion (string-ref str i)) i (lp (- i 1)))))) - ((procedure? criteria) + ((procedure? criterion) (let lp ((i (- end 1))) (and (>= i 0) - (if (criteria (string-ref str i)) i + (if (criterion (string-ref str i)) i (lp (- i 1)))))) (else (error "Second param is neither char-set, char, or predicate procedure." - string-index-right criteria))))) + string-index-right criterion))))) -(define (string-skip str criteria . maybe-start+end) - (let-start+end (start end) string-skip str maybe-start+end - (cond ((char? criteria) +(define (string-skip str criterion . maybe-start+end) + (let-string-start+end (start end) string-skip str maybe-start+end + (cond ((char? criterion) (let lp ((i start)) (and (< i end) - (if (char=? criteria (string-ref str i)) + (if (char=? criterion (string-ref str i)) (lp (+ i 1)) i)))) - ((char-set? criteria) + ((char-set? criterion) (let lp ((i start)) (and (< i end) - (if (char-set-contains? criteria (string-ref str i)) + (if (char-set-contains? criterion (string-ref str i)) (lp (+ i 1)) i)))) - ((char-set? criteria) + ((procedure? criterion) (let lp ((i start)) (and (< i end) - (if (criteria (string-ref str i)) (lp (+ i 1)) + (if (criterion (string-ref str i)) (lp (+ i 1)) i)))) (else (error "Second param is neither char-set, char, or predicate procedure." - string-skip criteria))))) + string-skip criterion))))) -(define (string-skip-right str criteria . maybe-end+start) - (let-optionals maybe-end+start ((start 0) (end (string-length str))) - (check-substring-spec string-index-right str start end) - (cond ((char? criteria) +(define (string-skip-right str criterion . maybe-start+end) + (let-string-start+end (start end) string-skip-right str maybe-start+end + (cond ((char? criterion) (let lp ((i (- end 1))) (and (>= i 0) - (if (char=? criteria (string-ref str i)) + (if (char=? criterion (string-ref str i)) (lp (- i 1)) i)))) - ((char-set? criteria) + ((char-set? criterion) (let lp ((i (- end 1))) (and (>= i 0) - (if (char-set-contains? criteria (string-ref str i)) + (if (char-set-contains? criterion (string-ref str i)) (lp (- i 1)) i)))) - ((procedure? criteria) + ((procedure? criterion) (let lp ((i (- end 1))) (and (>= i 0) - (if (criteria (string-ref str i)) (lp (- i 1)) + (if (criterion (string-ref str i)) (lp (- i 1)) i)))) - (else (error "CRITERIA param is neither char-set or char." - string-skip-right criteria))))) + (else (error "CRITERION param is neither char-set or char." + string-skip-right criterion))))) + + +(define (string-count s criterion . maybe-start+end) + (let-string-start+end (start end) string-count s maybe-start+end + (cond ((char? criterion) + (do ((i start (+ i 1)) + (count 0 (if (char=? criterion (string-ref s i)) + (+ count 1) + count))) + ((>= i end) count))) + + ((char-set? criterion) + (do ((i start (+ i 1)) + (count 0 (if (char-set-contains? criterion (string-ref s i)) + (+ count 1) + count))) + ((>= i end) count))) + + ((procedure? criterion) + (do ((i start (+ i 1)) + (count 0 (if (criterion (string-ref s i)) (+ count 1) count))) + ((>= i end) count))) + + (else (error "CRITERION param is neither char-set or char." + string-count criterion))))) @@ -975,146 +1253,251 @@ ;;; Guaranteed to work, even if s1 eq s2. (define (string-fill! s char . maybe-start+end) - (let-start+end (start end) string-fill! s maybe-start+end + (check-arg char? char string-fill!) + (let-string-start+end (start end) string-fill! s maybe-start+end (do ((i (- end 1) (- i 1))) ((< i start)) (string-set! s i char)))) (define (string-copy! to tstart from . maybe-fstart+fend) - (let-start+end (fstart fend) string-copy! from maybe-fstart+fend - (let ((tend (+ tstart (- fend fstart)))) - (check-substring-spec string-copy! to tstart tend) - (if (> fstart tstart) - (do ((i fstart (+ i 1)) - (j tstart (+ j 1))) - ((>= i fend)) - (string-set! to j (string-ref from i))) + (let-string-start+end (fstart fend) string-copy! from maybe-fstart+fend + (check-arg integer? tstart string-copy!) + (check-substring-spec string-copy! to tstart (+ tstart (- fend fstart))) + (%string-copy! to tstart from fstart fend))) - (do ((i (- fend 1) (- i 1)) - (j (- tend 1) (- j 1))) - ((< i fstart)) - (string-set! to j (string-ref from i))))))) +;;; Library-internal routine +(define (%string-copy! to tstart from fstart fend) + (if (> fstart tstart) + (do ((i fstart (+ i 1)) + (j tstart (+ j 1))) + ((>= i fend)) + (string-set! to j (string-ref from i))) + (do ((i (- fend 1) (- i 1)) + (j (+ -1 tstart (- fend fstart)) (- j 1))) + ((< i fstart)) + (string-set! to j (string-ref from i))))) -;;; Returns starting-position or #f if not true. -;;; This implementation is slow & simple. See below for KMP. -;;; Boyer-Moore would be nice. -;(define (substring? substring string . maybe-start+end) -; (let-start+end (start end) string substring? maybe-start+end -; (if (string-null? substring) start -; (let* ((len (string-length substring)) -; (i-bound (- end len)) -; (char1 (string-ref substring start))) -; (let lp ((i 0)) -; (cond ((string-index string char1 i i-bound) => -; (lambda (i) -; (if (substring= substring 0 len string i (+ i len)) -; i -; (lp (+ i 1))))) -; (else #f))))))) - -;;; Searching for an occurence of a substring +;;; Returns starting-position in STRING or #f if not true. +;;; This implementation is slow & simple. It is useful as a "spec" or for +;;; comparison testing with fancier implementations. +;;; See below for fast KMP version. + +;(define (string-contains string substring . maybe-starts+ends) +; (let-string-start+end2 (start1 end1 start2 end2) +; string-contains string substring maybe-starts+ends +; (let* ((len (- end2 start2)) +; (i-bound (- end1 len))) +; (let lp ((i start1)) +; (and (< i i-bound) +; (if (string= string substring i (+ i len) start2 end2) +; i +; (lp (+ i 1)))))))) + + +;;; Searching for an occurrence of a substring ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;; This uses the KMP algorithm -;;; "Fast Pattern Matching in Strings" + +(define (string-contains text pattern . maybe-starts+ends) + (let-string-start+end2 (t-start t-end p-start p-end) + string-contains text pattern maybe-starts+ends + (%kmp-search pattern text char=? p-start p-end t-start t-end))) + +(define (string-contains-ci text pattern . maybe-starts+ends) + (let-string-start+end2 (t-start t-end p-start p-end) + string-contains-ci text pattern maybe-starts+ends + (%kmp-search pattern text char-ci=? p-start p-end t-start t-end))) + + +;;; Knuth-Morris-Pratt string searching +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; See +;;; "Fast pattern matching in strings" ;;; SIAM J. Computing 6(2):323-350 1977 ;;; D. E. Knuth, J. H. Morris and V. R. Pratt ;;; also described in -;;; "Pattern Matching in Strings" +;;; "Pattern matching in strings" ;;; Alfred V. Aho ;;; Formal Language Theory - Perspectives and Open Problems ;;; Ronald V. Brook (editor) ;;; This algorithm is O(m + n) where m and n are the ;;; lengths of the pattern and string respectively -;;; Original version of this code by bevan; I have substantially rehacked it. -(define (substring? pattern source . maybe-start+end) - (let-start+end (start end) substring? source maybe-start+end - (really-substring? char=? pattern source start end))) +;;; KMP search source[start,end) for PATTERN. Return starting index of +;;; leftmost match or #f. -(define (substring-ci? pattern source . maybe-start+end) - (let-start+end (start end) substring-ci? source maybe-start+end - (really-substring? char-ci=? pattern source start end))) +(define (%kmp-search pattern text c= p-start p-end t-start t-end) + (let ((plen (- p-end p-start)) + (rv (make-kmp-restart-vector pattern c= p-start p-end))) -;;; Compute the Knuth-Morris-Pratt restart vector RV for string PATTERN. If + ;; The search loop. TJ & PJ are redundant state. + (let lp ((ti t-start) (pi 0) + (tj (- t-end t-start)) ; (- tlen ti) -- how many chars left. + (pj plen)) ; (- plen pi) -- how many chars left. + + (if (= pi plen) (- ti plen) ; Win. + + (and (<= pj tj) ; Lose. + + (if (c= (string-ref text ti) ; Search. + (string-ref pattern (+ p-start pi))) + (lp (+ 1 ti) (+ 1 pi) (- tj 1) (- pj 1)) ; Advance. + + (let ((pi (vector-ref rv pi))) ; Retreat. + (if (= pi -1) + (lp (+ ti 1) 0 (- tj 1) plen) ; Punt. + (lp ti pi tj (- plen pi)))))))))) + +;;; (make-kmp-restart-vector pattern [c= start end]) -> integer-vector +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Compute the KMP restart vector RV for string PATTERN. If ;;; we have matched chars 0..i-1 of PATTERN against a search string S, and ;;; PATTERN[i] doesn't match S[k], then reset i := RV[i], and try again to ;;; match S[k]. If RV[i] = -1, then punt S[k] completely, and move on to -;;; S[k+1] and PATTERN[0]. +;;; S[k+1] and PATTERN[0] -- no possible match of PAT[0..i] contains S[k]. ;;; ;;; In other words, if you have matched the first i chars of PATTERN, but ;;; the i+1'th char doesn't match, RV[i] tells you what the next-longest ;;; prefix of PATTERN is that you have matched. ;;; -;;; C= is the character comparator -- usefully CHAR= or CHAR-CI=. +;;; - C= (default CHAR=?) is used to compare characters for equality. +;;; Pass in CHAR-CI=? for case-folded string search. +;;; +;;; - START & END restrict the pattern to the indicated substring; the +;;; returned vector will be of length END - START. The numbers stored +;;; in the vector will be values in the range [0,END-START) -- that is, +;;; they are valid indices into the restart vector; you have to add START +;;; to them to use them as indices into PATTERN. ;;; ;;; I've split this out as a separate function in case other constant-string ;;; searchers might want to use it. +;;; +;;; E.g.: +;;; a b d a b x +;;; #(-1 0 0 -1 1 2) -(define (make-kmp-restart-vector pattern c=) - (let* ((plen (string-length pattern)) - (rv (make-vector plen))) - (if (> plen 0) - (let ((plen-1 (- plen 1))) - (vector-set! rv 0 -1) - (let lp ((i 0) (j -1)) - (if (< i plen-1) - (if (or (= j -1) - (c= (string-ref pattern i) - (string-ref pattern j))) - (let ((i (+ 1 i)) - (j (+ 1 j))) - (vector-set! rv i j) - (lp i j)) - (lp i (vector-ref rv j))))))) - rv)) +(define (make-kmp-restart-vector pattern . maybe-c=+start+end) + (let-optionals* maybe-c=+start+end + ((c= char=? (procedure? c=)) + ((start end) (lambda (args) + (string-parse-start+end make-kmp-restart-vector + pattern args)))) + (let* ((rvlen (- end start)) + (rv (make-vector rvlen -1))) + (if (> rvlen 0) + (let ((rvlen-1 (- rvlen 1)) + (c0 (string-ref pattern start))) -(define (really-substring? c= pattern source start end) - (let ((plen (string-length pattern)) - (rv (make-kmp-restart-vector pattern c=))) + ;; Here's the main loop. We have set rv[0] ... rv[i]. + ;; K = I + START -- it is the corresponding index into PATTERN. + (let lp1 ((i 0) (j -1) (k start)) + (if (< i rvlen-1) - ;; The search loop. SJ & PJ are redundant state. - (let lp ((si start) (pi 0) - (sj (- end start)) ; (- end si) -- how many chars left. - (pj plen)) ; (- plen pi) -- how many chars left. + (let ((ck (string-ref pattern k))) + ;; lp2 invariant: + ;; pat[(k-j) .. k-1] matches pat[start .. start+j-1] + ;; or j = -1. + (let lp2 ((j j)) - (if (= pi plen) (- si plen) ; Win. + (cond ((= j -1) + (let ((i1 (+ i 1))) + (vector-set! rv i1 (if (c= ck c0) -1 0)) + (lp1 i1 0 (+ k 1)))) - (and (<= pj sj) ; Lose. - - (if (c= (string-ref source si) ; Search. - (string-ref pattern pi)) - (lp (+ 1 si) (+ 1 pi) (- sj 1) (- pj 1)) ; Advance. - - (let ((pi (vector-ref rv pi))) ; Retreat. - (if (= pi -1) - (lp (+ si 1) 0 (- sj 1) plen) ; Punt. - (lp si pi sj (- plen pi)))))))))) + ;; pat[(k-j) .. k] matches pat[start..start+j]. + ((c= ck (string-ref pattern (+ j start))) + (let* ((i1 (+ 1 i)) + (j1 (+ 1 j))) + (vector-set! rv i1 j1) + (lp1 i1 j1 (+ k 1)))) + (else (lp2 (vector-ref rv j)))))))))) + rv))) + + +;;; We've matched I chars from PAT. C is the next char from the search string. +;;; Return the new I after handling C. +;;; +;;; The pattern is (VECTOR-LENGTH RV) chars long, beginning at index PAT-START +;;; in PAT (PAT-START is usually 0). The I chars of the pattern we've matched +;;; are +;;; PAT[PAT-START .. PAT-START + I]. +;;; +;;; It's *not* an oversight that there is no friendly error checking or +;;; defaulting of arguments. This is a low-level, inner-loop procedure +;;; that we want integrated/inlined into the point of call. + +(define (kmp-step pat rv c i c= p-start) + (let lp ((i i)) + (if (c= c (string-ref pat (+ i p-start))) ; Match => + (+ i 1) ; Done. + (let ((i (vector-ref rv i))) ; Back up in PAT. + (if (= i -1) 0 ; Can't back up further. + (lp i)))))) ; Keep trying for match. + +;;; Zip through S[start,end), looking for a match of PAT. Assume we've +;;; already matched the first I chars of PAT when we commence at S[start]. +;;; - <0: If we find a match *ending* at index J, return -J. +;;; - >=0: If we get to the end of the S[start,end) span without finding +;;; a complete match, return the number of chars from PAT we'd matched +;;; when we ran off the end. +;;; +;;; This is useful for searching *across* buffers -- that is, when your +;;; input comes in chunks of text. We hand-integrate the KMP-STEP loop +;;; for speed. + +(define (string-kmp-partial-search pat rv s i . c=+p-start+s-start+s-end) + (check-arg vector? rv string-kmp-partial-search) + (let-optionals* c=+p-start+s-start+s-end + ((c= char=? (procedure? c=)) + (p-start 0 (and (integer? p-start) (exact? p-start) (<= 0 p-start))) + ((s-start s-end) (lambda (args) + (string-parse-start+end string-kmp-partial-search + s args)))) + (let ((patlen (vector-length rv))) + (check-arg (lambda (i) (and (integer? i) (exact? i) (<= 0 i) (< i patlen))) + i string-kmp-partial-search) + + ;; Enough prelude. Here's the actual code. + (let lp ((si s-start) ; An index into S. + (vi i)) ; An index into RV. + (cond ((= vi patlen) (- si)) ; Win. + ((= si s-end) vi) ; Ran off the end. + (else ; Match s[si] & loop. + (let ((c (string-ref s si))) + (lp (+ si 1) + (let lp2 ((vi vi)) ; This is just KMP-STEP. + (if (c= c (string-ref pat (+ vi p-start))) + (+ vi 1) + (let ((vi (vector-ref rv vi))) + (if (= vi -1) 0 + (lp2 vi))))))))))))) ;;; Misc ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; (string-null? s) ;;; (string-reverse s [start end]) ;;; (string-reverse! s [start end]) -;;; (string-null? s) +;;; (reverse-list->string clist) +;;; (string->list s [start end]) (define (string-null? s) (zero? (string-length s))) (define (string-reverse s . maybe-start+end) - (let-start+end (start end) string-reverse s maybe-start+end - (let ((ans (make-string (- end start)))) - (do ((i (- end 1) (- i 1)) - (j start (+ j 1))) - ((< i j)) - (string-set! ans i (string-ref s j)) + (let-string-start+end (start end) string-reverse s maybe-start+end + (let* ((len (- end start)) + (ans (make-string len))) + (do ((i start (+ i 1)) + (j (- len 1) (- j 1))) + ((< j 0)) (string-set! ans j (string-ref s i))) ans))) (define (string-reverse! s . maybe-start+end) - (let-start+end (start end) string-reverse! s maybe-start+end + (let-string-start+end (start end) string-reverse! s maybe-start+end (do ((i (- end 1) (- i 1)) (j start (+ j 1))) ((<= i j)) @@ -1133,18 +1516,20 @@ ;(define (string->list s . maybe-start+end) -; (let-start+end (start end) string->list s maybe-start+end -; (do ((i (- end 1) (- i 1)) -; (ans '() (cons (string-ref s i) ans))) -; ((< i start) ans)))) +; (apply string-fold-right cons '() s maybe-start+end)) (define (string->list s . maybe-start+end) - (apply string-fold-right cons '() s maybe-start+end)) + (let-string-start+end (start end) string->list s maybe-start+end + (do ((i (- end 1) (- i 1)) + (ans '() (cons (string-ref s i) ans))) + ((< i start) ans)))) + +;;; Defined by R5RS, so commented out here. +;(define (list->string lis) (string-unfold null? car cdr lis)) - -;;; string-concat string-list -> string -;;; string-concat/shared string-list -> string +;;; string-concatenate string-list -> string +;;; string-concatenate/shared string-list -> string ;;; string-append/shared s ... -> string ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; STRING-APPEND/SHARED has license to return a string that shares storage @@ -1152,52 +1537,162 @@ ;;; string amongst its parameters, it is permitted to return that string as ;;; its result. STRING-APPEND, by contrast, always allocates new storage. ;;; -;;; STRING-CONCAT & STRING-CONCAT/SHARED are passed a list of strings, -;;; which they concatenate into a result string. STRING-CONCAT always -;;; allocates a fresh string; STRING-CONCAT/SHARED may (or may not) return -;;; a result that shares storage with any of its arguments. In particular, -;;; if it is applied to a singleton list, it is permitted to return the -;;; car of that list as its value. -;;; -;;; This is portable code, but could be much more efficient w/compiler -;;; support. Especially the n-ary guys. +;;; STRING-CONCATENATE & STRING-CONCATENATE/SHARED are passed a list of +;;; strings, which they concatenate into a result string. STRING-CONCATENATE +;;; always allocates a fresh string; STRING-CONCATENATE/SHARED may (or may +;;; not) return a result that shares storage with any of its arguments. In +;;; particular, if it is applied to a singleton list, it is permitted to +;;; return the car of that list as its value. -;;; We delete the empty strings from the parameter list before handing -;;; off to string-concat/shared. I wrote the recursion out by hand instead -;;; of using list-lib's FILTER or FILTER! to minimize non-R5RS dependencies. +(define (string-append/shared . strings) (string-concatenate/shared strings)) -(define (string-append/shared . strings) (string-concat/shared strings)) +(define (string-concatenate/shared strings) + (let lp ((strings strings) (nchars 0) (first #f)) + (cond ((pair? strings) ; Scan the args, add up total + (let* ((string (car strings)) ; length, remember 1st + (tail (cdr strings)) ; non-empty string. + (slen (string-length string))) + (if (zero? slen) + (lp tail nchars first) + (lp tail (+ nchars slen) (or first strings))))) -(define (string-concat/shared strings) - (let ((strings (let recur ((strings strings)) ; Delete empty strings. - (if (pair? strings) - (let ((s (car strings)) - (tail (recur (cdr strings)))) - (if (string-null? s) tail (cons s tail))) - '())))) + ((zero? nchars) "") - (cond ((not (pair? strings)) "") ; () => "". - ((not (pair? (cdr strings))) (car strings)) ; (s) => s. - (else (string-concat strings))))) ; Allocate & concat. + ;; Just one non-empty string! Return it. + ((= nchars (string-length (car first))) (car first)) + + (else (let ((ans (make-string nchars))) + (let lp ((strings first) (i 0)) + (if (pair? strings) + (let* ((s (car strings)) + (slen (string-length s))) + (%string-copy! ans i s 0 slen) + (lp (cdr strings) (+ i slen))))) + ans))))) + ; Alas, Scheme 48's APPLY blows up if you have many, many arguments. -;(define (string-concat strings) (apply string-append strings)) +;(define (string-concatenate strings) (apply string-append strings)) ;;; Here it is written out. I avoid using REDUCE to add up string lengths ;;; to avoid non-R5RS dependencies. -(define (string-concat strings) +(define (string-concatenate strings) (let* ((total (do ((strings strings (cdr strings)) (i 0 (+ i (string-length (car strings))))) ((not (pair? strings)) i))) (ans (make-string total))) (let lp ((i 0) (strings strings)) (if (pair? strings) - (let ((s (car strings))) - (string-copy! ans i s) - (lp (+ i (string-length s)) (cdr strings))))) + (let* ((s (car strings)) + (slen (string-length s))) + (%string-copy! ans i s 0 slen) + (lp (+ i slen) (cdr strings))))) ans)) +;;; Defined by R5RS, so commented out here. +;(define (string-append . strings) (string-concatenate strings)) + +;;; string-concatenate-reverse string-list [final-string end] -> string +;;; string-concatenate-reverse/shared string-list [final-string end] -> string +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Return +;;; (string-concatenate +;;; (reverse +;;; (cons (substring final-string 0 end) string-list))) + +(define (string-concatenate-reverse string-list . maybe-final+end) + (let-optionals* maybe-final+end ((final "" (string? final)) + (end (string-length final) + (and (integer? end) + (exact? end) + (<= 0 end (string-length final))))) + (let ((len (let lp ((sum 0) (lis string-list)) + (if (pair? lis) + (lp (+ sum (string-length (car lis))) (cdr lis)) + sum)))) + + (%finish-string-concatenate-reverse len string-list final end)))) + +(define (string-concatenate-reverse/shared string-list . maybe-final+end) + (let-optionals* maybe-final+end ((final "" (string? final)) + (end (string-length final) + (and (integer? end) + (exact? end) + (<= 0 end (string-length final))))) + ;; Add up the lengths of all the strings in STRING-LIST; also get a + ;; pointer NZLIST into STRING-LIST showing where the first non-zero-length + ;; string starts. + (let lp ((len 0) (nzlist #f) (lis string-list)) + (if (pair? lis) + (let ((slen (string-length (car string-list)))) + (lp (+ len slen) + (if (or nzlist (zero? slen)) nzlist lis) + (cdr lis))) + + (cond ((zero? len) (substring/shared final 0 end)) + + ;; LEN > 0, so NZLIST is non-empty. + + ((and (zero? end) (= len (string-length (car nzlist)))) + (car nzlist)) + + (else (%finish-string-concatenate-reverse len nzlist final end))))))) + +(define (%finish-string-concatenate-reverse len string-list final end) + (let ((ans (make-string (+ end len)))) + (%string-copy! ans len final 0 end) + (let lp ((i len) (lis string-list)) + (if (pair? lis) + (let* ((s (car lis)) + (lis (cdr lis)) + (slen (string-length s)) + (i (- i slen))) + (%string-copy! ans i s 0 slen) + (lp i lis)))) + ans)) + + + + +;;; string-replace s1 s2 start1 end1 [start2 end2] -> string +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Replace S1[START1,END1) with S2[START2,END2). + +(define (string-replace s1 s2 start1 end1 . maybe-start+end) + (check-substring-spec string-replace s1 start1 end1) + (let-string-start+end (start2 end2) string-replace s2 maybe-start+end + (let* ((slen1 (string-length s1)) + (sublen2 (- end2 start2)) + (alen (+ (- slen1 (- end1 start1)) sublen2)) + (ans (make-string alen))) + (%string-copy! ans 0 s1 0 start1) + (%string-copy! ans start1 s2 start2 end2) + (%string-copy! ans (+ start1 sublen2) s1 end1 slen1) + ans))) + + +;;; string-tokenize s [token-set start end] -> list +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Break S up into a list of token strings, where a token is a maximal +;;; non-empty contiguous sequence of chars belonging to TOKEN-SET. +;;; (string-tokenize "hello, world") => ("hello," "world") + +(define (string-tokenize s . token-chars+start+end) + (let-optionals* token-chars+start+end + ((token-chars char-set:graphic (char-set? token-chars)) rest) + (let-string-start+end (start end) string-tokenize s rest + (let lp ((i end) (ans '())) + (cond ((and (< start i) (string-index-right s token-chars start i)) => + (lambda (tend-1) + (let ((tend (+ 1 tend-1))) + (cond ((string-skip-right s token-chars start tend-1) => + (lambda (tstart-1) + (lp tstart-1 + (cons (substring s (+ 1 tstart-1) tend) + ans)))) + (else (cons (substring s start tend) ans)))))) + (else ans)))))) ;;; xsubstring s from [to start end] -> string @@ -1228,33 +1723,36 @@ ;;; dispensation when FROM=TO. (define (xsubstring s from . maybe-to+start+end) + (check-arg (lambda (val) (and (integer? val) (exact? val))) + from xsubstring) (receive (to start end) (if (pair? maybe-to+start+end) - (let-start+end (start end) xsubstring s (cdr maybe-to+start+end) - (values (car maybe-to+start+end) start end)) - (let ((slen (string-length s))) + (let-string-start+end (start end) xsubstring s (cdr maybe-to+start+end) + (let ((to (car maybe-to+start+end))) + (check-arg (lambda (val) (and (integer? val) + (exact? val) + (<= from val))) + to xsubstring) + (values to start end))) + (let ((slen (string-length (check-arg string? s xsubstring)))) (values (+ from slen) 0 slen))) (let ((slen (- end start)) (anslen (- to from))) - (cond ((< anslen 0) - (error "Illegal FROM/TO spec passed to xsubstring -- FROM > TO." - s from to start end)) - - ((zero? anslen) "") - ((zero? slen) (error "Empty (sub)string passed to xsubstring" - s from to start end)) + (cond ((zero? anslen) "") + ((zero? slen) (error "Cannot replicate empty (sub)string" + xsubstring s from to start end)) ((= 1 slen) ; Fast path for 1-char replication. (make-string anslen (string-ref s start))) ;; Selected text falls entirely within one span. ((= (floor (/ from slen)) (floor (/ to slen))) - (substringx s (+ start (modulo from slen)) - (+ start (modulo to slen)))) + (substring s (+ start (modulo from slen)) + (+ start (modulo to slen)))) ;; Selected text requires multiple spans. (else (let ((ans (make-string anslen))) - (multispan-repcopy! ans 0 s from to start end) + (%multispan-repcopy! ans 0 s from to start end) ans)))))) @@ -1266,10 +1764,15 @@ ;;; a string on top of itself. (define (string-xcopy! target tstart s sfrom . maybe-sto+start+end) + (check-arg (lambda (val) (and (integer? val) (exact? val))) + sfrom string-xcopy!) (receive (sto start end) (if (pair? maybe-sto+start+end) - (let-start+end (start end) string-xcopy! s (cdr maybe-sto+start+end) - (values (car maybe-sto+start+end) start end)) + (let-string-start+end (start end) string-xcopy! s (cdr maybe-sto+start+end) + (let ((sto (car maybe-sto+start+end))) + (check-arg (lambda (val) (and (integer? val) (exact? val))) + sto string-xcopy!) + (values sto start end))) (let ((slen (string-length s))) (values (+ sfrom slen) 0 slen))) @@ -1277,34 +1780,32 @@ (tend (+ tstart tocopy)) (slen (- end start))) (check-substring-spec string-xcopy! target tstart tend) - (cond ((< tocopy 0) - (error "Illegal FROM/TO spec passed to string-xcopy! -- FROM > TO." - target tstart s sfrom sto start end)) - ((zero? tocopy)) - ((zero? slen) (error "Empty (sub)string passed to string-xcopy!" - target tstart s sfrom sto start end)) + (cond ((zero? tocopy)) + ((zero? slen) (error "Cannot replicate empty (sub)string" + string-xcopy! + target tstart s sfrom sto start end)) ((= 1 slen) ; Fast path for 1-char replication. (string-fill! target (string-ref s start) tstart tend)) ;; Selected text falls entirely within one span. ((= (floor (/ sfrom slen)) (floor (/ sto slen))) - (string-copy! target tstart s - (+ start (modulo sfrom slen)) - (+ start (modulo sto slen)))) + (%string-copy! target tstart s + (+ start (modulo sfrom slen)) + (+ start (modulo sto slen)))) ;; Multi-span copy. - (else (multispan-repcopy! target tstart s sfrom sto start end)))))) + (else (%multispan-repcopy! target tstart s sfrom sto start end)))))) ;;; This is the core copying loop for XSUBSTRING and STRING-XCOPY! ;;; Internal -- not exported, no careful arg checking. -(define (multispan-repcopy! target tstart s sfrom sto start end) +(define (%multispan-repcopy! target tstart s sfrom sto start end) (let* ((slen (- end start)) (i0 (+ start (modulo sfrom slen))) (total-chars (- sto sfrom))) ;; Copy the partial span @ the beginning - (string-copy! target tstart s i0 end) + (%string-copy! target tstart s i0 end) (let* ((ncopied (- end i0)) ; We've copied this many. (nleft (- total-chars ncopied)) ; # chars left to copy. @@ -1315,43 +1816,155 @@ (nspans nspans (- nspans 1))) ; # spans to copy ((zero? nspans) ;; Copy the partial-span @ the end & we're done. - (string-copy! target i s start (+ start (- total-chars (- i tstart))))) + (%string-copy! target i s start (+ start (- total-chars (- i tstart))))) - (string-copy! target i s start end))))) ; Copy a whole span. + (%string-copy! target i s start end))))); Copy a whole span. -;;; (join-strings string-list [delimiter grammar]) => string +;;; (string-join string-list [delimiter grammar]) => string ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Paste strings together using the delimiter string. ;;; ;;; (join-strings '("foo" "bar" "baz") ":") => "foo:bar:baz" ;;; ;;; DELIMITER defaults to a single space " " -;;; GRAMMAR is one of the symbols {infix, suffix} and defaults to 'infix. +;;; GRAMMAR is one of the symbols {prefix, infix, strict-infix, suffix} +;;; and defaults to 'infix. +;;; +;;; I could rewrite this more efficiently -- precompute the length of the +;;; answer string, then allocate & fill it in iteratively. Using +;;; STRING-CONCATENATE is less efficient. -;;; (join-strings strings [delim grammar]) +(define (string-join strings . delim+grammar) + (let-optionals* delim+grammar ((delim " " (string? delim)) + (grammar 'infix)) + (let ((buildit (lambda (lis final) + (let recur ((lis lis)) + (if (pair? lis) + (cons delim (cons (car lis) (recur (cdr lis)))) + final))))) -(define (join-strings strings . args) - (if (pair? strings) - (let-optionals args ((delim " ") (grammar 'infix)) - (let ((strings (reverse strings))) - (let lp ((strings (cdr strings)) - (ans (case grammar - ((infix) (list (car strings))) - ((suffix) (list (car strings) delim)) - (else (error "Illegal join-strings grammar" grammar))))) - (if (pair? strings) - (lp (cdr strings) - (cons (car strings) (cons delim ans))) - - ; All done - (string-concat ans))))) + (cond ((pair? strings) + (string-concatenate + (case grammar - "")) ; Special-cased for infix grammar. + ((infix strict-infix) + (cons (car strings) (buildit (cdr strings) '()))) + ((prefix) (buildit strings '())) + + ((suffix) + (cons (car strings) (buildit (cdr strings) (list delim)))) + + (else (error "Illegal join grammar" + grammar string-join))))) + + ((not (null? strings)) + (error "STRINGS parameter not list." strings string-join)) + + ;; STRINGS is () + + ((eq? grammar 'strict-infix) + (error "Empty list cannot be joined with STRICT-INFIX grammar." + string-join)) + + (else ""))))) ; Special-cased for infix grammar. +;;; Porting & performance-tuning notes +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; See the section at the beginning of this file on external dependencies. +;;; +;;; The biggest issue with respect to porting is the LET-OPTIONALS* macro. +;;; There are many, many optional arguments in this library; the complexity +;;; of parsing, defaulting & type-testing these parameters is handled with the +;;; aid of this macro. There are about 15 uses of LET-OPTIONALS*. You can +;;; rewrite the uses, port the hairy macro definition (which is implemented +;;; using a Clinger-Rees low-level explicit-renaming macro system), or port +;;; the simple, high-level definition, which is less efficient. +;;; +;;; There is a fair amount of argument checking. This is, strictly speaking, +;;; unnecessary -- the actual body of the procedures will blow up if, say, a +;;; START/END index is improper. However, the error message will not be as +;;; good as if the error were caught at the "higher level." Also, a very, very +;;; smart Scheme compiler may be able to exploit having the type checks done +;;; early, so that the actual body of the procedures can assume proper values. +;;; This isn't likely; this kind of compiler technology isn't common any +;;; longer. +;;; +;;; The overhead of optional-argument parsing is irritating. The optional +;;; arguments must be consed into a rest list on entry, and then parsed out. +;;; Function call should be a matter of a few register moves and a jump; it +;;; should not involve heap allocation! Your Scheme system may have a superior +;;; non-R5RS optional-argument system that can eliminate this overhead. If so, +;;; then this is a prime candidate for optimising these procedures, +;;; *especially* the many optional START/END index parameters. +;;; +;;; Note that optional arguments are also a barrier to procedure integration. +;;; If your Scheme system permits you to specify alternate entry points +;;; for a call when the number of optional arguments is known in a manner +;;; that enables inlining/integration, this can provide performance +;;; improvements. +;;; +;;; There is enough *explicit* error checking that *all* string-index +;;; operations should *never* produce a bounds error. Period. Feel like +;;; living dangerously? *Big* performance win to be had by replacing +;;; STRING-REF's and STRING-SET!'s with unsafe equivalents in the loops. +;;; Similarly, fixnum-specific operators can speed up the arithmetic done on +;;; the index values in the inner loops. The only arguments that are not +;;; completely error checked are +;;; - string lists (complete checking requires time proportional to the +;;; length of the list) +;;; - procedure arguments, such as char->char maps & predicates. +;;; There is no way to check the range & domain of procedures in Scheme. +;;; Procedures that take these parameters cannot fully check their +;;; arguments. But all other types to all other procedures are fully +;;; checked. +;;; +;;; This does open up the alternate possibility of simply *removing* these +;;; checks, and letting the safe primitives raise the errors. On a dumb +;;; Scheme system, this would provide speed (by eliminating the redundant +;;; error checks) at the cost of error-message clarity. +;;; +;;; See the comments preceding the hash function code for notes on tuning +;;; the default bound so that the code never overflows your implementation's +;;; fixnum size into bignum calculation. +;;; +;;; In an interpreted Scheme, some of these procedures, or the internal +;;; routines with % prefixes, are excellent candidates for being rewritten +;;; in C. Consider STRING-HASH, %STRING-COMPARE, the +;;; %STRING-{SUF,PRE}FIX-LENGTH routines, STRING-COPY!, STRING-INDEX & +;;; STRING-SKIP (char-set & char cases), SUBSTRING and SUBSTRING/SHARED, +;;; %KMP-SEARCH, and %MULTISPAN-REPCOPY!. +;;; +;;; It would also be nice to have the ability to mark some of these +;;; routines as candidates for inlining/integration. +;;; +;;; All the %-prefixed routines in this source code are written +;;; to be called internally to this library. They do *not* perform +;;; friendly error checks on the inputs; they assume everything is +;;; proper. They also do not take optional arguments. These two properties +;;; save calling overhead and enable procedure integration -- but they +;;; are not appropriate for exported routines. + + +;;; Copyright details +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; The prefix/suffix and comparison routines in this code had (extremely +;;; distant) origins in MIT Scheme's string lib, and was substantially +;;; reworked by Olin Shivers (shivers@ai.mit.edu) 9/98. As such, it is +;;; covered by MIT Scheme's open source copyright. See below for details. +;;; +;;; The KMP string-search code was influenced by implementations written +;;; by Stephen Bevan, Brian Dehneyer and Will Fitzgerald. However, this +;;; version was written from scratch by myself. +;;; +;;; The remainder of this code was written from scratch by myself for scsh. +;;; The scsh copyright is a BSD-style open source copyright. See below for +;;; details. +;;; -Olin Shivers + ;;; MIT Scheme copyright terms ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; This material was developed by the Scheme project at the Massachusetts @@ -1382,3 +1995,29 @@ ;;; Technology nor of any adaptation thereof in any advertising, ;;; promotional, or sales literature without prior written consent from ;;; MIT in each case. + +;;; Scsh copyright terms +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; All rights reserved. +;;; +;;; Redistribution and use in source and binary forms, with or without +;;; modification, are permitted provided that the following conditions +;;; are met: +;;; 1. Redistributions of source code must retain the above copyright +;;; notice, this list of conditions and the following disclaimer. +;;; 2. Redistributions in binary form must reproduce the above copyright +;;; notice, this list of conditions and the following disclaimer in the +;;; documentation and/or other materials provided with the distribution. +;;; 3. The name of the authors may not be used to endorse or promote products +;;; derived from this software without specific prior written permission. +;;; +;;; THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR +;;; IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +;;; OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +;;; IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, +;;; INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +;;; NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +;;; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +;;; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +;;; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +;;; THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/scsh/lib/strings.txt b/scsh/lib/strings.txt deleted file mode 100644 index fa2d32d..0000000 --- a/scsh/lib/strings.txt +++ /dev/null @@ -1,578 +0,0 @@ -Todo: - parse-start+end parse-final-start+end need "string" in the name - Also, export macro binder. - What's up w/quotient? (quotient -1 3) = 0. - regexp-foldl - type regexp interface - land* - Let-optional: - A let-optional that parses a prefix of the args. - Arg checking forms that get used if it parses, but are not - applied to the default. - -The Scheme Underground string library includes a rich set of operations -for manipulating strings. These are frequently useful for scripting and -other text-manipulation applications. - -The library's design was influenced by the string libraries found in MIT -Scheme, Gambit, RScheme, MzScheme, slib, Common Lisp, Bigloo, guile, APL and -the SML standard basis. Some of the code bears a distant family relation to -the MIT Scheme implementation, and being derived from that code, is covered by -the MIT Scheme copyright (which is a fairly generic "free" copyright -- see -the source file for details). The fast KMP string-search code used in -SUBSTRING? was loosely adapted from old slib code by Stephen Bevan. - -The library has the following design principles: -- *All* procedures involving character comparison are available in - both case-sensitive and case-insensitive forms. - -- *All* functionality is available in substring and full-string forms. - -- The procedures are spec'd so as to permit efficient implementation in a - Scheme that provided shared-text substrings (e.g., guile). This means that - you should not rely on many of the substring-selecting procedures to return - freshly-allocated strings. Careful attention is paid to the issue of which - procedures allocate fresh storage, and which are permitted to return results - that share storage with the arguments. - -- Common Lisp theft: - + inequality functions return mismatch index. - I generalised this so that this "protocol" is extended even to - the equality functions. This means that clients can be handed any generic - string-comparison function and rely on the meaning of the true value. - - + Common Lisp capitalisation definition - -The library addresses some problems with the R5RS string procedures: - - Question marks after string-comparison functions (string=?, etc.) - This is inconsistent with numeric comparison functions, and ugly, too. - - String-comparison functions do not provide useful true value. - - STRING-COPY should have optional start/end args; - SUBSTRING shouldn't specify if it copies or returns shared bits. - - STRING-FILL! and STRING->LIST should take optional start/end args. - - No <> function provided. - -In the following procedure specifications: - - Any S parameter is a string; - - - START and END parameters are half-open string indices specifying - a substring within a string parameter; when optional, they default - to 0 and the length of the string, respectively. When specified, it - must be the case that 0 <= START <= END <= (string-length S), for - the corresponding parameter S. They typically restrict a procedure's - action to the indicated substring. - - - A CHAR/CHAR-SET/PRED parameter is a value used to select/search - for a character in a string. If it is a character, it is used in - an equality test; if it is a character set, it is used as a - membership test; if it is a procedure, it is applied to the - characters as a test predicate. - -This library contains a large number of procedures, but they follow -a consistent naming scheme. The names are composed of smaller lexemes -in a regular way that exposes the structure and relationships between the -procedures. This should help the programmer to recall or reconstitute the name -of the particular procedure that he needs when writing his own code. In -particular - - Procedures whose names end in "-ci" are case-insensitive variants. - - Procedures whose names end in "!" are side-effecting variants. - These procedures generally return an unspecified value. - - The order of common parameters is fairly consistent across the - different procedures. - -For more text-manipulation functionality, see also the regular expression, -file-name, character set, and character->character partial map packages. - -------------------------------------------------------------------------------- -* R4RS/R5RS procedures - -The R4RS and R5RS reports define 22 string procedures. The string-lib -package includes 8 of these exactly as defined, 4 in an extended, -backwards-compatible way, and drops the remaining 10 (whose functionality -is available via other bindings). - -The 8 procedures provided exactly as documented in the reports are - string? - make-string - string - string-length - string-ref - string-set! - string-append - list->string - -The ten functions not included are the R4RS string-comparison functions: - string=? string-ci=? - string? string-ci>? - string<=? string-ci<=? - string>=? string-ci>=? -The string-lib package provides alternate bindings. - -Additionally, the four extended procedures are - - string-fill! s char [start end] -> unspecific - string->list s [start end] -> char-list - substring s start [end] -> string - string-copy s [start end] -> string - -These procedures are documented in the following section. In brief, they are -extended to take optional start/end parameters specifying substring ranges; -Additionally, SUBSTRING is allowed to return a value that shares storage with -its argument. - - -* Procedures - -These procedures are contained in the Scheme 48 package "string-lib", -which is open in the default user package. They are not found in the -"scsh" package; script writers and other programmers that use the Scheme -48 module system must open string-lib explicitly. - -string-map proc s [start end] -> string -string-map! proc s [start end] -> unspecified - PROC is a char->char procedure; it is mapped over S. - Note: no sequence order is specified. - -string-fold kons knil s [start end] -> value -string-fold-right kons knil s [start end] -> value - These are the fundamental iterators for strings. - The left-fold operator maps the KONS procedure across the - string from left to right - (... (kons s[2] (kons s[1] (kons s[0] knil)))) - In other words, string-fold obeys the recursion - (string-fold kons knil s start end) = - (string-fold kons (kons s[start] knil) start+1 end) - - The right-fold operator maps the KONS procedure across the - string from right to left - (kons s[0] (... (kons s[end-3] (kons s[end-2] (kons s[end-1] knil))))) - obeying the recursion - (string-fold-right kons knil s start end) = - (string-fold-right kons (kons s[end-1] knil) start end-1) - - Examples: - To convert a string to a list of chars: - (string-fold-right cons '() s) - - To count the number of lower-case characters in a string: - (string-fold (lambda (c count) - (if (char-set-contains? char-set:lower c) - (+ count 1) - count)) - 0 - s) - -string-unfold p f g seed -> string - This is the fundamental constructor for strings. - - G is used to generate a series of "seed" values from the initial seed: - SEED, (G SEED), (G^2 SEED), (G^3 SEED), ... - - P tells us when to stop -- when it returns true when applied to one - of these seed values. - - F maps each seed value to the corresponding character - in the result string. - - More precisely, the following (simple, inefficient) definition holds: - (define (string-unfold p f g seed) - (if (p seed) "" - (string-append (string (f seed)) - (string-unfold p f g (g seed))))) - - STRING-UNFOLD is a fairly powerful constructor -- you can use it to - reverse a string, copy a string, convert a list to a string, read - a port into a string, and so forth. Examples: - (port->string p) = (string-unfold eof-object? values - (lambda (x) (read-char p)) - (read-char p)) - - (list->string lis) = (string-unfold null? car cdr lis) - - (tabulate-string f size) = (string-unfold (lambda (i) (= i size)) f add1 0) - - To map F over a list LIS, producing a string: - (string-unfold null? (compose f car) cdr lis) - -string-tabulate proc len -> string - PROC is an integer->char procedure. Construct a string of size LEN - by applying PROC to each index to produce the corresponding string - element. The order in which PROC is applied to the indices is not - specified. - -string-for-each proc s [start end] -> unspecified -string-iter proc s [start end] -> unspecified - Apply PROC to each character in S. - STRING-FOR-EACH has no specified iteration order. - STRING-ITER is required to iterate from START to END - in increasing order. - -string-every? pred s [start end] -> boolean -string-any? pred s [start end] -> value - Note: no sequence order specified. - Checks to see if predicate PRED is true of every / any character in S. - STRING-ANY? is witness-generating -- it applies PRED to the elements - of S, returning the first true value it finds, otherwise false. - -string-compare s1 s2 lt-proc eq-proc gt-proc -> values -string-compare-ci s1 s2 lt-proc eq-proc gt-proc -> values - Apply LT-PROC, EQ-PROC, GT-PROC to the mismatch index, depending - upon whether S1 is less than, equal to, or greater than S2. - The "mismatch index" is the largest index i such that for - every 0 <= j < i, s1[j] = s2[j] -- that is, I is the first - position that doesn't match. If S1 = S2, the mismatch index - is simply the length of the strings; we observe the protocol - in this redundant case for uniformity. - -substring-compare s1 start1 end1 s2 start2 end2 lt-proc eq-proc gt-proc -> values -substring-compare-ci s1 start1 end1 s2 start2 end2 lt-proc eq-proc gt-proc -> values - The continuation procedures are applied to S1's mismatch index (as defined - above). In the case of EQ-PROC, this is always END1. - -string= s1 s2 -> #f or integer -string<> s1 s2 -> #f or integer -string< s1 s2 -> #f or integer -string> s1 s2 -> #f or integer -string<= s1 s2 -> #f or integer -string>= s1 s2 -> #f or integer - If the comparison operation is true, the function returns the - mismatch index (as defined for the previous comparator functions). - -string-ci= s1 s2 -> #f or integer -string-ci<> s1 s2 -> #f or integer -string-ci< s1 s2 -> #f or integer -string-ci> s1 s2 -> #f or integer -string-ci<= s1 s2 -> #f or integer -string-ci>= s1 s2 -> #f or integer - Case-insensitive variants. - -substring= s1 start1 end1 s2 start2 end2 -> #f or integer -substring<> s1 start1 end1 s2 start2 end2 -> #f or integer -substring< s1 start1 end1 s2 start2 end2 -> #f or integer -substring> s1 start1 end1 s2 start2 end2 -> #f or integer -substring<= s1 start1 end1 s2 start2 end2 -> #f or integer -substring>= s1 start1 end1 s2 start2 end2 -> #f or integer - -substring-ci= s1 start1 end1 s2 start2 end2 -> #f or integer -substring-ci<> s1 start1 end1 s2 start2 end2 -> #f or integer -substring-ci< s1 start1 end1 s2 start2 end2 -> #f or integer -substring-ci> s1 start1 end1 s2 start2 end2 -> #f or integer -substring-ci<= s1 start1 end1 s2 start2 end2 -> #f or integer -substring-ci>= s1 start1 end1 s2 start2 end2 -> #f or integer - These variants restrict the comparison to the indicated - substrings of S1 and S2. - -string-upper-case? s [start end] -> boolean -string-lower-case? s [start end] -> boolean - STRING-UPPER-CASE? returns true iff the string contains - no lower-case characters. STRING-LOWER-CASE returns true - iff the string contains no upper-case characters. - (string-upper-case? "") => #t - (string-lower-case? "") => #t - (string-upper-case? "FOOb") => #f - (string-upper-case? "U.S.A.") => #t - -capitalize-string s [start end] -> string -capitalize-string! s [start end] -> unspecified - Capitalize the string: upcase the first alphanumeric character, - and downcase the rest of the string. CAPITALIZE-STRING returns - a freshly allocated string. - - (capitalize-string "--capitalize tHIS sentence.") => - "--Capitalize this sentence." - - (capitalize-string "see Spot run. see Nix run.") => - "See spot run. see nix run." - - (capitalize-string "3com makes routers.") => - "3com makes routers." - -capitalize-words s [start end] -> string -capitalize-words! s [start end] -> unspecified - A "word" is a maximal contiguous sequence of alphanumeric characters. - Upcase the first character of every word; downcase the rest of the word. - CAPITALIZE-WORDS returns a freshly allocated string. - - (capitalize-words "HELLO, 3THErE, my nAME IS olin") => - "Hello, 3there, My Name Is Olin" - - More sophisticated capitalisation procedures can be synthesized - using CAPITALIZE-STRING and pattern matchers. In this context, - the REGEXP-SUBSTITUTE/GLOBAL procedure may be useful for picking - out the units to be capitalised and applying CAPITALIZE-STRING to - their components. - -string-upcase s [start end] -> string -string-upcase! s [start end] -> unspecified -string-downcase s [start end] -> string -string-downcase! s [start end] -> unspecified - Raise or lower the case of the alphabetic characters in the string. - STRING-UPCASE and STRING-DOWNCASE return freshly allocated strings. - -string-take s nchars -> string -string-drop s nchars -> string -string-take-right s nchars -> string -string-drop-right s nchars -> string - STRING-TAKE returns the first NCHARS of STRING; - STRING-DROP returns all but the first NCHARS of STRING. - STRING-TAKE-RIGHT returns the last NCHARS of STRING; - STRING-DROP-RIGHT returns all but the last NCHARS of STRING. - These generalise MIT Scheme's HEAD & TAIL functions. - If these procedures produce the entire string, they may return either - S or a copy of S; in some implementations, proper substrings may share - memory with S. - -string-pad s k [char start end] -> string -string-pad-right s k [char start end] -> string - Build a string of length K comprised of S padded on the left (right) - by as many occurences of the character CHAR as needed. If S has more - than K chars, it is truncated on the left (right) to length k. CHAR - defaults to #\space. - - If K is exactly the length of S, these functions may return - either S or a copy of S. - -string-trim s [char/char-set/pred start end] -> string -string-trim-right s [char/char-set/pred start end] -> string -string-trim-both s [char/char-set/pred start end] -> string - Trim S by skipping over all characters on the left / on the right / - on both sides that satisfy the second parameter CHAR/CHAR-SET/PRED: - - If it is a character CHAR, characters equal to CHAR are trimmed. - - If it is a char set CHAR-SET, characters contained in CHAR-SET - are trimmed. - - If it is a predicate PRED, it is a test predicate that is applied - to the characters in S; a character causing it to return true - is skipped. - CHAR/CHAR/SET-PRED defaults to CHAR-SET:WHITESPACE. - - If no trimming occurs, these functions may return either S or a copy of S; - in some implementations, proper substrings may share memory with S. - - (string-trim-both " The outlook wasn't brilliant, \n\r") - => "The outlook wasn't brilliant," - -string-filter s char/char-set/pred [start end] -> string -string-delete s char/char-set/pred [start end] -> string - Filter the string S, retaining only those characters that - satisfy / do not satisfy the CHAR/CHAR-SET/PRED argument. If - this argument is a procedure, it is applied to the character - as a predicate; if it is a char-set, the character is tested - for membership; if it is a character, it is used in an equality test. - - If the string is unaltered by the filtering operation, these - functions may return either S or a copy of S. - -string-index s char/char-set/pred [start end] -> integer or #f -string-index-right s char/char-set/pred [end start] -> integer or #f -string-skip s char/char-set/pred [start end] -> integer or #f -string-skip-right s char/char-set/pred [end start] -> integer or #f - Note the inverted start/end ordering of index-right and skip-right's - parameters. - - Index (index-right) searches through the string from the left (right), - returning the index of the first occurence of a character which - - equals CHAR/CHAR-SET/PRED (if it is a character); - - is in CHAR/CHAR-SET/PRED (if it is a char-set); - - satisfies the predicate CHAR/CHAR-SET/PRED (if it is a procedure). - If no match is found, the functions return false. - - The skip functions are similar, but use the complement of the criteria: - they search for the first char that *doesn't* satisfy the test. E.g., - to skip over initial whitespace, say - (cond ((string-skip s char-set:whitespace) => - (lambda (i) - ;; (string-ref s i) is not whitespace. - ...))) - -string-prefix-count s1 s2 -> integer -string-suffix-count s1 s2 -> integer -string-prefix-count-ci s1 s2 -> integer -string-suffix-count-ci s1 s2 -> integer - Return the length of the longest common prefix/suffix of the two strings. - This is equivalent to the "mismatch index" for the strings. - -substring-prefix-count s1 start1 end1 s2 start2 end2 -> integer -substring-suffix-count s1 start1 end1 s2 start2 end2 -> integer -substring-prefix-count-ci s1 start1 end1 s2 start2 end2 -> integer -substring-suffix-count-ci s1 start1 end1 s2 start2 end2 -> integer - Substring variants. - -string-prefix? s1 s2 -> boolean -string-suffix? s1 s2 -> boolean -string-prefix-ci? s1 s2 -> boolean -string-suffix-ci? s1 s2 -> boolean - Is S1 a prefix/suffix of S2? - -substring-prefix? s1 start1 end1 s2 start2 end2 -> boolean -substring-suffix? s1 start1 end1 s2 start2 end2 -> boolean -substring-prefix-ci? s1 start1 end1 s2 start2 end2 -> boolean -substring-suffix-ci? s1 start1 end1 s2 start2 end2 -> boolean - Substring variants. - -substring? s1 s2 [start end] -> integer or false -substring-ci? s1 s2 [start end] -> integer or false - Return the index in S2 where S1 occurs as a substring, or false. - The returned index is in the range [start,end). - The current implementation uses the Knuth-Morris-Pratt algorithm. - -string-fill! s char [start end] -> unspecified - Store CHAR into the elements of S. - This is the R4RS procedure extended to have optional START/END parameters. - -string-copy! target tstart s [start end] -> unspecified - Copy the sequence of characters from index range [START,END) in - string S to string TARGET, beginning at index TSTART. The characters - are copied left-to-right or right-to-left as needed -- the copy is - guaranteed to work, even if TARGET and S are the same string. - -substring s start [end] -> string -string-copy s [start end] -> string - These R4RS procedures are extended to have optional START/END parameters. - Use STRING-COPY when you want to indicate explicitly in your code that you - wish to allocate new storage; use SUBSTRING when you don't care if you - get a fresh copy or share storage with the original string. - E.g.: - (string-copy "Beta substitution") => "Beta substitution" - (string-copy "Beta substitution" 1 10) - => "eta subst" - (string-copy "Beta substitution" 5) => "substitution" - - SUBSTRING may return a value with shares memory with S. - -string-reverse s [start end] -> string -string-reverse! s [start end] -> unspecific - Reverse the string. - -reverse-list->string char-list -> string - An efficient implementation of (compose string->list reverse): - (reverse-list->string '(#\a #\B #\c)) -> "cBa" - This is a common idiom in the epilog of string-processing loops - that accumulate an answer in a reverse-order list. - -string-concat string-list -> string - Append the elements of STRING-LIST together into a single list. - Guaranteed to return a freshly allocated list. Appears sufficiently - often as to warrant being named. - -string-concat/shared string-list -> string -string-append/shared s ... -> string - These two procedures are variants of STRING-CONCAT and STRING-APPEND - that are permitted to return results that share storage with their - parameters. In particular, if STRING-APPEND/SHARED is applied to just - one argument, it may return exactly that argument, whereas STRING-APPEND - is required to allocate a fresh string. - -string->list s [start end] -> char-list - The R5RS STRING->LIST procedure is extended to take optional START/END - arguments. - -string-null? s -> bool - Is S the empty string? - -xsubstring s from [to start end] -> string - This is the "extended substring" procedure that implements replicated - copying of a substring of some string. - - S is a string; START and END are optional arguments that demarcate - a substring of S, defaulting to 0 and the length of S (e.g., the whole - string). Replicate this substring up and down index space, in both the - positive and negative directions. For example, if S = "abcdefg", START=3, - and END=6, then we have the conceptual bidirectionally-infinite string - ... d e f d e f d e f d e f d e f d e f d e f ... - ... -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 ... - XSUBSTRING returns the substring of this string beginning at index FROM, - and ending at TO (which defaults to FROM+(END-START)). - - You can use XSUBSTRING to perform a variety of tasks: - - To rotate a string left: (xsubstring "abcdef" 2) => "cdefab" - - To rotate a string right: (xsubstring "abcdef" -2) => "efabcd" - - To replicate a string: (xsubstring "abc" 0 7) => "abcabca" - - Note that - - The FROM/TO indices give a half-open range -- the characters from - index FROM up to, but not including, index TO. - - The FROM/TO indices are not in terms of the index space for string S. - They are in terms of the replicated index space of the substring - defined by S, START, and END. - - It is an error if START=END -- although this is allowed by special - dispensation when FROM=TO. - -string-xcopy! target tstart s sfrom [sto start end] -> unspecific - Exactly the same as XSUBSTRING, but the extracted text is written - into the string TARGET starting at index TSTART. - This operation is not defined if (EQ? TARGET S) -- you cannot copy - a string on top of itself. - - -* Lower-level procedures - -The following procedures are useful for writing other string-processing -functions, and are contained in the string-lib-internals package. - -parse-start+end proc s args -> [start end rest] -parse-final-start+end proc s args -> [start end] - PARSE-START+END may be used to parse a pair of optional START/END arguments - from an argument list, defaulting them to 0 and the length of some string - S, respectively. Let the length of string S be SLEN. - - If ARGS = (), the function returns (values 0 slen '()) - - If ARGS = (i), I is checked to ensure it is an integer, and - that 0 <= i <= slen. Returns (values i slen (cdr rest)). - - If ARGS = (i j ...), I and J are checked to ensure they are - integers, and that 0 <= i <= j <= slen. Returns (values i j (cddr rest)). - If any of the checks fail, an error condition is raised, and PROC is used - as part of the error condition -- it should be the name of the client - procedure whose argument list PARSE-START+END is parsing. - - parse-final-start+end is exactly the same, except that the args list - passed to it is required to be of length two or less; if it is longer, - an error condition is raised. It may be used when the optional START/END - parameters are final arguments to the procedure. - -check-substring-spec proc s start end -> unspecific - Check values START and END to ensure they specify a valid substring - in S. This means that START and END are exact integers, and - 0 <= START <= END <= (STRING-LENGTH S) - If this is not the case, an error condition is raised. PROC is used - as part of error condition, and should be the procedure whose START/END - parameters we are checking. - -make-kmp-restart-vector s c= -> vector - Build the Knuth-Morris-Pratt "restart vector," which is useful - for quickly searching character sequences for the occurrence of - string S. C= is a character-equality function used to construct - the restart vector; it is usefully CHAR=? or CHAR-CI=?. - - The definition of the restart vector RV for string S is: - If we have matched chars 0..i-1 of S against some search string SS, and - S[i] doesn't match SS[k], then reset i := RV[i], and try again to - match SS[k]. If RV[i] = -1, then punt SS[k] completely, and move on to - SS[k+1] and S[0]. - - In other words, if you have matched the first i chars of S, but - the i+1'th char doesn't match, RV[i] tells you what the next-longest - prefix of PATTERN is that you have matched. - - The following string-search function shows how a restart vector - is used to search. It can be easily adapted to search other character - sequences (such as ports). - - (define (find-substring pattern source start end) - (let ((plen (string-length pattern)) - (rv (make-kmp-restart-vector pattern char=?))) - - ;; The search loop. SJ & PJ are redundant state. - (let lp ((si start) (pi 0) - (sj (- end start)) ; (- end si) -- how many chars left. - (pj plen)) ; (- plen pi) -- how many chars left. - - (if (= pi plen) (- si plen) ; Win. - - (and (<= pj sj) ; Lose. - - (if (char=? (string-ref source si) ; Search. - (string-ref pattern pi)) - (lp (+ 1 si) (+ 1 pi) (- sj 1) (- pj 1)) ; Advance. - - (let ((pi (vector-ref rv pi))) ; Retreat. - (if (= pi -1) - (lp (+ si 1) 0 (- sj 1) plen) ; Punt. - (lp si pi sj (- plen pi))))))))))