scsh-0.6/scsh/stringcoll.scm

167 lines
6.4 KiB
Scheme
Raw Normal View History

2003-05-01 06:21:33 -04:00
;;; Copyright (c) 1994 by Olin Shivers
;;; String collectors
;;; ===========================================================================
;;; string-colllector
;;; (make-string-collector)
;;; (collect-string! SC S)
;;; (clear-string-collector! SC)
;;; (string-collector->string SC)
;;;
;;; A string collector is a data structure that reduces the overhead of
;;; accumulating a large string in bits and pieces. It is basically a
;;; "chunk list," where a chunk is a string of at least 128 chars. In this
;;; way, the list overhead is kept under 2% of the whole data structure.
;;; When a new string is added to the collection, it is added to the current
;;; chunk. When the chunk reaches 128 chars, it is added to the chunk list,
;;; and a new chunk is started. If a large string is added to the collection,
;;; it is added as a chunk itself, so large strings are not split into small
;;; pieces. (Actually, a *copy* of the original large string is saved as a
;;; single chunk, so the collector's chunks are not shared with client data.)
;;;
;;; MAKE-STRING-COLLECTOR allocates a new string collector data structure.
;;; COLLECT-STRING! appends a string to the current collection.
;;; CLEAR-STRING-COLLECTOR! clears out accumulated strings from a collector.
;;; STRING-COLLECTOR->STRING converts a collector into a contiguous string.
;;;
;;; This facility makes it reasonably efficient to accumulate strings
;;; of any size in increments of any size.
(define-record string-collector
(len 0) ; How many chars have we accumulated?
(chunks '()) ; The chunk list.
(chunk #f) ; The current chunk being filled, if any.
(chunk-left 0)) ; How many chars left to fill in the current chunk.
(define (clear-string-collector! sc)
(set-string-collector:len sc 0)
(set-string-collector:chunks sc '())
(set-string-collector:chunk sc #f)
sc)
;;; (COLLECT-STRING! sc s)
;;; ----------------------
;;; S is a string. Append it to the string being collected in the
;;; string-collector SC.
;;;
;;; The algorithm:
;;; First, do nothing if S is the empty string. Otherwise:
;;; If there is a current chunk:
;;; Copy characters from S into it.
;;; If we filled up the chunk
;;; Put the chunk on the chunk list.
;;; Look at the remaining chars from S we haven't copied yet.
;;; If there a lot of characters left (>= 128)
;;; Save them as a single chunk on the chunk list.
;;; No current chunk.
;;; Else if there a just a few characters left (> 0, < 128)
;;; Start a new current chunk, copy the chars left into it.
;;; Else if there aren't any characters left
;;; No current chunk.
;;;
;;; If there is no current chunk:
;;; If there are a lot of characters in S (>= 128)
;;; Save a copy of S as a single chunk on the chunk list.
;;; Still no current chunk.
;;; Else if there are a few characters in S (> 0, < 128)
;;; Start a new current chunk, copy the S into it.
(define (collect-string! sc s)
(let ((slen (string-length s))
(chunk (string-collector:chunk sc))
(chunk-left (string-collector:chunk-left sc))
;; Add the chunk C to the collector's chunk list.
(push-chunk! (lambda (c)
(set-string-collector:chunks sc
(cons c (string-collector:chunks sc)))))
;; Copy nchars characters from src[j] to dest[i]
;; *Way* too much bounds checking going on in this loop.
(copy-substring! (lambda (dest i src j nchars)
(do ((i i (+ i 1))
(j j (+ j 1))
(nchars nchars (- nchars 1)))
((zero? nchars))
(string-set! dest i (string-ref src j))))))
(cond ((zero? slen)) ; Empty string, do nothing.
(chunk
(let ((ncopy (min slen chunk-left)))
(copy-substring! chunk (- 128 chunk-left) s 0 ncopy)
(if (> chunk-left slen)
(set-string-collector:chunk-left sc (- chunk-left slen))
;; Current chunk is full.
(let ((s-left (- slen chunk-left)))
(push-chunk! chunk) ; Push the current chunk.
;; Handle remaining chars from S that weren't copied into
;; the current chunk we just pushed:
(cond ((>= s-left 128)
;; A lot more chars left. Push them as one chunk.
(push-chunk! (substring s chunk-left slen))
(set-string-collector:chunk sc #f))
((> s-left 0)
;; A few more chars left. Start a new chunk.
(let ((new-chunk (make-string 128)))
(copy-substring! new-chunk 0 s chunk-left s-left)
(set-string-collector:chunk sc new-chunk)
(set-string-collector:chunk-left sc
(- 128 s-left))))
;; No more chars left. No current chunk.
(else (set-string-collector:chunk sc #f)))))))
(else ; No current chunk.
(if (>= slen 128) ; How many chars is S?
(push-chunk! (string-copy s)) ; A lot. Push as one chunk.
(let ((chunk (make-string 128))) ; Not many. Start a new chunk.
(set-string-collector:chunk sc chunk)
(copy-substring! chunk 0 s 0 slen)
(set-string-collector:chunk-left sc (- 128 slen))))))
;; We don't actually do anything with this, but we keep it updated anyway.
(set-string-collector:len sc (+ (string-collector:len sc) slen))
sc))
;;; A bummed version for collecting a single character.
(define (collect-char! sc c)
(let ((chunk (string-collector:chunk sc))
(chunk-left (string-collector:chunk-left sc)))
(cond (chunk
(string-set! chunk (- 128 chunk-left) c)
(cond ((> chunk-left 1)
(set-string-collector:chunk-left sc (- chunk-left 1)))
(else
(set-string-collector:chunks sc
(cons chunk (string-collector:chunks sc)))
(set-string-collector:chunk sc #f))))
(else
(let ((new-chunk (make-string 128 c)))
(set-string-collector:chunk-left sc 127)
(set-string-collector:chunk sc new-chunk)))))
;; We don't actually do anything with this, but we keep it updated anyway.
(set-string-collector:len sc (+ (string-collector:len sc) 1))
sc)
;;; Convert the data in the string-collector SC to a single contiguous
;;; string and return it.
(define (string-collector->string sc)
(let ((chunk (string-collector:chunk sc))
(chunks (string-collector:chunks sc)))
(apply string-append
(reverse (if chunk
(cons (substring chunk 0
(- 128
(string-collector:chunk-left sc)))
chunks)
chunks)))))
;;; It's too bad we can't side-effect the string-collector's chunk list
;;; to be a single chunk after this coalescing operation, but we don't
;;; want to share the string we return -- the user might side-effect it.