scsh-0.5/scsh/rdelim.scm

301 lines
11 KiB
Scheme

;;; Delimited readers
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; These procedures run their inner I/O loop in a C primitive, so they
;;; should be quite fast.
;;;
;;; N.B.:
;;; The C primitives %READ-DELIMITED-FDPORT!/ERRNO and
;;; %SKIP-CHAR-SET-FDPORT/ERRNO rely on knowing the representation of
;;; character sets. If these are changed from their current representation,
;;; this code must be changed as well.
;;; (read-delimited delims [port delim-action])
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Returns a string or the EOF object. DELIM-ACTION determines what to do
;;; with the terminating delimiter:
;;; - PEEK
;;; Leave it in the input stream for later reading.
;;; - TRIM (the default)
;;; Drop it on the floor.
;;; - CONCAT
;;; Append it to the returned string.
;;; - SPLIT
;;; Return it as a second return value.
;;;
;;; We repeatedly allocate a buffer and fill it with READ-DELIMITED!
;;; until we hit a delimiter or EOF. Each time through the loop, we
;;; double the total buffer space, so the loop terminates with a log
;;; number of reads, but uses at most double the optimal buffer space.
(define (read-delimited delims . args)
(let-optionals args ((port (current-input-port))
(delim-action 'trim))
(let ((substr (lambda (s end) ; Smart substring.
(if (= end (string-length s)) s
(substring s 0 end))))
(delims (->char-set delims))
(gobble? (not (eq? delim-action 'peek))))
;; BUFLEN is total amount of buffer space allocated to date.
(let lp ((strs '()) (buflen 80) (buf (make-string 80)))
(receive (terminator num-read)
(%read-delimited! delims buf gobble? port)
(if terminator
;; We are done. NUM-READ is either a read count or EOF.
(let ((retval (if (and (zero? num-read)
(eof-object? terminator)
(null? strs))
terminator ; EOF -- got nothing.
;; Got something. Stick all the strings
;; together, plus the terminator if the
;; client said 'CONCAT.
(let ((s (substr buf num-read)))
(cond ((and (eq? delim-action 'concat)
(char? terminator))
(apply string-append
(reverse `(,(string terminator)
,s . ,strs))))
((null? strs) s) ; Gratuitous opt.
(else (apply string-append
(reverse (cons s strs)))))))))
(if (eq? delim-action 'split)
(values retval terminator)
retval))
;; We are not done. Loop and read in some more.
(lp (cons buf strs)
(+ buflen buflen)
(make-string buflen))))))))
;;; (read-delimited! delims buf [port delim-action start end])
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Returns:
;;; - EOF if at end of file, and a non-zero read was requested.
;;; - Integer j if that many chars read into BUF.
;;; - #f if the buffer was filled w/o finding a delimiter.
;;;
;;; DELIM-ACTION determines what to do with the terminating delimiter;
;;; it is as in READ-DELIMITED.
;;;
;;; In determining the return value, there is an ambiguous case: when the
;;; buffer is full, *and* the following char is a delimiter char or EOF.
;;; Ties are broken favoring termination over #f -- after filling the buffer,
;;; READ-DELIMITED! won't return #f until it has peeked one past the end
;;; of the buffer to ensure the next char doesn't terminate input (or is EOF).
;;; However, this rule is relaxed with delim-action = CONCAT -- if the buffer
;;; is full, READ-DELIMITED! won't wait around trying to peek at the following
;;; char to determine whether or not it is a delimiter char, since it doesn't
;;; have space to store the character anyway. It simply immediately returns #f;
;;; a following read can pick up the delimiter char.
(define (read-delimited! delims buf . args) ; [port delim-action start end]
(let-optionals args ((port (current-input-port))
(delim-action 'trim)
(start 0)
(end (string-length buf)))
(receive (terminator num-read)
(%read-delimited! delims buf
(not (eq? delim-action 'peek)) ;Gobble delim?
port
start
(if (eq? delim-action 'concat)
(- end 1) ; Room for terminator.
end))
(if terminator ; Check for buffer overflow.
(let ((retval (if (and (zero? num-read)
(eof-object? terminator))
terminator ; EOF -- got nothing.
num-read))) ; Got something.
(case delim-action
((peek trim) retval)
((split) (values retval terminator))
((concat) (cond ((char? terminator)
(string-set! buf (+ start num-read) terminator)
(+ num-read 1))
(else retval)))))
;; Buffer overflow.
(case delim-action
((peek trim) #f)
((split) (values #f #f))
((concat) (let ((last (read-char port)))
(if (char? last)
(string-set! buf (+ start num-read) last))
(and (or (eof-object? last)
(char-set-contains? (->char-set delims)
last))
(+ num-read 1)))))))))
;;; (%read-delimited! delims buf gobble? [port start end])
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; This low-level routine uses a different interface. It returns two values:
;;; - TERMINATOR: A value describing why the read was terminated:
;;; + character or eof-object => read terminated by this value;
;;; + #f => filled buffer w/o terminating read.
;;; - NUM-READ: Number of chars read into buf.
;;;
;;; Note:
;;; - Invariant: TERMINATOR = #f => NUM-READ = END - START.
;;; - Invariant: TERMINATOR = eof-object and NUM-READ = 0 => at EOF.
;;; - When determining the TERMINATOR return value, ties are broken
;;; favoring character or the eof-object over #f. That is, if the buffer
;;; fills up, %READ-DELIMITED! will peek at one more character from the
;;; input stream to determine if it terminates the input. If so, that
;;; is returned, not #f.
;;;
;;; If GOBBLE? is true, then a terminator character is removed from
;;; the input stream. Otherwise, it is left in place for a following input
;;; operation.
(define (%read-delimited! delims buf gobble? . args)
(let-optionals args ((port (current-input-port))
(start 0)
(end (string-length buf)))
(check-arg input-port? port %read-delimited!) ; Arg checking.
(check-arg char-set? delims %read-delimited!) ; Required, since
(if (bogus-substring-spec? buf start end) ; we're calling C.
(error "Illegal START/END substring indices"
buf start end %read-delimited!))
(let* ((delims (->char-set delims))
(sdelims (char-set:s delims)))
(if (fdport? port)
;; Direct C support for Unix file ports -- zippy quick.
(let lp ((start start) (total 0))
(receive (terminator num-read)
(%read-delimited-fdport!/errno sdelims buf gobble?
port start end)
(let ((total (+ num-read total)))
(cond ((not (integer? terminator)) (values terminator total))
((= terminator errno/intr) (lp (+ start num-read) total))
(else (errno-error terminator %read-delimited!
num-read total
delims buf gobble? port start end))))))
;; This is the code for other kinds of ports.
;; Mighty slow -- we read each char twice (peek first, then read).
(let lp ((i start))
(let ((c (peek-char port)))
(cond ((or (eof-object? c) ; Found terminating char or eof
(char-set-contains? delims c))
(if gobble? (read-char port))
(values c (- i start)))
((>= i end) ; Filled the buffer.
(if gobble? (read-char port))
(values #f (- i start)))
(else (string-set! buf i (read-char port))
(lp (+ i 1))))))))))
(foreign-source
"#include <sys/types.h>"
""
"/* Make sure foreign-function stubs interface to the C funs correctly: */"
"#include \"fdports1.h\""
"" "")
(define-foreign %read-delimited-fdport!/errno (read_delim (string delims)
(var-string buf)
(bool gobble?)
(desc port)
(fixnum start)
(fixnum end))
desc ; int => errno; char => terminating char; eof-object; #f => buf ovflow
fixnum) ; number of chars read into BUF.
(define-foreign %skip-char-set-fdport/errno (skip_chars (string skip-set)
(desc port))
desc ; int => errno; #f => win.
fixnum) ; number of chars skipped.
(define (skip-char-set skip-chars . maybe-port)
(let* ((port (:optional maybe-port (current-input-port)))
(cset (->char-set skip-chars))
(scset (char-set:s cset)))
(cond ((not (input-port? port))
(error "Illegal value -- not an input port." port))
;; Direct C support for Unix file ports -- zippy quick.
((fdport? port)
(let lp ((total 0))
(receive (err num-read) (%skip-char-set-fdport/errno scset port)
(let ((total (+ total num-read)))
(cond ((not err) total)
((= errno/intr err) (lp total))
(errno-error err skip-char-set cset port total))))))
;; This is the code for other kinds of ports.
;; Mighty slow -- we read each char twice (peek first, then read).
(else (let lp ((i 0))
(let ((c (peek-char port)))
(cond ((and (char? c) (char-set-contains? cset c))
(read-char port)
(lp (+ i 1)))
(else i))))))))
;;; (read-line [port delim-action])
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Read in a line of data. Input is terminated by either a newline or EOF.
;;; The newline is trimmed from the string by default.
(define charset:newline (char-set #\newline))
(define (read-line . rest) (apply read-delimited charset:newline rest))
;;; (read-paragraph [port handle-delim])
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define blank-line-regexp (rx bos (* white) #\newline eos))
(define (read-paragraph . args)
(let-optionals args ((port (current-input-port))
(handle-delim 'trim))
;; First, skip all blank lines.
(let lp ()
(let ((line (read-line port 'concat)))
(cond ((eof-object? line)
(if (eq? handle-delim 'split) (values line line) line))
((regexp-search? blank-line-regexp line) (lp))
;; Then, read in non-blank lines.
(else
(let lp ((lines (list line)))
(let ((line (read-line port 'concat)))
(if (and (string? line)
(not (regexp-search? blank-line-regexp line)))
(lp (cons line lines))
;; Return the paragraph
(let ((->str (lambda (lns) (apply string-append (reverse lns)))))
(case handle-delim
((trim) (->str lines))
((concat)
(->str (if (eof-object? line) lines (cons line lines))))
((split)
(values (->str lines) line))
(else (error "Illegal HANDLE-DELIM parameter to READ-PARAGRAPH")))))))))))))