sunet/scheme/lib/rfc822.scm

172 lines
6.8 KiB
Scheme

;;; RFC 822 field-parsing code
;;; This file is part of the Scheme Untergrund Networking package.
;;; Copyright (c) 1995 by Olin Shivers <shivers@lcs.mit.edu>
;;; For copyright information, see the file COPYING which comes with
;;; the distribution.
;;; RFC 822 is the "Standard for the format of ARPA Internet text messages"
;;; -- the document that essentially tells how the fields in email headers
;;; (e.g., the Subject: and To: fields) are formatted. This code is for
;;; parsing these headers.
;;; Here is a pointer to the document:
;;; http://www.ietf.org/rfc/rfc0822.txt
;;; RFC 822 parsing is useful in other contexts as well -- the HTTP protocol
;;; uses it, and it tends to pop up here and there.
;;;
;;; RFC 822 header syntax has two levels: the general syntax for headers,
;;; and the syntax for specific headers. For example, once you have figured
;;; out which chunk of text is the To: line, there are more rules telling
;;; how to split the To: line up into a list of addresses. Another example:
;;; lines with dates, e.g., the Date: header, have a specific syntax for
;;; the time and date.
;;;
;;; This code currently *only* provides routines for parsing the gross
;;; structure -- splitting the message header into its distinct fields.
;;; It would be nice to provide the finer-detail parsers, too. You do it.
;;; -Olin
;;; A note on line-terminators:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Line-terminating sequences are always a drag, because there's no agreement
;;; on them -- the Net protocols and DOS use cr/lf; Unix uses lf; the Mac
;;; uses cr. One one hand, you'd like to use the code for all of the above,
;;; on the other, you'd also like to use the code for strict applications
;;; that need definitely not to recognise bare cr's or lf's as terminators.
;;;
;;; RFC 822 requires a cr/lf (carriage-return/line-feed) pair to terminate
;;; lines of text. On the other hand, careful perusal of the text shows up
;;; some ambiguities (there are maybe three or four of these, and I'm too
;;; lazy to write them all down). Furthermore, it is an unfortunate fact
;;; that many Unix apps separate lines of RFC 822 text with simple linefeeds
;;; (e.g., messages kept in /usr/spool/mail). As a result, this code takes a
;;; broad-minded view of line-terminators: lines can be terminated by either
;;; cr/lf or just lf, and either terminating sequence is trimmed.
;;;
;;; If you need stricter parsing, you can pass a read-line procedure
;;; as an extra parameter. This means that you can pass in a procedure
;;; that recognizes only cr/lf's, or only cr's (for a Mac app,
;;; perhaps), and you can determine whether or not the terminators get
;;; trimmed. However, your read-line procedure must indicate the
;;; header-terminating empty line by returning *either* the empty
;;; string or the two-char string cr/lf (or the EOF object).
(define htab (ascii->char 9))
;;; Convert to a symbol using the Scheme implementation's preferred case,
;;; so we can compare these things against quoted constants.
(define string->symbol-pref
(if (char=? #\a (string-ref (symbol->string 'a) 0)) ; Is it #\a or #\A?
(lambda (s) (string->symbol (string-map char-downcase s)))
(lambda (s) (string->symbol (string-map char-upcase s)))))
(define (read-rfc822-field . args)
(let-optionals args ((port (current-input-port))
(read-line read-crlf-line))
(let ((line1 (read-line port)))
(if (or (eof-object? line1)
(zero? (string-length line1))
(string=? line1 "\r\n")) ; In case read-line doesn't trim.
(values #f #f) ; Blank line or EOF terminates header text.
(cond
((string-index line1 #\:) => ; Find the colon and
(lambda (colon) ; split out field name.
(let ((name (string->symbol-pref (substring line1 0 colon))))
;; Read in continuation lines.
(let lp ((lines (list (substring line1
(+ colon 1)
(string-length line1)))))
(let ((c (peek-char port))) ; Could return EOF.
;; RFC822: continuous lines has to start with a space or a htab
(if (or (eqv? c #\space) (eqv? c htab))
(lp (cons (read-line port) lines))
(values name (reverse lines))))))))
(else (error "Illegal RFC 822 field syntax." line1))))))) ; No :
(define (read-rfc822-headers . args)
(let-optionals args ((port (current-input-port))
(read-line read-crlf-line))
(let lp ((alist '()))
(receive (field val) (read-rfc822-field port read-line)
(cond (field (cond ((assq field alist) =>
(lambda (entry)
(set-cdr! entry (cons val (cdr entry)))
(lp alist)))
(else (lp (cons (list field val) alist)))))
;; We are done. Reverse the order of each entry and return.
(else (for-each (lambda (entry)
(set-cdr! entry (reverse (cdr entry))))
alist)
alist))))))
;;; (rejoin-header-lines alist [separator])
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Takes a field alist such as is returned by READ-RFC822-HEADERS and
;;; returns an equivalent alist. Each body (string list) in the input alist
;;; is joined into a single list in the output alist. SEPARATOR is the
;;; string used to join these elements together; it defaults to a single
;;; space " ", but can usefully be "\n" or "\r\n".
;;;
;;; To rejoin a single body list, use scsh's STRING-JOIN procedure.
(define (rejoin-header-lines alist . maybe-separator)
(let-optionals maybe-separator ((sep " "))
(map (lambda (entry)
(cons (car entry)
(map (lambda (body) (string-join body sep))
(cdr entry))))
alist)))
;;; Given a set of RFC822 headers like this:
;;; From: shivers
;;; To: ziggy,
;;; newts
;;; To: gjs, tk
;;;
;;; We have the following definitions:
;;; (get-header-all hdrs 'to) -> ((" ziggy," " newts") (" gjs, tk"))
;;; - All entries, or #f
;;; (get-header-lines hdrs 'to) -> (" ziggy," " newts")
;;; - All lines of the first entry, or #f.
;;; (get-header hdrs 'to) -> "ziggy,\n newts"
;;; - First entry, with the lines joined together by newlines.
(define (get-header-all headers name)
(let ((entry (assq name headers)))
(and entry (cdr entry))))
(define (get-header-lines headers name)
(let ((entry (assq name headers)))
(and entry
(pair? entry)
(cadr entry))))
(define (get-header headers name . maybe-sep)
(let ((entry (assq name headers)))
(and entry
(pair? entry)
(string-join (cadr entry)
(:optional maybe-sep "\n")))))
(define (delete-headers headers name)
(fold (lambda (entry rest)
(if (eq? (car entry) name)
rest
(cons entry rest)))
'()
headers))
;;; Other desirable functionality
;;; - Unfolding long lines.
;;; - Lexing structured fields.
;;; - Unlexing structured fields into canonical form.
;;; - Parsing and unparsing dates.
;;; - Parsing and unparsing addresses.