220 lines
9.6 KiB
Scheme
220 lines
9.6 KiB
Scheme
;;; RFC 822 field-parsing code -*- Scheme -*-
|
|
;;; Copyright (c) 1995 by Olin Shivers.
|
|
;;; <shivers@lcs.mit.edu>
|
|
;;;
|
|
;;; Imports and non-R4RS'isms
|
|
;;; downcase-string upcase-string (string->symbol conversion)
|
|
;;; read-crlf-line
|
|
;;; let-optionals, :optional
|
|
;;; receive values (MV return)
|
|
;;; "\r\n" in string for cr/lf
|
|
;;; ascii->char (defining the tab char)
|
|
;;; index
|
|
;;; join-strings (reassembling body lines)
|
|
;;; error
|
|
;;; ? (COND)
|
|
|
|
;;; RFC 822 is the "Standard for the format of ARPA Internet text messages"
|
|
;;; -- the document that essentially tells how the fields in email headers
|
|
;;; (e.g., the Subject: and To: fields) are formatted. This code is for
|
|
;;; parsing these headers. Here are two pointers to the document:
|
|
;;; Emacs/ange /ftp@ftp.internic.net:/rfc/rfc822.txt
|
|
;;; URL ftp://ftp.internic.net/rfc/rfc822.txt
|
|
;;; RFC 822 parsing is useful in other contexts as well -- the HTTP protocol
|
|
;;; uses it, and it tends to pop up here and there.
|
|
;;;
|
|
;;; RFC 822 header syntax has two levels: the general syntax for headers,
|
|
;;; and the syntax for specific headers. For example, once you have figured
|
|
;;; out which chunk of text is the To: line, there are more rules telling
|
|
;;; how to split the To: line up into a list of addresses. Another example:
|
|
;;; lines with dates, e.g., the Date: header, have a specific syntax for
|
|
;;; the time and date.
|
|
;;;
|
|
;;; This code currently *only* provides routines for parsing the gross
|
|
;;; structure -- splitting the message header into its distinct fields.
|
|
;;; It would be nice to provide the finer-detail parsers, too. You do it.
|
|
;;; -Olin
|
|
|
|
;;; A note on line-terminators:
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;; Line-terminating sequences are always a drag, because there's no agreement
|
|
;;; on them -- the Net protocols and DOS use cr/lf; Unix uses lf; the Mac
|
|
;;; uses cr. One one hand, you'd like to use the code for all of the above,
|
|
;;; on the other, you'd also like to use the code for strict applications
|
|
;;; that need definitely not to recognise bare cr's or lf's as terminators.
|
|
;;;
|
|
;;; RFC 822 requires a cr/lf (carriage-return/line-feed) pair to terminate
|
|
;;; lines of text. On the other hand, careful perusal of the text shows up
|
|
;;; some ambiguities (there are maybe three or four of these, and I'm too
|
|
;;; lazy to write them all down). Furthermore, it is an unfortunate fact
|
|
;;; that many Unix apps separate lines of RFC 822 text with simple linefeeds
|
|
;;; (e.g., messages kept in /usr/spool/mail). As a result, this code takes a
|
|
;;; broad-minded view of line-terminators: lines can be terminated by either
|
|
;;; cr/lf or just lf, and either terminating sequence is trimmed.
|
|
;;;
|
|
;;; If you need stricter parsing, you can call the lower-level procedure
|
|
;;; %READ-RFC-822-FIELD and %READ-RFC822-HEADERS procs. They take the
|
|
;;; read-line procedure as an extra parameter. This means that you can
|
|
;;; pass in a procedure that recognises only cr/lf's, or only cr's (for a
|
|
;;; Mac app, perhaps), and you can determine whether or not the terminators
|
|
;;; get trimmed. However, your read-line procedure must indicate the
|
|
;;; header-terminating empty line by returning *either* the empty string or
|
|
;;; the two-char string cr/lf (or the EOF object).
|
|
|
|
;;; (read-rfc822-field [port])
|
|
;;; (%read-rfc822-field read-line port)
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;; Read one field from the port, and return two values [NAME BODY]:
|
|
;;; - NAME Symbol such as 'subject or 'to. The field name is converted
|
|
;;; to a symbol using the Scheme implementation's preferred
|
|
;;; case. If the implementation reads symbols in a case-sensitive
|
|
;;; fashion (e.g., scsh), lowercase is used. This means you can
|
|
;;; compare these symbols to quoted constants using EQ?. When
|
|
;;; printing these field names out, it looks best if you capitalise
|
|
;;; them with (CAPITALIZE-STRING (SYMBOL->STRING FIELD-NAME)).
|
|
;;; - BODY List of strings which are the field's body, e.g.
|
|
;;; ("shivers@lcs.mit.edu"). Each list element is one line
|
|
;;; from the field's body, so if the field spreads out
|
|
;;; over three lines, then the body is a list of three
|
|
;;; strings. The terminating cr/lf's are trimmed from each
|
|
;;; string. A leading space or a leading horizontal tab
|
|
;;; is also trimmed, but one and only one.
|
|
;;; When there are no more fields -- EOF or a blank line has terminated the
|
|
;;; header section -- then the procedure returns [#f #f].
|
|
;;;
|
|
;;; The %READ-RFC822-FIELD variant allows you to specify your own
|
|
;;; read-line procedure. The one used by READ-RFC822-FIELD terminates
|
|
;;; lines with either cr/lf or just lf, and it trims the terminator
|
|
;;; from the line. Your read-line procedure should trim the terminator
|
|
;;; of a line so an empty line is returned just as an empty string.
|
|
|
|
(define htab (ascii->char 9))
|
|
|
|
;;; Convert to a symbol using the Scheme implementation's preferred case,
|
|
;;; so we can compare these things against quoted constants.
|
|
(define string->symbol-pref
|
|
(if (char=? #\a (string-ref (symbol->string 'a) 0)) ; Is it #\a or #\A?
|
|
(lambda (s) (string->symbol (downcase-string s)))
|
|
(lambda (s) (string->symbol (upcase-string s)))))
|
|
|
|
(define (read-rfc822-field . maybe-port)
|
|
(let-optionals maybe-port ((port (current-input-port)))
|
|
(%read-rfc822-field read-crlf-line port)))
|
|
|
|
(define (%read-rfc822-field read-line port)
|
|
(let ((line1 (read-line port)))
|
|
(if (or (eof-object? line1)
|
|
(zero? (string-length line1))
|
|
(string=? line1 "\r\n")) ; In case read-line doesn't trim.
|
|
|
|
(values #f #f) ; Blank line or EOF terminates header text.
|
|
|
|
(cond
|
|
((string-index line1 #\:) => ; Find the colon and
|
|
(lambda (colon) ; split out field name.
|
|
(let ((name (string->symbol-pref (substring line1 0 colon))))
|
|
;; Read in continuation lines.
|
|
(let lp ((lines (list (substring line1
|
|
(+ colon 1)
|
|
(string-length line1)))))
|
|
(let ((c (peek-char port))) ; Could return EOF.
|
|
;;; RFC822: continuous lines has to start with a space or a htab
|
|
(if (or (eqv? c #\space) (eqv? c htab))
|
|
(lp (cons (read-line port) lines))
|
|
(values name (reverse lines))))))))
|
|
(else (error "Illegal RFC 822 field syntax." line1)))))) ; No :
|
|
|
|
|
|
;;; (read-rfc822-headers [port])
|
|
;;; (%read-rfc822-headers read-line port)
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;; Read in and parse up a section of text that looks like the header portion
|
|
;;; of an RFC 822 message. Return an alist mapping a field name (a symbol
|
|
;;; such as 'date or 'subject) to a list of field bodies -- one for
|
|
;;; each occurence of the field in the header. So if there are five
|
|
;;; "Received-by:" fields in the header, the alist maps 'received-by
|
|
;;; to a five element list. Each body is in turn represented by a list
|
|
;;; of strings -- one for each line of the field. So a field spread across
|
|
;;; three lines would produce a three element body.
|
|
;;;
|
|
;;; The %READ-RFC822-HEADERS variant allows you to specify your own read-line
|
|
;;; procedure. See notes above for reasons why.
|
|
|
|
(define (read-rfc822-headers . maybe-port)
|
|
(let-optionals maybe-port ((port (current-input-port)))
|
|
(%read-rfc822-headers read-crlf-line port)))
|
|
|
|
(define (%read-rfc822-headers read-line port)
|
|
(let lp ((alist '()))
|
|
(receive (field val) (%read-rfc822-field read-line port)
|
|
(cond (field (cond ((assq field alist) =>
|
|
(lambda (entry)
|
|
(set-cdr! entry (cons val (cdr entry)))
|
|
(lp alist)))
|
|
(else (lp (cons (list field val) alist)))))
|
|
|
|
;; We are done. Reverse the order of each entry and return.
|
|
(else (for-each (lambda (entry)
|
|
(set-cdr! entry (reverse (cdr entry))))
|
|
alist)
|
|
alist)))))
|
|
|
|
;;; (rejoin-header-lines alist [separator])
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;; Takes a field alist such as is returned by READ-RFC822-HEADERS and
|
|
;;; returns an equivalent alist. Each body (string list) in the input alist
|
|
;;; is joined into a single list in the output alist. SEPARATOR is the
|
|
;;; string used to join these elements together; it defaults to a single
|
|
;;; space " ", but can usefully be "\n" or "\r\n".
|
|
;;;
|
|
;;; To rejoin a single body list, use scsh's JOIN-STRINGS procedure.
|
|
|
|
(define (rejoin-header-lines alist . maybe-separator)
|
|
(let-optionals maybe-separator ((sep " "))
|
|
(map (lambda (entry)
|
|
(cons (car entry)
|
|
(map (lambda (body) (string-join (list body sep)))
|
|
(cdr entry))))
|
|
alist)))
|
|
|
|
|
|
;;; Given a set of RFC822 headers like this:
|
|
;;; From: shivers
|
|
;;; To: ziggy,
|
|
;;; newts
|
|
;;; To: gjs, tk
|
|
;;;
|
|
;;; We have the following definitions:
|
|
;;; (get-header-all hdrs 'to) -> ((" ziggy," " newts") (" gjs, tk"))
|
|
;;; - All entries, or #f
|
|
;;; (get-header-lines hdrs 'to) -> (" ziggy," " newts")
|
|
;;; - All lines of the first entry, or #f.
|
|
;;; (get-header hdrs 'to) -> "ziggy,\n newts"
|
|
;;; - First entry, with the lines joined together by newlines.
|
|
|
|
(define (get-header-all headers name)
|
|
(let ((entry (assq name headers)))
|
|
(and entry (cdr entry))))
|
|
|
|
(define (get-header-lines headers name)
|
|
(let ((entry (assq name headers)))
|
|
(and entry
|
|
(pair? entry)
|
|
(cadr entry))))
|
|
|
|
(define (get-header headers name . maybe-sep)
|
|
(let ((entry (assq name headers)))
|
|
(and entry
|
|
(pair? entry)
|
|
(string-join (list (cadr entry)
|
|
(:optional maybe-sep "\n"))))))
|
|
|
|
|
|
|
|
;;; Other desireable functionality
|
|
;;; - Unfolding long lines.
|
|
;;; - Lexing structured fields.
|
|
;;; - Unlexing structured fields into canonical form.
|
|
;;; - Parsing and unparsing dates.
|
|
;;; - Parsing and unparsing addresses.
|