;;; HTTP 1.1 Request-URI parsing and unparsing -*- Scheme -*- ;;; Copyright (c) 1995 by Olin Shivers. ;;; For copyright information, see the file COPYING which comes with ;;; the distribution. ;;; References: ;;; RFC 2616 Hypertext Transfer Protocol -- HTTP/1.1 ;;; RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax ;;; ;;; RFC 2616 adopts definitions of regexps from RFC 2396. ;;; RFC 2616 is ambiguous in defining Request_URIS: ;;; ;;; section 5.1.2 states: ;;; HTTP 1.1 Request-URIS are of the form ;;; Request-URI = "*" | absoluteURI | abs_path | authority ;;; ;;; whilst section 3.2.2 defines the 'http_URL' ;;; http_URL = "http://" host [ ":" port ] [ abs_path [ "?" query ]] ;;; Since allowing for general absoluteURIs doesn't make too much sense ;;; we implement only Request_URIs as follows: ;;; Request-URI = ( http_URL | abs_path) ["#" fragment] ;;; ;;; where http_URL is a subset of absoluteURI ;;; [ "#" fragment ] is allowed even though ;;; RFC 2616 disallowes the #fragment part ;;; (while RFC 1945 for HTTP/1.0 allowed it). ;;; (This is for compatibility with buggy clients). ;;; RexExps for Request-URIs as scsh SREs ;;; stick to RFC terminology throughout (define digit (rx numeric)) (define alpha (rx alphanum)) (define alphanum (rx alphanumeric)) (define hex (rx hex-digit)) (define escaped (rx "%" ,hex ,hex)) (define mark (rx ( "-_.!~*'()"))) (define unreserved (rx (| ,alphanum ,mark))) (define reserved (rx ( ";/?:@&=+$,"))) (define uric (rx (| ,reserved ,unreserved ,escaped))) (define fragment (rx (* ,uric))) (define query (rx (* ,uric))) (define pchar-charset (rx ( ":@&=+$,"))) (define pchar (rx (| ,unreserved ,escaped ,pchar-charset))) (define param (rx (* ,pchar))) (define segment (rx (: (* ,pchar) (* (: ";" ,param))))) (define path-segments (rx (: ,segment (* (: "/" ,segment))))) (define abs_path (rx (: "/" ,path-segments))) (define port (rx (* ,digit))) (define IPv4address (rx (+ ,digit) "." (+ ,digit) "." (+ ,digit) "." (+ ,digit))) (define toplabel (rx (: (| ,alpha (: ,alpha (* (| ,alphanum "-")) ,alphanum))))) (define domainlabel (rx (: (| ,alphanum (: ,alphanum (* (| ,alphanum "-")) ,alphanum))))) (define hostname (rx (: (* (: ,domainlabel ".")) ,toplabel (? ".")))) (define host (rx (| ,hostname ,IPv4address))) (define http_URL (rx (: "http://" (submatch ,host) (? (: ":" (submatch ,port))) (? (: (submatch ,abs_path) (? (: "?" (submatch ,query)))))))) (define http_URL_with_frag (rx (: bos ,@http_URL (? "#" ,fragment) eos))) (define abs_path_with_frag (rx (: bos (submatch ,abs_path) (? "#" ,fragment) eos))) (define Request-URI (rx (| ,@http_URL_with_frag ,@abs_path_with_frag))) ;;parse HTTP 1.1 Request-URI (define (parse-uri request-uri) (cond ((regexp-search abs_path_with_frag request-uri) => (lambda (match) (values #f #f (split-abs-path (match:substring match 1)) #f))) ((regexp-search http_URL_with_frag request-uri) =>(lambda (match) (let ((host (match:substring match 1)) (port (match:substring match 2)) (path (split-abs-path (match:substring match 3))) (query (match:substring match 4))) (values host port path query)))) (else (fatal-syntax-error "Request-URI syntactically faulty")))) ;; split the string abs-path at slashes, return list of segments. ;; SPLIT-PATH assumes abs-path matches the RegExp abs_path, no checks are done. ;; minor remark: abs_path allows for strings containing several consecutive slashes; ;; SPLIT-ABS-PATH treats them as one slash. (define (split-abs-path abs-path) (regexp-fold-right (rx (+ (~ ("/")))) (lambda (match i res) (cons (match:substring match 0) res)) '() abs-path)) ;;; HTTP URL parsing ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; The PATH slot of this record is the URL's path split at slashes, ;;; e.g., "foo/bar//baz/" => ("foo" "bar" "" "baz" "") ;;; These elements are in raw, unescaped format. To convert back to ;;; a string, use (uri-path->uri (map escape-uri pathlist)). (define-record-type http-url :http-url (make-http-url server path search fragment-identifier) http-url? (server http-url-server) ; Initial //anonymous@clark.lcs.mit.edu:80/ (path http-url-path) ; Rest of path, split at slashes & decoded. (search http-url-search) (fragment-identifier http-url-fragment-identifier)) ;(define-new-record-type http-url :http-url ; (make-http-url hostname port path query) ; http-url? ; (hostname http-url-hostname) ; (port http-url-port) ; (path http-url-path) ; (query http-url-query)) ;;; The URI parser (parse-uri in uri.scm) maps a string to four parts: ;;; : ? # , , and ;;; are strings; is a non-empty string list -- the ;;; URI's path split at slashes. Optional parts of the URI, when ;;; missing, are specified as #f. If is "http", then the ;;; other three parts can be passed to PARSE-HTTP-URL, which parses ;;; them into a HTTP-URL record. All strings come back from the URI ;;; parser encoded. SEARCH and FRAG-ID are left that way; this parser ;;; decodes the path elements. ;;; ;;; Returns a HTTP-URL record, if possible. Otherwise ;;; FATAL-SYNTAX-ERROR is called. (define (parse-http-url path search frag-id) (let ((uh (parse-server path default-http-server))) (if (or (server-user uh) (server-password uh)) (fatal-syntax-error "HTTP URL's may not specify a user or password field" path)) (make-http-url uh (map unescape-uri (cdddr path)) search frag-id))) (define (parse-http-url-string string) (call-with-values (lambda () (parse-uri string)) (lambda (scheme path search frag-id) (if (string=? scheme "http") (parse-http-url path search frag-id) (fatal-syntax-error "not an HTTP URL" path))))) ;;; Default http port is 80. (define default-http-server (make-server #f #f #f "80")) ;;; Unparse. (define (http-url->string url) (string-append "http://" (server->string (http-url-server url)) "/" (uri-path->uri (map escape-uri (http-url-path url))) (cond ((http-url-search url) => (lambda (s) (string-append "?" s))) (else "")) (cond ((http-url-fragment-identifier url) => (lambda (fi) (string-append "#" fi))) (else ""))))