;;; HTTP 1.1 Request-URI parsing and unparsing -*- Scheme -*- ;;; Copyright (c) 2005 by Viola Brunner. ;;; For copyright information, see the file COPYING which comes with ;;; the distribution. ;;; References: ;;; RFC 2616 Hypertext Transfer Protocol -- HTTP/1.1 ;;; RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax ;;; ;;; RFC 2616 adopts definitions of regexps from RFC 2396 ;;; (see copy of Appendix A of RFC 2396 below) ;;; Note: there are 2 Problems in RFC 2616 concerning URIS: ;;; Problem 1: ;;; RFC 2616 is ambiguous in defining Request_URIS: ;;; ;;; section 5.1.2 states: ;;; HTTP 1.1 Request-URIS are of the form ;;; Request-URI = "*" | absoluteURI | abs_path | authority ;;; ;;; whilst section 3.2.2 defines the 'http_URL' ;;; http_URL = "http://" host [ ":" port ] [ abs_path [ "?" query ]] ;;; ;;; Solution to Problem 1: ;;; Since allowing for general absoluteURIs doesn't make too much sense ;;; we implement Request_URIs of the form ;;; Request-URI = ( http_URL | abs_path) ["#" fragment] ;;; where http_URL is a only a subset of absoluteURI ;;; Problem 2: ;;; according to RFC 2616, section 5.1.2, the Request-URI may only ;;; have a [? query] part if it's an absoluteURI; on the other hand ;;; only requests being made to proxies are supposed to use ;;; absoluteURIs; abs_path is the normal case. So this must be a mistake. ;;; See also http://skrb.org/ietf/http_errata.html#uriquery ;;; ;;; Solution to Problem 2: ;;, we implement Request_URIs of the form ;;; Request-URI = ( http_URL | abs_path ["?" query] ) ["#" fragment] ;;; Note: we don't have to support Request-URIS of the form "*" or ;;; authority, because these are not used with the any of the methods ;;; HEAD, GET and POST, which are the only methods we implement so ;;; far. ;;; Here we depart from the RFCs: ;;; RFC 2616 and 1945 disallow a #fragment-suffix of the Request-URI. ;;; For compatibility with buggy clients we _do_ allow for it. ;;; (Apache does so, too). ;;; RexExps for Request-URIs as scsh SREs ;;; stick to RFC terminology throughout ;;; (see copy of Appendix A of RFC 2396 below) ;;; ;;; we implement Request_URIs of the form ;;; Request-URI = ( http_URL | abs_path ["?" query] ) ["#" fragment] (define digit (rx numeric)) (define alpha (rx alphabetic)) (define alphanum (rx alphanumeric)) (define hex (rx hex-digit)) (define escaped (rx (: "%" ,hex ,hex))) (define mark (rx ( "-_.!~*'()"))) (define unreserved (rx (~ (~ (| ,alphanum ,mark))))) (define reserved (rx ( ";/?:@&=+$,"))) (define uric (rx (| ,reserved ,unreserved ,escaped))) (define fragment (rx (* ,uric))) (define query (rx (* ,uric))) (define pchar-charset (rx ( ":@&=+$,"))) (define pchar (rx (| ,unreserved ,escaped ,pchar-charset))) (define param (rx (* ,pchar))) (define segment (rx (: (* ,pchar) (* (: ";" ,param))))) (define path-segments (rx (: ,segment (* (: "/" ,segment))))) (define abs_path (rx (: "/" ,path-segments))) (define port (rx (* ,digit))) (define IPv4address (rx (+ ,digit) "." (+ ,digit) "." (+ ,digit) "." (+ ,digit))) (define toplabel (rx (: (| ,alpha (: ,alpha (* (| ,alphanum "-")) ,alphanum))))) (define domainlabel (rx (: (| ,alphanum (: ,alphanum (* (| ,alphanum "-")) ,alphanum))))) (define hostname (rx (: (* (: ,domainlabel ".")) ,toplabel (? ".")))) (define host (rx (| ,hostname ,IPv4address))) (define http_URL (rx (: "http://" (submatch ,host) (? (: ":" (submatch ,port))) (? (: (submatch ,abs_path) (? (: "?" (submatch ,query)))))))) (define http_URL_with_frag (rx (: bos ,@http_URL (? (: "#" ,fragment)) eos))) (define abs_path_with_frag (rx (: bos (submatch ,abs_path) (? (: "?" (submatch ,query))) (? (: "#" ,fragment)) eos))) (define Request-URI (rx (| ,@http_URL_with_frag ,@abs_path_with_frag))) ;;; parse a HTTP 1.1 Request_URI ;;; ;;; return matches of regexps host, port, abs_path, query; ;;; ;;; If request-uri is a relative URI, host and port are #f; ;;; port and query are also #f if they are not given. ;;; If there's no abs_path given, or abs_path is "/", path is the empty list; ;;; otherwise it is a list containing the path's segments. ;;; ;;; Caution: parse-url doesn't unescape anything yet! (define (parse-url request-uri) (cond ((regexp-search abs_path_with_frag request-uri) => (lambda (match) (let ((path (split-abs-path (match:substring match 1))) (query (match:substring match 2))) (values #f #f path query)))) ((regexp-search http_URL_with_frag request-uri) =>(lambda (match) (let ((host (match:substring match 1)) (port (match:substring match 2)) (path (split-abs-path (match:substring match 3))) (query (match:substring match 4))) (values host port path query)))) (else (fatal-syntax-error "Request-URI syntactically faulty")))) ;;; split the string abs-path at slashes, return list of 'segments' (see RegExp definition above). ;;; ;;; SPLIT-PATH assumes abs-path if either #f or matches the RegExp abs_path, ;;; no checks are done. ;;; ;;; remark: abs_path allows for strings containing several consecutive slashes; ;;; SPLIT-ABS-PATH treats them as one slash. ;;; (e.g., "/foo///bar//baz/" => ("foo" "bar" "baz")) (define (split-abs-path abs-path) (if abs-path (regexp-fold-right (rx (+ (~ ("/")))) (lambda (match i res) (cons (match:substring match 0) res)) '() abs-path) '())) ;;; record type HTTP-URL for Request_URIs ;;; ;;; The HOST slot is a non-empty-string or #f. ;;; ;;; The PORT slot is an integer or #f. ;;; ;;; The PATH slot is the Request_URI's path split at slashes ;;; (e.g., "/foo///bar//baz/" => ("foo" "bar" "baz")) ;;; and unescaped. ;;; ;;; The QUERY slot is an non-empty-string, still in its escaped ;;; representation, or #f. ;;; Caution: the path slot of a http-url record has already been ;;; UNESCAPED; don't unescape it a second time! ;;; The query slot is still in its escaped representation. (define-record-type http-url :http-url (make-http-url host port path query) http-url? (host http-url-host) (port http-url-port) (path http-url-path) (query http-url-query)) ;;; parse a HTTP 1.1. Request_URI into a http-url record (define (url-string->http-url uri-string) (receive (host port path query) (parse-url uri-string) (let ((portnumber (and port (string->number port))) (unescaped-path (map unescape path))) (make-http-url host portnumber unescaped-path query)))) ;;; Unparse a http-url record into its corresponding Request_URI (define (http-url->url-string http-url) (let* ((host (http-url-host http-url)) (scheme-and-host-string (if host (string-append "http://" host) "")) (port (http-url-port http-url)) (port-string (if port (string-append ":" (number->string port)) "")) (path (http-url-path http-url)) (path-string (fold-right (lambda (segment res) (string-append "/" (escape-segment segment) res)) "" path)) (query (http-url-query http-url)) (query-string (if query (string-append "?" query) ""))) (string-append scheme-and-host-string port-string path-string query-string))) ;;; Unparse the http-url-path field of an http-url record into its corresponding part of the Request_URI (define (http-url-path->path-string http-url-path) (fold-right (lambda (segment res) (string-append "/" (escape-segment segment) res)) "" http-url-path)) ;;; decoding and encoding Request-URIs: ;;; to decode Request-URIs use UNESCAPE from uri.scm ;;; encode Request-URIs: ;;; Each component of a URI may have its own set of characters that are reserved, ;;; -> differentiate between components. ;;; not allowed within component 'segment' in 'abs_path' (define segment-reserved-and-excluded (rx (~ ,unreserved ,pchar-charset (";")))) ;;; not allowed within component 'query' (define query-reserved-and-excluded (rx (~ ,unreserved ,reserved ))) ;;; encode 'abs_path' portion of a URI: ;;; use SPLIT-PATH to split abs_path into its segments, ;;; then apply ESCAPE-SEGMENT to the segments. (define (escape-segment segment) (escape segment segment-reserved-and-excluded)) ;;; encode 'query' portion of a URI (define (escape-query query) (escape query query-reserved-and-excluded)) ;;; encode something we don't know: escape all but the unreserved characters. (define (escape-not-unreserved-chars something) (escape something (rx (~ ,unreserved)))) ;; Appendix A of RFC 2396 ;; ;A. Collected BNF for URI ; URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] ; absoluteURI = scheme ":" ( hier_part | opaque_part ) ; relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] ; hier_part = ( net_path | abs_path ) [ "?" query ] ; opaque_part = uric_no_slash *uric ; uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" | ; "&" | "=" | "+" | "$" | "," ; net_path = "//" authority [ abs_path ] ; abs_path = "/" path_segments ; rel_path = rel_segment [ abs_path ] ; rel_segment = 1*( unreserved | escaped | ; ";" | "@" | "&" | "=" | "+" | "$" | "," ) ; scheme = alpha *( alpha | digit | "+" | "-" | "." ) ; authority = server | reg_name ; reg_name = 1*( unreserved | escaped | "$" | "," | ; ";" | ":" | "@" | "&" | "=" | "+" ) ; server = [ [ userinfo "@" ] hostport ] ; userinfo = *( unreserved | escaped | ; ";" | ":" | "&" | "=" | "+" | "$" | "," ) ; hostport = host [ ":" port ] ; host = hostname | IPv4address ; hostname = *( domainlabel "." ) toplabel [ "." ] ; domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum ; toplabel = alpha | alpha *( alphanum | "-" ) alphanum ; IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit ; port = *digit ; path = [ abs_path | opaque_part ] ; path_segments = segment *( "/" segment ) ; segment = *pchar *( ";" param ) ; param = *pchar ; pchar = unreserved | escaped | ; ":" | "@" | "&" | "=" | "+" | "$" | "," ; query = *uric ; fragment = *uric ; uric = reserved | unreserved | escaped ; reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | ; "$" | "," ; unreserved = alphanum | mark ; mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | ; "(" | ")" ; escaped = "%" hex hex ; hex = digit | "A" | "B" | "C" | "D" | "E" | "F" | ; "a" | "b" | "c" | "d" | "e" | "f" ; alphanum = alpha | digit ; alpha = lowalpha | upalpha ; lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | ; "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | ; "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" ; upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | ; "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | ; "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" ; digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | ; "8" | "9"