;;; HTTP 1.1 Request-URI parsing and unparsing -*- Scheme -*- ;;; Copyright (c) 1995 by Olin Shivers. ;;; For copyright information, see the file COPYING which comes with ;;; the distribution. ;;; References: ;;; RFC 2616 Hypertext Transfer Protocol -- HTTP/1.1 ;;; RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax ;;; ;;; RFC 2616 adopts definitions of regexps from RFC 2396. ;;; RFC 2616 is ambiguous in defining Request_URIS: ;;; ;;; section 5.1.2 states: ;;; HTTP 1.1 Request-URIS are of the form ;;; Request-URI = "*" | absoluteURI | abs_path | authority ;;; ;;; whilst section 3.2.2 defines the 'http_URL' ;;; http_URL = "http://" host [ ":" port ] [ abs_path [ "?" query ]] ;;; Since allowing for general absoluteURIs doesn't make too much sense ;;; we implement only Request_URIs as follows: ;;; Request-URI = ( http_URL | abs_path) ["#" fragment] ;;; ;;; where http_URL is a subset of absoluteURI ;;; [ "#" fragment ] is allowed even though ;;; RFC 2616 disallowes the #fragment part ;;; (while RFC 1945 for HTTP/1.0 allowed it). ;;; (This is for compatibility with buggy clients). ;;; RexExps for Request-URIs as scsh SREs ;;; stick to RFC terminology throughout (define digit (rx numeric)) (define alpha (rx alphabetic)) (define alphanum (rx alphanumeric)) (define hex (rx hex-digit)) (define escaped (rx (: "%" ,hex ,hex))) (define mark (rx ( "-_.!~*'()"))) (define unreserved (rx (| ,alphanum ,mark))) (define reserved (rx ( ";/?:@&=+$,"))) (define uric (rx (| ,reserved ,unreserved ,escaped))) (define fragment (rx (* ,uric))) (define query (rx (* ,uric))) (define pchar-charset (rx ( ":@&=+$,"))) (define pchar (rx (| ,unreserved ,escaped ,pchar-charset))) (define param (rx (* ,pchar))) (define segment (rx (: (* ,pchar) (* (: ";" ,param))))) (define path-segments (rx (: ,segment (* (: "/" ,segment))))) (define abs_path (rx (: "/" ,path-segments))) (define port (rx (* ,digit))) (define IPv4address (rx (+ ,digit) "." (+ ,digit) "." (+ ,digit) "." (+ ,digit))) (define toplabel (rx (: (| ,alpha (: ,alpha (* (| ,alphanum "-")) ,alphanum))))) (define domainlabel (rx (: (| ,alphanum (: ,alphanum (* (| ,alphanum "-")) ,alphanum))))) (define hostname (rx (: (* (: ,domainlabel ".")) ,toplabel (? ".")))) (define host (rx (| ,hostname ,IPv4address))) (define http_URL (rx (: "http://" (submatch ,host) (? (: ":" (submatch ,port))) (? (: (submatch ,abs_path) (? (: "?" (submatch ,query)))))))) (define http_URL_with_frag (rx (: bos ,@http_URL (? "#" ,fragment) eos))) (define abs_path_with_frag (rx (: bos (submatch ,abs_path) (? "#" ,fragment) eos))) (define Request-URI (rx (| ,@http_URL_with_frag ,@abs_path_with_frag))) ;;; parse a HTTP 1.1 Request_URI ;;; ;;; return matches of regexps host, port, abs_path, query; ;;; ;;; If request-uri is a relative URI, host, port and query are #f; ;;; port and query are also #f if they are not given in an absolute URI. ;;; If there's no abs_path given, or abs_path is "/", path is the empty list; ;;; otherwise it is a list containing the path's segments. ;;; ;;; don't decode yet! (define (parse-uri request-uri) (cond ((regexp-search abs_path_with_frag request-uri) => (lambda (match) (values #f #f (split-abs-path (match:substring match 1)) #f))) ((regexp-search http_URL_with_frag request-uri) =>(lambda (match) (let ((host (match:substring match 1)) (port (match:substring match 2)) (path (split-abs-path (match:substring match 3))) (query (match:substring match 4))) (values host port path query)))) (else (fatal-syntax-error "Request-URI syntactically faulty")))) ;;; split the string abs-path at slashes, return list of 'segments' (see RegExp definition above). ;;; ;;; SPLIT-PATH assumes abs-path if either #f or matches the RegExp abs_path, ;;; no checks are done. ;;; ;;; remark: abs_path allows for strings containing several consecutive slashes; ;;; SPLIT-ABS-PATH treats them as one slash. ;;; (e.g., "/foo///bar//baz/" => ("foo" "bar" "baz")) (define (split-abs-path abs-path) (if abs-path (regexp-fold-right (rx (+ (~ ("/")))) (lambda (match i res) (cons (match:substring match 0) res)) '() abs-path) '())) ;;; record type HTTP-URL for Request_URIs ;;; ;;; The HOST slot is a non-empty-string or #f. ;;; ;;; The PORT slot is an integer or #f. ;;; ;;; The PATH slot is the Request_URI's path split at slashes ;;; (e.g., "/foo///bar//baz/" => ("foo" "bar" "baz")) ;;; and decoded. ;;; ;;; The QUERY slot is a decoded non-empty-string or #f. (define-record-type http-url :http-url (make-http-url host port path query) http-url? (host http-url-host) (port http-url-port) (path http-url-path) (query http-url-query)) ;;; decode various parts of the Request_URI as returned by PARSE-URI; ;;; returns a HTTP-URL record. (define (parsed-uri->http-url host port path query) (let ((portnumber (and port (string->number port))) (decoded-path (map unescape path)) (decoded-query (and query (unescape query)))) (make-http-url host portnumber decoded-path decoded-query))) ;;really decode query here?? ;;; parse a HTTP 1.1. Request_URI into a http-url record (define (parse-http-url uri-string) (call-with-values (lambda () (parse-uri uri-string)) parsed-uri->http-url)) ;;; Unparse a http-url record into its corresponding Request_URI (define (http-url->uri-string http-url) (let* ((host (http-url-host http-url)) (scheme-and-host-string (if host (string-append "http://" host) "")) (port (http-url-port http-url)) (port-string (if port (string-append ":" (number->string port)) "")) (path (http-url-path http-url)) (path-string (fold-right (lambda (segment res) (string-append "/" (escape-uri segment) res)) "" path)) (query (http-url-query http-url)) (query-string (if query (string-append "?" (escape-uri query)) ""))) (string-append scheme-and-host-string port-string path-string query-string))) ;;TODO: separate procedures for encoding path and query ;;; decode a URI ;;; walk over string s and unescape all occurrences of RegExp 'escaped' (see above). ;;; Remark: ;;; we assume no non-ASCII characters occur in the URI; therefore the ;;; ascii table is used for conversion of the octet the hexnumber ;;; represents to a char. ;;; Caution: ;;; a URI must be separated into its components (i.e. parsed by ;;; PARSE-URI) before the escaped characters within those components ;;; can be safely decoded. Don't use UNESCAPE on an unparsed URI. (define (unescape s) (regexp-fold escaped (lambda (start-search match res) (let* ((start-match (match:start match)) (hexchar-low (string-ref s (+ start-match 2))) (hexchar-high (string-ref s (+ start-match 1))) (hex-low (hexchar->int hexchar-low)) (hex-high (hexchar->int hexchar-high)) (ascii (+ (* 16 hex-high) hex-low))) (string-append res (substring s start-search start-match) (string (ascii->char ascii))))) "" s (lambda (start-search res) (string-append res (substring s start-search (string-length s)))))) ; make use of the fact that numbers and characters are in order in the ascii table (define (hexchar->int c) (- (char->ascii c) (if (char-numeric? c) (char->ascii #\0) (- (if (char-upper-case? c) (char->ascii #\A) (char->ascii #\a)) 10))))