400 lines
12 KiB
Scheme
400 lines
12 KiB
Scheme
;;; HTTP 1.1 Request-URI parsing and unparsing -*- Scheme -*-
|
|
|
|
;;; This file is part of the Scheme Untergrund Networking package.
|
|
|
|
;;; For copyright information, see the file COPYING which comes with
|
|
;;; the distribution.
|
|
|
|
;;; References:
|
|
;;; RFC 2616 Hypertext Transfer Protocol -- HTTP/1.1
|
|
;;; RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax
|
|
;;;
|
|
;;; RFC 2616 adopts definitions of regexps from RFC 2396
|
|
;;; (see copy of Appendix A of RFC 2396 below)
|
|
|
|
|
|
;;; Note: there are 2 Problems in RFC 2616 concerning URIS:
|
|
|
|
;;; Problem 1:
|
|
;;; RFC 2616 is ambiguous in defining Request_URIS:
|
|
;;;
|
|
;;; section 5.1.2 states:
|
|
;;; HTTP 1.1 Request-URIS are of the form
|
|
;;; Request-URI = "*" | absoluteURI | abs_path | authority
|
|
;;;
|
|
;;; whilst section 3.2.2 defines the 'http_URL'
|
|
;;; http_URL = "http://" host [ ":" port ] [ abs_path [ "?" query ]]
|
|
;;;
|
|
;;; Solution to Problem 1:
|
|
;;; Since allowing for general absoluteURIs doesn't make too much sense
|
|
;;; we implement Request_URIs of the form
|
|
;;; Request-URI = ( http_URL | abs_path) ["#" fragment]
|
|
;;; where http_URL is a only a subset of absoluteURI
|
|
|
|
|
|
;;; Problem 2:
|
|
;;; according to RFC 2616, section 5.1.2, the Request-URI may only
|
|
;;; have a [? query] part if it's an absoluteURI; on the other hand
|
|
;;; only requests being made to proxies are supposed to use
|
|
;;; absoluteURIs; abs_path is the normal case. So this must be a mistake.
|
|
;;; See also http://skrb.org/ietf/http_errata.html#uriquery
|
|
;;;
|
|
;;; Solution to Problem 2:
|
|
;;, we implement Request_URIs of the form
|
|
;;; Request-URI = ( http_URL | abs_path ["?" query] ) ["#" fragment]
|
|
|
|
|
|
;;; Note: we don't have to support Request-URIS of the form "*" or
|
|
;;; authority, because these are not used with the any of the methods
|
|
;;; HEAD, GET and POST, which are the only methods we implement so
|
|
;;; far.
|
|
|
|
|
|
;;; Here we depart from the RFCs:
|
|
;;; RFC 2616 and 1945 disallow a #fragment-suffix of the Request-URI.
|
|
;;; For compatibility with buggy clients we _do_ allow for it.
|
|
;;; (Apache does so, too).
|
|
|
|
|
|
|
|
;;; RexExps for Request-URIs as scsh SREs
|
|
;;; stick to RFC terminology throughout
|
|
;;; (see copy of Appendix A of RFC 2396 below)
|
|
;;;
|
|
;;; we implement Request_URIs of the form
|
|
;;; Request-URI = ( http_URL | abs_path ["?" query] ) ["#" fragment]
|
|
|
|
(define digit (rx numeric))
|
|
|
|
(define alpha (rx alphabetic))
|
|
|
|
(define alphanum (rx alphanumeric))
|
|
|
|
(define hex (rx hex-digit))
|
|
|
|
(define escaped (rx (: "%" ,hex ,hex)))
|
|
|
|
(define mark (rx ( "-_.!~*'()")))
|
|
|
|
(define unreserved (rx (~ (~ (| ,alphanum ,mark)))))
|
|
|
|
(define reserved (rx ( ";/?:@&=+$,")))
|
|
|
|
(define uric (rx (| ,reserved ,unreserved ,escaped)))
|
|
|
|
(define fragment (rx (* ,uric)))
|
|
|
|
(define query (rx (* ,uric)))
|
|
|
|
(define pchar-charset (rx ( ":@&=+$,")))
|
|
|
|
(define pchar (rx (| ,unreserved ,escaped ,pchar-charset)))
|
|
|
|
(define param (rx (* ,pchar)))
|
|
|
|
(define segment (rx (:
|
|
(* ,pchar)
|
|
(* (: ";" ,param)))))
|
|
|
|
(define path-segments (rx (:
|
|
,segment
|
|
(* (: "/" ,segment)))))
|
|
|
|
(define abs_path (rx (:
|
|
"/"
|
|
,path-segments)))
|
|
|
|
|
|
(define port (rx (* ,digit)))
|
|
|
|
(define IPv4address (rx (+ ,digit) "." (+ ,digit) "." (+ ,digit) "." (+ ,digit)))
|
|
|
|
(define toplabel (rx (:
|
|
(|
|
|
,alpha
|
|
(:
|
|
,alpha
|
|
(* (| ,alphanum "-"))
|
|
,alphanum)))))
|
|
|
|
(define domainlabel (rx (:
|
|
(|
|
|
,alphanum
|
|
(: ,alphanum
|
|
(* (| ,alphanum "-"))
|
|
,alphanum)))))
|
|
|
|
(define hostname (rx (:
|
|
(* (: ,domainlabel "."))
|
|
,toplabel
|
|
(? "."))))
|
|
|
|
(define host (rx (| ,hostname ,IPv4address)))
|
|
|
|
(define http_URL (rx (:
|
|
"http://"
|
|
(submatch ,host)
|
|
(?
|
|
(: ":" (submatch ,port)))
|
|
(?
|
|
(: (submatch ,abs_path)
|
|
(?
|
|
(: "?" (submatch ,query))))))))
|
|
|
|
(define http_URL_with_frag (rx (: bos
|
|
,@http_URL
|
|
(? (: "#" ,fragment))
|
|
eos)))
|
|
|
|
|
|
(define abs_path_with_frag (rx (: bos
|
|
(submatch ,abs_path)
|
|
(? (: "?" (submatch ,query)))
|
|
(? (: "#" ,fragment))
|
|
eos)))
|
|
|
|
(define Request-URI (rx (| ,@http_URL_with_frag ,@abs_path_with_frag)))
|
|
|
|
|
|
|
|
;;; parse a HTTP 1.1 Request_URI
|
|
;;;
|
|
;;; return matches of regexps host, port, abs_path, query;
|
|
;;;
|
|
;;; If request-uri is a relative URI, host and port are #f;
|
|
;;; port and query are also #f if they are not given.
|
|
;;; If there's no abs_path given, or abs_path is "/", path is the empty list;
|
|
;;; otherwise it is a list containing the path's segments.
|
|
;;;
|
|
|
|
;;; Caution: parse-url doesn't unescape anything yet!
|
|
|
|
(define (parse-url request-uri)
|
|
(cond
|
|
|
|
((regexp-search abs_path_with_frag request-uri)
|
|
=> (lambda (match)
|
|
(let ((path (split-abs-path (match:substring match 1)))
|
|
(query (match:substring match 2)))
|
|
(values #f #f path query))))
|
|
|
|
((regexp-search http_URL_with_frag request-uri)
|
|
=>(lambda (match)
|
|
(let ((host (match:substring match 1))
|
|
(port (match:substring match 2))
|
|
(path (split-abs-path (match:substring match 3)))
|
|
(query (match:substring match 4)))
|
|
(values host port path query))))
|
|
|
|
(else
|
|
(fatal-syntax-error "Request-URI syntactically faulty"))))
|
|
|
|
|
|
;;; split the string abs-path at slashes, return list of 'segments' (see RegExp definition above).
|
|
;;;
|
|
;;; SPLIT-PATH assumes abs-path if either #f or matches the RegExp abs_path,
|
|
;;; no checks are done.
|
|
;;;
|
|
;;; Remark: abs_path allows for strings containing several consecutive slashes;
|
|
;;; SPLIT-ABS-PATH treats them as one slash.
|
|
;;; (e.g., "/foo///bar//baz" => ("foo" "bar" "baz"))
|
|
;;;
|
|
;;; Note: we have to differentiate between paths with trailing
|
|
;;; slash(es) and paths without and hand that information over
|
|
;;; to the request handler. (See
|
|
;;; http://httpd.apache.org/docs-2.0/misc/rewriteguide.html ->
|
|
;;;"Trailing Slash problem" for the reasons.)
|
|
;;; If there is one or more trailing slash(es) the last element of the
|
|
;;; returned list will be an empty string.
|
|
;;; (e.g., "/foo///bar//baz//" => ("foo" "bar" "baz" ""))
|
|
|
|
|
|
(define (split-abs-path abs-path)
|
|
|
|
(if abs-path
|
|
|
|
(let* ((trailing-slash (char=? #\/ (string-ref abs-path (- (string-length abs-path) 1))))
|
|
(last-element (if trailing-slash '("") '())))
|
|
(regexp-fold-right
|
|
(rx (+ (~ ("/"))))
|
|
(lambda (match i res)
|
|
(cons (match:substring match 0) res))
|
|
last-element
|
|
abs-path))
|
|
|
|
'()))
|
|
|
|
|
|
;;; record type HTTP-URL for Request_URIs
|
|
;;;
|
|
;;; The HOST slot is a non-empty-string or #f.
|
|
;;;
|
|
;;; The PORT slot is an integer or #f.
|
|
;;;
|
|
;;; The PATH slot is a list of strings containing the Request_URI's
|
|
;;; path split at slashes and unescaped. If the Request_URI's path
|
|
;;; ends with a slash, an empty string is inserted as the last element
|
|
;;; of the list.
|
|
;;; (e.g., "/foo///bar//baz" => ("foo" "bar" "baz"))
|
|
;;; (e.g., "/foo///bar//baz//" => ("foo" "bar" "baz" ""))
|
|
;;;
|
|
;;; The QUERY slot is an non-empty-string, still in its escaped
|
|
;;; representation, or #f.
|
|
|
|
;;; Caution: the path slot of a http-url record has already been
|
|
;;; UNESCAPED; don't unescape it a second time!
|
|
;;; The query slot is still in its escaped representation.
|
|
|
|
(define-record-type http-url :http-url
|
|
(make-http-url host port path query)
|
|
http-url?
|
|
(host http-url-host)
|
|
(port http-url-port)
|
|
(path http-url-path)
|
|
(query http-url-query))
|
|
|
|
;;; Is http-url of the form http_URL, i.e. absolute?
|
|
(define (absolute-url? http-url)
|
|
(http-url-host http-url))
|
|
|
|
;;; parse a HTTP 1.1. Request_URI into a http-url record
|
|
|
|
(define (url-string->http-url uri-string)
|
|
(receive (host port path query)
|
|
(parse-url uri-string)
|
|
(let ((portnumber (and port (string->number port)))
|
|
(unescaped-path (map unescape path)))
|
|
(make-http-url host portnumber unescaped-path query))))
|
|
|
|
|
|
;;; Unparse a http-url record into its corresponding Request_URI
|
|
|
|
;;; The following holds (apart from multiple slashes in the path,
|
|
;;; which are removed by url-string->http-url):
|
|
;;; (http-url->url-string (url-string->http-url <request-uri-string>)) == <request-uri-string>
|
|
|
|
(define (http-url->url-string http-url)
|
|
|
|
(let* ((host (http-url-host http-url))
|
|
(scheme-and-host-string
|
|
(if host
|
|
(string-append "http://" host)
|
|
""))
|
|
|
|
(port (http-url-port http-url))
|
|
(port-string
|
|
(if port
|
|
(string-append ":" (number->string port))
|
|
""))
|
|
|
|
(path (http-url-path http-url))
|
|
(path-string
|
|
(fold-right
|
|
(lambda (segment res)
|
|
(string-append "/" (escape-segment segment) res))
|
|
""
|
|
path))
|
|
|
|
(query (http-url-query http-url))
|
|
(query-string (if query
|
|
(string-append "?" query)
|
|
"")))
|
|
|
|
(string-append scheme-and-host-string port-string path-string query-string)))
|
|
|
|
;;; Unparse the http-url-path field of an http-url record into its
|
|
;;; corresponding part of the Request_URI
|
|
|
|
(define (http-url-path->path-string http-url-path)
|
|
(fold-right
|
|
(lambda (segment res)
|
|
(string-append "/" (escape-segment segment) res))
|
|
""
|
|
http-url-path))
|
|
|
|
;;; decoding and encoding Request-URIs:
|
|
|
|
;;; to decode Request-URIs use UNESCAPE from uri.scm
|
|
|
|
;;; encode Request-URIs:
|
|
;;; Each component of a URI may have its own set of characters that are reserved,
|
|
;;; -> differentiate between components.
|
|
|
|
;;; not allowed within component 'segment' in 'abs_path'
|
|
(define segment-reserved-and-excluded (rx (~ ,unreserved ,pchar-charset (";"))))
|
|
|
|
;;; not allowed within component 'query'
|
|
(define query-reserved-and-excluded (rx (~ ,unreserved ,reserved )))
|
|
|
|
;;; encode 'abs_path' portion of a URI:
|
|
;;; use SPLIT-PATH to split abs_path into its segments,
|
|
;;; then apply ESCAPE-SEGMENT to the segments.
|
|
(define (escape-segment segment)
|
|
(escape segment segment-reserved-and-excluded))
|
|
|
|
;;; encode 'query' portion of a URI
|
|
(define (escape-query query)
|
|
(escape query query-reserved-and-excluded))
|
|
|
|
;;; encode something we don't know: escape all but the unreserved characters.
|
|
(define (escape-not-unreserved-chars something)
|
|
(escape something (rx (~ ,unreserved))))
|
|
|
|
;; Appendix A of RFC 2396
|
|
;;
|
|
;A. Collected BNF for URI
|
|
|
|
; URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
|
|
; absoluteURI = scheme ":" ( hier_part | opaque_part )
|
|
; relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
|
|
; hier_part = ( net_path | abs_path ) [ "?" query ]
|
|
; opaque_part = uric_no_slash *uric
|
|
; uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
|
|
; "&" | "=" | "+" | "$" | ","
|
|
; net_path = "//" authority [ abs_path ]
|
|
; abs_path = "/" path_segments
|
|
; rel_path = rel_segment [ abs_path ]
|
|
; rel_segment = 1*( unreserved | escaped |
|
|
; ";" | "@" | "&" | "=" | "+" | "$" | "," )
|
|
; scheme = alpha *( alpha | digit | "+" | "-" | "." )
|
|
; authority = server | reg_name
|
|
; reg_name = 1*( unreserved | escaped | "$" | "," |
|
|
; ";" | ":" | "@" | "&" | "=" | "+" )
|
|
; server = [ [ userinfo "@" ] hostport ]
|
|
; userinfo = *( unreserved | escaped |
|
|
; ";" | ":" | "&" | "=" | "+" | "$" | "," )
|
|
; hostport = host [ ":" port ]
|
|
; host = hostname | IPv4address
|
|
; hostname = *( domainlabel "." ) toplabel [ "." ]
|
|
; domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
|
|
; toplabel = alpha | alpha *( alphanum | "-" ) alphanum
|
|
; IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit
|
|
; port = *digit
|
|
; path = [ abs_path | opaque_part ]
|
|
; path_segments = segment *( "/" segment )
|
|
; segment = *pchar *( ";" param )
|
|
; param = *pchar
|
|
; pchar = unreserved | escaped |
|
|
; ":" | "@" | "&" | "=" | "+" | "$" | ","
|
|
; query = *uric
|
|
; fragment = *uric
|
|
; uric = reserved | unreserved | escaped
|
|
; reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
|
|
; "$" | ","
|
|
; unreserved = alphanum | mark
|
|
; mark = "-" | "_" | "." | "!" | "~" | "*" | "'" |
|
|
; "(" | ")"
|
|
; escaped = "%" hex hex
|
|
; hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
|
|
; "a" | "b" | "c" | "d" | "e" | "f"
|
|
; alphanum = alpha | digit
|
|
; alpha = lowalpha | upalpha
|
|
; lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |
|
|
; "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |
|
|
; "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
|
|
; upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
|
|
; "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
|
|
; "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
|
|
; digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
|
|
; "8" | "9"
|