diff --git a/scheme/lib/url.scm b/scheme/lib/url.scm index cfd2b64..6de4130 100644 --- a/scheme/lib/url.scm +++ b/scheme/lib/url.scm @@ -1,6 +1,6 @@ ;;; HTTP 1.1 Request-URI parsing and unparsing -*- Scheme -*- -;;; Copyright (c) 1995 by Olin Shivers. +;;; Copyright (c) 2005 by Viola Brunner. ;;; For copyright information, see the file COPYING which comes with ;;; the distribution. @@ -8,9 +8,13 @@ ;;; RFC 2616 Hypertext Transfer Protocol -- HTTP/1.1 ;;; RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax ;;; -;;; RFC 2616 adopts definitions of regexps from RFC 2396. +;;; RFC 2616 adopts definitions of regexps from RFC 2396 +;;; (see copy of Appendix A of RFC 2396 below) +;;; Note: there are 2 Problems in RFC 2616 concerning URIS: + +;;; Problem 1: ;;; RFC 2616 is ambiguous in defining Request_URIS: ;;; ;;; section 5.1.2 states: @@ -19,24 +23,39 @@ ;;; ;;; whilst section 3.2.2 defines the 'http_URL' ;;; http_URL = "http://" host [ ":" port ] [ abs_path [ "?" query ]] - - -;;; Since allowing for general absoluteURIs doesn't make too much sense -;;; we implement only Request_URIs as follows: -;;; Request-URI = ( http_URL | abs_path) ["#" fragment] ;;; -;;; where http_URL is a subset of absoluteURI +;;; Solution to Problem 1: +;;; Since allowing for general absoluteURIs doesn't make too much sense +;;; we implement Request_URIs of the form +;;; Request-URI = ( http_URL | abs_path) ["#" fragment] +;;; where http_URL is a only a subset of absoluteURI -;;; [ "#" fragment ] is allowed even though -;;; RFC 2616 disallowes the #fragment part -;;; (while RFC 1945 for HTTP/1.0 allowed it). -;;; (This is for compatibility with buggy clients). +;;; Problem 2: +;;; according to RFC 2616, section 5.1.2, the Request-URI may only +;;; have a [? query] part if it's an absoluteURI; on the other hand +;;; only requests being made to proxies are supposed to use +;;; absoluteURIs; abs_path is the normal case. So this must be a mistake. +;;; See also http://skrb.org/ietf/http_errata.html#uriquery +;;; +;;; Solution to Problem 2: +;;, we implement Request_URIs of the form +;;; Request-URI = ( http_URL | abs_path ["?" query] ) ["#" fragment] + + +;;; Here we depart from the RFCs: +;;; RFC 2616 and 1945 disallow a #fragment-suffix of the Request-URI. +;;; For compatibility with buggy clients we _do_ allow for it. +;;; (Apache does so, too). ;;; RexExps for Request-URIs as scsh SREs ;;; stick to RFC terminology throughout +;;; (see copy of Appendix A of RFC 2396 below) +;;; +;;; we implement Request_URIs of the form +;;; Request-URI = ( http_URL | abs_path) ["#" fragment] (define digit (rx numeric)) @@ -276,3 +295,60 @@ (define (escape-query query) (escape query query-reserved-and-excluded)) +;; Appendix A of RFC 2396 +;; +;A. Collected BNF for URI + +; URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] +; absoluteURI = scheme ":" ( hier_part | opaque_part ) +; relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] +; hier_part = ( net_path | abs_path ) [ "?" query ] +; opaque_part = uric_no_slash *uric +; uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" | +; "&" | "=" | "+" | "$" | "," +; net_path = "//" authority [ abs_path ] +; abs_path = "/" path_segments +; rel_path = rel_segment [ abs_path ] +; rel_segment = 1*( unreserved | escaped | +; ";" | "@" | "&" | "=" | "+" | "$" | "," ) +; scheme = alpha *( alpha | digit | "+" | "-" | "." ) +; authority = server | reg_name +; reg_name = 1*( unreserved | escaped | "$" | "," | +; ";" | ":" | "@" | "&" | "=" | "+" ) +; server = [ [ userinfo "@" ] hostport ] +; userinfo = *( unreserved | escaped | +; ";" | ":" | "&" | "=" | "+" | "$" | "," ) +; hostport = host [ ":" port ] +; host = hostname | IPv4address +; hostname = *( domainlabel "." ) toplabel [ "." ] +; domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum +; toplabel = alpha | alpha *( alphanum | "-" ) alphanum +; IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit +; port = *digit +; path = [ abs_path | opaque_part ] +; path_segments = segment *( "/" segment ) +; segment = *pchar *( ";" param ) +; param = *pchar +; pchar = unreserved | escaped | +; ":" | "@" | "&" | "=" | "+" | "$" | "," +; query = *uric +; fragment = *uric +; uric = reserved | unreserved | escaped +; reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | +; "$" | "," +; unreserved = alphanum | mark +; mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | +; "(" | ")" +; escaped = "%" hex hex +; hex = digit | "A" | "B" | "C" | "D" | "E" | "F" | +; "a" | "b" | "c" | "d" | "e" | "f" +; alphanum = alpha | digit +; alpha = lowalpha | upalpha +; lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | +; "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | +; "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" +; upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | +; "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | +; "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" +; digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | +; "8" | "9"