From a1e79c4fc7913db7fcd6a05c26ac98f1dab7462a Mon Sep 17 00:00:00 2001 From: vibr Date: Wed, 6 Oct 2004 19:10:49 +0000 Subject: [PATCH] parse HTTP 1.1 URIs: * add RegExps * add proc PARSE-URI * add proc SPLIT-ABS-PATH --- scheme/lib/url.scm | 160 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 151 insertions(+), 9 deletions(-) diff --git a/scheme/lib/url.scm b/scheme/lib/url.scm index 92a0042..5496c5f 100644 --- a/scheme/lib/url.scm +++ b/scheme/lib/url.scm @@ -1,18 +1,152 @@ -;;; URL parsing and unparsing -*- Scheme -*- - -;;; This file is part of the Scheme Untergrund Networking package. +;;; HTTP 1.1 Request-URI parsing and unparsing -*- Scheme -*- ;;; Copyright (c) 1995 by Olin Shivers. ;;; For copyright information, see the file COPYING which comes with ;;; the distribution. -;;; I'm only implementing HTTP URL's right now. - ;;; References: -;;; - http://www.w3.org/Addressing/rfc1738.txt -;;; Original RFC -;;; - http://www.w3.org/hypertext/WWW/Addressing/URL/Overview.html -;;; General Web page of URI pointers. +;;; RFC 2616 Hypertext Transfer Protocol -- HTTP/1.1 +;;; RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax + +;;; HTTP 1.1 Request-URIS are of the form +;;; Request-URI = "*" | absoluteURI | abs_path | authority +;;; +;;; We implement only the subset +;;; Request-URI = ( http_URL | abs_path) ["#" fragment] + +;;; where http_URL is a subset of absoluteURI + +;;; and [ "#" fragment ] is allowed even though +;;; RFC 2616 disallowes the #fragment part +;;; (while RFC 1945 for HTTP/1.0 allowed it). +;;; (This is for compatibility with buggy clients). + + + + +;;; RexExps for Request-URIs as scsh SREs +;;; stick to RFC terminology throughout + +(define digit (rx numeric)) + +(define alpha (rx alphanum)) + +(define alphanum (rx alphanumeric)) + +(define hex (rx hex-digit)) + +(define escaped (rx "%" ,hex ,hex)) + +(define mark (rx ( "-_.!~*'()"))) + +(define unreserved (rx (| ,alphanum ,mark))) + +(define reserved (rx ( ";/?:@&=+$,"))) + +(define uric (rx (| ,reserved ,unreserved ,escaped))) + +(define fragment (rx (* ,uric))) + +(define query (rx (* ,uric))) + +(define pchar-charset (rx ( ":@&=+$,"))) + +(define pchar (rx (| ,unreserved ,escaped ,pchar-charset))) + +(define param (rx (* ,pchar))) + +(define segment (rx (: + (* ,pchar) + (* (: ";" ,param))))) + +(define path-segments (rx (: + ,segment + (* (: "/" ,segment))))) + +(define abs_path (rx (: + "/" + ,path-segments))) + + +(define port (rx (* ,digit))) + +(define IPv4address (rx (+ ,digit) "." (+ ,digit) "." (+ ,digit) "." (+ ,digit))) + +(define toplabel (rx (: + (| + ,alpha + (: + ,alpha + (* (| ,alphanum "-")) + ,alphanum))))) + +(define domainlabel (rx (: + (| + ,alphanum + (: ,alphanum + (* (| ,alphanum "-")) + ,alphanum))))) + +(define hostname (rx (: + (* (: ,domainlabel ".")) + ,toplabel + (? ".")))) + +(define host (rx (| ,hostname ,IPv4address))) + +(define http_URL (rx (: + "http://" + (submatch + ,host) + (? + (: ":" (submatch ,port))) + (? + (: (submatch ,abs_path) + (? + (: "?" (submatch ,query)))))))) + +(define http_URL_with_frag (rx (: bos ,@http_URL (? "#" ,fragment) eos))) + +(define abs_path_with_frag (rx (: bos (submatch ,abs_path) (? "#" ,fragment) eos))) + +(define Request-URI (rx (| ,@http_URL_with_frag ,@abs_path_with_frag))) + + + +;;parse HTTP 1.1 Request-URI + +(define (parse-uri request-uri) + (cond + + ((regexp-search abs_path_with_frag request-uri) + => (lambda (match) + (values #f #f (split-abs-path (match:substring match 1)) #f))) + + ((regexp-search http_URL_with_frag request-uri) + =>(lambda (match) + (let ((host (match:substring match 1)) + (port (match:substring match 2)) + (path (split-abs-path (match:substring match 3))) + (query (match:substring match 4))) + (values host port path query)))) + + (else + (fatal-syntax-error "Request-URI syntactically faulty")))) + + +;; split the string abs-path at slashes, return list of segments. +;; SPLIT-PATH assumes abs-path matches the RegExp abs_path, no checks are done. + +;; minor remark: abs_path allows for strings containing several consecutive slashes; +;; SPLIT-ABS-PATH treats them as one slash. + +(define (split-abs-path abs-path) + (regexp-fold-right + (rx (+ (~ ("/")))) + (lambda (match i res) + (cons (match:substring match 0) res)) + '() + abs-path)) ;;; Unresolved issues: @@ -115,6 +249,14 @@ (search http-url-search) (fragment-identifier http-url-fragment-identifier)) +;(define-new-record-type http-url :http-url +; (make-http-url hostname port path query) +; http-url? +; (hostname http-url-hostname) +; (port http-url-port) +; (path http-url-path) +; (query http-url-query)) + ;;; The URI parser (parse-uri in uri.scm) maps a string to four parts: ;;; : ? # , , and ;;; are strings; is a non-empty string list -- the