parse HTTP 1.1 URIs:

* add RegExps
* add proc PARSE-URI
* add proc SPLIT-ABS-PATH
This commit is contained in:
vibr 2004-10-06 19:10:49 +00:00
parent d9950a9b0b
commit a1e79c4fc7
1 changed files with 151 additions and 9 deletions

View File

@ -1,18 +1,152 @@
;;; URL parsing and unparsing -*- Scheme -*-
;;; This file is part of the Scheme Untergrund Networking package.
;;; HTTP 1.1 Request-URI parsing and unparsing -*- Scheme -*-
;;; Copyright (c) 1995 by Olin Shivers.
;;; For copyright information, see the file COPYING which comes with
;;; the distribution.
;;; I'm only implementing HTTP URL's right now.
;;; References:
;;; - http://www.w3.org/Addressing/rfc1738.txt
;;; Original RFC
;;; - http://www.w3.org/hypertext/WWW/Addressing/URL/Overview.html
;;; General Web page of URI pointers.
;;; RFC 2616 Hypertext Transfer Protocol -- HTTP/1.1
;;; RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax
;;; HTTP 1.1 Request-URIS are of the form
;;; Request-URI = "*" | absoluteURI | abs_path | authority
;;;
;;; We implement only the subset
;;; Request-URI = ( http_URL | abs_path) ["#" fragment]
;;; where http_URL is a subset of absoluteURI
;;; and [ "#" fragment ] is allowed even though
;;; RFC 2616 disallowes the #fragment part
;;; (while RFC 1945 for HTTP/1.0 allowed it).
;;; (This is for compatibility with buggy clients).
;;; RexExps for Request-URIs as scsh SREs
;;; stick to RFC terminology throughout
(define digit (rx numeric))
(define alpha (rx alphanum))
(define alphanum (rx alphanumeric))
(define hex (rx hex-digit))
(define escaped (rx "%" ,hex ,hex))
(define mark (rx ( "-_.!~*'()")))
(define unreserved (rx (| ,alphanum ,mark)))
(define reserved (rx ( ";/?:@&=+$,")))
(define uric (rx (| ,reserved ,unreserved ,escaped)))
(define fragment (rx (* ,uric)))
(define query (rx (* ,uric)))
(define pchar-charset (rx ( ":@&=+$,")))
(define pchar (rx (| ,unreserved ,escaped ,pchar-charset)))
(define param (rx (* ,pchar)))
(define segment (rx (:
(* ,pchar)
(* (: ";" ,param)))))
(define path-segments (rx (:
,segment
(* (: "/" ,segment)))))
(define abs_path (rx (:
"/"
,path-segments)))
(define port (rx (* ,digit)))
(define IPv4address (rx (+ ,digit) "." (+ ,digit) "." (+ ,digit) "." (+ ,digit)))
(define toplabel (rx (:
(|
,alpha
(:
,alpha
(* (| ,alphanum "-"))
,alphanum)))))
(define domainlabel (rx (:
(|
,alphanum
(: ,alphanum
(* (| ,alphanum "-"))
,alphanum)))))
(define hostname (rx (:
(* (: ,domainlabel "."))
,toplabel
(? "."))))
(define host (rx (| ,hostname ,IPv4address)))
(define http_URL (rx (:
"http://"
(submatch
,host)
(?
(: ":" (submatch ,port)))
(?
(: (submatch ,abs_path)
(?
(: "?" (submatch ,query))))))))
(define http_URL_with_frag (rx (: bos ,@http_URL (? "#" ,fragment) eos)))
(define abs_path_with_frag (rx (: bos (submatch ,abs_path) (? "#" ,fragment) eos)))
(define Request-URI (rx (| ,@http_URL_with_frag ,@abs_path_with_frag)))
;;parse HTTP 1.1 Request-URI
(define (parse-uri request-uri)
(cond
((regexp-search abs_path_with_frag request-uri)
=> (lambda (match)
(values #f #f (split-abs-path (match:substring match 1)) #f)))
((regexp-search http_URL_with_frag request-uri)
=>(lambda (match)
(let ((host (match:substring match 1))
(port (match:substring match 2))
(path (split-abs-path (match:substring match 3)))
(query (match:substring match 4)))
(values host port path query))))
(else
(fatal-syntax-error "Request-URI syntactically faulty"))))
;; split the string abs-path at slashes, return list of segments.
;; SPLIT-PATH assumes abs-path matches the RegExp abs_path, no checks are done.
;; minor remark: abs_path allows for strings containing several consecutive slashes;
;; SPLIT-ABS-PATH treats them as one slash.
(define (split-abs-path abs-path)
(regexp-fold-right
(rx (+ (~ ("/"))))
(lambda (match i res)
(cons (match:substring match 0) res))
'()
abs-path))
;;; Unresolved issues:
@ -115,6 +249,14 @@
(search http-url-search)
(fragment-identifier http-url-fragment-identifier))
;(define-new-record-type http-url :http-url
; (make-http-url hostname port path query)
; http-url?
; (hostname http-url-hostname)
; (port http-url-port)
; (path http-url-path)
; (query http-url-query))
;;; The URI parser (parse-uri in uri.scm) maps a string to four parts:
;;; <scheme> : <path> ? <search> # <frag-id> <scheme>, <search>, and
;;; <frag-id> are strings; <path> is a non-empty string list -- the