sunet/scheme/lib/url.scm

323 lines
9.5 KiB
Scheme
Raw Normal View History

;;; HTTP 1.1 Request-URI parsing and unparsing -*- Scheme -*-
2002-08-27 05:03:22 -04:00
2002-06-08 11:07:01 -04:00
;;; Copyright (c) 1995 by Olin Shivers.
2002-08-27 05:03:22 -04:00
;;; For copyright information, see the file COPYING which comes with
;;; the distribution.
2002-06-08 11:07:01 -04:00
;;; References:
;;; RFC 2616 Hypertext Transfer Protocol -- HTTP/1.1
;;; RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax
;;;
;;; RFC 2616 adopts definitions of regexps from RFC 2396.
;;; RFC 2616 is ambiguous in defining Request_URIS:
;;;
;;; section 5.1.2 states:
;;; HTTP 1.1 Request-URIS are of the form
;;; Request-URI = "*" | absoluteURI | abs_path | authority
;;;
;;; whilst section 3.2.2 defines the 'http_URL'
;;; http_URL = "http://" host [ ":" port ] [ abs_path [ "?" query ]]
;;; Since allowing for general absoluteURIs doesn't make too much sense
;;; we implement only Request_URIs as follows:
;;; Request-URI = ( http_URL | abs_path) ["#" fragment]
;;;
;;; where http_URL is a subset of absoluteURI
;;; [ "#" fragment ] is allowed even though
;;; RFC 2616 disallowes the #fragment part
;;; (while RFC 1945 for HTTP/1.0 allowed it).
;;; (This is for compatibility with buggy clients).
;;; RexExps for Request-URIs as scsh SREs
;;; stick to RFC terminology throughout
(define digit (rx numeric))
(define alpha (rx alphanum))
(define alphanum (rx alphanumeric))
(define hex (rx hex-digit))
(define escaped (rx "%" ,hex ,hex))
(define mark (rx ( "-_.!~*'()")))
(define unreserved (rx (| ,alphanum ,mark)))
(define reserved (rx ( ";/?:@&=+$,")))
(define uric (rx (| ,reserved ,unreserved ,escaped)))
(define fragment (rx (* ,uric)))
(define query (rx (* ,uric)))
(define pchar-charset (rx ( ":@&=+$,")))
(define pchar (rx (| ,unreserved ,escaped ,pchar-charset)))
(define param (rx (* ,pchar)))
(define segment (rx (:
(* ,pchar)
(* (: ";" ,param)))))
(define path-segments (rx (:
,segment
(* (: "/" ,segment)))))
(define abs_path (rx (:
"/"
,path-segments)))
(define port (rx (* ,digit)))
(define IPv4address (rx (+ ,digit) "." (+ ,digit) "." (+ ,digit) "." (+ ,digit)))
(define toplabel (rx (:
(|
,alpha
(:
,alpha
(* (| ,alphanum "-"))
,alphanum)))))
(define domainlabel (rx (:
(|
,alphanum
(: ,alphanum
(* (| ,alphanum "-"))
,alphanum)))))
(define hostname (rx (:
(* (: ,domainlabel "."))
,toplabel
(? "."))))
(define host (rx (| ,hostname ,IPv4address)))
(define http_URL (rx (:
"http://"
(submatch
,host)
(?
(: ":" (submatch ,port)))
(?
(: (submatch ,abs_path)
(?
(: "?" (submatch ,query))))))))
(define http_URL_with_frag (rx (: bos ,@http_URL (? "#" ,fragment) eos)))
(define abs_path_with_frag (rx (: bos (submatch ,abs_path) (? "#" ,fragment) eos)))
(define Request-URI (rx (| ,@http_URL_with_frag ,@abs_path_with_frag)))
;;parse HTTP 1.1 Request-URI
(define (parse-uri request-uri)
(cond
((regexp-search abs_path_with_frag request-uri)
=> (lambda (match)
(values #f #f (split-abs-path (match:substring match 1)) #f)))
((regexp-search http_URL_with_frag request-uri)
=>(lambda (match)
(let ((host (match:substring match 1))
(port (match:substring match 2))
(path (split-abs-path (match:substring match 3)))
(query (match:substring match 4)))
(values host port path query))))
(else
(fatal-syntax-error "Request-URI syntactically faulty"))))
;; split the string abs-path at slashes, return list of segments.
;; SPLIT-PATH assumes abs-path matches the RegExp abs_path, no checks are done.
;; minor remark: abs_path allows for strings containing several consecutive slashes;
;; SPLIT-ABS-PATH treats them as one slash.
(define (split-abs-path abs-path)
(regexp-fold-right
(rx (+ (~ ("/"))))
(lambda (match i res)
(cons (match:substring match 0) res))
'()
abs-path))
2002-06-08 11:07:01 -04:00
;;; Unresolved issues:
;;; - The server parser shouldn't substitute default values --
2002-06-08 11:07:01 -04:00
;;; that should happen in a separate step.
;;; The steps in hacking a URL are:
;;; - Take the UID, parse it, and resolve it with the context UID, if any.
;;; - Consult the UID's <scheme>. Pick the appropriate URL parser and parse.
;;; Server strings: //<user>:<password>@<host>:<port>/
2002-06-08 11:07:01 -04:00
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; A SERVER record describes path-prefixes of the form
2002-06-08 11:07:01 -04:00
;;; //<user>:<password>@<host>:<port>/
;;; These are frequently used as the initial prefix of URL's describing
;;; Internet resources.
;;Note: the server record-type and its associated procedures are
;;a relict of the parsing of general URIs.
;;Neither HTTP 1.0 nor HTTP 1.1 allow for the 'host'-part (see regexp above)
;;of the Request_URI to contain a <user>:<password>@ section
;;(as long as the ambiguity of the definition of Request_URIs is 'solved' as explained above).
(define-record-type server :server ; Each slot is a decoded string or #f.
(make-server user password host port)
server?
(user server-user)
(password server-password)
(host server-host)
(port server-port))
2002-06-08 11:07:01 -04:00
;;; Parse a URI path (a list representing a path, not a string!) into
;;; a server record. Default values are taken from the server
;;; record DEFAULT except for the host. Returns a server record if
;;; it wins. CADDR drops the server portion of the path. In fact,
2002-06-08 11:07:01 -04:00
;;; fatal-syntax-error is called, if the path doesn't start with '//'.
;
(define (parse-server path default)
(if (and (pair? path) ; The thing better begin
(string=? (car path) "") ; with // (i.e., have two
(pair? (cdr path)) ; initial "" elements).
2002-06-08 11:07:01 -04:00
(string=? (cadr path) ""))
(let* ((uhs (caddr path)) ; Server string.
2002-06-08 11:07:01 -04:00
(uhs-len (string-length uhs))
(at (string-index uhs #\@)) ; Usr:passwd at-sign, if any.
2002-06-08 11:07:01 -04:00
(colon1 (and at (string-index uhs #\:))) ; Usr:passwd colon,
(colon1 (and colon1 (< colon1 at) colon1)) ; if any.
2002-06-08 11:07:01 -04:00
(colon2 (string-index uhs #\: (or at 0)))) ; Host:port colon, if any.
(make-server (if at
2003-01-15 06:00:22 -05:00
(unescape-uri uhs 0 (or colon1 at))
(server-user default))
(if colon1
(unescape-uri uhs (+ colon1 1) at)
(server-password default))
(unescape-uri uhs (if at (+ at 1) 0)
(or colon2 uhs-len))
(if colon2
(unescape-uri uhs (+ colon2 1) uhs-len)
(server-port default))))
2002-06-08 11:07:01 -04:00
(fatal-syntax-error "URL must begin with //..." path)))
;;; Unparser
(define server-escaped-chars
(char-set-union uri-escaped-chars ; @ and : are also special
(string->char-set "@:"))) ; in UH strings.
2002-06-08 11:07:01 -04:00
(define (server->string uh)
(let* ((us (server-user uh))
(pw (server-password uh))
(ho (server-host uh))
(po (server-port uh))
2002-06-08 11:07:01 -04:00
;; Encode before assembly in case pieces contain colons or at-signs.
(e (lambda (s) (escape-uri s server-escaped-chars)))
2002-06-08 11:07:01 -04:00
(user/passwd (if us
`(,(e us) . ,(if pw `(":" ,(e pw) "@") '("@")))
2002-06-08 11:07:01 -04:00
'()))
(host/port (if ho
`(,(e ho) . ,(if po `(":" ,(e po)) '()))
2002-06-08 11:07:01 -04:00
'())))
(apply string-append (append user/passwd host/port))))
;;; HTTP URL parsing
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; The PATH slot of this record is the URL's path split at slashes,
;;; e.g., "foo/bar//baz/" => ("foo" "bar" "" "baz" "")
;;; These elements are in raw, unescaped format. To convert back to
;;; a string, use (uri-path->uri (map escape-uri pathlist)).
2002-06-08 11:07:01 -04:00
(define-record-type http-url :http-url
(make-http-url server path search fragment-identifier)
http-url?
(server http-url-server) ; Initial //anonymous@clark.lcs.mit.edu:80/
(path http-url-path) ; Rest of path, split at slashes & decoded.
(search http-url-search)
(fragment-identifier http-url-fragment-identifier))
2002-06-08 11:07:01 -04:00
;(define-new-record-type http-url :http-url
; (make-http-url hostname port path query)
; http-url?
; (hostname http-url-hostname)
; (port http-url-port)
; (path http-url-path)
; (query http-url-query))
2002-06-08 11:07:01 -04:00
;;; The URI parser (parse-uri in uri.scm) maps a string to four parts:
;;; <scheme> : <path> ? <search> # <frag-id> <scheme>, <search>, and
;;; <frag-id> are strings; <path> is a non-empty string list -- the
;;; URI's path split at slashes. Optional parts of the URI, when
;;; missing, are specified as #f. If <scheme> is "http", then the
;;; other three parts can be passed to PARSE-HTTP-URL, which parses
;;; them into a HTTP-URL record. All strings come back from the URI
;;; parser encoded. SEARCH and FRAG-ID are left that way; this parser
;;; decodes the path elements.
;;;
;;; Returns a HTTP-URL record, if possible. Otherwise
;;; FATAL-SYNTAX-ERROR is called.
(define (parse-http-url path search frag-id)
(let ((uh (parse-server path default-http-server)))
(if (or (server-user uh) (server-password uh))
2002-06-08 11:07:01 -04:00
(fatal-syntax-error
"HTTP URL's may not specify a user or password field" path))
2002-06-08 11:07:01 -04:00
(make-http-url uh (map unescape-uri (cdddr path)) search frag-id)))
2002-06-08 11:07:01 -04:00
(define (parse-http-url-string string)
(call-with-values
(lambda () (parse-uri string))
(lambda (scheme path search frag-id)
(if (string=? scheme "http")
(parse-http-url path search frag-id)
(fatal-syntax-error "not an HTTP URL" path)))))
2002-06-08 11:07:01 -04:00
;;; Default http port is 80.
(define default-http-server (make-server #f #f #f "80"))
2002-06-08 11:07:01 -04:00
;;; Unparse.
(define (http-url->string url)
(string-append "http://"
(server->string (http-url-server url))
2002-06-08 11:07:01 -04:00
"/"
(uri-path->uri (map escape-uri (http-url-path url)))
(cond ((http-url-search url) =>
2002-06-08 11:07:01 -04:00
(lambda (s) (string-append "?" s)))
(else ""))
(cond ((http-url-fragment-identifier url) =>
2002-06-08 11:07:01 -04:00
(lambda (fi) (string-append "#" fi)))
(else ""))))