2000-09-26 10:35:26 -04:00
|
|
|
;;; -*- Scheme -*-
|
|
|
|
;;; Copyright (c) 1995 by Olin Shivers.
|
|
|
|
|
|
|
|
;;; URI syntax -- [scheme] : path [? search ] [# fragmentid]
|
|
|
|
|
|
|
|
;;; Imports and non-R4RS'isms
|
|
|
|
;;; let-optionals
|
|
|
|
;;; receive values (MV return)
|
|
|
|
;;; ascii->char char->ascii
|
|
|
|
;;; index rindex
|
|
|
|
;;; char-set-index char-set-rindex
|
|
|
|
;;; string-reduce
|
|
|
|
;;; char-set package
|
|
|
|
;;; bitwise logical funs and arithmetic-shift
|
|
|
|
;;; join-strings (scsh field-reader code.)
|
|
|
|
|
|
|
|
|
|
|
|
;;; References:
|
|
|
|
;;; - ftp://ftp.internic.net/rfc/rfc1630.txt
|
|
|
|
;;; Original RFC
|
|
|
|
;;; - http://www.w3.org/hypertext/WWW/Addressing/URL/URI_Overview.html
|
|
|
|
;;; General Web page of URI pointers.
|
|
|
|
|
|
|
|
;;; I wrote a URI parser that slavishly obeyed Tim Berners-Lee's
|
|
|
|
;;; spec (rfc 1630). This was a waste of time, as most URL's do not
|
|
|
|
;;; obey his spec, which is incomplete and inconsistent with the URL spec
|
|
|
|
;;; in any event. This parser is much simpler. It parses a URI into four
|
|
|
|
;;; fields:
|
|
|
|
;;; [ <scheme> ] : <path> [ ? <search> ] [ # fragid ]
|
|
|
|
;;; The returned fields are *not* unescaped, as the rules for parsing the
|
|
|
|
;;; <path> component in particular need unescaped text, and are dependent
|
|
|
|
;;; on <scheme>. The URL parser is responsible for doing this.
|
|
|
|
;;; If the <scheme>, <search> or <fragid> portions are not specified,
|
|
|
|
;;; they are #f. Otherwise, <scheme>, <search>, and <fragid> are strings;
|
|
|
|
;;; <path> is a non-empty string list.
|
|
|
|
|
|
|
|
;;; The parsing technique is inwards from both ends.
|
|
|
|
;;; - First we search forwards for the first reserved char (= ; / # ? : space)
|
|
|
|
;;; If it's a colon, then that's the <scheme> part, otw no <scheme> part.
|
|
|
|
;;; Remove it.
|
|
|
|
;;; - Then we search backwards from the end for the last reserved char.
|
|
|
|
;;; If it's a sharp, then that's the <fragment-id> part -- remove it.
|
|
|
|
;;; - Then we search backwards from the end for the last reserved char.
|
|
|
|
;;; If it's a question-mark, then that's the <search> part -- remove it.
|
|
|
|
;;; - What's left is the path. Split at slashes. "" -> ("")
|
|
|
|
;;;
|
2001-07-16 07:31:41 -04:00
|
|
|
;;; This scheme is tolerant of the various ways people build broken
|
|
|
|
;;; URI's out there on the Net , p.e. \#= is a reserved character, but
|
|
|
|
;;; used unescaped in the search-part. It was given to me by Dan
|
|
|
|
;;; Connolly of the W3C and slightly modified.
|
2000-09-26 10:35:26 -04:00
|
|
|
|
2001-05-17 12:48:41 -04:00
|
|
|
;;; Returns four values: scheme, path, search, frag-id. Each value is
|
|
|
|
;;; either #f or a string except of the path, which is a nonempty list
|
|
|
|
;;; of string (as mentioned above).
|
2000-09-26 10:35:26 -04:00
|
|
|
|
2001-04-27 12:19:34 -04:00
|
|
|
|
2001-07-16 07:31:41 -04:00
|
|
|
(define uri-reserved (string->char-set ";/#?: ="))
|
2000-09-26 10:35:26 -04:00
|
|
|
|
|
|
|
(define (parse-uri s)
|
|
|
|
(let* ((slen (string-length s))
|
|
|
|
;; Search forwards for colon (or intervening reserved char).
|
2002-04-21 14:55:18 -04:00
|
|
|
(rs1 (string-index s uri-reserved)) ; 1st reserved char
|
2000-09-26 10:35:26 -04:00
|
|
|
(colon (and rs1 (char=? (string-ref s rs1) #\:) rs1))
|
|
|
|
(path-start (if colon (+ colon 1) 0))
|
|
|
|
|
|
|
|
;; Search backwards for # (or intervening reserved char).
|
2002-04-21 14:55:18 -04:00
|
|
|
(rs-last (string-index-right s uri-reserved))
|
2000-09-26 10:35:26 -04:00
|
|
|
(sharp (and rs-last (char=? (string-ref s rs-last) #\#) rs-last))
|
|
|
|
|
|
|
|
;; Search backwards for ? (or intervening reserved char).
|
2001-07-16 07:31:41 -04:00
|
|
|
;; (NB: #\= may be after #\? and before #\#)
|
2002-04-21 14:55:18 -04:00
|
|
|
(rs-penult (string-index-right
|
2001-07-16 07:31:41 -04:00
|
|
|
s
|
|
|
|
(char-set-delete uri-reserved #\=)
|
|
|
|
(or sharp slen)))
|
2000-09-26 10:35:26 -04:00
|
|
|
(ques (and rs-penult (char=? (string-ref s rs-penult) #\?) rs-penult))
|
|
|
|
|
|
|
|
(path-end (or ques sharp slen)))
|
|
|
|
(values (and colon (substring s 0 colon))
|
|
|
|
(split-uri-path s path-start path-end)
|
|
|
|
(and ques (substring s (+ ques 1) (or sharp slen)))
|
|
|
|
(and sharp (substring s (+ sharp 1) slen)))))
|
|
|
|
|
|
|
|
;;; Caution:
|
|
|
|
;;; Don't use this proc until *after* you've parsed the URL -- unescaping
|
|
|
|
;;; might introduce reserved chars (like slashes and colons) that could
|
|
|
|
;;; blow your parse.
|
|
|
|
|
|
|
|
(define (unescape-uri s . maybe-start/end)
|
|
|
|
(let-optionals maybe-start/end ((start 0)
|
|
|
|
(end (string-length s)))
|
|
|
|
(let* ((esc-seq? (lambda (i) (and (< (+ i 2) end)
|
|
|
|
(char=? (string-ref s i) #\%)
|
|
|
|
(hex-digit? (string-ref s (+ i 1)))
|
|
|
|
(hex-digit? (string-ref s (+ i 2))))))
|
|
|
|
(hits (let lp ((i start) (hits 0)) ; count # of esc seqs.
|
|
|
|
(if (< i end)
|
|
|
|
(if (esc-seq? i)
|
|
|
|
(lp (+ i 3) (+ hits 1))
|
|
|
|
(lp (+ i 1) hits))
|
|
|
|
hits))))
|
|
|
|
|
|
|
|
(if (and (zero? hits) (zero? start) (= end (string-length s))) s
|
|
|
|
|
2001-05-17 12:48:41 -04:00
|
|
|
(let* ((nlen (- (- end start) (* hits 2))) ; the new
|
|
|
|
; length of the
|
|
|
|
; unescaped
|
|
|
|
; string
|
2001-05-20 14:57:31 -04:00
|
|
|
(ns (make-string nlen))) ; stores the result
|
2000-09-26 10:35:26 -04:00
|
|
|
|
2001-05-17 12:48:41 -04:00
|
|
|
(let lp ((i start) (j 0)) ; sweap over the string
|
2000-09-26 10:35:26 -04:00
|
|
|
(if (< j nlen)
|
2001-08-20 07:31:03 -04:00
|
|
|
(lp (cond
|
|
|
|
((esc-seq? i) ; unescape
|
2001-05-17 12:48:41 -04:00
|
|
|
; escape-sequence
|
2001-08-20 07:31:03 -04:00
|
|
|
(string-set! ns j
|
|
|
|
(let ((d1 (string-ref s (+ i 1)))
|
|
|
|
(d2 (string-ref s (+ i 2))))
|
|
|
|
(ascii->char (+ (* 16 (hexchar->int d1))
|
|
|
|
(hexchar->int d2)))))
|
|
|
|
(+ i 3))
|
|
|
|
(else (string-set! ns j (string-ref s i))
|
|
|
|
(+ i 1)))
|
2000-09-26 10:35:26 -04:00
|
|
|
(+ j 1))))
|
|
|
|
ns)))))
|
|
|
|
|
|
|
|
(define hex-digit?
|
|
|
|
(let ((hex-digits (string->char-set "0123456789abcdefABCDEF")))
|
|
|
|
(lambda (c) (char-set-contains? hex-digits c))))
|
|
|
|
|
2001-05-17 12:48:41 -04:00
|
|
|
; make use of the fact that numbers and characters are in order in the ascii table
|
|
|
|
(define (hexchar->int c)
|
2000-09-26 10:35:26 -04:00
|
|
|
(- (char->ascii c)
|
|
|
|
(if (char-numeric? c)
|
2001-05-17 12:48:41 -04:00
|
|
|
(char->ascii #\0)
|
2000-09-26 10:35:26 -04:00
|
|
|
(- (if (char-upper-case? c)
|
|
|
|
(char->ascii #\A)
|
|
|
|
(char->ascii #\a))
|
|
|
|
10))))
|
|
|
|
|
|
|
|
(define int->hexchar
|
|
|
|
(let ((table '#(#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9
|
|
|
|
#\A #\B #\C #\D #\E #\F)))
|
|
|
|
(lambda (i) (vector-ref table i))))
|
|
|
|
|
|
|
|
|
|
|
|
;;; Caution:
|
|
|
|
;;; All reserved chars (e.g., slash, sharp, colon) get escaped: "=;/#?: "
|
|
|
|
;;; So don't apply this proc to chunks of text with syntactically meaningful
|
|
|
|
;;; reserved chars (e.g., paths with URI slashes or colons) -- they'll be
|
|
|
|
;;; escaped, and lose their special meaning. E.g. it would be a mistake
|
|
|
|
;;; to apply ESCAPE-URI to "//lcs.mit.edu:8001/foo/bar.html" because the
|
|
|
|
;;; slashes and colons would be escaped.
|
|
|
|
|
|
|
|
(define uri-escaped-chars
|
2002-02-23 12:14:48 -05:00
|
|
|
(char-set-complement (char-set-union char-set:letter+digit
|
|
|
|
(string->char-set "$-_@.&!*\"'(),+"))))
|
2000-09-26 10:35:26 -04:00
|
|
|
|
|
|
|
;;; Takes a set of chars to escape. This is because we sometimes need to
|
|
|
|
;;; escape larger sets of chars for different parts of a URI.
|
|
|
|
|
|
|
|
(define (escape-uri s . maybe-escaped-chars)
|
|
|
|
(let-optionals maybe-escaped-chars ((escaped-chars uri-escaped-chars))
|
2002-04-21 14:55:18 -04:00
|
|
|
(let ((nlen (string-fold-right
|
|
|
|
(lambda (c i)
|
|
|
|
(+ i
|
|
|
|
(if (char-set-contains? escaped-chars c)
|
|
|
|
3 1)))
|
|
|
|
0
|
|
|
|
s))) ; new length of escaped string
|
2000-09-26 10:35:26 -04:00
|
|
|
(if (= nlen (string-length s)) s
|
2001-05-17 12:48:41 -04:00
|
|
|
(let ((ns (make-string nlen)))
|
2002-04-25 12:30:14 -04:00
|
|
|
(string-fold-left
|
2001-05-17 12:48:41 -04:00
|
|
|
(lambda (c i) ; replace each occurance of an
|
|
|
|
; character to escape with %ff where ff
|
|
|
|
; is the ascii-code in hexadecimal
|
|
|
|
; notation
|
2001-08-20 07:31:03 -04:00
|
|
|
(+ i (cond
|
|
|
|
((char-set-contains? escaped-chars c)
|
|
|
|
(string-set! ns i #\%)
|
|
|
|
(let* ((d (char->ascii c))
|
|
|
|
(dhi (bitwise-and (arithmetic-shift d -4) #xF))
|
|
|
|
(dlo (bitwise-and d #xF)))
|
|
|
|
(string-set! ns (+ i 1)
|
|
|
|
(int->hexchar dhi))
|
|
|
|
(string-set! ns (+ i 2)
|
|
|
|
(int->hexchar dlo)))
|
|
|
|
3)
|
|
|
|
(else (string-set! ns i c)
|
|
|
|
1))))
|
2002-04-21 14:55:18 -04:00
|
|
|
0
|
2000-09-26 10:35:26 -04:00
|
|
|
s)
|
|
|
|
ns)))))
|
|
|
|
|
|
|
|
|
|
|
|
;;; Four args: context URI's <scheme> : <path> values, and
|
|
|
|
;;; main URI's <scheme> : <path> values.
|
|
|
|
;;; If the path cannot be resolved, return #f #f (this occurs if <path>
|
|
|
|
;;; begins with n sequential slashes, and <context-path> doesn't
|
|
|
|
;;; have that many sequential slashes anywhere). All paths are
|
|
|
|
;;; represented as non-empty lists.
|
|
|
|
|
|
|
|
(define (resolve-uri cscheme cp scheme p)
|
|
|
|
(if scheme (values scheme p) ; If URI has own <scheme>, it is absolute.
|
|
|
|
|
|
|
|
(if (and (pair? p) (string=? (car p) "")) ; Path P begins with a slash.
|
|
|
|
|
|
|
|
(receive (numsl p) ; Count and strip off initial
|
|
|
|
(do ((i 1 (+ i 1)) ; slashes (i.e., initial ""'s)
|
|
|
|
(q (cdr p) (cdr q)))
|
|
|
|
((or (null? q) (not (string=? (car q) "")))
|
|
|
|
(values i q)))
|
|
|
|
|
|
|
|
;; Skip through CP until we find that many sequential /'s.
|
|
|
|
(let lp ((cp-tail cp)
|
|
|
|
(rhead '()) ; CP prefix, reversed.
|
|
|
|
(j 0)) ; J counts sequential /
|
|
|
|
|
2001-08-20 07:31:03 -04:00
|
|
|
(cond
|
|
|
|
((and (pair? cp-tail) (string=? (car cp-tail) "")) ; More ""'s
|
|
|
|
(lp (cdr cp-tail)
|
|
|
|
(cons (car cp-tail) rhead)
|
|
|
|
(+ j 0)))
|
2000-09-26 10:35:26 -04:00
|
|
|
|
2001-08-20 07:31:03 -04:00
|
|
|
((= j numsl) ; Win
|
|
|
|
(values cscheme (simplify-uri-path (rev-append rhead p))))
|
2000-09-26 10:35:26 -04:00
|
|
|
|
2001-08-20 07:31:03 -04:00
|
|
|
((pair? cp-tail) ; Keep looking.
|
|
|
|
(lp (cdr cp-tail)
|
|
|
|
(cons (car cp-tail) rhead)
|
|
|
|
1))
|
2000-09-26 10:35:26 -04:00
|
|
|
|
2001-08-20 07:31:03 -04:00
|
|
|
(else (values #f #f))))) ; Lose.
|
2000-09-26 10:35:26 -04:00
|
|
|
|
|
|
|
|
|
|
|
;; P doesn't begin with a slash.
|
|
|
|
(values cscheme (simplify-uri-path
|
|
|
|
(rev-append (cdr (reverse cp)) ; Drop non-dir part
|
|
|
|
p)))))) ; and append P.
|
|
|
|
|
|
|
|
|
|
|
|
(define (rev-append a b) ; (append (reverse a) b)
|
|
|
|
(let rev-app ((a a) (b b)) ; Should be defined in a list-proc
|
|
|
|
(if (pair? a) ; package, not here.
|
|
|
|
(rev-app (cdr a) (cons (car a) b))
|
|
|
|
b)))
|
|
|
|
|
|
|
|
;;; Cribbed from scsh's fname.scm
|
|
|
|
|
|
|
|
(define (split-uri-path uri start end) ; Split at /'s (infix grammar).
|
|
|
|
(let split ((i start)) ; "" -> ("")
|
2001-08-20 07:31:03 -04:00
|
|
|
(cond
|
|
|
|
((>= i end) '(""))
|
|
|
|
((string-index uri #\/ i) =>
|
|
|
|
(lambda (slash)
|
|
|
|
(cons (substring uri i slash)
|
|
|
|
(split (+ slash 1)))))
|
|
|
|
(else (list (substring uri i end))))))
|
2000-09-26 10:35:26 -04:00
|
|
|
|
|
|
|
|
|
|
|
;;; The elements of PLIST must be escaped in case they contain slashes.
|
|
|
|
;;; This procedure doesn't escape them for you; you must do that yourself:
|
|
|
|
;;; (uri-path-list->path (map escape-uri pathlist))
|
|
|
|
|
|
|
|
(define (uri-path-list->path plist)
|
2001-10-08 13:33:13 -04:00
|
|
|
(string-join plist "/")) ; Insert slashes between elts of PLIST.
|
2000-09-26 10:35:26 -04:00
|
|
|
|
|
|
|
|
2001-05-20 14:57:31 -04:00
|
|
|
;;; Remove . and <segment>/.. elements from path. The result is a
|
|
|
|
;;; (maybe empty) list representing a path that does not contain "."
|
|
|
|
;;; and ".." elements neither at the beginning nor somewhere else. I
|
|
|
|
;;; tried to follow RFC2396 here. The procedure returns #f if the path
|
|
|
|
;;; tries to back up past root (like "//.." or "/foo/../.."). "//" may
|
|
|
|
;;; occur somewhere in the path but not being backed up. Usually,
|
|
|
|
;;; relative paths are intended to be used with a base
|
|
|
|
;;; url. Accordingly to RFC2396 (as I hope) relative paths are
|
|
|
|
;;; considered not to start with "/". They are appended to a base
|
|
|
|
;;; URL-path and then simplified. So before you start to simplify a
|
|
|
|
;;; URL try to find out if it is a relative path (i.e. it does not
|
|
|
|
;;; start with a "/").
|
|
|
|
|
2001-07-13 13:21:39 -04:00
|
|
|
(define (simplify-uri-path p)
|
|
|
|
(if (null? p) #f ; P must be non-null
|
2001-05-20 14:57:31 -04:00
|
|
|
(let lp ((path-list (cdr p))
|
|
|
|
(stack (list (car p))))
|
|
|
|
(if (null? path-list) ; we're done
|
|
|
|
(reverse stack)
|
|
|
|
(cond
|
|
|
|
((string=? (car path-list) "..") ; back up
|
|
|
|
; neither the empty path nor root
|
|
|
|
(if (not (or (null? stack) (string=? (car stack) "")))
|
|
|
|
(lp (cdr path-list) (cdr stack))
|
|
|
|
#f))
|
|
|
|
((string=? (car path-list) ".") ; leave this
|
|
|
|
(lp (cdr path-list) stack))
|
|
|
|
((string=? (car path-list) "") ; back to root
|
|
|
|
(lp (cdr path-list) '("")))
|
|
|
|
(else ; usual segment
|
|
|
|
(lp (cdr path-list) (cons (car path-list) stack))))))))
|
|
|
|
|
2001-07-13 13:21:39 -04:00
|
|
|
|