;;; -*- Scheme -*- ;;; Copyright (c) 1995 by Olin Shivers. ;;; URI syntax -- [scheme] : path [? search ] [# fragmentid] ;;; References: ;;; - http://www.w3.org/Addressing/rfc1630.txt ;;; Original RFC ;;; - http://www.w3.org/hypertext/WWW/Addressing/URL/URI_Overview.html ;;; General Web page of URI pointers. ;;; I wrote a URI parser that slavishly obeyed Tim Berners-Lee's ;;; spec (rfc 1630). This was a waste of time, as most URL's do not ;;; obey his spec, which is incomplete and inconsistent with the URL spec ;;; in any event. This parser is much simpler. It parses a URI into four ;;; fields: ;;; [ ] : [ ? ] [ # fragid ] ;;; The returned fields are *not* unescaped, as the rules for parsing the ;;; component in particular need unescaped text, and are dependent ;;; on . The URL parser is responsible for doing this. ;;; If the , or portions are not specified, ;;; they are #f. Otherwise, , , and are strings; ;;; is a non-empty string list. ;;; The parsing technique is inwards from both ends. ;;; - First we search forwards for the first reserved char (= ; / # ? : space) ;;; If it's a colon, then that's the part, otw no part. ;;; Remove it. ;;; - Then we search backwards from the end for the last reserved char. ;;; If it's a sharp, then that's the part -- remove it. ;;; - Then we search backwards from the end for the last reserved char. ;;; If it's a question-mark, then that's the part -- remove it. ;;; - What's left is the path. Split at slashes. "" -> ("") ;;; ;;; This scheme is tolerant of the various ways people build broken ;;; URI's out there on the Net , p.e. \#= is a reserved character, but ;;; used unescaped in the search-part. It was given to me by Dan ;;; Connolly of the W3C and slightly modified. ;;; Returns four values: scheme, path, search, frag-id. Each value is ;;; either #f or a string except of the path, which is a nonempty list ;;; of string (as mentioned above). (define uri-reserved (string->char-set ";/#?: =")) (define (parse-uri s) (let* ((slen (string-length s)) ;; Search forwards for colon (or intervening reserved char). (rs1 (string-index s uri-reserved)) ; 1st reserved char (colon (and rs1 (char=? (string-ref s rs1) #\:) rs1)) (path-start (if colon (+ colon 1) 0)) ;; Search backwards for # (or intervening reserved char). (rs-last (string-index-right s uri-reserved)) (sharp (and rs-last (char=? (string-ref s rs-last) #\#) rs-last)) ;; Search backwards for ? (or intervening reserved char). ;; (NB: #\= may be after #\? and before #\#) (rs-penult (string-index-right s (char-set-delete uri-reserved #\=) (or sharp slen))) (ques (and rs-penult (char=? (string-ref s rs-penult) #\?) rs-penult)) (path-end (or ques sharp slen))) (values (and colon (substring s 0 colon)) (split-uri-path s path-start path-end) (and ques (substring s (+ ques 1) (or sharp slen))) (and sharp (substring s (+ sharp 1) slen))))) ;;; Caution: ;;; Don't use this proc until *after* you've parsed the URL -- unescaping ;;; might introduce reserved chars (like slashes and colons) that could ;;; blow your parse. (define (unescape-uri s . maybe-start/end) (let-optionals maybe-start/end ((start 0) (end (string-length s))) (let* ((esc-seq? (lambda (i) (and (< (+ i 2) end) (char=? (string-ref s i) #\%) (hex-digit? (string-ref s (+ i 1))) (hex-digit? (string-ref s (+ i 2)))))) (hits (let lp ((i start) (hits 0)) ; count # of esc seqs. (if (< i end) (if (esc-seq? i) (lp (+ i 3) (+ hits 1)) (lp (+ i 1) hits)) hits)))) (if (and (zero? hits) (zero? start) (= end (string-length s))) s (let* ((nlen (- (- end start) (* hits 2))) ; the new ; length of the ; unescaped ; string (ns (make-string nlen))) ; stores the result (let lp ((i start) (j 0)) ; sweap over the string (if (< j nlen) (lp (cond ((esc-seq? i) ; unescape ; escape-sequence (string-set! ns j (let ((d1 (string-ref s (+ i 1))) (d2 (string-ref s (+ i 2)))) (ascii->char (+ (* 16 (hexchar->int d1)) (hexchar->int d2))))) (+ i 3)) (else (string-set! ns j (string-ref s i)) (+ i 1))) (+ j 1)))) ns))))) (define hex-digit? (let ((hex-digits (string->char-set "0123456789abcdefABCDEF"))) (lambda (c) (char-set-contains? hex-digits c)))) ; make use of the fact that numbers and characters are in order in the ascii table (define (hexchar->int c) (- (char->ascii c) (if (char-numeric? c) (char->ascii #\0) (- (if (char-upper-case? c) (char->ascii #\A) (char->ascii #\a)) 10)))) (define int->hexchar (let ((table '#(#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9 #\A #\B #\C #\D #\E #\F))) (lambda (i) (vector-ref table i)))) ;;; Caution: ;;; All reserved chars (e.g., slash, sharp, colon) get escaped: "=;/#?: " ;;; So don't apply this proc to chunks of text with syntactically meaningful ;;; reserved chars (e.g., paths with URI slashes or colons) -- they'll be ;;; escaped, and lose their special meaning. E.g. it would be a mistake ;;; to apply ESCAPE-URI to "//lcs.mit.edu:8001/foo/bar.html" because the ;;; slashes and colons would be escaped. (define uri-escaped-chars (char-set-complement (char-set-union char-set:letter+digit (string->char-set "$-_@.&!*\"'(),+")))) ;;; Takes a set of chars to escape. This is because we sometimes need to ;;; escape larger sets of chars for different parts of a URI. (define (escape-uri s . maybe-escaped-chars) (let-optionals maybe-escaped-chars ((escaped-chars uri-escaped-chars)) (let ((nlen (string-fold (lambda (c i) (+ i (if (char-set-contains? escaped-chars c) 3 1))) 0 s))) ; new length of escaped string (if (= nlen (string-length s)) s (let ((ns (make-string nlen))) (string-fold (lambda (c i) ; replace each occurance of an ; character to escape with %ff where ff ; is the ascii-code in hexadecimal ; notation (+ i (cond ((char-set-contains? escaped-chars c) (string-set! ns i #\%) (let* ((d (char->ascii c)) (dhi (bitwise-and (arithmetic-shift d -4) #xF)) (dlo (bitwise-and d #xF))) (string-set! ns (+ i 1) (int->hexchar dhi)) (string-set! ns (+ i 2) (int->hexchar dlo))) 3) (else (string-set! ns i c) 1)))) 0 s) ns))))) ;;; Four args: context URI's : values, and ;;; main URI's : values. ;;; If the path cannot be resolved, return #f #f (this occurs if ;;; begins with n sequential slashes, and doesn't ;;; have that many sequential slashes anywhere). All paths are ;;; represented as non-empty lists. (define (resolve-uri cscheme cp scheme p) (if scheme (values scheme p) ; If URI has own , it is absolute. (if (and (pair? p) (string=? (car p) "")) ; Path P begins with a slash. (receive (numsl p) ; Count and strip off initial (do ((i 1 (+ i 1)) ; slashes (i.e., initial ""'s) (q (cdr p) (cdr q))) ((or (null? q) (not (string=? (car q) ""))) (values i q))) ;; Skip through CP until we find that many sequential /'s. (let lp ((cp-tail cp) (rhead '()) ; CP prefix, reversed. (j 0)) ; J counts sequential / (cond ((and (pair? cp-tail) (string=? (car cp-tail) "")) ; More ""'s (lp (cdr cp-tail) (cons (car cp-tail) rhead) (+ j 0))) ((= j numsl) ; Win (values cscheme (simplify-uri-path (rev-append rhead p)))) ((pair? cp-tail) ; Keep looking. (lp (cdr cp-tail) (cons (car cp-tail) rhead) 1)) (else (values #f #f))))) ; Lose. ;; P doesn't begin with a slash. (values cscheme (simplify-uri-path (rev-append (cdr (reverse cp)) ; Drop non-dir part p)))))) ; and append P. (define (rev-append a b) ; (append (reverse a) b) (let rev-app ((a a) (b b)) ; Should be defined in a list-proc (if (pair? a) ; package, not here. (rev-app (cdr a) (cons (car a) b)) b))) ;;; Cribbed from scsh's fname.scm (define (split-uri-path uri start end) ; Split at /'s (infix grammar). (let split ((i start)) ; "" -> ("") (cond ((>= i end) '("")) ((string-index uri #\/ i) => (lambda (slash) (cons (substring uri i slash) (split (+ slash 1))))) (else (list (substring uri i end)))))) ;;; The elements of PLIST must be escaped in case they contain slashes. ;;; This procedure doesn't escape them for you; you must do that yourself: ;;; (uri-path-list->path (map escape-uri pathlist)) (define (uri-path-list->path plist) (string-join plist "/")) ; Insert slashes between elts of PLIST. ;;; Remove . and /.. elements from path. The result is a ;;; (maybe empty) list representing a path that does not contain "." ;;; and ".." elements neither at the beginning nor somewhere else. I ;;; tried to follow RFC2396 here. The procedure returns #f if the path ;;; tries to back up past root (like "//.." or "/foo/../.."). "//" may ;;; occur somewhere in the path but not being backed up. Usually, ;;; relative paths are intended to be used with a base ;;; url. Accordingly to RFC2396 (as I hope) relative paths are ;;; considered not to start with "/". They are appended to a base ;;; URL-path and then simplified. So before you start to simplify a ;;; URL try to find out if it is a relative path (i.e. it does not ;;; start with a "/"). (define (simplify-uri-path p) (if (null? p) #f ; P must be non-null (let lp ((path-list (cdr p)) (stack (list (car p)))) (if (null? path-list) ; we're done (reverse stack) (cond ((string=? (car path-list) "..") ; back up ; neither the empty path nor root (if (not (or (null? stack) (string=? (car stack) ""))) (lp (cdr path-list) (cdr stack)) #f)) ((string=? (car path-list) ".") ; leave this (lp (cdr path-list) stack)) ((string=? (car path-list) "") ; back to root (lp (cdr path-list) '(""))) (else ; usual segment (lp (cdr path-list) (cons (car path-list) stack))))))))