sunet/scheme/lib/uri.scm

;;; -*- Scheme -*-
;;; Copyright (c) 1995 by Olin Shivers.

;;; URI syntax -- [scheme] : path [? search ] [# fragmentid]

;;; References:
;;; - http://www.w3.org/Addressing/rfc1630.txt
;;;   Original RFC
;;; - http://www.w3.org/hypertext/WWW/Addressing/URL/URI_Overview.html
;;;   General Web page of URI pointers.

;;; I wrote a URI parser that slavishly obeyed Tim Berners-Lee's
;;; spec (rfc 1630). This was a waste of time, as most URL's do not
;;; obey his spec, which is incomplete and inconsistent with the URL spec
;;; in any event. This parser is much simpler. It parses a URI into four
;;; fields:
;;;     [ <scheme> ] : <path> [ ? <search> ] [ # fragid ]
;;; The returned fields are *not* unescaped, as the rules for parsing the
;;; <path> component in particular need unescaped text, and are dependent
;;; on <scheme>. The URL parser is responsible for doing this.
;;; If the <scheme>, <search> or <fragid> portions are not specified,
;;; they are #f. Otherwise, <scheme>, <search>, and <fragid> are strings;
;;; <path> is a non-empty string list.

;;; The parsing technique is inwards from both ends.
;;; - First we search forwards for the first reserved char (= ; / # ? : space)
;;;   If it's a colon, then that's the <scheme> part, otw no <scheme> part.
;;;   Remove it.
;;; - Then we search backwards from the end for the last reserved char.
;;;   If it's a sharp, then that's the <fragment-id> part -- remove it.
;;; - Then we search backwards from the end for the last reserved char.
;;;   If it's a question-mark, then that's the <search> part -- remove it.
;;; - What's left is the path. Split at slashes. "" -> ("")
;;;
;;; This scheme is tolerant of the various ways people build broken
;;; URI's out there on the Net , p.e. \#= is a reserved character, but
;;; used unescaped in the search-part. It was given to me by Dan
;;; Connolly of the W3C and slightly modified.

;;; Returns four values: scheme, path, search, frag-id.  Each value is
;;; either #f or a string except of the path, which is a nonempty list
;;; of string (as mentioned above).


(define uri-reserved (string->char-set ";/#?: ="))

(define (parse-uri s)
  (let* ((slen (string-length s))
	 ;; Search forwards for colon (or intervening reserved char).
	 (rs1 (string-index s uri-reserved))	; 1st reserved char
	 (colon (and rs1 (char=? (string-ref s rs1) #\:) rs1))
	 (path-start (if colon (+ colon 1) 0))

	 ;; Search backwards for # (or intervening reserved char).
	 (rs-last (string-index-right s uri-reserved))
	 (sharp (and rs-last (char=? (string-ref s rs-last) #\#) rs-last))

	 ;; Search backwards for ? (or intervening reserved char).
	 ;; (NB: #\= may be after #\? and before #\#)
	 (rs-penult (string-index-right
		     s                      
		     (char-set-delete uri-reserved #\=)
		     (or sharp slen))) 
	 (ques (and rs-penult (char=? (string-ref s rs-penult) #\?) rs-penult))

	 (path-end (or ques sharp slen)))
    (values (and colon (substring s 0 colon))
	    (split-uri-path s path-start path-end)
	    (and ques (substring s (+ ques 1) (or sharp slen)))
	    (and sharp (substring s (+ sharp 1) slen)))))

;;; Caution:
;;; Don't use this proc until *after* you've parsed the URL -- unescaping
;;; might introduce reserved chars (like slashes and colons) that could
;;; blow your parse.

(define (unescape-uri s . maybe-start/end)
  (let-optionals maybe-start/end ((start 0)
				  (end (string-length s)))
    (let* ((esc-seq? (lambda (i) (and (< (+ i 2) end)
				      (char=? (string-ref s i) #\%)
				      (hex-digit? (string-ref s (+ i 1)))
				      (hex-digit? (string-ref s (+ i 2))))))
	   (hits (let lp ((i start) (hits 0)) ; count # of esc seqs.
		   (if (< i end)
		       (if (esc-seq? i)
			   (lp (+ i 3) (+ hits 1))
			   (lp (+ i 1) hits))
		       hits))))
	 
      (if (and (zero? hits) (zero? start) (= end (string-length s))) s

	  (let* ((nlen (- (- end start) (* hits 2)))   ; the new
						       ; length of the
						       ; unescaped
						       ; string
		 (ns (make-string nlen)))              ; stores the result

	    (let lp ((i start) (j 0))                  ; sweap over the string
	      (if (< j nlen)
		  (lp (cond 
		       ((esc-seq? i)                 ; unescape
						       ; escape-sequence
			(string-set! ns j
				     (let ((d1 (string-ref s (+ i 1)))
					   (d2 (string-ref s (+ i 2))))
				       (ascii->char (+ (* 16 (hexchar->int d1))
						       (hexchar->int d2)))))
			(+ i 3))
		       (else (string-set! ns j (string-ref s i))  
			     (+ i 1)))
		      (+ j 1))))
	    ns)))))

(define hex-digit?
  (let ((hex-digits (string->char-set "0123456789abcdefABCDEF")))
    (lambda (c) (char-set-contains? hex-digits c))))

; make use of the fact that numbers and characters are in order in the ascii table
(define (hexchar->int c)      
  (- (char->ascii c) 
     (if (char-numeric? c)
	 (char->ascii #\0)         
	 (- (if (char-upper-case? c)
		(char->ascii #\A)
		(char->ascii #\a))
	    10))))

(define int->hexchar
  (let ((table '#(#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9
		  #\A #\B #\C #\D #\E #\F)))
    (lambda (i) (vector-ref table i))))
  

;;; Caution:
;;; All reserved chars (e.g., slash, sharp, colon) get escaped: "=;/#?: "
;;; So don't apply this proc to chunks of text with syntactically meaningful
;;; reserved chars (e.g., paths with URI slashes or colons) -- they'll be 
;;; escaped, and lose their special meaning. E.g. it would be a mistake
;;; to apply ESCAPE-URI to "//lcs.mit.edu:8001/foo/bar.html" because the
;;; slashes and colons would be escaped.

(define uri-escaped-chars
  (char-set-complement (char-set-union char-set:letter+digit
				       (string->char-set "$-_@.&!*\"'(),+"))))

;;; Takes a set of chars to escape. This is because we sometimes need to
;;; escape larger sets of chars for different parts of a URI.

(define (escape-uri s . maybe-escaped-chars)
  (let-optionals maybe-escaped-chars ((escaped-chars uri-escaped-chars))
    (let ((nlen (string-fold
		 (lambda (c i)
		   (+ i
		      (if (char-set-contains? escaped-chars c)
			  3 1)))
		 0
		 s)))    ; new length of escaped string
      (if (= nlen (string-length s)) s
	  (let ((ns (make-string nlen))) 
	    (string-fold
	     (lambda (c i)     ; replace each occurance of an
			       ; character to escape with %ff where ff
			       ; is the ascii-code in hexadecimal
			       ; notation
	       (+ i (cond 
		     ((char-set-contains? escaped-chars c)
		      (string-set! ns i #\%)
		      (let* ((d (char->ascii c))
			     (dhi (bitwise-and (arithmetic-shift d -4) #xF))
			     (dlo (bitwise-and d #xF)))
			(string-set! ns (+ i 1)
				     (int->hexchar dhi))
			(string-set! ns (+ i 2)
				     (int->hexchar dlo)))
		      3)
		     (else (string-set! ns i c)
			   1))))
	     0
	     s)
	    ns)))))


;;; Four args: context URI's  <scheme> : <path> values, and
;;;            main URI's     <scheme> : <path> values.
;;; If the path cannot be resolved, return #f #f (this occurs if <path>
;;; begins with n sequential slashes, and <context-path> doesn't
;;; have that many sequential slashes anywhere). All paths are
;;; represented as non-empty lists.

(define (resolve-uri cscheme cp  scheme p)
  (if scheme (values scheme p)	; If URI has own <scheme>, it is absolute.

      (if (and (pair? p) (string=? (car p) ""))	  ; Path P begins with a slash.

	  (receive (numsl p)			; Count and strip off initial 
	      (do ((i 1       (+ i 1))  	; slashes (i.e., initial ""'s)
		   (q (cdr p) (cdr q)))
		  ((or (null? q) (not (string=? (car q) "")))
		   (values i q)))

	    ;; Skip through CP until we find that many sequential /'s.
	    (let lp ((cp-tail cp)
		     (rhead '())			; CP prefix, reversed.
		     (j 0))				; J counts sequential /
	      
	      (cond
	       ((and (pair? cp-tail) (string=? (car cp-tail) "")) ; More ""'s
		(lp (cdr cp-tail)
		    (cons (car cp-tail) rhead)
		    (+ j 0)))

	       ((= j numsl)				; Win
		(values cscheme (simplify-uri-path (rev-append rhead p))))

	       ((pair? cp-tail)				; Keep looking.
		(lp (cdr cp-tail)
		    (cons (car cp-tail) rhead)
		    1))

	       (else (values #f #f)))))		; Lose.


	  ;; P doesn't begin with a slash.
	  (values cscheme (simplify-uri-path
			    (rev-append (cdr (reverse cp)) ; Drop non-dir part
					p))))))		   ; and append P.


(define (rev-append a b)		; (append (reverse a) b)
  (let rev-app ((a a) (b b))		; Should be defined in a list-proc
    (if (pair? a)			; package, not here.
	(rev-app (cdr a) (cons (car a) b))
	b)))

;;; Cribbed from scsh's fname.scm

(define (split-uri-path uri start end)		; Split at /'s (infix grammar).
  (let split ((i start))			; "" -> ("")
    (cond
     ((>= i end) '(""))
     ((string-index uri #\/ i) =>
      (lambda (slash)
	(cons (substring uri i slash)
	      (split (+ slash 1)))))
     (else (list (substring uri i end))))))


;;; The elements of PLIST must be escaped in case they contain slashes.
;;; This procedure doesn't escape them for you; you must do that yourself:
;;;     (uri-path-list->path (map escape-uri pathlist))

(define (uri-path-list->path plist)
  (string-join plist "/"))		; Insert slashes between elts of PLIST.


;;; Remove . and <segment>/.. elements from path.  The result is a
;;; (maybe empty) list representing a path that does not contain "." 
;;; and ".." elements neither at the beginning nor somewhere else. I
;;; tried to follow RFC2396 here. The procedure returns #f if the path
;;; tries to back up past root (like "//.." or "/foo/../.."). "//" may
;;; occur somewhere in the path but not being backed up. Usually,
;;; relative paths are intended to be used with a base
;;; url. Accordingly to RFC2396 (as I hope) relative paths are
;;; considered not to start with "/". They are appended to a base
;;; URL-path and then simplified. So before you start to simplify a
;;; URL try to find out if it is a relative path (i.e. it does not
;;; start with a "/").

(define (simplify-uri-path p)    
  (if (null? p) #f                               ; P must be non-null
      (let lp ((path-list (cdr p))
	       (stack (list (car p))))
	(if (null? path-list)                   ; we're done
	  (reverse stack)
	  (cond
	   ((string=? (car path-list) "..")     ; back up
					     ; neither the empty path nor root
	    (if (not (or (null? stack) (string=? (car stack) "")))  
		(lp (cdr path-list) (cdr stack))
		#f))
	   ((string=? (car path-list) ".")      ; leave this
	    (lp (cdr path-list) stack))
	   ((string=? (car path-list) "")       ; back to root
	    (lp (cdr path-list) '("")))
	   (else                                ; usual segment
	    (lp (cdr path-list) (cons (car path-list) stack))))))))
Moved from top-level directory. 2002-06-08 11:07:01 -04:00			`;;; -- Scheme --`
			`;;; Copyright (c) 1995 by Olin Shivers.`

			`;;; URI syntax -- [scheme] : path [? search ] [# fragmentid]`

			`;;; References:`
Remove Olin's import notes which are bound to be inaccurate by now. Fix a few RFC URLs. 2002-08-26 07:18:44 -04:00			`;;; - http://www.w3.org/Addressing/rfc1630.txt`
Moved from top-level directory. 2002-06-08 11:07:01 -04:00			`;;; Original RFC`
			`;;; - http://www.w3.org/hypertext/WWW/Addressing/URL/URI_Overview.html`
			`;;; General Web page of URI pointers.`

			`;;; I wrote a URI parser that slavishly obeyed Tim Berners-Lee's`
			`;;; spec (rfc 1630). This was a waste of time, as most URL's do not`
			`;;; obey his spec, which is incomplete and inconsistent with the URL spec`
			`;;; in any event. This parser is much simpler. It parses a URI into four`
			`;;; fields:`
			`;;; [ <scheme> ] : <path> [ ? <search> ] [ # fragid ]`
			`;;; The returned fields are not unescaped, as the rules for parsing the`
			`;;; <path> component in particular need unescaped text, and are dependent`
			`;;; on <scheme>. The URL parser is responsible for doing this.`
			`;;; If the <scheme>, <search> or <fragid> portions are not specified,`
			`;;; they are #f. Otherwise, <scheme>, <search>, and <fragid> are strings;`
			`;;; <path> is a non-empty string list.`

			`;;; The parsing technique is inwards from both ends.`
			`;;; - First we search forwards for the first reserved char (= ; / # ? : space)`
			`;;; If it's a colon, then that's the <scheme> part, otw no <scheme> part.`
			`;;; Remove it.`
			`;;; - Then we search backwards from the end for the last reserved char.`
			`;;; If it's a sharp, then that's the <fragment-id> part -- remove it.`
			`;;; - Then we search backwards from the end for the last reserved char.`
			`;;; If it's a question-mark, then that's the <search> part -- remove it.`
			`;;; - What's left is the path. Split at slashes. "" -> ("")`
			`;;;`
			`;;; This scheme is tolerant of the various ways people build broken`
			`;;; URI's out there on the Net , p.e. \#= is a reserved character, but`
			`;;; used unescaped in the search-part. It was given to me by Dan`
			`;;; Connolly of the W3C and slightly modified.`

			`;;; Returns four values: scheme, path, search, frag-id. Each value is`
			`;;; either #f or a string except of the path, which is a nonempty list`
			`;;; of string (as mentioned above).`


			`(define uri-reserved (string->char-set ";/#?: ="))`

			`(define (parse-uri s)`
			`(let* ((slen (string-length s))`
			`;; Search forwards for colon (or intervening reserved char).`
			`(rs1 (string-index s uri-reserved)) ; 1st reserved char`
			`(colon (and rs1 (char=? (string-ref s rs1) #\:) rs1))`
			`(path-start (if colon (+ colon 1) 0))`

			`;; Search backwards for # (or intervening reserved char).`
			`(rs-last (string-index-right s uri-reserved))`
			`(sharp (and rs-last (char=? (string-ref s rs-last) #\#) rs-last))`

			`;; Search backwards for ? (or intervening reserved char).`
			`;; (NB: #\= may be after #\? and before #\#)`
			`(rs-penult (string-index-right`
			`s`
			`(char-set-delete uri-reserved #\=)`
			`(or sharp slen)))`
			`(ques (and rs-penult (char=? (string-ref s rs-penult) #\?) rs-penult))`

			`(path-end (or ques sharp slen)))`
			`(values (and colon (substring s 0 colon))`
			`(split-uri-path s path-start path-end)`
			`(and ques (substring s (+ ques 1) (or sharp slen)))`
			`(and sharp (substring s (+ sharp 1) slen)))))`

			`;;; Caution:`
			`;;; Don't use this proc until after you've parsed the URL -- unescaping`
			`;;; might introduce reserved chars (like slashes and colons) that could`
			`;;; blow your parse.`

			`(define (unescape-uri s . maybe-start/end)`
			`(let-optionals maybe-start/end ((start 0)`
			`(end (string-length s)))`
			`(let* ((esc-seq? (lambda (i) (and (< (+ i 2) end)`
			`(char=? (string-ref s i) #\%)`
			`(hex-digit? (string-ref s (+ i 1)))`
			`(hex-digit? (string-ref s (+ i 2))))))`
			`(hits (let lp ((i start) (hits 0)) ; count # of esc seqs.`
			`(if (< i end)`
			`(if (esc-seq? i)`
			`(lp (+ i 3) (+ hits 1))`
			`(lp (+ i 1) hits))`
			`hits))))`

			`(if (and (zero? hits) (zero? start) (= end (string-length s))) s`

			`(let* ((nlen (- (- end start) (* hits 2))) ; the new`
			`; length of the`
			`; unescaped`
			`; string`
			`(ns (make-string nlen))) ; stores the result`

			`(let lp ((i start) (j 0)) ; sweap over the string`
			`(if (< j nlen)`
			`(lp (cond`
			`((esc-seq? i) ; unescape`
			`; escape-sequence`
			`(string-set! ns j`
			`(let ((d1 (string-ref s (+ i 1)))`
			`(d2 (string-ref s (+ i 2))))`
			`(ascii->char (+ (* 16 (hexchar->int d1))`
			`(hexchar->int d2)))))`
			`(+ i 3))`
			`(else (string-set! ns j (string-ref s i))`
			`(+ i 1)))`
			`(+ j 1))))`
			`ns)))))`

			`(define hex-digit?`
			`(let ((hex-digits (string->char-set "0123456789abcdefABCDEF")))`
			`(lambda (c) (char-set-contains? hex-digits c))))`

			`; make use of the fact that numbers and characters are in order in the ascii table`
			`(define (hexchar->int c)`
			`(- (char->ascii c)`
			`(if (char-numeric? c)`
			`(char->ascii #\0)`
			`(- (if (char-upper-case? c)`
			`(char->ascii #\A)`
			`(char->ascii #\a))`
			`10))))`

			`(define int->hexchar`
			`(let ((table '#(#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9`
			`#\A #\B #\C #\D #\E #\F)))`
			`(lambda (i) (vector-ref table i))))`


			`;;; Caution:`
			`;;; All reserved chars (e.g., slash, sharp, colon) get escaped: "=;/#?: "`
			`;;; So don't apply this proc to chunks of text with syntactically meaningful`
			`;;; reserved chars (e.g., paths with URI slashes or colons) -- they'll be`
			`;;; escaped, and lose their special meaning. E.g. it would be a mistake`
			`;;; to apply ESCAPE-URI to "//lcs.mit.edu:8001/foo/bar.html" because the`
			`;;; slashes and colons would be escaped.`

			`(define uri-escaped-chars`
			`(char-set-complement (char-set-union char-set:letter+digit`
			`(string->char-set "$-_@.&!*\"'(),+"))))`

			`;;; Takes a set of chars to escape. This is because we sometimes need to`
			`;;; escape larger sets of chars for different parts of a URI.`

			`(define (escape-uri s . maybe-escaped-chars)`
			`(let-optionals maybe-escaped-chars ((escaped-chars uri-escaped-chars))`
			`(let ((nlen (string-fold`
			`(lambda (c i)`
			`(+ i`
			`(if (char-set-contains? escaped-chars c)`
			`3 1)))`
			`0`
			`s))) ; new length of escaped string`
			`(if (= nlen (string-length s)) s`
			`(let ((ns (make-string nlen)))`
			`(string-fold`
			`(lambda (c i) ; replace each occurance of an`
			`; character to escape with %ff where ff`
			`; is the ascii-code in hexadecimal`
			`; notation`
			`(+ i (cond`
			`((char-set-contains? escaped-chars c)`
			`(string-set! ns i #\%)`
			`(let* ((d (char->ascii c))`
			`(dhi (bitwise-and (arithmetic-shift d -4) #xF))`
			`(dlo (bitwise-and d #xF)))`
			`(string-set! ns (+ i 1)`
			`(int->hexchar dhi))`
			`(string-set! ns (+ i 2)`
			`(int->hexchar dlo)))`
			`3)`
			`(else (string-set! ns i c)`
			`1))))`
			`0`
			`s)`
			`ns)))))`


			`;;; Four args: context URI's <scheme> : <path> values, and`
			`;;; main URI's <scheme> : <path> values.`
			`;;; If the path cannot be resolved, return #f #f (this occurs if <path>`
			`;;; begins with n sequential slashes, and <context-path> doesn't`
			`;;; have that many sequential slashes anywhere). All paths are`
			`;;; represented as non-empty lists.`

			`(define (resolve-uri cscheme cp scheme p)`
			`(if scheme (values scheme p) ; If URI has own <scheme>, it is absolute.`

			`(if (and (pair? p) (string=? (car p) "")) ; Path P begins with a slash.`

			`(receive (numsl p) ; Count and strip off initial`
			`(do ((i 1 (+ i 1)) ; slashes (i.e., initial ""'s)`
			`(q (cdr p) (cdr q)))`
			`((or (null? q) (not (string=? (car q) "")))`
			`(values i q)))`

			`;; Skip through CP until we find that many sequential /'s.`
			`(let lp ((cp-tail cp)`
			`(rhead '()) ; CP prefix, reversed.`
			`(j 0)) ; J counts sequential /`

			`(cond`
			`((and (pair? cp-tail) (string=? (car cp-tail) "")) ; More ""'s`
			`(lp (cdr cp-tail)`
			`(cons (car cp-tail) rhead)`
			`(+ j 0)))`

			`((= j numsl) ; Win`
			`(values cscheme (simplify-uri-path (rev-append rhead p))))`

			`((pair? cp-tail) ; Keep looking.`
			`(lp (cdr cp-tail)`
			`(cons (car cp-tail) rhead)`
			`1))`

			`(else (values #f #f))))) ; Lose.`


			`;; P doesn't begin with a slash.`
			`(values cscheme (simplify-uri-path`
			`(rev-append (cdr (reverse cp)) ; Drop non-dir part`
			`p)))))) ; and append P.`


			`(define (rev-append a b) ; (append (reverse a) b)`
			`(let rev-app ((a a) (b b)) ; Should be defined in a list-proc`
			`(if (pair? a) ; package, not here.`
			`(rev-app (cdr a) (cons (car a) b))`
			`b)))`

			`;;; Cribbed from scsh's fname.scm`

			`(define (split-uri-path uri start end) ; Split at /'s (infix grammar).`
			`(let split ((i start)) ; "" -> ("")`
			`(cond`
			`((>= i end) '(""))`
			`((string-index uri #\/ i) =>`
			`(lambda (slash)`
			`(cons (substring uri i slash)`
			`(split (+ slash 1)))))`
			`(else (list (substring uri i end))))))`


			`;;; The elements of PLIST must be escaped in case they contain slashes.`
			`;;; This procedure doesn't escape them for you; you must do that yourself:`
			`;;; (uri-path-list->path (map escape-uri pathlist))`

			`(define (uri-path-list->path plist)`
			`(string-join plist "/")) ; Insert slashes between elts of PLIST.`


			`;;; Remove . and <segment>/.. elements from path. The result is a`
			`;;; (maybe empty) list representing a path that does not contain "."`
			`;;; and ".." elements neither at the beginning nor somewhere else. I`
			`;;; tried to follow RFC2396 here. The procedure returns #f if the path`
			`;;; tries to back up past root (like "//.." or "/foo/../.."). "//" may`
			`;;; occur somewhere in the path but not being backed up. Usually,`
			`;;; relative paths are intended to be used with a base`
			`;;; url. Accordingly to RFC2396 (as I hope) relative paths are`
			`;;; considered not to start with "/". They are appended to a base`
			`;;; URL-path and then simplified. So before you start to simplify a`
			`;;; URL try to find out if it is a relative path (i.e. it does not`
			`;;; start with a "/").`

			`(define (simplify-uri-path p)`
			`(if (null? p) #f ; P must be non-null`
			`(let lp ((path-list (cdr p))`
			`(stack (list (car p))))`
			`(if (null? path-list) ; we're done`
			`(reverse stack)`
			`(cond`
			`((string=? (car path-list) "..") ; back up`
			`; neither the empty path nor root`
			`(if (not (or (null? stack) (string=? (car stack) "")))`
			`(lp (cdr path-list) (cdr stack))`
			`#f))`
			`((string=? (car path-list) ".") ; leave this`
			`(lp (cdr path-list) stack))`
			`((string=? (car path-list) "") ; back to root`
			`(lp (cdr path-list) '("")))`
			`(else ; usual segment`
			`(lp (cdr path-list) (cons (car path-list) stack))))))))`