sunet/uri.scm

;;; -*- Scheme -*-
;;; Copyright (c) 1995 by Olin Shivers.

;;; URI syntax -- [scheme] : path [? search ] [# fragmentid]

;;; Imports and non-R4RS'isms
;;;	let-optionals
;;;	receive values				(MV return)
;;;     ascii->char char->ascii
;;;	index rindex
;;;	char-set-index char-set-rindex
;;;	string-reduce
;;;	char-set package
;;;	bitwise logical funs and arithmetic-shift
;;; 	join-strings				(scsh field-reader code.)


;;; References:
;;; - ftp://ftp.internic.net/rfc/rfc1630.txt
;;;   Original RFC
;;; - http://www.w3.org/hypertext/WWW/Addressing/URL/URI_Overview.html
;;;   General Web page of URI pointers.

;;; I wrote a URI parser that slavishly obeyed Tim Berners-Lee's
;;; spec (rfc 1630). This was a waste of time, as most URL's do not
;;; obey his spec, which is incomplete and inconsistent with the URL spec
;;; in any event. This parser is much simpler. It parses a URI into four
;;; fields:
;;;     [ <scheme> ] : <path> [ ? <search> ] [ # fragid ]
;;; The returned fields are *not* unescaped, as the rules for parsing the
;;; <path> component in particular need unescaped text, and are dependent
;;; on <scheme>. The URL parser is responsible for doing this.
;;; If the <scheme>, <search> or <fragid> portions are not specified,
;;; they are #f. Otherwise, <scheme>, <search>, and <fragid> are strings;
;;; <path> is a non-empty string list.

;;; The parsing technique is inwards from both ends.
;;; - First we search forwards for the first reserved char (= ; / # ? : space)
;;;   If it's a colon, then that's the <scheme> part, otw no <scheme> part.
;;;   Remove it.
;;; - Then we search backwards from the end for the last reserved char.
;;;   If it's a sharp, then that's the <fragment-id> part -- remove it.
;;; - Then we search backwards from the end for the last reserved char.
;;;   If it's a question-mark, then that's the <search> part -- remove it.
;;; - What's left is the path. Split at slashes. "" -> ("")
;;;
;;; This scheme is tolerant of the various ways people build broken URI's
;;; out there on the Net. It was given to me by Dan Connolly of the W3C.

;;; Returns four values: scheme, path, search, frag-id.
;;; Each value is either #f or a string.


;;; MG: I think including = here will break up things, since it may be
;;; part of the search string, preventing the ? to be found (+ and &
;;; are excluded anyway).
(define uri-reserved (string->char-set ";/#?: "))

(define (parse-uri s)
  (let* ((slen (string-length s))
	 ;; Search forwards for colon (or intervening reserved char).
	 (rs1 (char-set-index s uri-reserved))	; 1st reserved char
	 (colon (and rs1 (char=? (string-ref s rs1) #\:) rs1))
	 (path-start (if colon (+ colon 1) 0))

	 ;; Search backwards for # (or intervening reserved char).
	 (rs-last (char-set-rindex s uri-reserved))
	 (sharp (and rs-last (char=? (string-ref s rs-last) #\#) rs-last))

	 ;; Search backwards for ? (or intervening reserved char).
	 (rs-penult (if sharp (char-set-rindex s uri-reserved sharp) rs-last))
	 (ques (and rs-penult (char=? (string-ref s rs-penult) #\?) rs-penult))

	 (path-end (or ques sharp slen)))
    (values (and colon (substring s 0 colon))
	    (split-uri-path s path-start path-end)
	    (and ques (substring s (+ ques 1) (or sharp slen)))
	    (and sharp (substring s (+ sharp 1) slen)))))

;;; Caution:
;;; Don't use this proc until *after* you've parsed the URL -- unescaping
;;; might introduce reserved chars (like slashes and colons) that could
;;; blow your parse.

(define (unescape-uri s . maybe-start/end)
  (let-optionals maybe-start/end ((start 0)
				  (end (string-length s)))
    (let* ((esc-seq? (lambda (i) (and (< (+ i 2) end)
				      (char=? (string-ref s i) #\%)
				      (hex-digit? (string-ref s (+ i 1)))
				      (hex-digit? (string-ref s (+ i 2))))))
	   (hits (let lp ((i start) (hits 0)) ; count # of esc seqs.
		   (if (< i end)
		       (if (esc-seq? i)
			   (lp (+ i 3) (+ hits 1))
			   (lp (+ i 1) hits))
		       hits))))

      (if (and (zero? hits) (zero? start) (= end (string-length s))) s

	  (let* ((nlen (- (- end start) (* hits 2)))
		 (ns (make-string nlen)))

	    (let lp ((i start) (j 0))
	      (if (< j nlen)
		  (lp (? ((esc-seq? i)
			  (string-set! ns j
				       (let ((d1 (string-ref s (+ i 1)))
					     (d2 (string-ref s (+ i 2))))
					 (ascii->char (+ (* 16 (hexchar->int d1))
							 (hexchar->int d2)))))
			  (+ i 3))
			 (else (string-set! ns j (string-ref s i))
			       (+ i 1)))
		      (+ j 1))))
	    ns)))))

(define hex-digit?
  (let ((hex-digits (string->char-set "0123456789abcdefABCDEF")))
    (lambda (c) (char-set-contains? hex-digits c))))

(define (hexchar->int c)
  (- (char->ascii c)
     (if (char-numeric? c)
	 (char->ascii #\0)
	 (- (if (char-upper-case? c)
		(char->ascii #\A)
		(char->ascii #\a))
	    10))))

(define int->hexchar
  (let ((table '#(#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9
		  #\A #\B #\C #\D #\E #\F)))
    (lambda (i) (vector-ref table i))))


;;; Caution:
;;; All reserved chars (e.g., slash, sharp, colon) get escaped: "=;/#?: "
;;; So don't apply this proc to chunks of text with syntactically meaningful
;;; reserved chars (e.g., paths with URI slashes or colons) -- they'll be
;;; escaped, and lose their special meaning. E.g. it would be a mistake
;;; to apply ESCAPE-URI to "//lcs.mit.edu:8001/foo/bar.html" because the
;;; slashes and colons would be escaped.

(define uri-escaped-chars
  (char-set-invert (char-set-union char-set:alphanumeric
				   (string->char-set "$-_@.&!*\"'(),+"))))

;;; Takes a set of chars to escape. This is because we sometimes need to
;;; escape larger sets of chars for different parts of a URI.

(define (escape-uri s . maybe-escaped-chars)
  (let-optionals maybe-escaped-chars ((escaped-chars uri-escaped-chars))
    (let ((nlen (string-reduce 0
			       (lambda (c i)
				 (+ i
				    (if (char-set-contains? uri-escaped-chars c)
					3 1)))
			       s)))
      (if (= nlen (string-length s)) s
	  (let ((ns (make-string nlen)))
	    (string-reduce
	     0
	     (lambda (c i)
	       (+ i (? ((char-set-contains? uri-escaped-chars c)
			(string-set! ns i #\%)
			(let* ((d (char->ascii c))
			       (dhi (bitwise-and (arithmetic-shift d -4) #xF))
			       (dlo (bitwise-and d #xF)))
			  (string-set! ns (+ i 1)
				       (int->hexchar dhi))
			  (string-set! ns (+ i 2)
				       (int->hexchar dlo)))
			3)
		       (else (string-set! ns i c)
			     1))))
	     s)
	    ns)))))


;;; Four args: context URI's  <scheme> : <path> values, and
;;;            main URI's     <scheme> : <path> values.
;;; If the path cannot be resolved, return #f #f (this occurs if <path>
;;; begins with n sequential slashes, and <context-path> doesn't
;;; have that many sequential slashes anywhere). All paths are
;;; represented as non-empty lists.

(define (resolve-uri cscheme cp  scheme p)
  (if scheme (values scheme p)	; If URI has own <scheme>, it is absolute.

      (if (and (pair? p) (string=? (car p) ""))	  ; Path P begins with a slash.

	  (receive (numsl p)			; Count and strip off initial
	      (do ((i 1       (+ i 1))  	; slashes (i.e., initial ""'s)
		   (q (cdr p) (cdr q)))
		  ((or (null? q) (not (string=? (car q) "")))
		   (values i q)))

	    ;; Skip through CP until we find that many sequential /'s.
	    (let lp ((cp-tail cp)
		     (rhead '())			; CP prefix, reversed.
		     (j 0))				; J counts sequential /

	      (? ((and (pair? cp-tail) (string=? (car cp-tail) "")) ; More ""'s
		  (lp (cdr cp-tail)
		      (cons (car cp-tail) rhead)
		      (+ j 0)))

		 ((= j numsl)				; Win
		  (values cscheme (simplify-uri-path (rev-append rhead p))))

		 ((pair? cp-tail)				; Keep looking.
		  (lp (cdr cp-tail)
		      (cons (car cp-tail) rhead)
		      1))

		 (else (values #f #f)))))		; Lose.


	  ;; P doesn't begin with a slash.
	  (values cscheme (simplify-uri-path
			    (rev-append (cdr (reverse cp)) ; Drop non-dir part
					p))))))		   ; and append P.


(define (rev-append a b)		; (append (reverse a) b)
  (let rev-app ((a a) (b b))		; Should be defined in a list-proc
    (if (pair? a)			; package, not here.
	(rev-app (cdr a) (cons (car a) b))
	b)))

;;; Cribbed from scsh's fname.scm

(define (split-uri-path uri start end)		; Split at /'s (infix grammar).
  (let split ((i start))			; "" -> ("")
    (? ((>= i end) '(""))
       ((string-index uri #\/ i) =>
	(lambda (slash)
	  (cons (substring uri i slash)
		(split (+ slash 1)))))
       (else (list (substring uri i end))))))


;;; The elements of PLIST must be escaped in case they contain slashes.
;;; This procedure doesn't escape them for you; you must do that yourself:
;;;     (uri-path-list->path (map escape-uri pathlist))

(define (uri-path-list->path plist)
  (join-strings plist "/"))		; Insert slashes between elts of PLIST.


;;; Remove . and foo/.. elts from path. After simplification, there are no
;;; . elements, and the only .. elements occur at the beginning of the path
;;; (i.e., they attempt to back up past root). One could argue that this is
;;; illegal, and we should error out in this case, reporting an unresolvable
;;; URL. The URI "spec" is not even slightly clear on this issue.
;;;
;;; URI's are pathetic. The case of /a/b//../c is ambiguous. Do we
;;; 	1) not simplify across multi-slashes?
;;; 	2) Flush the "empty" dir, giving /a/b//c
;;;	3) Flush across multi-slashes, giving /a/c
;;; What is the meaning of //../a ? /../b ? /../../c ?

(define (simplify-uri-path p)	; P must be non-null.
  (reverse (let lp ((path-list p)
		    (ans '()))
	     (let ((elt (car path-list))
		   (path-list (cdr path-list)))
	       (? ((pair? path-list)
		   (? ((string=? "." elt) ; Kill .
		       (lp path-list ans))
		      ((string=? ".." elt)
		       (if (pair? ans)
			   (lp path-list (cddr ans))
			   (lp path-list (cons elt ans))))
		      (else
		       (lp path-list (cons elt ans)))))
		  ;; Last element of list.
		  ((string=? ".." elt)
		   (if (null? ans) '("..") (cddr ans)))
		  (else (cons elt ans)))))))