;;; URL parsing and unparsing -*- Scheme -*-
;;; Copyright (c) 1995 by Olin Shivers.

;;; I'm only implementing http URL's right now.

;;; References:
;;; - ftp://ftp.internic.net/rfc/rfc1738.txt 
;;;   Original RFC
;;; - http://www.w3.org/hypertext/WWW/Addressing/URL/Overview.html
;;;   General Web page of URI pointers.


;;; Unresolved issues:
;;; - The userhost parser shouldn't substitute default values --
;;;   that should happen in a separate step.

;;; Imports and non-R4RS'isms
;;; 	define-record		Record structures
;;; 	receive values		MV return
;;; 	URI support
;;; 	index

;;; The steps in hacking a URL are:
;;; - Take the UID, parse it, and resolve it with the context UID, if any.
;;; - Consult the UID's <scheme>. Pick the appropriate URL parser and parse.


;;; Userhost strings: //<user>:<password>@<host>:<port>/
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; A USERHOST record describes path-prefixes of the form
;;;     //<user>:<password>@<host>:<port>/
;;; These are frequently used as the initial prefix of URL's describing
;;; Internet resources.

(define-record userhost		; Each slot is a decoded string or #f.
  user
  password
  host
  port)

;;; Parse a URI path (a list representing a path, not a string!) into
;;; a userhost record. Default values are taken from the userhost
;;; record DEFAULT except for the host. Returns a userhost record if
;;; it wins, and #f if it cannot parse the path. CADDR drops the
;;; userhost portion of the path. In fact, fatal-syntax-error is
;;; called, if the path doesn't start with '//'.

(define (parse-userhost path default)
  (if (and (pair? path)				; The thing better begin
	   (string=? (car path) "")		; with // (i.e., have two
	   (pair? (cdr path))			; initial "" elements).
	   (string=? (cadr path) ""))
	 
      (let* ((uhs (caddr path))			; Userhost string.
	     (uhs-len (string-length uhs))
							; Usr:passwd at-sign,
	     (at (index uhs #\@))			;     if any.
	       
	     (colon1 (and at (index uhs #\:)))		    ; Usr:passwd colon,
	     (colon1 (and colon1 (< colon1 at) colon1))	    ;     if any.

	     (colon2 (index uhs #\: (or at 0))))	; Host:port colon,
	      						;     if any.
	(make-userhost (if at
			   (unescape-uri uhs 0 (or colon1 at))
			   (userhost:user default))
		       (if colon1
			   (unescape-uri uhs (+ colon1 1) at)
			   (userhost:password default))
		       (unescape-uri uhs (if at (+ at 1) 0)
				     (or colon2 uhs-len))
		       (if colon2
			   (unescape-uri uhs (+ colon2 1) uhs-len)
			   (userhost:port default))))

      (fatal-syntax-error "URL must begin with //..." path)))

;;; Unparser

(define userhost-escaped-chars
  (char-set-union uri-escaped-chars		; @ and : are also special
		  (string->char-set "@:")))	; in UH strings.

(define (userhost->string uh)
  (let* ((us (userhost:user uh))
	 (pw (userhost:password uh))
	 (ho (userhost:host uh))
	 (po (userhost:port uh))

	 ;; Encode before assembly in case pieces contain colons or at-signs.
	 (e (lambda (s) (escape-uri s userhost-escaped-chars)))

	 (user/passwd (if us `(,(e us) . ,(if pw `(":" ,(e pw) "@") '("@")))
			  '()))
	 (host/port   (if ho `(,(e ho) . ,(if po `(":" ,(e po)) '()))
			  '())))

    (apply string-append (append user/passwd host/port))))


;;; HTTP URL parsing
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;; The PATH slot of this record is the URL's path split at slashes,
;;; e.g., "foo/bar//baz/" => ("foo" "bar" "" "baz" "")
;;; These elements are in raw, unescaped format. To convert back to
;;; a string, use (uri-path-list->path (map escape-uri pathlist)).

(define-record http-url
  userhost			; Initial //anonymous@clark.lcs.mit.edu:80/
  path				; Rest of path, split at slashes & decoded.
  search
  frag-id)

;;; The URI parser (parse-uri in uri.scm) maps a string to four parts:
;;;     <scheme> : <path> ? <search> # <frag-id>
;;; <scheme>, <search>, and <frag-id> are strings; <path> is a non-empty
;;; string list -- the URI's path split at slashes. Optional parts of the
;;; URI, when missing, are specified as #f. If <scheme> is "http", then the
;;; other three parts can be passed to PARSE-HTTP-URL, which parses them
;;; into a HTTP-URL record (or #f if the string cannot be parsed). All strings
;;; come back from the URI parser encoded. SEARCH and FRAG-ID are left
;;; that way; this parser decodes the path elements.
;;;
;;; Return #f if the URL could not be parsed.

(define (parse-http-url path search frag-id)
  (let ((uh (parse-userhost path default-http-userhost)))
    (if (or (userhost:user uh) (userhost:password uh))
	(fatal-syntax-error
	    "HTTP URL's may not specify a user or password field" path))

	(make-http-url uh (map unescape-uri (cdddr path)) search frag-id)))


;;; Default http port is 80.
(define default-http-userhost (make-userhost #f #f #f "80"))


;;; Unparse.

(define (http-url->string url)
  (string-append "http://"
		 (userhost->string (http-url:userhost url))
		 "/"
		 (uri-path-list->path (map escape-uri (http-url:path url)))
		 (? ((http-url:search url) =>
                     (lambda (s) (string-append "?" s)))
		    (else ""))
		 (? ((http-url:frag-id url) =>
                     (lambda (fi) (string-append "#" fi)))
		    (else ""))))