446 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
			
		
		
	
	
			446 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
| ;;; www-url.stk - URL Parsing library (used by the www package).
 | |
| ;;;           Version 0.5.
 | |
| ;;;
 | |
| ;;; Copyright (c) 1995 Harvey J. Stein (hjstein@math.huji.ac.il)
 | |
| ;;; This code is freely usable and distributable as long as this
 | |
| ;;; heading remains.
 | |
| 
 | |
| ;;; slightly modified by Erick Gallesio (changes are noted with [eg])
 | |
| ;;;      (Last file update: 16-Sep-1999 15:24 (eg)
 | |
| 
 | |
| 
 | |
| ;;; Usage:
 | |
| ;;; (url:parse-url url)
 | |
| ;;;   Takes a string (url), and returns a parsed url.  One may apply
 | |
| ;;;   The general url form handled is:
 | |
| ;;;
 | |
| ;;;      service://user:password@host:port/path;parameters?query#anchor
 | |
| ;;;
 | |
| ;;;   This function takes a second optional argument - the parent url
 | |
| ;;;   (a parsed url).  When the 2nd argument is given, the url is
 | |
| ;;;   interpreted relative to the parent url.
 | |
| ;;;
 | |
| ;;;   The following functions may be applied to a parsed url:
 | |
| ;;;
 | |
| ;;;   (url:unparse-url url)
 | |
| ;;;         Returns a string (a fully qualified url) which would parse
 | |
| ;;;         into url.
 | |
| ;;;   (url:service  parsed-url)
 | |
| ;;;         Returns the service (i.e. - protocol) (as a symbol), or #f
 | |
| ;;;         if none was given.
 | |
| ;;;   (define url:user parsed-url)
 | |
| ;;;         Returns user name, or #f if none was supplied.
 | |
| ;;;   (url:password parsed-url)
 | |
| ;;;         Returns the password, or #f if none was supplied.
 | |
| ;;;   (url:host parsed-url)
 | |
| ;;;         Returns the host name, or #f if none was supplied.
 | |
| ;;;         Note - the host name can be "" (as in file:///foo/bar).
 | |
| ;;;   (url:port-number parsed-url)
 | |
| ;;;         The port number in the url, or #f if none was supplied.
 | |
| ;;;   (url:filename parsed-url)
 | |
| ;;;         The file name (i.e. - path name) of the url, or #f if
 | |
| ;;;         none was supplied.
 | |
| ;;;   (url:anchor parsed-url)
 | |
| ;;;         The anchor in the url, or #f if none was supplied.  It can
 | |
| ;;;         be "" - as in http://foo.bar.com/file#
 | |
| ;;;   (url:parameters parsed-url)
 | |
| ;;;         The parameters (#f if not supplied, empty string if
 | |
| ;;;         parameter delimiter is supplied, but no parameters are
 | |
| ;;;         supplied.
 | |
| ;;;   (url:query parsed-url)
 | |
| ;;;         The query parameters (#f if not supplied, empty string if
 | |
| ;;;         query delimiter is supplied, but no query parameters are
 | |
| ;;;         supplied.
 | |
| ;;;   (url:through-proxy? parsed-url)
 | |
| ;;;         #t iff url has been parsed to be passed through a proxy.
 | |
| ;;;         Proxying is handled by parsing the proxy address, and
 | |
| ;;;         passing the unparsed url through as the file name.
 | |
| ;;;   (url:pretty-url url . parent)				[eg]
 | |
| ;;;  	    returns a pretty string denoting the URL. url can be parsed or not,
 | |
| ;;;	    parent can be present or not.
 | |
| 
 | |
| ;;; Overview:
 | |
| 
 | |
| ;;; Internet RFC 1808 discusses how to interpret relative URLs.  In
 | |
| ;;; doing so, it gives algorithms both for parsing URLs and for
 | |
| ;;; computing relative URLs.
 | |
| ;;;
 | |
| ;;; To parse a URL, they say to follow the following procedure:
 | |
| ;;;
 | |
| ;;;   1. Everything incl & after 1st "#" is the anchor, of the rest,
 | |
| ;;;   2. Everything incl & before 1st ":" is the scheme, assuming at
 | |
| ;;;      least 1 char before ":" & all chars are scheme allowable
 | |
| ;;;      [a-zA-Z0-9+.-], of the rest,
 | |
| ;;;   3. If it starts with "//", everything up to (but not incl) next
 | |
| ;;;      ocurrence of "/" or until end is network location.  Of the rest,
 | |
| ;;;   4. everything from 1st "?" until end is the query info.  of the
 | |
| ;;;      rest,
 | |
| ;;;   5. everything from 1st ";" until end is the parameters field.  Of
 | |
| ;;;      the rest,
 | |
| ;;;   6. everything remaining is the path.
 | |
| ;;;
 | |
| ;;; After all this is done, one must remove the "#" from the beginning
 | |
| ;;; of the anchor, the ":" from the end of the scheme, the "//" from
 | |
| ;;; the beginning of the net location, the "?" from the beginning of
 | |
| ;;; the query part, and the ";" from the beginning of the params part.
 | |
| ;;; One typically leaves the "/" on the beginning of the path part,
 | |
| ;;; because it shows up iff the URL is relative.
 | |
| ;;;
 | |
| ;;; This means that the following regexp should be able to separate
 | |
| ;;; out these 6 basic parts of the url:
 | |
| ;;;
 | |
| ;;; "^([a-zA-Z0-9+.-][a-zA-Z0-9+.-]+:|)(//[^#/]*|)([^#?;]*|)(;[^?#]*|)(\\?[^#]*|)(#.*|)$"
 | |
| ;;;
 | |
| ;;; Once this is done, the net address must be parsed.  I believe this
 | |
| ;;; can be done as follows:
 | |
| ;;;
 | |
| ;;;   1. Everything before & including the 1st "@" is the
 | |
| ;;;      username/password part.  Of the rest,
 | |
| ;;;   2. Everything after & including the last ":" is the port number,
 | |
| ;;;      provided the ":" is only followed by digits
 | |
| ;;;   3. What's left is the host name.
 | |
| ;;;
 | |
| ;;; Bugs
 | |
| ;;;   -The parsing of http://foo.bar.com is problematic.  On the one
 | |
| ;;;    hand, there's no file name 
 | |
| ;;;    become /.			[eg] corrected
 | |
| ;;;   -Proxy junk barely tested...
 | |
| 
 | |
| 
 | |
| (require "fs")
 | |
| 
 | |
| (define-module URL
 | |
|    (export url:parse-url url:unparse-url url:service url:user       
 | |
|            url:password url:host url:port-number url:filename   
 | |
|            url:anchor url:parameters url:query url:through-proxy?
 | |
|            url:pretty-url))
 | |
| 
 | |
| (select-module URL)
 | |
| 
 | |
| (define url:*proxy-env-vars*
 | |
|   '((http   "HTTP_PROXY")
 | |
|     (ftp    "FTP_PROXY")
 | |
|     (wais   "WAIS_PROXY")
 | |
|     (gopher "GOPHER_PROXY")))
 | |
| 
 | |
| (define *proxy-servers* ())
 | |
| ;;; Takes a URL as an argument and returns a list containing the 
 | |
| ;;; protocol, the host name, and the file name.
 | |
| 
 | |
| (define (parse-url url . parent)
 | |
|   (proxitize (apply relativize (basic-parse-url url)
 | |
| 		    (cond 
 | |
| 		       ((null? parent) '())
 | |
| 		       ((through-proxy? (car parent))
 | |
| 		          (list (basic-parse-url (apply unparse-url parent))))
 | |
| 		       (else parent)))))
 | |
| 
 | |
| (define (basic-parse-url url)
 | |
|   (let* ((base (split url-regexp url))
 | |
| 	 (up-hp (split up-hp-regexp (safe-list-ref base 1)))
 | |
| 	 (u-p   (split u-p-regexp (safe-list-ref up-hp 0)))
 | |
| 	 (h-p   (split-hp-part (safe-list-ref up-hp 1))))
 | |
| ;;    (format #t "basic-parse-url:\n   base=~s\n   up-hp=~s\n   u-p=~s\n   h-p=~s\n"
 | |
| ;;	    base up-hp u-p h-p)
 | |
|     (if (not (and base up-hp u-p h-p))
 | |
| 	#f
 | |
|       (let* ((dirty-url (map (lambda (x) (if (string=? x "") #f x))
 | |
| 			     (append (list (car base))
 | |
| 				     u-p
 | |
| 				     h-p
 | |
| 				     (cddr base))))
 | |
| 	     (srv  (maybe-chop-end (list-ref dirty-url 0)))
 | |
| 	     (user (maybe-chop-end (list-ref dirty-url 1)))
 | |
| 	     (pass (maybe-chop-end (list-ref dirty-url 2)))
 | |
| 	     (host (list-ref dirty-url 3))
 | |
| 	     (port (maybe-chop-beg (list-ref dirty-url 4)))
 | |
| 	     (path (list-ref dirty-url 5))
 | |
| 	     (parm (maybe-chop-beg (list-ref dirty-url 6)))
 | |
| 	     (quer (maybe-chop-beg (list-ref dirty-url 7)))
 | |
| 	     (anch (maybe-chop-beg (list-ref dirty-url 8))))
 | |
| 	(if (and (not host)
 | |
| 		 (string? (safe-list-ref base 1))
 | |
| 		 (> (string-length (safe-list-ref base 1)) 0))
 | |
| 	    (set! host ""))
 | |
| 	(list (if srv (string->symbol (string-lower srv))
 | |
| 		#f)
 | |
| 	      user
 | |
| 	      pass
 | |
| 	      host
 | |
| 	      (if port (string->number port)
 | |
| 		#f)
 | |
| 	      (or path "/")
 | |
| 	      parm
 | |
| 	      quer
 | |
| 	      anch
 | |
| 	      #f)))))
 | |
| 
 | |
| (define (relativize url . parent)
 | |
|   (define (inherit-service)
 | |
|     (set! url (smerge-lists url parent 0)))
 | |
| 
 | |
|   (define (inherit-netloc)
 | |
|     (set! url (smerge-lists url parent 1 2 3 4)))
 | |
| 
 | |
|   (define (inherit-path)
 | |
|     (set! url (smerge-lists url parent 5)))
 | |
| 
 | |
|   (define (inherit-parameters)
 | |
|     (set! url (smerge-lists url parent 6)))
 | |
| 
 | |
|   (define (inherit-query)
 | |
|     (set! url (smerge-lists url parent 7)))
 | |
| 
 | |
|   (define (smerge-lists url parent . positions)
 | |
|     (define (smerge-aux url parent positions ref)
 | |
|       (cond ((null? positions) url)
 | |
| 	    ((null? url) parent)
 | |
| 	    ((null? parent) url)
 | |
| 	    ((= (car positions) ref)
 | |
| 	     (cons (car parent)
 | |
| 		   (smerge-aux (cdr url) (cdr parent) (cdr positions) (+ 1 ref))))
 | |
| 	    (else
 | |
| 	     (cons (car url)
 | |
| 		   (smerge-aux (cdr url) (cdr parent) positions (+ 1 ref))))))
 | |
|     (smerge-aux url parent positions 0))
 | |
| 
 | |
| ;  (define (dirname f)			;; [eg] deleted to use fs.stk instead
 | |
| ;    (define r (string->regexp "^(.*/|)([^/]*)$"))
 | |
| ;    (car (split r f)))
 | |
| 
 | |
|   (define (merge-paths)
 | |
|     (let* ((base (string-append (dirname (filename parent))
 | |
| 				(if (eq? (os-kind) 'Unix) "/" "\\")     ; [eg]
 | |
| 				(filename url)))
 | |
| 	   (slb  (string-length base))
 | |
| 	   (newpath 
 | |
| 	    (cond ((and (> slb 0)
 | |
| 			 (char=? (string-ref base (- slb 1)) #\/))
 | |
| 		   (string-append (expand-file-name base) "/"))
 | |
| 		  ((and (= slb 1)
 | |
| 			(char=? (string-ref base 0) #\.))
 | |
| 		   (string-append (expand-file-name base) "/"))
 | |
| 		  ((and (= slb 2)
 | |
| 			(or (string=? base "..")
 | |
| 			    (string=? base "/.")))
 | |
| 		   (string-append (expand-file-name base) "/"))
 | |
| 		  ((and (> slb 2)
 | |
| 			(char=? (string-ref base (- slb 2)) #\/)
 | |
| 			(char=? (string-ref base (- slb 1)) #\.))
 | |
| 		   (string-append (expand-file-name base) "/"))
 | |
| 		  ((and (> slb 2)
 | |
| 			(char=? (string-ref base (- slb 3)) #\/)
 | |
| 			(char=? (string-ref base (- slb 2)) #\.)
 | |
| 			(char=? (string-ref base (- slb 1)) #\.))
 | |
| 		   (string-append (expand-file-name base) "/"))
 | |
| 		  (else
 | |
| 		   (expand-file-name base)))))
 | |
|       (set! slb (string-length newpath))
 | |
|       (if (and (> slb 1)
 | |
| 	       (char=? (string-ref newpath (- slb 1)) #\/)
 | |
| 	       (char=? (string-ref newpath (- slb 2)) #\/))
 | |
| 	  (set! newpath (substring newpath 0 (- slb 1))))
 | |
| ;;      (format #t "merge-paths: base=~s, newpath=~s\n" base newpath)
 | |
|       (set! url (smerge-lists url `(serv user pass host port ,newpath) 5))))
 | |
|       
 | |
|   (if (not (null? parent)) (set! parent (car parent)))
 | |
|   (cond ((null? parent) url)
 | |
| 	((string=? "" (unparse-url parent)) url)
 | |
| 	((string=? "" (unparse-url url)) parent)
 | |
| 	((service url) url)
 | |
| 	((host url) (inherit-service) url)
 | |
| 	((and (filename url)
 | |
| 	      (> (string-length (filename url)) 0)
 | |
| 	      (char=? (string-ref (filename url) 0) #\/))
 | |
| 	 (inherit-service) (inherit-netloc) url)
 | |
| 	((and (not (filename url))
 | |
| 	      (parameters url))
 | |
| 	 (inherit-service) (inherit-netloc) (inherit-path) url)
 | |
| 	((and (not (filename url))
 | |
| 	      (query url))
 | |
| 	 (inherit-service) (inherit-netloc) (inherit-path) (inherit-parameters)
 | |
| 	 url)
 | |
| 	((not (filename url))
 | |
| 	 (inherit-service) (inherit-netloc) (inherit-path) (inherit-parameters)
 | |
| 	 (inherit-query)
 | |
| 	 url)
 | |
| 	(else
 | |
| 	 (inherit-service) (inherit-netloc) (merge-paths) url)))
 | |
| 
 | |
| (define (proxitize parsed-url)
 | |
|   (let ((proxy (assoc (service parsed-url) *proxy-servers*)))
 | |
|     (cond (proxy
 | |
| 	   (set! proxy (cadr proxy))
 | |
| 	   (list (service proxy)
 | |
| 		 (user proxy)
 | |
| 		 (password proxy)
 | |
| 		 (host proxy)
 | |
| 		 (port-number proxy)
 | |
| 		 (unparse-url parsed-url)
 | |
| 		 (parameters proxy)
 | |
| 		 (query proxy)
 | |
| 		 (anchor proxy)
 | |
| 		 #t))			; Is through proxy.
 | |
| 	  (else
 | |
| 	   parsed-url))))
 | |
| 	
 | |
| (define (unparse-url url)
 | |
|   (cond ((through-proxy? url)
 | |
| 	 (filename url))
 | |
| 	(else
 | |
| 	 (let ((srv (service url)))
 | |
| 	   ; (if srv (set! srv (symbol->string srv))) [eg]
 | |
| 	   (set! srv  (if srv (symbol->string srv) "file"))
 | |
| 	   (string-append (maybe-append srv ":")
 | |
| 			  (string-netaddr url)
 | |
| 			  (if (filename url) (filename url) "")
 | |
| 			  (maybe-append ";" (parameters url))
 | |
| 			  (maybe-append "?" (query url))
 | |
| 			  (maybe-append "#" (anchor url)))))))
 | |
| 
 | |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 | |
| (define (pretty-url url . parent)    ;; [eg] added for pretty mesages
 | |
|   (unless (string? url)
 | |
|     (set! url (url:unparse-url url)))
 | |
|   (url:unparse-url (apply url:parse-url url parent)))
 | |
| 
 | |
| 
 | |
| ;;;  --------- Regexps for parsing ---------------
 | |
| 
 | |
| (define url-regexp 
 | |
|   (string->regexp
 | |
|    "^([[a-zA-Z0-9+.-][a-zA-Z0-9+.-]+:|)(//[^#/]*|)([^#?;]*|)(;[^?#]*|)(\\?[^#]*|)(#.*|)$"))
 | |
| 
 | |
| (define up-hp-regexp
 | |
|   (string->regexp
 | |
|    "^/?/?([^@]*@|)(.*)$"))		; Strips off // automatically.
 | |
| 
 | |
| (define u-p-regexp
 | |
|   (string->regexp
 | |
|    "^([^@:]*[:@]|)(.*)$"))
 | |
| 
 | |
| (define h-p-test-regexp
 | |
|   (string->regexp
 | |
|    "^(.*)(:[0-9]+)$"))
 | |
| 
 | |
| ;;; ----------------- Support routines for basic-parse-url ---------------
 | |
| (define (string-netaddr url)
 | |
|   (let ((prt (port-number url)))
 | |
|     (if prt (set! prt (number->string prt)))
 | |
|     (cond ((password url)
 | |
| 	   (string-append "//" (user url) ":" (password url)
 | |
| 			  "@" (host url)
 | |
| 			  (maybe-append ":" prt)))
 | |
| 	   ((user url)
 | |
| 	    (string-append "//" (user url) "@" (host url)
 | |
| 			   (maybe-append ":" prt)))
 | |
| 	   ((host url)
 | |
| 	    (string-append "//" (host url)
 | |
| 			   (maybe-append ":" prt)))
 | |
| 	   (else ""))))
 | |
| 
 | |
| (define (split-hp-part h-p)
 | |
|   (or (split h-p-test-regexp h-p)
 | |
|       (list h-p "")))
 | |
| 
 | |
| (define (maybe-append s1 s2)
 | |
|   (if (and s1 s2) (string-append s1 s2)
 | |
|     ""))
 | |
| 
 | |
| (define (split regexp maybe-str)
 | |
|   (if (string? maybe-str)
 | |
|       (apply-matches (regexp maybe-str) maybe-str)
 | |
|     #f))
 | |
| 
 | |
| (define (safe-list-ref maybe-list index)
 | |
|   (if (and (list? maybe-list)
 | |
| 	   (< index (length maybe-list)))
 | |
|       (list-ref maybe-list index)
 | |
|     #f))
 | |
| 
 | |
| (define (maybe-chop-end maybe-string)
 | |
|   (if (and (string? maybe-string)
 | |
| 	   (> (string-length maybe-string) 0))
 | |
|       (substring maybe-string 0 (- (string-length maybe-string) 1))
 | |
|     #f))
 | |
| 
 | |
| (define (maybe-chop-beg maybe-string)
 | |
|   (if (and (string? maybe-string)
 | |
| 	   (> (string-length maybe-string) 0))
 | |
|       (substring maybe-string 1 (string-length maybe-string))
 | |
|     #f))
 | |
| 
 | |
| 
 | |
| (define (apply-matches matches string)
 | |
|   (if matches
 | |
|       (map (lambda (m) (apply substring string m))
 | |
| 	   (cdr matches))
 | |
|     #f))
 | |
| 
 | |
| ;;; ---------- Url access routines -------------
 | |
| 
 | |
| (define (service url)
 | |
|   (list-ref url 0))
 | |
| 
 | |
| (define (user url)
 | |
|   (list-ref url 1))
 | |
| 
 | |
| (define (password url)
 | |
|   (list-ref url 2))
 | |
| 
 | |
| (define (host url)
 | |
|   (list-ref url 3))
 | |
| 
 | |
| (define (port-number url)
 | |
|   (list-ref url 4))
 | |
| 
 | |
| (define (filename url)
 | |
|   (list-ref url 5))
 | |
| 
 | |
| (define (parameters url)
 | |
|   (list-ref url 6))
 | |
| 
 | |
| (define (query url)
 | |
|   (list-ref url 7))
 | |
| 
 | |
| (define (anchor url)
 | |
|   (list-ref url 8))
 | |
| 
 | |
| (define (through-proxy? url)
 | |
|   (list-ref url 9))
 | |
| 
 | |
| ;;; ------------ Exports -------------------------
 | |
| (define url:parse-url      parse-url)
 | |
| (define url:unparse-url    unparse-url)
 | |
| ;(define url:service       service)	[eg] to default to file if nothing else
 | |
| (define url:service        (lambda (url) (or (service url) 'file)))
 | |
| (define url:user           user)
 | |
| (define url:password       password)
 | |
| (define url:host           host)
 | |
| (define url:port-number    port-number)
 | |
| (define url:filename       filename)
 | |
| (define url:anchor         anchor)
 | |
| (define url:through-proxy? through-proxy?)
 | |
| (define url:pretty-url     pretty-url)
 | |
| 
 | |
| ;;; -------------- Set up proxy list -----------------
 | |
| (define (get-proxy-evar evar)
 | |
|   (let ((e (getenv evar)))
 | |
|     (if e
 | |
| 	(basic-parse-url e)
 | |
|       #f)))
 | |
| 
 | |
| 
 | |
| (define *proxy-servers*
 | |
|   (let loop ((l url:*proxy-env-vars*))
 | |
|        (cond ((null? l) ())
 | |
| 	     (else
 | |
| 	      (let ((p (get-proxy-evar (cadar l))))
 | |
| 		(if p
 | |
| 		    (cons (list (caar l) p)
 | |
| 			  (loop (cdr l)))
 | |
| 		  (loop (cdr l))))))))
 | |
| 
 | |
| 
 | |
| (provide "www-url")
 |