stk/Lib/www-url.stk

446 lines
14 KiB
Plaintext

;;; www-url.stk - URL Parsing library (used by the www package).
;;; Version 0.5.
;;;
;;; Copyright (c) 1995 Harvey J. Stein (hjstein@math.huji.ac.il)
;;; This code is freely usable and distributable as long as this
;;; heading remains.
;;; slightly modified by Erick Gallesio (changes are noted with [eg])
;;; (Last file update: 16-Sep-1999 15:24 (eg)
;;; Usage:
;;; (url:parse-url url)
;;; Takes a string (url), and returns a parsed url. One may apply
;;; The general url form handled is:
;;;
;;; service://user:password@host:port/path;parameters?query#anchor
;;;
;;; This function takes a second optional argument - the parent url
;;; (a parsed url). When the 2nd argument is given, the url is
;;; interpreted relative to the parent url.
;;;
;;; The following functions may be applied to a parsed url:
;;;
;;; (url:unparse-url url)
;;; Returns a string (a fully qualified url) which would parse
;;; into url.
;;; (url:service parsed-url)
;;; Returns the service (i.e. - protocol) (as a symbol), or #f
;;; if none was given.
;;; (define url:user parsed-url)
;;; Returns user name, or #f if none was supplied.
;;; (url:password parsed-url)
;;; Returns the password, or #f if none was supplied.
;;; (url:host parsed-url)
;;; Returns the host name, or #f if none was supplied.
;;; Note - the host name can be "" (as in file:///foo/bar).
;;; (url:port-number parsed-url)
;;; The port number in the url, or #f if none was supplied.
;;; (url:filename parsed-url)
;;; The file name (i.e. - path name) of the url, or #f if
;;; none was supplied.
;;; (url:anchor parsed-url)
;;; The anchor in the url, or #f if none was supplied. It can
;;; be "" - as in http://foo.bar.com/file#
;;; (url:parameters parsed-url)
;;; The parameters (#f if not supplied, empty string if
;;; parameter delimiter is supplied, but no parameters are
;;; supplied.
;;; (url:query parsed-url)
;;; The query parameters (#f if not supplied, empty string if
;;; query delimiter is supplied, but no query parameters are
;;; supplied.
;;; (url:through-proxy? parsed-url)
;;; #t iff url has been parsed to be passed through a proxy.
;;; Proxying is handled by parsing the proxy address, and
;;; passing the unparsed url through as the file name.
;;; (url:pretty-url url . parent) [eg]
;;; returns a pretty string denoting the URL. url can be parsed or not,
;;; parent can be present or not.
;;; Overview:
;;; Internet RFC 1808 discusses how to interpret relative URLs. In
;;; doing so, it gives algorithms both for parsing URLs and for
;;; computing relative URLs.
;;;
;;; To parse a URL, they say to follow the following procedure:
;;;
;;; 1. Everything incl & after 1st "#" is the anchor, of the rest,
;;; 2. Everything incl & before 1st ":" is the scheme, assuming at
;;; least 1 char before ":" & all chars are scheme allowable
;;; [a-zA-Z0-9+.-], of the rest,
;;; 3. If it starts with "//", everything up to (but not incl) next
;;; ocurrence of "/" or until end is network location. Of the rest,
;;; 4. everything from 1st "?" until end is the query info. of the
;;; rest,
;;; 5. everything from 1st ";" until end is the parameters field. Of
;;; the rest,
;;; 6. everything remaining is the path.
;;;
;;; After all this is done, one must remove the "#" from the beginning
;;; of the anchor, the ":" from the end of the scheme, the "//" from
;;; the beginning of the net location, the "?" from the beginning of
;;; the query part, and the ";" from the beginning of the params part.
;;; One typically leaves the "/" on the beginning of the path part,
;;; because it shows up iff the URL is relative.
;;;
;;; This means that the following regexp should be able to separate
;;; out these 6 basic parts of the url:
;;;
;;; "^([a-zA-Z0-9+.-][a-zA-Z0-9+.-]+:|)(//[^#/]*|)([^#?;]*|)(;[^?#]*|)(\\?[^#]*|)(#.*|)$"
;;;
;;; Once this is done, the net address must be parsed. I believe this
;;; can be done as follows:
;;;
;;; 1. Everything before & including the 1st "@" is the
;;; username/password part. Of the rest,
;;; 2. Everything after & including the last ":" is the port number,
;;; provided the ":" is only followed by digits
;;; 3. What's left is the host name.
;;;
;;; Bugs
;;; -The parsing of http://foo.bar.com is problematic. On the one
;;; hand, there's no file name
;;; become /. [eg] corrected
;;; -Proxy junk barely tested...
(require "fs")
(define-module URL
(export url:parse-url url:unparse-url url:service url:user
url:password url:host url:port-number url:filename
url:anchor url:parameters url:query url:through-proxy?
url:pretty-url))
(select-module URL)
(define url:*proxy-env-vars*
'((http "HTTP_PROXY")
(ftp "FTP_PROXY")
(wais "WAIS_PROXY")
(gopher "GOPHER_PROXY")))
(define *proxy-servers* ())
;;; Takes a URL as an argument and returns a list containing the
;;; protocol, the host name, and the file name.
(define (parse-url url . parent)
(proxitize (apply relativize (basic-parse-url url)
(cond
((null? parent) '())
((through-proxy? (car parent))
(list (basic-parse-url (apply unparse-url parent))))
(else parent)))))
(define (basic-parse-url url)
(let* ((base (split url-regexp url))
(up-hp (split up-hp-regexp (safe-list-ref base 1)))
(u-p (split u-p-regexp (safe-list-ref up-hp 0)))
(h-p (split-hp-part (safe-list-ref up-hp 1))))
;; (format #t "basic-parse-url:\n base=~s\n up-hp=~s\n u-p=~s\n h-p=~s\n"
;; base up-hp u-p h-p)
(if (not (and base up-hp u-p h-p))
#f
(let* ((dirty-url (map (lambda (x) (if (string=? x "") #f x))
(append (list (car base))
u-p
h-p
(cddr base))))
(srv (maybe-chop-end (list-ref dirty-url 0)))
(user (maybe-chop-end (list-ref dirty-url 1)))
(pass (maybe-chop-end (list-ref dirty-url 2)))
(host (list-ref dirty-url 3))
(port (maybe-chop-beg (list-ref dirty-url 4)))
(path (list-ref dirty-url 5))
(parm (maybe-chop-beg (list-ref dirty-url 6)))
(quer (maybe-chop-beg (list-ref dirty-url 7)))
(anch (maybe-chop-beg (list-ref dirty-url 8))))
(if (and (not host)
(string? (safe-list-ref base 1))
(> (string-length (safe-list-ref base 1)) 0))
(set! host ""))
(list (if srv (string->symbol (string-lower srv))
#f)
user
pass
host
(if port (string->number port)
#f)
(or path "/")
parm
quer
anch
#f)))))
(define (relativize url . parent)
(define (inherit-service)
(set! url (smerge-lists url parent 0)))
(define (inherit-netloc)
(set! url (smerge-lists url parent 1 2 3 4)))
(define (inherit-path)
(set! url (smerge-lists url parent 5)))
(define (inherit-parameters)
(set! url (smerge-lists url parent 6)))
(define (inherit-query)
(set! url (smerge-lists url parent 7)))
(define (smerge-lists url parent . positions)
(define (smerge-aux url parent positions ref)
(cond ((null? positions) url)
((null? url) parent)
((null? parent) url)
((= (car positions) ref)
(cons (car parent)
(smerge-aux (cdr url) (cdr parent) (cdr positions) (+ 1 ref))))
(else
(cons (car url)
(smerge-aux (cdr url) (cdr parent) positions (+ 1 ref))))))
(smerge-aux url parent positions 0))
; (define (dirname f) ;; [eg] deleted to use fs.stk instead
; (define r (string->regexp "^(.*/|)([^/]*)$"))
; (car (split r f)))
(define (merge-paths)
(let* ((base (string-append (dirname (filename parent))
(if (eq? (os-kind) 'Unix) "/" "\\") ; [eg]
(filename url)))
(slb (string-length base))
(newpath
(cond ((and (> slb 0)
(char=? (string-ref base (- slb 1)) #\/))
(string-append (expand-file-name base) "/"))
((and (= slb 1)
(char=? (string-ref base 0) #\.))
(string-append (expand-file-name base) "/"))
((and (= slb 2)
(or (string=? base "..")
(string=? base "/.")))
(string-append (expand-file-name base) "/"))
((and (> slb 2)
(char=? (string-ref base (- slb 2)) #\/)
(char=? (string-ref base (- slb 1)) #\.))
(string-append (expand-file-name base) "/"))
((and (> slb 2)
(char=? (string-ref base (- slb 3)) #\/)
(char=? (string-ref base (- slb 2)) #\.)
(char=? (string-ref base (- slb 1)) #\.))
(string-append (expand-file-name base) "/"))
(else
(expand-file-name base)))))
(set! slb (string-length newpath))
(if (and (> slb 1)
(char=? (string-ref newpath (- slb 1)) #\/)
(char=? (string-ref newpath (- slb 2)) #\/))
(set! newpath (substring newpath 0 (- slb 1))))
;; (format #t "merge-paths: base=~s, newpath=~s\n" base newpath)
(set! url (smerge-lists url `(serv user pass host port ,newpath) 5))))
(if (not (null? parent)) (set! parent (car parent)))
(cond ((null? parent) url)
((string=? "" (unparse-url parent)) url)
((string=? "" (unparse-url url)) parent)
((service url) url)
((host url) (inherit-service) url)
((and (filename url)
(> (string-length (filename url)) 0)
(char=? (string-ref (filename url) 0) #\/))
(inherit-service) (inherit-netloc) url)
((and (not (filename url))
(parameters url))
(inherit-service) (inherit-netloc) (inherit-path) url)
((and (not (filename url))
(query url))
(inherit-service) (inherit-netloc) (inherit-path) (inherit-parameters)
url)
((not (filename url))
(inherit-service) (inherit-netloc) (inherit-path) (inherit-parameters)
(inherit-query)
url)
(else
(inherit-service) (inherit-netloc) (merge-paths) url)))
(define (proxitize parsed-url)
(let ((proxy (assoc (service parsed-url) *proxy-servers*)))
(cond (proxy
(set! proxy (cadr proxy))
(list (service proxy)
(user proxy)
(password proxy)
(host proxy)
(port-number proxy)
(unparse-url parsed-url)
(parameters proxy)
(query proxy)
(anchor proxy)
#t)) ; Is through proxy.
(else
parsed-url))))
(define (unparse-url url)
(cond ((through-proxy? url)
(filename url))
(else
(let ((srv (service url)))
; (if srv (set! srv (symbol->string srv))) [eg]
(set! srv (if srv (symbol->string srv) "file"))
(string-append (maybe-append srv ":")
(string-netaddr url)
(if (filename url) (filename url) "")
(maybe-append ";" (parameters url))
(maybe-append "?" (query url))
(maybe-append "#" (anchor url)))))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define (pretty-url url . parent) ;; [eg] added for pretty mesages
(unless (string? url)
(set! url (url:unparse-url url)))
(url:unparse-url (apply url:parse-url url parent)))
;;; --------- Regexps for parsing ---------------
(define url-regexp
(string->regexp
"^([[a-zA-Z0-9+.-][a-zA-Z0-9+.-]+:|)(//[^#/]*|)([^#?;]*|)(;[^?#]*|)(\\?[^#]*|)(#.*|)$"))
(define up-hp-regexp
(string->regexp
"^/?/?([^@]*@|)(.*)$")) ; Strips off // automatically.
(define u-p-regexp
(string->regexp
"^([^@:]*[:@]|)(.*)$"))
(define h-p-test-regexp
(string->regexp
"^(.*)(:[0-9]+)$"))
;;; ----------------- Support routines for basic-parse-url ---------------
(define (string-netaddr url)
(let ((prt (port-number url)))
(if prt (set! prt (number->string prt)))
(cond ((password url)
(string-append "//" (user url) ":" (password url)
"@" (host url)
(maybe-append ":" prt)))
((user url)
(string-append "//" (user url) "@" (host url)
(maybe-append ":" prt)))
((host url)
(string-append "//" (host url)
(maybe-append ":" prt)))
(else ""))))
(define (split-hp-part h-p)
(or (split h-p-test-regexp h-p)
(list h-p "")))
(define (maybe-append s1 s2)
(if (and s1 s2) (string-append s1 s2)
""))
(define (split regexp maybe-str)
(if (string? maybe-str)
(apply-matches (regexp maybe-str) maybe-str)
#f))
(define (safe-list-ref maybe-list index)
(if (and (list? maybe-list)
(< index (length maybe-list)))
(list-ref maybe-list index)
#f))
(define (maybe-chop-end maybe-string)
(if (and (string? maybe-string)
(> (string-length maybe-string) 0))
(substring maybe-string 0 (- (string-length maybe-string) 1))
#f))
(define (maybe-chop-beg maybe-string)
(if (and (string? maybe-string)
(> (string-length maybe-string) 0))
(substring maybe-string 1 (string-length maybe-string))
#f))
(define (apply-matches matches string)
(if matches
(map (lambda (m) (apply substring string m))
(cdr matches))
#f))
;;; ---------- Url access routines -------------
(define (service url)
(list-ref url 0))
(define (user url)
(list-ref url 1))
(define (password url)
(list-ref url 2))
(define (host url)
(list-ref url 3))
(define (port-number url)
(list-ref url 4))
(define (filename url)
(list-ref url 5))
(define (parameters url)
(list-ref url 6))
(define (query url)
(list-ref url 7))
(define (anchor url)
(list-ref url 8))
(define (through-proxy? url)
(list-ref url 9))
;;; ------------ Exports -------------------------
(define url:parse-url parse-url)
(define url:unparse-url unparse-url)
;(define url:service service) [eg] to default to file if nothing else
(define url:service (lambda (url) (or (service url) 'file)))
(define url:user user)
(define url:password password)
(define url:host host)
(define url:port-number port-number)
(define url:filename filename)
(define url:anchor anchor)
(define url:through-proxy? through-proxy?)
(define url:pretty-url pretty-url)
;;; -------------- Set up proxy list -----------------
(define (get-proxy-evar evar)
(let ((e (getenv evar)))
(if e
(basic-parse-url e)
#f)))
(define *proxy-servers*
(let loop ((l url:*proxy-env-vars*))
(cond ((null? l) ())
(else
(let ((p (get-proxy-evar (cadar l))))
(if p
(cons (list (caar l) p)
(loop (cdr l)))
(loop (cdr l))))))))
(provide "www-url")