446 lines
14 KiB
Plaintext
446 lines
14 KiB
Plaintext
;;; www-url.stk - URL Parsing library (used by the www package).
|
|
;;; Version 0.5.
|
|
;;;
|
|
;;; Copyright (c) 1995 Harvey J. Stein (hjstein@math.huji.ac.il)
|
|
;;; This code is freely usable and distributable as long as this
|
|
;;; heading remains.
|
|
|
|
;;; slightly modified by Erick Gallesio (changes are noted with [eg])
|
|
;;; (Last file update: 16-Sep-1999 15:24 (eg)
|
|
|
|
|
|
;;; Usage:
|
|
;;; (url:parse-url url)
|
|
;;; Takes a string (url), and returns a parsed url. One may apply
|
|
;;; The general url form handled is:
|
|
;;;
|
|
;;; service://user:password@host:port/path;parameters?query#anchor
|
|
;;;
|
|
;;; This function takes a second optional argument - the parent url
|
|
;;; (a parsed url). When the 2nd argument is given, the url is
|
|
;;; interpreted relative to the parent url.
|
|
;;;
|
|
;;; The following functions may be applied to a parsed url:
|
|
;;;
|
|
;;; (url:unparse-url url)
|
|
;;; Returns a string (a fully qualified url) which would parse
|
|
;;; into url.
|
|
;;; (url:service parsed-url)
|
|
;;; Returns the service (i.e. - protocol) (as a symbol), or #f
|
|
;;; if none was given.
|
|
;;; (define url:user parsed-url)
|
|
;;; Returns user name, or #f if none was supplied.
|
|
;;; (url:password parsed-url)
|
|
;;; Returns the password, or #f if none was supplied.
|
|
;;; (url:host parsed-url)
|
|
;;; Returns the host name, or #f if none was supplied.
|
|
;;; Note - the host name can be "" (as in file:///foo/bar).
|
|
;;; (url:port-number parsed-url)
|
|
;;; The port number in the url, or #f if none was supplied.
|
|
;;; (url:filename parsed-url)
|
|
;;; The file name (i.e. - path name) of the url, or #f if
|
|
;;; none was supplied.
|
|
;;; (url:anchor parsed-url)
|
|
;;; The anchor in the url, or #f if none was supplied. It can
|
|
;;; be "" - as in http://foo.bar.com/file#
|
|
;;; (url:parameters parsed-url)
|
|
;;; The parameters (#f if not supplied, empty string if
|
|
;;; parameter delimiter is supplied, but no parameters are
|
|
;;; supplied.
|
|
;;; (url:query parsed-url)
|
|
;;; The query parameters (#f if not supplied, empty string if
|
|
;;; query delimiter is supplied, but no query parameters are
|
|
;;; supplied.
|
|
;;; (url:through-proxy? parsed-url)
|
|
;;; #t iff url has been parsed to be passed through a proxy.
|
|
;;; Proxying is handled by parsing the proxy address, and
|
|
;;; passing the unparsed url through as the file name.
|
|
;;; (url:pretty-url url . parent) [eg]
|
|
;;; returns a pretty string denoting the URL. url can be parsed or not,
|
|
;;; parent can be present or not.
|
|
|
|
;;; Overview:
|
|
|
|
;;; Internet RFC 1808 discusses how to interpret relative URLs. In
|
|
;;; doing so, it gives algorithms both for parsing URLs and for
|
|
;;; computing relative URLs.
|
|
;;;
|
|
;;; To parse a URL, they say to follow the following procedure:
|
|
;;;
|
|
;;; 1. Everything incl & after 1st "#" is the anchor, of the rest,
|
|
;;; 2. Everything incl & before 1st ":" is the scheme, assuming at
|
|
;;; least 1 char before ":" & all chars are scheme allowable
|
|
;;; [a-zA-Z0-9+.-], of the rest,
|
|
;;; 3. If it starts with "//", everything up to (but not incl) next
|
|
;;; ocurrence of "/" or until end is network location. Of the rest,
|
|
;;; 4. everything from 1st "?" until end is the query info. of the
|
|
;;; rest,
|
|
;;; 5. everything from 1st ";" until end is the parameters field. Of
|
|
;;; the rest,
|
|
;;; 6. everything remaining is the path.
|
|
;;;
|
|
;;; After all this is done, one must remove the "#" from the beginning
|
|
;;; of the anchor, the ":" from the end of the scheme, the "//" from
|
|
;;; the beginning of the net location, the "?" from the beginning of
|
|
;;; the query part, and the ";" from the beginning of the params part.
|
|
;;; One typically leaves the "/" on the beginning of the path part,
|
|
;;; because it shows up iff the URL is relative.
|
|
;;;
|
|
;;; This means that the following regexp should be able to separate
|
|
;;; out these 6 basic parts of the url:
|
|
;;;
|
|
;;; "^([a-zA-Z0-9+.-][a-zA-Z0-9+.-]+:|)(//[^#/]*|)([^#?;]*|)(;[^?#]*|)(\\?[^#]*|)(#.*|)$"
|
|
;;;
|
|
;;; Once this is done, the net address must be parsed. I believe this
|
|
;;; can be done as follows:
|
|
;;;
|
|
;;; 1. Everything before & including the 1st "@" is the
|
|
;;; username/password part. Of the rest,
|
|
;;; 2. Everything after & including the last ":" is the port number,
|
|
;;; provided the ":" is only followed by digits
|
|
;;; 3. What's left is the host name.
|
|
;;;
|
|
;;; Bugs
|
|
;;; -The parsing of http://foo.bar.com is problematic. On the one
|
|
;;; hand, there's no file name
|
|
;;; become /. [eg] corrected
|
|
;;; -Proxy junk barely tested...
|
|
|
|
|
|
(require "fs")
|
|
|
|
(define-module URL
|
|
(export url:parse-url url:unparse-url url:service url:user
|
|
url:password url:host url:port-number url:filename
|
|
url:anchor url:parameters url:query url:through-proxy?
|
|
url:pretty-url))
|
|
|
|
(select-module URL)
|
|
|
|
(define url:*proxy-env-vars*
|
|
'((http "HTTP_PROXY")
|
|
(ftp "FTP_PROXY")
|
|
(wais "WAIS_PROXY")
|
|
(gopher "GOPHER_PROXY")))
|
|
|
|
(define *proxy-servers* ())
|
|
;;; Takes a URL as an argument and returns a list containing the
|
|
;;; protocol, the host name, and the file name.
|
|
|
|
(define (parse-url url . parent)
|
|
(proxitize (apply relativize (basic-parse-url url)
|
|
(cond
|
|
((null? parent) '())
|
|
((through-proxy? (car parent))
|
|
(list (basic-parse-url (apply unparse-url parent))))
|
|
(else parent)))))
|
|
|
|
(define (basic-parse-url url)
|
|
(let* ((base (split url-regexp url))
|
|
(up-hp (split up-hp-regexp (safe-list-ref base 1)))
|
|
(u-p (split u-p-regexp (safe-list-ref up-hp 0)))
|
|
(h-p (split-hp-part (safe-list-ref up-hp 1))))
|
|
;; (format #t "basic-parse-url:\n base=~s\n up-hp=~s\n u-p=~s\n h-p=~s\n"
|
|
;; base up-hp u-p h-p)
|
|
(if (not (and base up-hp u-p h-p))
|
|
#f
|
|
(let* ((dirty-url (map (lambda (x) (if (string=? x "") #f x))
|
|
(append (list (car base))
|
|
u-p
|
|
h-p
|
|
(cddr base))))
|
|
(srv (maybe-chop-end (list-ref dirty-url 0)))
|
|
(user (maybe-chop-end (list-ref dirty-url 1)))
|
|
(pass (maybe-chop-end (list-ref dirty-url 2)))
|
|
(host (list-ref dirty-url 3))
|
|
(port (maybe-chop-beg (list-ref dirty-url 4)))
|
|
(path (list-ref dirty-url 5))
|
|
(parm (maybe-chop-beg (list-ref dirty-url 6)))
|
|
(quer (maybe-chop-beg (list-ref dirty-url 7)))
|
|
(anch (maybe-chop-beg (list-ref dirty-url 8))))
|
|
(if (and (not host)
|
|
(string? (safe-list-ref base 1))
|
|
(> (string-length (safe-list-ref base 1)) 0))
|
|
(set! host ""))
|
|
(list (if srv (string->symbol (string-lower srv))
|
|
#f)
|
|
user
|
|
pass
|
|
host
|
|
(if port (string->number port)
|
|
#f)
|
|
(or path "/")
|
|
parm
|
|
quer
|
|
anch
|
|
#f)))))
|
|
|
|
(define (relativize url . parent)
|
|
(define (inherit-service)
|
|
(set! url (smerge-lists url parent 0)))
|
|
|
|
(define (inherit-netloc)
|
|
(set! url (smerge-lists url parent 1 2 3 4)))
|
|
|
|
(define (inherit-path)
|
|
(set! url (smerge-lists url parent 5)))
|
|
|
|
(define (inherit-parameters)
|
|
(set! url (smerge-lists url parent 6)))
|
|
|
|
(define (inherit-query)
|
|
(set! url (smerge-lists url parent 7)))
|
|
|
|
(define (smerge-lists url parent . positions)
|
|
(define (smerge-aux url parent positions ref)
|
|
(cond ((null? positions) url)
|
|
((null? url) parent)
|
|
((null? parent) url)
|
|
((= (car positions) ref)
|
|
(cons (car parent)
|
|
(smerge-aux (cdr url) (cdr parent) (cdr positions) (+ 1 ref))))
|
|
(else
|
|
(cons (car url)
|
|
(smerge-aux (cdr url) (cdr parent) positions (+ 1 ref))))))
|
|
(smerge-aux url parent positions 0))
|
|
|
|
; (define (dirname f) ;; [eg] deleted to use fs.stk instead
|
|
; (define r (string->regexp "^(.*/|)([^/]*)$"))
|
|
; (car (split r f)))
|
|
|
|
(define (merge-paths)
|
|
(let* ((base (string-append (dirname (filename parent))
|
|
(if (eq? (os-kind) 'Unix) "/" "\\") ; [eg]
|
|
(filename url)))
|
|
(slb (string-length base))
|
|
(newpath
|
|
(cond ((and (> slb 0)
|
|
(char=? (string-ref base (- slb 1)) #\/))
|
|
(string-append (expand-file-name base) "/"))
|
|
((and (= slb 1)
|
|
(char=? (string-ref base 0) #\.))
|
|
(string-append (expand-file-name base) "/"))
|
|
((and (= slb 2)
|
|
(or (string=? base "..")
|
|
(string=? base "/.")))
|
|
(string-append (expand-file-name base) "/"))
|
|
((and (> slb 2)
|
|
(char=? (string-ref base (- slb 2)) #\/)
|
|
(char=? (string-ref base (- slb 1)) #\.))
|
|
(string-append (expand-file-name base) "/"))
|
|
((and (> slb 2)
|
|
(char=? (string-ref base (- slb 3)) #\/)
|
|
(char=? (string-ref base (- slb 2)) #\.)
|
|
(char=? (string-ref base (- slb 1)) #\.))
|
|
(string-append (expand-file-name base) "/"))
|
|
(else
|
|
(expand-file-name base)))))
|
|
(set! slb (string-length newpath))
|
|
(if (and (> slb 1)
|
|
(char=? (string-ref newpath (- slb 1)) #\/)
|
|
(char=? (string-ref newpath (- slb 2)) #\/))
|
|
(set! newpath (substring newpath 0 (- slb 1))))
|
|
;; (format #t "merge-paths: base=~s, newpath=~s\n" base newpath)
|
|
(set! url (smerge-lists url `(serv user pass host port ,newpath) 5))))
|
|
|
|
(if (not (null? parent)) (set! parent (car parent)))
|
|
(cond ((null? parent) url)
|
|
((string=? "" (unparse-url parent)) url)
|
|
((string=? "" (unparse-url url)) parent)
|
|
((service url) url)
|
|
((host url) (inherit-service) url)
|
|
((and (filename url)
|
|
(> (string-length (filename url)) 0)
|
|
(char=? (string-ref (filename url) 0) #\/))
|
|
(inherit-service) (inherit-netloc) url)
|
|
((and (not (filename url))
|
|
(parameters url))
|
|
(inherit-service) (inherit-netloc) (inherit-path) url)
|
|
((and (not (filename url))
|
|
(query url))
|
|
(inherit-service) (inherit-netloc) (inherit-path) (inherit-parameters)
|
|
url)
|
|
((not (filename url))
|
|
(inherit-service) (inherit-netloc) (inherit-path) (inherit-parameters)
|
|
(inherit-query)
|
|
url)
|
|
(else
|
|
(inherit-service) (inherit-netloc) (merge-paths) url)))
|
|
|
|
(define (proxitize parsed-url)
|
|
(let ((proxy (assoc (service parsed-url) *proxy-servers*)))
|
|
(cond (proxy
|
|
(set! proxy (cadr proxy))
|
|
(list (service proxy)
|
|
(user proxy)
|
|
(password proxy)
|
|
(host proxy)
|
|
(port-number proxy)
|
|
(unparse-url parsed-url)
|
|
(parameters proxy)
|
|
(query proxy)
|
|
(anchor proxy)
|
|
#t)) ; Is through proxy.
|
|
(else
|
|
parsed-url))))
|
|
|
|
(define (unparse-url url)
|
|
(cond ((through-proxy? url)
|
|
(filename url))
|
|
(else
|
|
(let ((srv (service url)))
|
|
; (if srv (set! srv (symbol->string srv))) [eg]
|
|
(set! srv (if srv (symbol->string srv) "file"))
|
|
(string-append (maybe-append srv ":")
|
|
(string-netaddr url)
|
|
(if (filename url) (filename url) "")
|
|
(maybe-append ";" (parameters url))
|
|
(maybe-append "?" (query url))
|
|
(maybe-append "#" (anchor url)))))))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
(define (pretty-url url . parent) ;; [eg] added for pretty mesages
|
|
(unless (string? url)
|
|
(set! url (url:unparse-url url)))
|
|
(url:unparse-url (apply url:parse-url url parent)))
|
|
|
|
|
|
;;; --------- Regexps for parsing ---------------
|
|
|
|
(define url-regexp
|
|
(string->regexp
|
|
"^([[a-zA-Z0-9+.-][a-zA-Z0-9+.-]+:|)(//[^#/]*|)([^#?;]*|)(;[^?#]*|)(\\?[^#]*|)(#.*|)$"))
|
|
|
|
(define up-hp-regexp
|
|
(string->regexp
|
|
"^/?/?([^@]*@|)(.*)$")) ; Strips off // automatically.
|
|
|
|
(define u-p-regexp
|
|
(string->regexp
|
|
"^([^@:]*[:@]|)(.*)$"))
|
|
|
|
(define h-p-test-regexp
|
|
(string->regexp
|
|
"^(.*)(:[0-9]+)$"))
|
|
|
|
;;; ----------------- Support routines for basic-parse-url ---------------
|
|
(define (string-netaddr url)
|
|
(let ((prt (port-number url)))
|
|
(if prt (set! prt (number->string prt)))
|
|
(cond ((password url)
|
|
(string-append "//" (user url) ":" (password url)
|
|
"@" (host url)
|
|
(maybe-append ":" prt)))
|
|
((user url)
|
|
(string-append "//" (user url) "@" (host url)
|
|
(maybe-append ":" prt)))
|
|
((host url)
|
|
(string-append "//" (host url)
|
|
(maybe-append ":" prt)))
|
|
(else ""))))
|
|
|
|
(define (split-hp-part h-p)
|
|
(or (split h-p-test-regexp h-p)
|
|
(list h-p "")))
|
|
|
|
(define (maybe-append s1 s2)
|
|
(if (and s1 s2) (string-append s1 s2)
|
|
""))
|
|
|
|
(define (split regexp maybe-str)
|
|
(if (string? maybe-str)
|
|
(apply-matches (regexp maybe-str) maybe-str)
|
|
#f))
|
|
|
|
(define (safe-list-ref maybe-list index)
|
|
(if (and (list? maybe-list)
|
|
(< index (length maybe-list)))
|
|
(list-ref maybe-list index)
|
|
#f))
|
|
|
|
(define (maybe-chop-end maybe-string)
|
|
(if (and (string? maybe-string)
|
|
(> (string-length maybe-string) 0))
|
|
(substring maybe-string 0 (- (string-length maybe-string) 1))
|
|
#f))
|
|
|
|
(define (maybe-chop-beg maybe-string)
|
|
(if (and (string? maybe-string)
|
|
(> (string-length maybe-string) 0))
|
|
(substring maybe-string 1 (string-length maybe-string))
|
|
#f))
|
|
|
|
|
|
(define (apply-matches matches string)
|
|
(if matches
|
|
(map (lambda (m) (apply substring string m))
|
|
(cdr matches))
|
|
#f))
|
|
|
|
;;; ---------- Url access routines -------------
|
|
|
|
(define (service url)
|
|
(list-ref url 0))
|
|
|
|
(define (user url)
|
|
(list-ref url 1))
|
|
|
|
(define (password url)
|
|
(list-ref url 2))
|
|
|
|
(define (host url)
|
|
(list-ref url 3))
|
|
|
|
(define (port-number url)
|
|
(list-ref url 4))
|
|
|
|
(define (filename url)
|
|
(list-ref url 5))
|
|
|
|
(define (parameters url)
|
|
(list-ref url 6))
|
|
|
|
(define (query url)
|
|
(list-ref url 7))
|
|
|
|
(define (anchor url)
|
|
(list-ref url 8))
|
|
|
|
(define (through-proxy? url)
|
|
(list-ref url 9))
|
|
|
|
;;; ------------ Exports -------------------------
|
|
(define url:parse-url parse-url)
|
|
(define url:unparse-url unparse-url)
|
|
;(define url:service service) [eg] to default to file if nothing else
|
|
(define url:service (lambda (url) (or (service url) 'file)))
|
|
(define url:user user)
|
|
(define url:password password)
|
|
(define url:host host)
|
|
(define url:port-number port-number)
|
|
(define url:filename filename)
|
|
(define url:anchor anchor)
|
|
(define url:through-proxy? through-proxy?)
|
|
(define url:pretty-url pretty-url)
|
|
|
|
;;; -------------- Set up proxy list -----------------
|
|
(define (get-proxy-evar evar)
|
|
(let ((e (getenv evar)))
|
|
(if e
|
|
(basic-parse-url e)
|
|
#f)))
|
|
|
|
|
|
(define *proxy-servers*
|
|
(let loop ((l url:*proxy-env-vars*))
|
|
(cond ((null? l) ())
|
|
(else
|
|
(let ((p (get-proxy-evar (cadar l))))
|
|
(if p
|
|
(cons (list (caar l) p)
|
|
(loop (cdr l)))
|
|
(loop (cdr l))))))))
|
|
|
|
|
|
(provide "www-url")
|