;;; www-url.stk - URL Parsing library (used by the www package). ;;; Version 0.5. ;;; ;;; Copyright (c) 1995 Harvey J. Stein (hjstein@math.huji.ac.il) ;;; This code is freely usable and distributable as long as this ;;; heading remains. ;;; slightly modified by Erick Gallesio (changes are noted with [eg]) ;;; (Last file update: 16-Sep-1999 15:24 (eg) ;;; Usage: ;;; (url:parse-url url) ;;; Takes a string (url), and returns a parsed url. One may apply ;;; The general url form handled is: ;;; ;;; service://user:password@host:port/path;parameters?query#anchor ;;; ;;; This function takes a second optional argument - the parent url ;;; (a parsed url). When the 2nd argument is given, the url is ;;; interpreted relative to the parent url. ;;; ;;; The following functions may be applied to a parsed url: ;;; ;;; (url:unparse-url url) ;;; Returns a string (a fully qualified url) which would parse ;;; into url. ;;; (url:service parsed-url) ;;; Returns the service (i.e. - protocol) (as a symbol), or #f ;;; if none was given. ;;; (define url:user parsed-url) ;;; Returns user name, or #f if none was supplied. ;;; (url:password parsed-url) ;;; Returns the password, or #f if none was supplied. ;;; (url:host parsed-url) ;;; Returns the host name, or #f if none was supplied. ;;; Note - the host name can be "" (as in file:///foo/bar). ;;; (url:port-number parsed-url) ;;; The port number in the url, or #f if none was supplied. ;;; (url:filename parsed-url) ;;; The file name (i.e. - path name) of the url, or #f if ;;; none was supplied. ;;; (url:anchor parsed-url) ;;; The anchor in the url, or #f if none was supplied. It can ;;; be "" - as in http://foo.bar.com/file# ;;; (url:parameters parsed-url) ;;; The parameters (#f if not supplied, empty string if ;;; parameter delimiter is supplied, but no parameters are ;;; supplied. ;;; (url:query parsed-url) ;;; The query parameters (#f if not supplied, empty string if ;;; query delimiter is supplied, but no query parameters are ;;; supplied. ;;; (url:through-proxy? parsed-url) ;;; #t iff url has been parsed to be passed through a proxy. ;;; Proxying is handled by parsing the proxy address, and ;;; passing the unparsed url through as the file name. ;;; (url:pretty-url url . parent) [eg] ;;; returns a pretty string denoting the URL. url can be parsed or not, ;;; parent can be present or not. ;;; Overview: ;;; Internet RFC 1808 discusses how to interpret relative URLs. In ;;; doing so, it gives algorithms both for parsing URLs and for ;;; computing relative URLs. ;;; ;;; To parse a URL, they say to follow the following procedure: ;;; ;;; 1. Everything incl & after 1st "#" is the anchor, of the rest, ;;; 2. Everything incl & before 1st ":" is the scheme, assuming at ;;; least 1 char before ":" & all chars are scheme allowable ;;; [a-zA-Z0-9+.-], of the rest, ;;; 3. If it starts with "//", everything up to (but not incl) next ;;; ocurrence of "/" or until end is network location. Of the rest, ;;; 4. everything from 1st "?" until end is the query info. of the ;;; rest, ;;; 5. everything from 1st ";" until end is the parameters field. Of ;;; the rest, ;;; 6. everything remaining is the path. ;;; ;;; After all this is done, one must remove the "#" from the beginning ;;; of the anchor, the ":" from the end of the scheme, the "//" from ;;; the beginning of the net location, the "?" from the beginning of ;;; the query part, and the ";" from the beginning of the params part. ;;; One typically leaves the "/" on the beginning of the path part, ;;; because it shows up iff the URL is relative. ;;; ;;; This means that the following regexp should be able to separate ;;; out these 6 basic parts of the url: ;;; ;;; "^([a-zA-Z0-9+.-][a-zA-Z0-9+.-]+:|)(//[^#/]*|)([^#?;]*|)(;[^?#]*|)(\\?[^#]*|)(#.*|)$" ;;; ;;; Once this is done, the net address must be parsed. I believe this ;;; can be done as follows: ;;; ;;; 1. Everything before & including the 1st "@" is the ;;; username/password part. Of the rest, ;;; 2. Everything after & including the last ":" is the port number, ;;; provided the ":" is only followed by digits ;;; 3. What's left is the host name. ;;; ;;; Bugs ;;; -The parsing of http://foo.bar.com is problematic. On the one ;;; hand, there's no file name ;;; become /. [eg] corrected ;;; -Proxy junk barely tested... (require "fs") (define-module URL (export url:parse-url url:unparse-url url:service url:user url:password url:host url:port-number url:filename url:anchor url:parameters url:query url:through-proxy? url:pretty-url)) (select-module URL) (define url:*proxy-env-vars* '((http "HTTP_PROXY") (ftp "FTP_PROXY") (wais "WAIS_PROXY") (gopher "GOPHER_PROXY"))) (define *proxy-servers* ()) ;;; Takes a URL as an argument and returns a list containing the ;;; protocol, the host name, and the file name. (define (parse-url url . parent) (proxitize (apply relativize (basic-parse-url url) (cond ((null? parent) '()) ((through-proxy? (car parent)) (list (basic-parse-url (apply unparse-url parent)))) (else parent))))) (define (basic-parse-url url) (let* ((base (split url-regexp url)) (up-hp (split up-hp-regexp (safe-list-ref base 1))) (u-p (split u-p-regexp (safe-list-ref up-hp 0))) (h-p (split-hp-part (safe-list-ref up-hp 1)))) ;; (format #t "basic-parse-url:\n base=~s\n up-hp=~s\n u-p=~s\n h-p=~s\n" ;; base up-hp u-p h-p) (if (not (and base up-hp u-p h-p)) #f (let* ((dirty-url (map (lambda (x) (if (string=? x "") #f x)) (append (list (car base)) u-p h-p (cddr base)))) (srv (maybe-chop-end (list-ref dirty-url 0))) (user (maybe-chop-end (list-ref dirty-url 1))) (pass (maybe-chop-end (list-ref dirty-url 2))) (host (list-ref dirty-url 3)) (port (maybe-chop-beg (list-ref dirty-url 4))) (path (list-ref dirty-url 5)) (parm (maybe-chop-beg (list-ref dirty-url 6))) (quer (maybe-chop-beg (list-ref dirty-url 7))) (anch (maybe-chop-beg (list-ref dirty-url 8)))) (if (and (not host) (string? (safe-list-ref base 1)) (> (string-length (safe-list-ref base 1)) 0)) (set! host "")) (list (if srv (string->symbol (string-lower srv)) #f) user pass host (if port (string->number port) #f) (or path "/") parm quer anch #f))))) (define (relativize url . parent) (define (inherit-service) (set! url (smerge-lists url parent 0))) (define (inherit-netloc) (set! url (smerge-lists url parent 1 2 3 4))) (define (inherit-path) (set! url (smerge-lists url parent 5))) (define (inherit-parameters) (set! url (smerge-lists url parent 6))) (define (inherit-query) (set! url (smerge-lists url parent 7))) (define (smerge-lists url parent . positions) (define (smerge-aux url parent positions ref) (cond ((null? positions) url) ((null? url) parent) ((null? parent) url) ((= (car positions) ref) (cons (car parent) (smerge-aux (cdr url) (cdr parent) (cdr positions) (+ 1 ref)))) (else (cons (car url) (smerge-aux (cdr url) (cdr parent) positions (+ 1 ref)))))) (smerge-aux url parent positions 0)) ; (define (dirname f) ;; [eg] deleted to use fs.stk instead ; (define r (string->regexp "^(.*/|)([^/]*)$")) ; (car (split r f))) (define (merge-paths) (let* ((base (string-append (dirname (filename parent)) (if (eq? (os-kind) 'Unix) "/" "\\") ; [eg] (filename url))) (slb (string-length base)) (newpath (cond ((and (> slb 0) (char=? (string-ref base (- slb 1)) #\/)) (string-append (expand-file-name base) "/")) ((and (= slb 1) (char=? (string-ref base 0) #\.)) (string-append (expand-file-name base) "/")) ((and (= slb 2) (or (string=? base "..") (string=? base "/."))) (string-append (expand-file-name base) "/")) ((and (> slb 2) (char=? (string-ref base (- slb 2)) #\/) (char=? (string-ref base (- slb 1)) #\.)) (string-append (expand-file-name base) "/")) ((and (> slb 2) (char=? (string-ref base (- slb 3)) #\/) (char=? (string-ref base (- slb 2)) #\.) (char=? (string-ref base (- slb 1)) #\.)) (string-append (expand-file-name base) "/")) (else (expand-file-name base))))) (set! slb (string-length newpath)) (if (and (> slb 1) (char=? (string-ref newpath (- slb 1)) #\/) (char=? (string-ref newpath (- slb 2)) #\/)) (set! newpath (substring newpath 0 (- slb 1)))) ;; (format #t "merge-paths: base=~s, newpath=~s\n" base newpath) (set! url (smerge-lists url `(serv user pass host port ,newpath) 5)))) (if (not (null? parent)) (set! parent (car parent))) (cond ((null? parent) url) ((string=? "" (unparse-url parent)) url) ((string=? "" (unparse-url url)) parent) ((service url) url) ((host url) (inherit-service) url) ((and (filename url) (> (string-length (filename url)) 0) (char=? (string-ref (filename url) 0) #\/)) (inherit-service) (inherit-netloc) url) ((and (not (filename url)) (parameters url)) (inherit-service) (inherit-netloc) (inherit-path) url) ((and (not (filename url)) (query url)) (inherit-service) (inherit-netloc) (inherit-path) (inherit-parameters) url) ((not (filename url)) (inherit-service) (inherit-netloc) (inherit-path) (inherit-parameters) (inherit-query) url) (else (inherit-service) (inherit-netloc) (merge-paths) url))) (define (proxitize parsed-url) (let ((proxy (assoc (service parsed-url) *proxy-servers*))) (cond (proxy (set! proxy (cadr proxy)) (list (service proxy) (user proxy) (password proxy) (host proxy) (port-number proxy) (unparse-url parsed-url) (parameters proxy) (query proxy) (anchor proxy) #t)) ; Is through proxy. (else parsed-url)))) (define (unparse-url url) (cond ((through-proxy? url) (filename url)) (else (let ((srv (service url))) ; (if srv (set! srv (symbol->string srv))) [eg] (set! srv (if srv (symbol->string srv) "file")) (string-append (maybe-append srv ":") (string-netaddr url) (if (filename url) (filename url) "") (maybe-append ";" (parameters url)) (maybe-append "?" (query url)) (maybe-append "#" (anchor url))))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define (pretty-url url . parent) ;; [eg] added for pretty mesages (unless (string? url) (set! url (url:unparse-url url))) (url:unparse-url (apply url:parse-url url parent))) ;;; --------- Regexps for parsing --------------- (define url-regexp (string->regexp "^([[a-zA-Z0-9+.-][a-zA-Z0-9+.-]+:|)(//[^#/]*|)([^#?;]*|)(;[^?#]*|)(\\?[^#]*|)(#.*|)$")) (define up-hp-regexp (string->regexp "^/?/?([^@]*@|)(.*)$")) ; Strips off // automatically. (define u-p-regexp (string->regexp "^([^@:]*[:@]|)(.*)$")) (define h-p-test-regexp (string->regexp "^(.*)(:[0-9]+)$")) ;;; ----------------- Support routines for basic-parse-url --------------- (define (string-netaddr url) (let ((prt (port-number url))) (if prt (set! prt (number->string prt))) (cond ((password url) (string-append "//" (user url) ":" (password url) "@" (host url) (maybe-append ":" prt))) ((user url) (string-append "//" (user url) "@" (host url) (maybe-append ":" prt))) ((host url) (string-append "//" (host url) (maybe-append ":" prt))) (else "")))) (define (split-hp-part h-p) (or (split h-p-test-regexp h-p) (list h-p ""))) (define (maybe-append s1 s2) (if (and s1 s2) (string-append s1 s2) "")) (define (split regexp maybe-str) (if (string? maybe-str) (apply-matches (regexp maybe-str) maybe-str) #f)) (define (safe-list-ref maybe-list index) (if (and (list? maybe-list) (< index (length maybe-list))) (list-ref maybe-list index) #f)) (define (maybe-chop-end maybe-string) (if (and (string? maybe-string) (> (string-length maybe-string) 0)) (substring maybe-string 0 (- (string-length maybe-string) 1)) #f)) (define (maybe-chop-beg maybe-string) (if (and (string? maybe-string) (> (string-length maybe-string) 0)) (substring maybe-string 1 (string-length maybe-string)) #f)) (define (apply-matches matches string) (if matches (map (lambda (m) (apply substring string m)) (cdr matches)) #f)) ;;; ---------- Url access routines ------------- (define (service url) (list-ref url 0)) (define (user url) (list-ref url 1)) (define (password url) (list-ref url 2)) (define (host url) (list-ref url 3)) (define (port-number url) (list-ref url 4)) (define (filename url) (list-ref url 5)) (define (parameters url) (list-ref url 6)) (define (query url) (list-ref url 7)) (define (anchor url) (list-ref url 8)) (define (through-proxy? url) (list-ref url 9)) ;;; ------------ Exports ------------------------- (define url:parse-url parse-url) (define url:unparse-url unparse-url) ;(define url:service service) [eg] to default to file if nothing else (define url:service (lambda (url) (or (service url) 'file))) (define url:user user) (define url:password password) (define url:host host) (define url:port-number port-number) (define url:filename filename) (define url:anchor anchor) (define url:through-proxy? through-proxy?) (define url:pretty-url pretty-url) ;;; -------------- Set up proxy list ----------------- (define (get-proxy-evar evar) (let ((e (getenv evar))) (if e (basic-parse-url e) #f))) (define *proxy-servers* (let loop ((l url:*proxy-env-vars*)) (cond ((null? l) ()) (else (let ((p (get-proxy-evar (cadar l)))) (if p (cons (list (caar l) p) (loop (cdr l))) (loop (cdr l)))))))) (provide "www-url")