Reasonably complete and up-to-date docs.

2003-01-14 15:02:44 +00:00 · 2003-01-14 15:02:44 +00:00 · ff8061c4ea
parent 8ed2a48176
commit ff8061c4ea
1 changed files with 138 additions and 190 deletions
--- a/doc/latex/uri.tex
+++ b/doc/latex/uri.tex
@ -1,218 +1,166 @@
-\chapter{Handle URIs}\label{cha:uri}
-%
-\begin{description}
-\item[Used files:] uri.scm
-\item[Name of the package:] uri 
-\end{description}
-%
+\chapter{Parsing and Processing URIs}\label{cha:uri}
+
+The \ex{uri} structure contains a library for dealing with URIs.
+
+\section{Notes on URI Syntax}

-\section{Overview}
 A URI (Uniform Resource Identifier) is of following syntax:
+%
 \begin{inset}
-[scheme] : \semvar{path} [{\normalfont?\/} search] [{\normalfont\#} fragmentid]
+[scheme] : \var{path} [{\normalfont?\/} search] [{\normalfont\#} fragid]
 \end{inset}
-Parts in brackets may be ommitted. The last part is usually referred
-to as fragid in this document.
+%
+Parts in brackets may be ommitted.

-As you see, the URI contains characters like \verb|:| to indicate its
-different parts. But what, if the \semvar{scheme} contains \verb|:| as
-part of its name? For this purpose, some special characters are
-\emph{escaped} if they are a regular part of a name and not indicators
-for the structure of a URI. Escape-sequences are of following scheme:
-\verb|\%hh| where \verb|h| is a hexadecimal digit. The hexadecimal
-number refers to the (US) ASCII code of the escaped character, e.g.\ 
-\ex{\%20} is space (ASCII character 32) and \ex{\%61} is `a' (ASCII
-character 97). This module provides procedures to escape and unescape
-strings that are meant to be used in a URI.
+The URI contains characters like \verb|:| to indicate its different
+parts.  Some special characters are \emph{escaped} if they are a
+regular part of a name and not indicators for the structure of a URI.
+Escape sequences are of following scheme: \verb|\%hh| where \verb|h|
+is a hexadecimal digit.  The hexadecimal number refers to the
+ASCII of the escaped character, e.g.\ \ex{\%20} is space (ASCII
+32) and \ex{\%61} is `a' (ASCII 97). This module
+provides procedures to escape and unescape strings that are meant to
+be used in a URI.

 \section{Procedures}

-\begin{defundesc}{parse-uri} {uri-string } {scheme path search
+\defun{parse-uri} {uri-string } {scheme path search
    frag-id} \label{proc:parse-uri}
-  
-  Parses an \semvar{uri\=string} in the possible four fields, as
-  mentioned above in \emph{Overview}. These four fields are returned
-  as a multiple value. They are \emph{not} unescaped, as the rules for
-  parsing the \semvar{path} component in particular need unescaped
-  text, and are dependent on \semvar{scheme}. The URL parser is
-  responsible for doing this. If the \semvar{scheme}, \semvar{search}\ 
-  or \semvar{fragid} portions are not specified, they are \sharpf.
-  Otherwise, \semvar{scheme}, \semvar{search}, and \semvar{fragid} are
-  strings. \semvar{path} is a non-empty string list -- the path split
+\begin{desc}
+  Parses an \var{uri\=string} into its four fields.
+  The fields are \emph{not} unescaped, as the rules for
+  parsing the \var{path} component in particular need unescaped
+  text, and are dependent on \var{scheme}. The URL parser is
+  responsible for doing this.  If the \var{scheme}, \var{search}
+  or \var{fragid} portions are not specified, they are \sharpf.
+  Otherwise, \var{scheme}, \var{search}, and \var{fragid} are
+  strings. \var{path} is a non-empty string list----the path split
  at slashes.
-  
-  For those of you who are interested, here is a description of the
-  parsing technique. It is inwards from both ends.
-  \begin{itemize}
-  \item First we search forwards for the first reserved character
-    (\verb|=|, \verb|;|, \verb|/|, \verb|#|, \verb|?|, \verb|:| or
-    \verb|space|). If it's a colon, then that's the \semvar{scheme}
-    part, otherwise we have no \semvar{scheme} part. At all events we
-    remove it.
-  \item Then we search backwards from the end for the last reserved
-    char.  If it's a sharp, then that's the \semvar{fragid} part --
-    remove it.
-  \item Then we search backwards from the end for the last reserved
-    char.  If it's a question-mark, then that's the \semvar{search}
-    part -- remove it.
-  \item What's left is the path. We split at slashes. The empty string
-    becomes a list containing the empty string.
-  \end{itemize}
-  
-  This scheme is tolerant of the various ways people build broken
-  URI's out there on the Net\footnote{So it is not absolutely conform
-    with RFC~1630}, e.g. \verb|=| is a reserved character, but used
-  unescaped in the search-part. It was given to me\footnote{That's
-    Olin Shivers.} by Dan Connolly of the W3C and slightly modified.
-\end{defundesc}
+\end{desc}

-\begin{defundesc}{unescape-uri} {string [start] [end]} {string}
-  Unescapes a string. This procedure should only be used \emph{after}
-  the URL (!) was parsed, since unescaping may introduce characters
-  that blow up the parse (that's why escape sequences are used in URIs
-  ;-). Escape sequences are of the scheme as described in ``Overview''.
-\end{defundesc}
+Here is a description of the parsing technique. It is inwards from
+both ends:
+\begin{itemize}
+\item First, the code searches forwards for the first reserved
+  character (\verb|=|, \verb|;|, \verb|/|, \verb|#|, \verb|?|,
+  \verb|:| or \verb|space|).  If it's a colon, then that's the
+  \var{scheme} part, otherwise there is no \var{scheme} part. At
+  all events, it is removed.
+\item Then the code searches backwards from the end for the last reserved
+  char.  If it's a sharp, then that's the \var{fragid} part---remove it.
+\item Then the code searches backwards from the end for the last reserved
+  char.  If it's a question-mark, then that's the \var{search}
+  part----remove it.
+\item What's left is the path.  The code split it at slashes. The
+  empty string becomes a list containing the empty string.
+\end{itemize}
+%  
+This scheme is tolerant of the various ways people build broken
+URI's out there on the Net\footnote{So it does not absolutely conform
+  to RFC~1630.}, e.g.\ \verb|=| is a reserved character, but used
+unescaped in the search-part. It was given to me\footnote{That's
+  Olin Shivers.} by Dan Connolly of the W3C and slightly modified.

+\defun{unescape-uri}{string [start] [end]}{string}
+\begin{desc}
+  \ex{Unescape-uri} unescapes a string. If \var{start} and/or \var{end} are
+  specified, they specify start and end positions within \var{string}
+  should be unescaped.
+\end{desc}
+%
+This procedure should only be used \emph{after} the URI was parsed,
+since unescaping may introduce characters that blow up the
+parse---that's why escape sequences are used in URIs.

-%\texttt{uri-escaped-chars} \hfill
-%\texttt{char-set}\index{\texttt{uri-escaped-chars}}
 \defvar{uri-escaped-chars}{char-set}
 \begin{desc}
-  A set of characters that are escaped in URIs. These are the
-  following characters: dollar (\verb|$|), minus (\verb|-|),%fool Xemacs$
-  underscore (\verb|_|), at (\verb|@|), dot (\verb|.|), and-sign
-  (\verb|&|), exclamation mark (\verb|!|), asterisk (\verb|*|),
-  backslash (\verb|\|), double quote (\verb|"|), single quote
-  (\verb|'|), open brace (\verb|(|), close brace (\verb|)|), comma
-  (\verb|,|) plus (\verb|+|) and all other characters that are neither
-  letters nor digits (such as space and control characters).
+  This is a set of characters (in the sense of SRFI~14) which are
+  escaped in URIs.  These are the
+  following characters: \verb|$|, \verb|-|, \verb|_|, \verb|@|, %$
+  \verb|.|, \verb|&|, \verb|!|, \verb|*|, \verb|\|, \verb|"|,
+  \verb|'|, \verb|(|, \verb|)|, \verb|,|, \verb|+|, and all other
+  characters that are neither letters nor digits (such as space and
+  control characters).
 \end{desc}

-\begin{defundesc}{escape-uri} {string [escaped-chars]} {string}
-  Escapes characters of \semvar{string} that are given with
-  \semvar{escaped\=chars}. \semvar{escaped\=chars} default to
-  \ex{uri\=escaped\=chars}. Be careful with using this procedure to
-  chunks of text with syntactically meaningful reserved characters
-  (e.g., paths with URI slashes or colons) -- they'll be escaped, and
-  lose their special meaning. E.g.\ it would be a mistake to apply
-  \ex{escape-uri} to
-  ``\ex{//lcs.\ob{}mit.\ob{}edu:8001\ob/foo\ob/bar.html}'' because the
-  sla\-shes and co\-lons would be escaped. Note that \ex{esacpe-uri}
-  doesn't check this as it would lose his meaning.
-\end{defundesc}
-
-\begin{defundesc}{resolve-uri} {cscheme cp scheme p} {scheme path}
-%FIXME{Sorry, I can't figure out what resolve-uri is inteded to do.
-%Perhaps I find it out later.}
-%There is a paragraph in the spec, that describes someting like
-%resolve-uri does. We have to check this.
-  To be done.
-\end{defundesc}
-
-\begin{defundesc}{split-uri-path} {uri start end} {list}
-  Splits uri at slashes. Only the substring given with \semvar{start}
-  (inclusive) and \semvar{end} (exclusive) as indices is considered.
-  \semvar{start} and $\semvar{end} - 1$ have to be within the range of
-  \semvar{uri}.  Otherwise an index-out-of-range exception will be
-  raised. Example: \codex{(split-uri-path "foo/bar/colon" 4 11)}
-  results to \codex{'("bar" "col")}
-\end{defundesc}
-
-\begin{defundesc}{uri-path-list->path} {plist} {string}
-  Generates a path out of an uri-path-list by inserting slashes
-  between the elements of \semvar{plist}. If you want to use the
-  resulting string for further operation, you should escape the
-  elements of \semvar{plist} in case the contain slashes. This doesn't
-  escape them for you, you must do that yourself like
-  \ex{(uri-path-list->path (map escape-uri pathlist))}.
-\end{defundesc}
-
-\begin{defundesc}{simplify-uri-path} {path} {list}
-  Removes `\ex{.}' and `\ex{..}' entries from path. The result is
-  a (maybe empty) list representing a path that does not contain any
-  `\ex{.}' or `\ex{..}'\,. The list can only be empty if the path
-  did not start with a slash (for the rare occasion someone wants to
-  simplify a relative path). The result is \sharpf{} if the path tries
-  to back up past root, for example by `\ex{/..}' or
-  `\ex{/foo\ob/..\ob/..}' or just `\ex{..}'\,. `\ex{//}' may occur
-  somewhere in the path referring to root but not being backed up.
-  Examples: 
-%FIXME: Can't we have a better environment for examples like these?
-\begin{alltt}
-(simplify-uri-path
- (split-uri-path  "/foo/bar/baz/.."  0 15))
-\end{alltt} 
-  results to 
-  \codex{'("" "foo" "bar")}
-
-\begin{alltt}
-(simplify-uri-path 
- (split-uri-path "foo/bar/baz/../../.." 0 20))
-\end{alltt}
-  results to
-  \codex{'()}
-
-\begin{alltt}
-(simplify-uri-path 
- (split-uri-path "/foo/../.." 0 10))
-\end{alltt}
-  results to
-  \codex{\sharpf          ; tried to back up root}
-
-\begin{alltt}
-(simplify-uri-path 
- (split-uri-path "foo/bar//" 0 9))
-\end{alltt}
-  results to
-  \codex{'("")       ; "//" refers to root}
-
-\begin{alltt}
-(simplify-uri-path 
- (split-uri-path "foo/bar/" 0 8))
-\end{alltt}
-  results to 
-  \codex{'("")       ; last "/" also refers to root}
-
-\begin{alltt}
-(simplify-uri-path 
- (split-uri-path "/foo/bar//baz/../.." 0 19))
-\end{alltt}
-  results to
-  \codex{\sharpf          ; tries to back up root}
-\end{defundesc}
-
-\section{Unexported names}
-
-\defvar{uri-reserved}{char-set}
+\defun{escape-uri} {string [escaped-chars]} {string}
 \begin{desc}
-  A list of reserved characters (semicolon, slash, hash, question
-  mark, double colon and space).
+  This procedure escapes characters of \var{string} that are in
+  \var{escaped\=chars}. \var{Escaped\=chars} defaults to
+  \ex{uri\=escaped\=chars}.  
+\end{desc}
+%
+Be careful with using this procedure to chunks of text with
+syntactically meaningful reserved characters (e.g., paths with URI
+slashes or colons)---they'll be escaped, and lose their special
+meaning. E.g.\ it would be a mistake to apply \ex{escape-uri} to
+\begin{verbatim}
+//lcs.mit.edu:8001/foo/bar.html}
+\end{verbatim}
+%
+because the sla\-shes and co\-lons would be escaped.
+
+\defun{split-uri}{uri start end} {list}
+\begin{desc}
+  This procedure splits \var{uri} at slashes. Only the substring given
+  with \var{start} (inclusive) and \var{end} (exclusive) as indices is
+  considered.  \var{start} and $\var{end} - 1$ have to be within the
+  range of \var{uri}.  Otherwise an \ex{index-out-of-range} exception
+  will be raised.
+  
+  Example: \codex{(split-uri "foo/bar/colon" 4 11)} returns
+  \codex{("bar" "col")}
 \end{desc}

-\begin{defundesc}{hex-digit?} {character} {boolean}
-  Returns \sharpt{} if character is a hexadecimal digit (i.e., one of 1--9,
-  a--f, A--F), \sharpf{} otherwise.
-\end{defundesc}
+\defun{uri-path->uri}{plist}{string}
+\begin{desc}
+  This procedure generates a path out of a URI path list by inserting
+  slashes between the elements of \var{plist}.
+\end{desc}
+%
+If you want to use the resulting string for further operation, you
+should escape the elements of \var{plist} in case they contain
+slashes, like so:
+%
+\begin{verbatim}
+(uri-path->uri (map escape-uri pathlist))
+\end{verbatim}

+\defun{simplify-uri-path}{path}{list}
+\begin{desc}
+  This procedure simplifies a URI path.  It removes \verb|"."| and
+  \verb|"/.."| entries from path, and removes parts before a root.
+  The result is a list, or \sharpf{} if the path tries to back up past
+  root.
+\end{desc}
+%
+According to RFC~2396, relative paths are considered not to start with
+\verb|/|.  They are appended to a base URL path and then simplified.
+So before you start to simplify a URL try to find out if it is a
+relative path (i.e. it does not start with a \verb|/|).

-\begin{defundesc}{hexchar->int} {character} {number}
-  Translates the given character to an integer, e.g. \ex{(hexchar->int
-  \#a)} results to 10.
-\end{defundesc}
+Examples:
+%
+\begin{alltt}
+(simplify-uri-path (split-uri  "/foo/bar/baz/.."  0 15))
+\(\Rightarrow\) ("" "foo" "bar")

-\begin{defundesc}{int->hexchar} {integer} {character}
-  Translates the given integer from range 1--15 into an hexadecimal
-  character (uses uppercase letters), e.g. \ex{(int->hexchar 14)}
-  results to `E'.
-\end{defundesc}
+(simplify-uri-path (split-uri "foo/bar/baz/../../.." 0 20))
+\(\Rightarrow\) ()

-\begin{defundesc}{rev-append} {list-a list-b} {list}
-  Performs a \ex{(append (reverse list-a) list-b)}. The comment says it
-  should be defined in a list package but I am wondering how often
-  this will be used.
-\end{defundesc}
+(simplify-uri-path (split-uri "/foo/../.." 0 10))
+\(\Rightarrow\) #f
+
+(simplify-uri-path (split-uri "foo/bar//" 0 9))
+\(\Rightarrow\) ("")     
+
+(simplify-uri-path (split-uri "foo/bar/" 0 8))
+\(\Rightarrow\) ("")
+
+(simplify-uri-path (split-uri "/foo/bar//baz/../.." 0 19))
+\(\Rightarrow\) #f
+\end{alltt}

-%EOF

 %%% Local Variables: 
 %%% mode: latex