sunet/doc/latex/uri.tex

\section{Handle URIs}\label{sec:uri}
%
\begin{description}
\item[Used files:] uri.scm
\item[Name of the package:] uri
\end{description}
%

\subsection{Overview}
A URI (Uniform Resource Identifier) is of following syntax:
\begin{inset}
\ovar{scheme\/} : \semvar{path} \ovar{{\normalfont?\/} search} \ovar{{\normalfont\#} fragmentid}
\end{inset}
Parts in brackets may be ommitted. The last part is usually referred
to as fragid in this document.

As you see, the URI contains characters like \verb|:| to indicate its
different parts. But what, if the \semvar{scheme} contains \verb|:| as
part of its name? For this purpose, some special characters are
\emph{escaped} if they are a regular part of a name and not indicators
for the structure of a URI. Escape-sequences are of following scheme:
\verb|\%hh| where \verb|h| is a hexadecimal digit. The hexadecimal
number refers to the (US) ASCII code of the escaped character, e.g.\
\ex{\%20} is space (ASCII character 32) and \ex{\%61} is `a' (ASCII
character 97). This module provides procedures to escape and unescape
strings that are meant to be used in a URI.

\subsection{Procedures}

\begin{defundesc}{parse-uri} {uri-string } {scheme path search
    frag-id} \label{proc:parse-uri}

  Parses an \semvar{uri\=string} in the possible four fields, as
  mentioned above in \emph{Overview}. These four fields are returned
  as a multiple value. They are \emph{not} unescaped, as the rules for
  parsing the \semvar{path} component in particular need unescaped
  text, and are dependent on \semvar{scheme}. The URL parser is
  responsible for doing this. If the \semvar{scheme}, \semvar{search}\
  or \semvar{fragid} portions are not specified, they are \sharpf.
  Otherwise, \semvar{scheme}, \semvar{search}, and \semvar{fragid} are
  strings. \semvar{path} is a non-empty string list -- the path split
  at slashes.

  For those of you who are interested, here is a description of the
  parsing technique. It is inwards from both ends.
  \begin{itemize}
  \item First we search forwards for the first reserved character
    (\verb|=|, \verb|;|, \verb|/|, \verb|#|, \verb|?|, \verb|:| or
    \verb|space|). If it's a colon, then that's the \semvar{scheme}
    part, otherwise we have no \semvar{scheme} part. At all events we
    remove it.
  \item Then we search backwards from the end for the last reserved
    char.  If it's a sharp, then that's the \semvar{fragid} part --
    remove it.
  \item Then we search backwards from the end for the last reserved
    char.  If it's a question-mark, then that's the \semvar{search}
    part -- remove it.
  \item What's left is the path. We split at slashes. The empty string
    becomes a list containing the empty string.
  \end{itemize}

  This scheme is tolerant of the various ways people build broken
  URI's out there on the Net\footnote{So it is not absolutely conform
    with RFC~1630}, e.g. \verb|=| is a reserved character, but used
  unescaped in the search-part. It was given to me\footnote{That's
    Olin Shivers.} by Dan Connolly of the W3C and slightly modified.
\end{defundesc}

\begin{defundesc}{unescape-uri} {string \ovar{start \ovar{end}}} {string}
  Unescapes a string. This procedure should only be used \emph{after}
  the URL (!) was parsed, since unescaping may introduce characters
  that blow up the parse (that's why escape sequences are used in URIs
  ;-). Escape sequences are of the scheme as described in ``Overview''.
\end{defundesc}


%\texttt{uri-escaped-chars} \hfill
%\texttt{char-set}\index{\texttt{uri-escaped-chars}}
\defvar{uri-escaped-chars}{char-set}
\begin{desc}
  A set of characters that are escaped in URIs. These are the
  following characters: dollar (\verb|$|), minus (\verb|-|),%fool Xemacs$
  underscore (\verb|_|), at (\verb|@|), dot (\verb|.|), and-sign
  (\verb|&|), exclamation mark (\verb|!|), asterisk (\verb|*|),
  backslash (\verb|\|), double quote (\verb|"|), single quote
  (\verb|'|), open brace (\verb|(|), close brace (\verb|)|), comma
  (\verb|,|) plus (\verb|+|) and all other characters that are neither
  letters nor digits (such as space and control characters).
\end{desc}

\begin{defundesc}{escape-uri} {string \ovar{escaped-chars}} {string}
  Escapes characters of \semvar{string} that are given with
  \semvar{escaped\=chars}. \semvar{escaped\=chars} default to
  \ex{uri\=escaped\=chars}. Be careful with using this procedure to
  chunks of text with syntactically meaningful reserved characters
  (e.g., paths with URI slashes or colons) -- they'll be escaped, and
  lose their special meaning. E.g.\ it would be a mistake to apply
  \ex{escape-uri} to
  ``\ex{//lcs.\ob{}mit.\ob{}edu:8001\ob/foo\ob/bar.html}'' because the
  sla\-shes and co\-lons would be escaped. Note that \ex{esacpe-uri}
  doesn't check this as it would lose his meaning.
\end{defundesc}

\begin{defundesc}{resolve-uri} {cscheme cp scheme p} {scheme path}
\FIXME{Sorry, I can't figure out what resolve-uri is inteded to do.
Perhaps I find it out later.}

  Nobody really knows what this procedure was intended to do.

  The code seems to have a bug: In the body of receive, there's a
  loop. j should, according to the comment, count sequential \verb|/|.
  But \ex{j} counts nothing in the body. Either zero is added \ex{((lp
    (cdr cp-tail) (cons (car cp-tail) rhead) (+ j 0)))} or \ex{j} is
  set to 1 \ex{((lp (cdr cp-tail) (cons (car cp-tail) rhead) 1))}.
  Nevertheless, \ex{j} is expected to reach value \ex{numsl} that can
  be larger than one. So what?  I am confused.
\end{defundesc}

\begin{defundesc}{split-uri-path} {uri start end} {list}
  Splits uri at slashes. Only the substring given with \semvar{start}
  (inclusive) and \semvar{end} (exclusive) as indices is considered.
  \semvar{start} and $\semvar{end} - 1$ have to be within the range of
  \semvar{uri}.  Otherwise an index-out-of-range exception will be
  raised. Example: \codex{(split-uri-path "foo/bar/colon" 4 11)}
  results to \codex{'("bar" "col")}
\end{defundesc}

\begin{defundesc}{uri-path-list->path} {plist} {string}
  Generates a path out of an uri-path-list by inserting slashes
  between the elements of \semvar{plist}. If you want to use the
  resulting string for further operation, you should escape the
  elements of \semvar{plist} in case the contain slashes. This doesn't
  escape them for you, you must do that yourself like
  \ex{(uri-path-list->path (map escape-uri pathlist))}.
\end{defundesc}

\begin{defundesc}{simplify-uri-path} {path} {list}
  Removes `\ex{.}' and `\ex{..}' entries from path. The result is
  a (maybe empty) list representing a path that does not contain any
  `\ex{.}' or `\ex{..}'\,. The list can only be empty if the path
  did not start with a slash (for the rare occasion someone wants to
  simplify a relative path). The result is \sharpf{} if the path tries
  to back up past root, for example by `\ex{/..}' or
  `\ex{/foo\ob/..\ob/..}' or just `\ex{..}'\,. `\ex{//}' may occur
  somewhere in the path referring to root but not being backed up.
  Examples:
%FIXME: Can't we have a better environment for examples like these?
  \begin{code}
(simplify-uri-path
 (split-uri-path  "/foo/bar/baz/.."  0 15))\end{code}
  results to
  \codex{'("" "foo" "bar")}

  \begin{code}
(simplify-uri-path
 (split-uri-path "foo/bar/baz/../../.." 0 20))\end{code}
  results to
  \codex{'()}

  \begin{code}
(simplify-uri-path
 (split-uri-path "/foo/../.." 0 10))\end{code}
  results to
  \codex{\sharpf          ; tried to back up root}

  \begin{code}
(simplify-uri-path
 (split-uri-path "foo/bar//" 0 9))\end{code}
  results to
  \codex{'("")       ; "//" refers to root}

\begin{code}
(simplify-uri-path
 (split-uri-path "foo/bar/" 0 8))\end{code}
  results to
  \codex{'("")       ; last "/" also refers to root}

  \begin{code}
(simplify-uri-path
 (split-uri-path "/foo/bar//baz/../.." 0 19))\end{code}
  results to
  \codex{\sharpf          ; tries to back up root}
\end{defundesc}

\subsection*{Unexported names}

\defvar{uri-reserved}{char-set}
\begin{desc}
  A list of reserved characters (semicolon, slash, hash, question
  mark, double colon and space).
\end{desc}

\begin{defundesc}{hex-digit?} {character} {boolean}
  Returns \sharpt{} if character is a hexadecimal digit (i.e., one of 1--9,
  a--f, A--F), \sharpf{} otherwise.
\end{defundesc}


\begin{defundesc}{hexchar->int} {character} {number}
  Translates the given character to an integer, e.g. \ex{(hexchar->int
  \#a)} results to 10.
\end{defundesc}

\begin{defundesc}{int->hexchar} {integer} {character}
  Translates the given integer from range 1--15 into an hexadecimal
  character (uses uppercase letters), e.g. \ex{(int->hexchar 14)}
  results to `E'.
\end{defundesc}

\begin{defundesc}{rev-append} {list-a list-b} {list}
  Performs a \ex{(append (reverse list-a) list-b)}. The comment says it
  should be defined in a list package but I am wondering how often
  this will be used.
\end{defundesc}

%EOF


%%% Local Variables:
%%% mode: latex
%%% TeX-master: t
%%% End: