diff --git a/doc/scsh-manual/Makefile b/doc/scsh-manual/Makefile new file mode 100644 index 0000000..c0ce8cb --- /dev/null +++ b/doc/scsh-manual/Makefile @@ -0,0 +1,40 @@ +.SUFFIXES: .idx .ind .tex .dvi .ps $(.SUFFIXES) + +TEX= front.tex intro.tex procnotation.tex syscalls.tex network.tex \ + strings.tex awk.tex miscprocs.tex running.tex todo.tex + + +man.dvi: $(TEX) man.ind +man.ind: man.idx + +.dvi.ps: + dvips -o $@ $< + +.tex.dvi: + latex $< + rm $*.log + +.idx.ind: + makeindex $< + +clean: + -rm *.log + rm -r html + +INSTALL_DATA= install -c -m 644 + +tar: + tar cf - *.tex sty | gzip > man.tar.gz + +html: + tex2page man + +install: man.ps + @echo WARNING: + @echo WARNING: this depends on /u/su/scsh/scsh + @echo WARNING: pointing to the current release + @echo WARNING: + $(INSTALL_DATA) cheat.txt /u/su/scsh/scsh/doc/ + $(INSTALL_DATA) man.ps /u/su/scsh/scsh/doc/scsh-manual.ps + $(INSTALL_DATA) $(TEX) /u/su/scsh/scsh/doc/scsh-manual/ + $(INSTALL_DATA) sty/* /u/su/scsh/scsh/doc/scsh-manual/sty/ diff --git a/doc/scsh-manual/THANKS b/doc/scsh-manual/THANKS new file mode 100644 index 0000000..6ddc496 --- /dev/null +++ b/doc/scsh-manual/THANKS @@ -0,0 +1,35 @@ +Michel.Schinz@studi.epfl.ch + Documentation error in STRING-OUTPUT-PORT-OUTPUT. + Reported 12/19. + +Victor Zandy + character-gobbling in (record-reader) caused by 'trim / 'peek + default misunderstanding in delimited readers. Fixed 4/5/96 + +Michael Becker + reap-policy = early can still lose if you loop and fork. + fork now reaps & retries if it loses and the policy is early reap. + This is a kludge until I have sigchld handlers. + Fixed 4/5/96 + +Tod Olson + Reported painfully slow delimited-reader I/O in November. + +Michel.Schinz@studi.epfl.ch + Reported some picky little typos in the manual. + +Shriram + Doc bugs in defrec.scm + +euler@lavielle.COM (Lutz Euler) 2/24/97 + Manual bugs and a bug in stdio->stdports. + +Alan Bawden 4/97 + Lots of good bug reports and fixes. + +Jim Blandy 4/97 + Fixes for meta.scm + +Kevin Esler 4/97 + Updated Irix port + diff --git a/doc/scsh-manual/ack.txt b/doc/scsh-manual/ack.txt new file mode 100644 index 0000000..a4cf870 --- /dev/null +++ b/doc/scsh-manual/ack.txt @@ -0,0 +1,32 @@ +Acknowledgements + +Who should I thank? My so-called "colleagues," who laugh at me behind my +back, all the while becoming famous on *my* work? My worthless graduate +students, whose computer skills appear to be limited to downloading bitmaps +off of netnews? My parents, who are still waiting for me to quit "fooling +around with computers," go to med school, and become a radiologist? My +department chairman, a manager who gives one new insight into and sympathy for +disgruntled postal workers? + +My God, no one could blame me--no one!--if I went off the edge and just lost +it completely one day. I couldn't get through the day as it is without the +Prozac and Jack Daniels I keep on the shelf, behind my Tops-20 JSYS manuals. +I start getting the shakes real bad around 10am, right before my advisor +meetings. A 10 oz. Jack 'n Zac helps me get through the meetings without one +of my students winding up with his severed head in a bowling-ball bag. They +look at me funny; they think I twitch a lot. I'm not twitching. I'm +controlling my impulse to snag my 9mm Sig-Sauer out from my day-pack and make +a few strong points about the quality of undergraduate education in Amerika. + +If I thought anyone cared, if I thought anyone would even be reading this, I'd +probably make an effort to keep up appearances until the last possible +moment. But no one does, and no one will. So I can pretty much say exactly +what I think. + +Oh, yes, the *acknowledgements.* I think not. I did it. I did it all, +by myself. + +Olin Shivers +Cambridge +September 4, 1994 + diff --git a/doc/scsh-manual/array.sty b/doc/scsh-manual/array.sty new file mode 100644 index 0000000..88b9b7d --- /dev/null +++ b/doc/scsh-manual/array.sty @@ -0,0 +1,252 @@ +%% +%% This is file `/usr2/distrib/latex209/nfss/array.sty' generated +%% on <1991/11/22> with the docstrip utility (v1.1k). +%% +%% The original source files were: +%% +%% /usr2/users/latex3/source/array/array.doc + +%% +%% Copyright (C) 1989,1990,1991 by Frank Mittelbach, Rainer Schoepf. +%% All rights reserved. +%% +%% This file is part of the NFSS (New Font Selection Scheme) package. +%% +%% IMPORTANT NOTICE: +%% +%% You are not allowed to change this file. You may however copy this file +%% to a file with a different name and then change the copy if you obey +%% the restrictions on file changes described in readme.mz. +%% +%% You are allowed to distribute this file under the condition that it is +%% distributed together with all files mentioned in readme.mz3. If you +%% receive only some of these files from someone, complain! +%% +%% You are NOT ALLOWED to distribute this file alone. You are NOT ALLOWED +%% to take money for the distribution or use of either this file or a +%% changed version, except for a nominal charge for copying etc. +%% +%% For error reports in case of UNCHANGED versions see readme files. +%% +%% Please do not request updates from us directly. Distribution is done +%% through Mail-Servers and TeX organizations. +%% + +\def\fileversion{v2.0e} +\def\filedate{91/02/07} +\def\docdate {90/08/20} + +%% \CheckSum{681} +%% \CharacterTable +%% {Upper-case \A\B\C\D\E\F\G\H\I\J\K\L\M\N\O\P\Q\R\S\T\U\V\W\X\Y\Z +%% Lower-case \a\b\c\d\e\f\g\h\i\j\k\l\m\n\o\p\q\r\s\t\u\v\w\x\y\z +%% Digits \0\1\2\3\4\5\6\7\8\9 +%% Exclamation \! Double quote \" Hash (number) \# +%% Dollar \$ Percent \% Ampersand \& +%% Acute accent \' Left paren \( Right paren \) +%% Asterisk \* Plus \+ Comma \, +%% Minus \- Point \. Solidus \/ +%% Colon \: Semicolon \; Less than \< +%% Equals \= Greater than \> Question mark \? +%% Commercial at \@ Left bracket \[ Backslash \\ +%% Right bracket \] Circumflex \^ Underscore \_ +%% Grave accent \` Left brace \{ Vertical bar \| +%% Right brace \} Tilde \~} +%% +\@ifundefined{d@llar}{}{\endinput} +\typeout{Style-Option: `array' \fileversion + \space\space <\filedate> (F.M.)} +\typeout{English documentation dated \space <\docdate> (F.M.)} +\def\@addtopreamble#1{\xdef\@preamble{\@preamble #1}} +\def\@testpach#1{\@chclass + \ifnum \@lastchclass=6 \@ne \@chnum \@ne \else + \ifnum \@lastchclass=7 5 \else + \ifnum \@lastchclass=8 \tw@ \else + \ifnum \@lastchclass=9 \thr@@ + \else \z@ + \ifnum \@lastchclass = 10 \else + \@chnum + \if #1c\z@ \else + \if #1l\@ne \else + \if #1r\tw@ \else + \z@ \@chclass + \if#1|\@ne \else + \if #1!6 \else + \if #1@7 \else + \if #1<8 \else + \if #1>9 \else + 10 + \@chnum + \if #1m\thr@@\else + \if #1p4 \else + \if #1b5 \else + \z@ \@chclass \z@ \@preamerr \z@ \fi \fi \fi \fi + \fi \fi \fi \fi \fi \fi \fi \fi \fi \fi \fi \fi} +\def\@xexpast#1*#2#3#4\@@{% + \@tempcnta #2 + \toks@={#1}\@temptokena={#3}% + \let\the@toksz\relax \let\the@toks\relax + \def\@tempa{\the@toksz}% + \ifnum\@tempcnta >0 \@whilenum\@tempcnta >0\do + {\edef\@tempa{\@tempa\the@toks}\advance \@tempcnta \m@ne}% + \let \@tempb \@xexpast \else + \let \@tempb \@xexnoop \fi + \def\the@toksz{\the\toks@}\def\the@toks{\the\@temptokena}% + \edef\@tempa{\@tempa}% + \expandafter \@tempb \@tempa #4\@@} +\def\prepnext@tok{\advance \count@ \@ne + \toks\count@={}} +\def\save@decl{\toks\count@ \expandafter{\@nextchar}} +\def\insert@column{% + \the@toks \the \@tempcnta + {\ignorespaces \@sharp \unskip}% + \the@toks \the \count@ \relax} +\newdimen\col@sep +\def\@acol{\@addtopreamble{\hskip\col@sep}} +\def\@mkpream#1{\gdef\@preamble{}\@lastchclass 4 \@firstamptrue + \let\@sharp\relax \let\@startpbox\relax \let\@endpbox\relax + \@xexpast #1*0x\@@ + \count@\m@ne + \let\the@toks\relax + \prepnext@tok + \expandafter \@tfor \expandafter \@nextchar + \expandafter :\expandafter =\@tempa \do + {\@testpach\@nextchar + \ifcase \@chclass \@classz \or \@classi \or \@classii + \or \save@decl \or \or \@classv \or \@classvi + \or \@classvii \or \@classviii \or \@classix + \or \@classx \fi + \@lastchclass\@chclass}% + \ifcase\@lastchclass + \@acol \or + \or + \@acol \or + \@preamerr \thr@@ \or + \@preamerr \tw@ \@addtopreamble\@sharp \or + \or + \else \@preamerr \@ne \fi + \def\the@toks{\the\toks}} +\def\@classx{% + \ifcase \@lastchclass + \@acolampacol \or + \@addamp \@acol \or + \@acolampacol \or + \or + \@acol \@firstampfalse \or + \@addamp + \fi} +\def\@classz{\@classx + \@tempcnta \count@ + \prepnext@tok + \@addtopreamble{\ifcase \@chnum + \hfil + \d@llar + \insert@column + \d@llar \hfil \or + \d@llar \insert@column \d@llar \hfil \or + \hfil\kern\z@ \d@llar \insert@column \d@llar \or + $\vcenter + \@startpbox{\@nextchar}\insert@column \@endpbox $\or + \vtop \@startpbox{\@nextchar}\insert@column \@endpbox \or + \vbox \@startpbox{\@nextchar}\insert@column \@endpbox + \fi}\prepnext@tok} +\def\@classix{\ifnum \@lastchclass = \thr@@ + \@preamerr \thr@@ \fi + \@classx} +\def\@classviii{\ifnum \@lastchclass >\z@ + \@preamerr 4\@chclass 6 \@classvi \fi} +\def\@arrayrule{\@addtopreamble \vline} +\def\@classvii{\ifnum \@lastchclass = \thr@@ + \@preamerr \thr@@ \fi} +\def\@classvi{\ifcase \@lastchclass + \@acol \or + \@addtopreamble{\hskip \doublerulesep}\or + \@acol \or + \@classvii + \fi} +\def\@classii{\advance \count@ \m@ne + \save@decl\prepnext@tok} +\def\@classv{\save@decl + \@addtopreamble{\d@llar\the@toks\the\count@\relax\d@llar}% + \prepnext@tok} +\def\@classi{\@classvi + \ifcase \@chnum \@arrayrule \or + \@classv \fi} +\def\@startpbox#1{\bgroup + \hsize #1 \@arrayparboxrestore + \vrule \@height \ht\@arstrutbox \@width \z@} +\def\@endpbox{\vrule \@width \z@ \@depth \dp \@arstrutbox \egroup} +\def\@array[#1]#2{% + \@tempdima \ht \strutbox + \advance \@tempdima by\extrarowheight + \setbox \@arstrutbox \hbox{\vrule + \@height \arraystretch \@tempdima + \@depth \arraystretch \dp \strutbox + \@width \z@}% + \begingroup + \@mkpream{#2}% + \xdef\@preamble{\ialign \@halignto + \bgroup \@arstrut \@preamble + \tabskip \z@ \cr}% + \endgroup + \if #1t\vtop \else \if#1b\vbox \else \vcenter \fi \fi + \bgroup + \let \@sharp ##\let \protect \relax + \lineskip \z@ + \baselineskip \z@ + \m@th + \let\\ \@arraycr \let\par\@empty \@preamble} +\newdimen \extrarowheight +\extrarowheight=0pt +\def\@arstrut{\unhcopy\@arstrutbox} +\def\@arraycr{{\ifnum 0=`}\fi + \@ifstar \@xarraycr \@xarraycr} +\def\@xarraycr{\@ifnextchar [% + \@argarraycr {\ifnum 0=`{\fi}\cr}} +\def\@argarraycr[#1]{\ifnum0=`{\fi}\ifdim #1>\z@ + \@xargarraycr{#1}\else \@yargarraycr{#1}\fi} +\def\@xargarraycr#1{\unskip + \@tempdima #1\advance\@tempdima \dp\@arstrutbox + \vrule \@depth\@tempdima \@width\z@ \cr} +\def\@yargarraycr#1{\cr\noalign{\vskip #1}} +\def\multicolumn#1#2#3{% + \multispan{#1}\begingroup + \def\@addamp{\if@firstamp \@firstampfalse \else + \@preamerr 5\fi}% + \@mkpream{#2}\@addtopreamble\@empty + \endgroup + \def\@sharp{#3}% + \@arstrut \@preamble \ignorespaces} +\def\array{\col@sep\arraycolsep + \def\d@llar{$}\gdef\@halignto{}% + \@tabarray} +\def\@tabarray{\@ifnextchar[{\@array}{\@array[c]}} +\def\tabular{\gdef\@halignto{}\@tabular} +\expandafter\def\csname tabular*\endcsname#1{% + \gdef\@halignto{to#1}\@tabular} +\def\@tabular{% + \leavevmode + \hbox \bgroup $\col@sep\tabcolsep \let\d@llar\@empty + \@tabarray} +\def\endarray{\crcr \egroup \egroup \gdef\@preamble{}} +\def\endtabular{\endarray $\egroup} +\expandafter\let\csname endtabular*\endcsname=\endtabular +\let\@ampacol=\relax \let\@expast=\relax +\let\@arrayclassiv=\relax \let\@arrayclassz=\relax +\let\@tabclassiv=\relax \let\@tabclassz=\relax +\let\@arrayacol=\relax \let\@tabacol=\relax +\let\@tabularcr=\relax \let\@@endpbox=\relax +\let\@argtabularcr=\relax \let\@xtabularcr=\relax +\def\@preamerr#1{\def\@tempd{{..} at wrong position: }% + \@latexerr{% + \ifcase #1 Illegal pream-token (\@nextchar): `c' used\or %0 + Missing arg: token ignored\or %1 + Empty preamble: `l' used\or %2 + >\@tempd token ignored\or %3 + <\@tempd changed to !{..}\or %4 + Only one colum-spec. allowed.\fi}\@ehc} %5 +\def\@tfor#1:=#2\do#3{\def\@fortmp{#2}\ifx\@fortmp\@empty + \else\@tforloop#2\@nil\@nil\@@#1{#3}\fi} +\endinput +%% +%% End of file `/usr2/distrib/latex209/nfss/array.sty'. diff --git a/doc/scsh-manual/awk.tex b/doc/scsh-manual/awk.tex new file mode 100644 index 0000000..e8ec358 --- /dev/null +++ b/doc/scsh-manual/awk.tex @@ -0,0 +1,672 @@ +%&latex -*- latex -*- + +\chapter{Awk, record I/O, and field parsing} +\label{chapt:fr-awk} + +{\Unix} programs frequently process streams of records, +where each record is delimited by a newline, +and records are broken into fields with other delimiters +(for example, the colon character in \ex{/etc/passwd}). +Scsh has procedures that allow the programmer to easily +do this kind of processing. +Scsh's field parsers can also be used to parse other kinds +of delimited strings, such as colon-separated \verb|$PATH| lists. +These routines can be used with scsh's \ex{awk} loop construct +to conveniently perform pattern-directed computation over streams +of records. + + +\section{Record I/O and field parsing} +\label{sec:field-reader} + +The procedures in this section are used to read records from +I/O streams and parse them into fields. +A record is defined as text terminated by some delimiter (usually a newline). +A record can be split into fields by using regular expressions in +one of several ways: to \emph{match} fields, to \emph{separate} fields, +or to \emph{terminate} fields. +The field parsers can be applied to arbitrary strings (one common use is +splitting environment variables such as \ex{\$PATH} at colons into its +component elements). + +The general delimited-input procedures described in +chapter~\ref{chapt:rdelim} are also useful for reading simple records, +such as single lines, paragraphs of text, or strings terminated by specific +characters. + +\subsection{Reading records} + +\defun{record-reader} {[delims elide-delims? handle-delim]} {\proc} +\begin{desc} + Returns a procedure that reads records from a port. The + procedure is invoked as follows: + % + \codex{(\var{reader} \var{[port]}) $\longrightarrow$ + \textrm{\textit{{\str} or eof}}} + % + A record is a sequence of characters terminated by one of the characters + in \var{delims} or eof. If \var{elide-delims?} is true, then a contiguous + sequence of delimiter chars are taken as a single record delimiter. If + \var{elide-delims?} is false, then a delimiter char coming immediately + after a delimiter char produces an empty-string record. The reader + consumes the delimiting char(s) before returning from a read. + + The \var{delims} set defaults to the set $\{\mbox{newline}\}$. + It may be a charset, string, character, or character predicate, + and is coerced to a charset. + The \var{elide-delims?} flag defaults to \ex{\#f}. + + The \var{handle-delim} argument controls what is done with the record's + terminating delimiter. + \begin{inset} + \begin{tabular}{lp{0.6\linewidth}} + \ex{'trim} & Delimiters are trimmed. (The default)\\ + \ex{'split}& Reader returns delimiter string as a second argument. + If record is terminated by EOF, then the eof object is + returned as this second argument. \\ + \ex{'concat} & The record and its delimiter are returned as + a single string. + \end{tabular} + \end{inset} + + The reader procedure returned takes one optional argument, the port + from which to read, which defaults to the current input port. It returns + a string or eof. +\end{desc} + + +\subsection{Parsing fields} +\label{sec:field-splitter} + +\defun {field-splitter} {[field num-fields]} \proc +\defunx {infix-splitter} {[delim num-fields handle-delim]} \proc +\defunx {suffix-splitter} {[delim num-fields handle-delim]} \proc +\defunx {sloppy-suffix-splitter} {[delim num-fields handle-delim]} \proc +\begin{desc} +These functions return a parser function that can be used as follows: +\codex{(\var{parser} \var{string} \var{[start]}) $\longrightarrow$ + \var{string-list}} + + The returned parsers split strings into fields defined + by regular expressions. You can parse by specifying a pattern that + \emph{separates} fields, a pattern that \emph{terminates} fields, or + a pattern that \emph{matches} fields: + \begin{inset} + \begin{tabular}{l@{\qquad}l} + Procedure & Pattern \\ \hline + \ex{field-splitter} & matches fields \\ + \ex{infix-splitter} & separates fields \\ + \ex{suffix-splitter}& terminates fields \\ + \ex{sloppy-suffix-splitter} & terminates fields + \end{tabular} + \end{inset} + + These parser generators are controlled by a range of options, so that you + can precisely specify what kind of parsing you want. However, these + options default to reasonable values for general use. + + Defaults: +\begin{tightinset} +\begin{tabular}{l@{\quad=\quad }ll} + \var{delim} & \ex{(rx (| (+ white) eos))} & (suffix delimiter: white space or eos) \\ + \multicolumn{1}{l}{} & \ex{(rx (+ white))} & (infix delimiter: white space) \\ + + \var{field} & \verb|(rx (+ (~ white)))| & (non-white-space) \\ + + \var{num-fields} & \verb|#f| & (as many fields as possible) \\ + + \var{handle-delim} & \verb|'trim| & (discard delimiter chars) +\end{tabular} +\end{tightinset} + {\ldots}which means: break the string at white space, discarding the + white space, and parse as many fields as possible. + + The \var{delim} parameter is a regular expression matching the text + that occurs between fields. + See chapter~\ref{chapt:sre} for information on regular expressions, + and the \ex{rx} form used to specify them. + In the separator case, + it defaults to a pattern matching white space; + in the terminator case, + it defaults to white space or end-of-string. + + The \var{field} parameter is a regular expression used + to match fields. It defaults to non-white-space. + + The \var{delim} patterns may also be given as a string, + character, or char-set, which are coerced to regular expressions. + So the following expressions are all equivalent, + each producing a function that splits strings apart at colons: +\begin{inset} +\begin{verbatim} +(infix-splitter (rx ":")) +(infix-splitter ":") +(infix-splitter #\:) +(infix-splitter (char-set #\:))\end{verbatim} +\end{inset} + + The boolean \var{handle-delim} determines what to do with delimiters. + \begin{tightinset}\begin{tabular}{ll} + \ex{'trim} & Delimiters are thrown away after parsing. (default) \\ + \ex{'concat} & Delimiters are appended to the field preceding them. \\ + \ex{'split} & Delimiters are returned as separate elements in + the field list. + \end{tabular} + \end{tightinset} + + The \var{num-fields} argument used to create the parser specifies how many + fields to parse. If \ex{\#f} (the default), the procedure parses them all. + If a positive integer $n$, exactly that many fields are parsed; it is an + error if there are more or fewer than $n$ fields in the record. If + \var{num-fields} is a negative integer or zero, then $|n|$ fields + are parsed, and the remainder of the string is returned in the last + element of the field list; it is an error if fewer than $|n|$ fields + can be parsed. + + The field parser produced is a procedure that can be employed as + follows: + \codex{(\var{parse} \var{string} \var{[start]}) \evalto \var{string-list}} + The optional \var{start} argument (default 0) specifies where in the string + to begin the parse. It is an error if + $\var{start} > \ex{(string-length \var{string})}$. + + The parsers returned by the four parser generators implement different + kinds of field parsing: + \begin{description} + \item[\ex{field-splitter}] + The regular expression specifies the actual field. + + + \item[\ex{suffix-splitter}] + Delimiters are interpreted as element \emph{terminators}. + If vertical-bar is the the delimiter, then the string \ex{""} + is the empty record \ex{()}, \ex{"foo|"} produces a one-field record + \ex{("foo")}, and \ex{"foo"} is an error. + + The syntax of suffix-delimited records is: + \begin{inset} + \begin{tabular}{lcll} + \synvar{record} & ::= & \ex{""} \qquad (Empty record) \\ + & $|$ & \synvar{element} \synvar{delim} + \synvar{record} + \end{tabular} + \end{inset} + + It is an error if a non-empty record does not end with a delimiter. + To make the last delimiter optional, make sure the delimiter regexp + matches the end-of-string (sre \ex{eos}). + + \item [\ex{infix-splitter}] + Delimiters are interpreted as element \emph{separators}. If comma is the + delimiter, then the string \ex{"foo,"} produces a two-field + record \ex{("foo" "")}. + + The syntax of infix-delimited records is: + \begin{inset} + \begin{tabular}{lcll} + \synvar{record} & ::= & \ex{""} \qquad (Forced to be empty record) \\ + & $|$ & \synvar{real-infix-record} \\ + \\ + \synvar{real-infix-record} & ::= & \synvar{element} \synvar{delim} + \synvar{real-infix-record} \\ + & $|$ & \synvar{element} + \end{tabular} + \end{inset} + + Note that separator semantics doesn't really allow for empty + records---the straightforward grammar (\ie, \synvar{real-infix-record}) + parses an empty string as a singleton list whose one field is the empty + string, \ex{("")}, not as the empty record \ex{()}. This is unfortunate, + since it means that infix string parsing doesn't make \ex{string-append} + and \ex{append} isomorphic. For example, + \codex{((infix-splitter ":") (string-append \var{x} ":" \var{y}))} + doesn't always equal +\begin{code} +(append ((infix-splitter ":") \var{x}) + ((infix-splitter ":") \var{y}))\end{code} + It fails when \var{x} or \var{y} are the empty string. + Terminator semantics \emph{does} preserve a similar isomorphism. + + However, separator semantics is frequently what other Unix software + uses, so to parse their strings, we need to use it. For example, + Unix \verb|$PATH| lists have separator semantics. The path list + \ex{"/bin:"} is broken up into \ex{("/bin" "")}, not \ex{("/bin")}. + Comma-separated lists should also be parsed this way. + + \item[\ex{sloppy-suffix}] + The same as the \ex{suffix} case, except that the parser will skip an + initial delimiter string if the string begins with one instead of parsing + an initial empty field. This can be used, for example, to field-split a + sequence of English text at white-space boundaries, where the string may + begin or end with white space, by using regex +\begin{code}{(rx (| (+ white) eos))}\end{code} + (But you would be better off using \ex{field-splitter} in this case.) + \end{description} + \end{desc} + + Figure~\ref{fig:splitters} shows how the different parser grammars + split apart the same strings. +% +\begin{boxedfigure}{tbp} +\begin{center}\small +\begin{tabular}{lllll} +Record & : suffix & \verb!:|$! suffix & : infix & non-: field \\ +\hline +\ex{""} & \ex{()} & \ex{()} & \ex{()} & \ex{()} \\ +\ex{":"} & \ex{("")} & \ex{("")} & \ex{("" "")} & \ex{()} \\ +\ex{"foo:"} & \ex{("foo")} & \ex{("foo")} & \ex{("foo" "")} & \ex{("foo")} \\ +\ex{":foo"}& \emph{error} & \ex{("" "foo")}& \ex{("" "foo")}& \ex{("foo")} \\ +\ex{"foo:bar"} & \emph{error} & \ex{("foo" "bar")} & \ex{("foo" "bar")} & \ex{("foo" "bar")} +\end{tabular} +\end{center} +\caption{Using different grammars to split records into fields.} +\label{fig:splitters} +\end{boxedfigure} +% + Having to choose between the different grammars requires you to decide + what you want, but at least you can be precise about what you are parsing. + Take fifteen seconds and think it out. Say what you mean; mean what you + say. + + +\defun{join-strings} {string-list [delimiter grammar]} \str +\begin{desc} + This procedure is a simple unparser---it pastes strings together using + the delimiter string. + + The \var{grammar} argument is one of the symbols \ex{infix} (the default) + or \ex{suffix}; it determines whether the + delimiter string is used as a separator or as a terminator. + + The delimiter is the string used to delimit elements; it defaults to + a single space \ex{" "}. + + Example: +\begin{code} +(join-strings '("foo" "bar" "baz") ":") +\qquad{\evalto} "foo:bar:baz"\end{code} +\end{desc} + +\subsection{Field readers} + +\defun{field-reader} {[field-parser rec-reader]} \proc +\begin{desc} + This utility returns a procedure that reads records with field structure + from a port. + The reader's interface is designed to make it useful in the \ex{awk} + loop macro (section~\ref{sec:awk}). + The reader is used as follows: + \codex{(\var{reader} \var{[port]}) {\evalto} \var{[raw-record parsed-record]} or \var{[eof ()]}} + + When the reader is applied to an input port (default: the current + input port), it reads a record using \var{rec-reader}. If this record isn't + the eof object, it is parsed with \var{field-parser}. These two + values---the record, and its parsed representation---are returned + as multiple values from the reader. + + When called at eof, the reader returns [eof-object \ex{()}]. + + Although the record reader typically returns a string, and + the field-parser typically takes a string argument, this is not + required. The record reader can produce, and the field-parser consume, + values of any type. However, the empty list returned as the + parsed value on eof is hardwired into the field reader. + + For example, if port \ex{p} is open on \ex{/etc/passwd}, then + \codex{((field-reader (infix-splitter ":" 7)) p)} + returns two values: +{\small +\begin{widecode} +"dalbertz:mx3Uaqq0:107:22:David Albertz:/users/dalbertz:/bin/csh" +("dalbertz" "mx3Uaqq0" "107" "22" "David Albertz" "/users/dalbertz" + "/bin/csh")\end{widecode}} + The \var{field-parser} defaults to the value of \ex{(field-splitter)}, + a parser that picks out sequences of non-white-space strings. + + The \var{rec-reader} defaults to \ex{read-line}. + + Figure~\ref{fig:field-readers} shows \ex{field-reader} being + used to read different kinds of Unix records. + +\begin{boxedfigure}{tbhp} +\begin{centercode} +;;; /etc/passwd reader +(field-reader (infix-splitter ":" 7)) + ; wandy:3xuncWdpKhR.:73:22:Wandy Saetan:/usr/wandy:/bin/csh + +;;; Two ls -l output readers +(field-reader (infix-splitter (rx (+ white)) 8)) +(field-reader (infix-splitter (rx (+ white)) -7)) + ; -rw-r--r-- 1 shivers 22880 Sep 24 12:45 scsh.scm + +;;; Internet hostname reader +(field-reader (field-splitter (rx (+ (~ "."))))) + ; stat.sinica.edu.tw + +;;; Internet IP address reader +(field-reader (field-splitter (rx (+ (~ "."))) 4)) + ; 18.24.0.241 + +;;; Line of integers +(let ((parser (field-splitter (rx (? ("+-")) (+ digit))))) + (field-reader (\l{s} (map string->number (parser s)))) + ; 18 24 0 241 + +;;; Same as above. +(let ((reader (field-reader (field-splitter (rx (? ("+-")) + (+ digit)))))) + (\lx{maybe-port} (map string->number (apply reader maybe-port)))) + ; Yale beat harvard 26 to 7.\end{centercode} +\caption{Some examples of \protect\ex{field-reader}} +\label{fig:field-readers} +\end{boxedfigure} + +\end{desc} + + +\subsection{Forward-progress guarantees and empty-string matches} +A loop that pulls text off a string by repeatedly matching a regexp +against that string can conceivably get stuck in an infinite loop if +the regexp matches the empty string. For example, the SREs +\ex{bos}, \ex{eos}, \ex{(* any)}, and \ex{(| "foo" (* (~ "f")))} +can all match the empty string. + +The routines in this package that iterate through strings with regular +expressions are careful to handle this empty-string case. +If a regexp matches the empty string, the next search starts, not from +the end of the match (which in the empty string case is also the +beginning---that's the problem), but from the next character over. +This is the correct behaviour. Regexps match the longest possible +string at a given location, so if the regexp matched the empty string +at location $i$, then it is guaranteed it could not have matched +a longer pattern starting with character $i$. So we can safely begin +our search for the next match at char $i+1$. + +With this provision, every iteration through the loop makes some forward +progress, and the loop is guaranteed to terminate. + +This has the effect you want with field parsing. For example, if you split +a string with the empty pattern, you will explode the string into its +individual characters: + \codex{((suffix-splitter (rx)) "foo") {\evalto} ("" "f" "o" "o")} +However, even though this boundary case is handled correctly, we don't +recommend using it. Say what you mean---just use a field splitter: + \codex{((field-splitter (rx any)) "foo") {\evalto} ("f" "o" "o")} +Or, more efficiently, + \codex{((\l{s} (map string (string->list s))) "foo")} + + +\subsection{Reader limitations} +Since all of the readers in this package require the ability to peek +ahead one char in the input stream, they cannot be applied to raw +integer file descriptors, only Scheme input ports. This is because +Unix doesn't support peeking ahead into input streams. + + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Awk} +\label{sec:awk} + +Scsh provides a loop macro and a set of field parsers that can +be used to perform text processing very similar to the Awk programming +language. +The basic functionality of Awk is factored in scsh into its component +parts. +The control structure is provided by the \ex{awk} loop macro; +the text I/O and parsers are provided by the field-reader subroutine library +(section~\ref{sec:field-reader}). +This factoring allows the programmer to compose the basic loop structure +with any parser or input mechanism at all. +If the parsers provided by the field-reader package are insufficient, +the programmer can write a custom parser in Scheme and use it with +equal ease in the awk framework. + +Awk-in-scheme is given by a loop macro called \ex{awk}. It looks like +this: +\begin{code}\cdmath +(awk \synvar{next-record} \synvar{record\&field-vars} + {\rm[\synvar{counter}]} \synvar{state-var-decls} + \synvar{clause$_1$} \ldots)\index{awk}\end{code} + +The body of the loop is a series of clauses, each one representing +a kind of condition/action pair. The loop repeatedly reads a record, +and then executes each clause whose condition is satisfied by the record. + +Here's an example that reads lines from port \ex{p} +and prints the line number and line of every line containing the +string ``\ex{Church-Rosser}'': +\begin{code} +(awk (read-line) (ln) lineno () + ("Church-Rosser" (format #t "~d: ~s~%" lineno ln)))\end{code} +This example has just one clause in the loop body, the one that +tests for matches against the regular expression ``\ex{Church-Rosser}''. + +The \synvar{next-record} form is an expression that is evaluated each time +through the loop to produce a record to process. +This expression can return multiple values; +these values are bound to the variables given in the +\synvar{record\&field-vars} list of variables. +The first value returned is assumed to be the record; +when it is the end-of-file object, the loop terminates. + +For example, let's suppose we want to read items from \ex{/etc/password}, +and we use the \ex{field-reader} procedure to define a record parser for +\ex{/etc/passwd} entries: + \codex{(define read-passwd (field-reader (infix-splitter ":" 7)))} +binds \ex{read-passwd} to a procedure that reads in a line of text when +it is called, and splits the text at colons. It returns two values: +the entire line read, and a seven-element list of the split-out fields. +(See section~\ref{sec:field-reader} for more on \ex{field-reader} and +\ex{infix-splitter}.) + +So if the \synvar{next-record} form in an \ex{awk} expression is +\ex{(read-passwd)}, then \synvar{record\&field-vars} must be a list of +two variables, \eg, + \codex{(record field-vec)} +since \ex{read-passwd} returns two values. + +Note that \ex{awk} allows us to use \emph{any} record reader we want in the +loop, returning whatever number of values we like. These values +don't have to be strings or string lists. The only requirement +is that the record reader return the eof object as its first value +when the loop should terminate. + +The \ex{awk} loop allows the programmer to have loop variables. These are +declared and initialised by the \synvar{state-var-decls} form, a + \codex{((\var{var} \var{init-exp}) (\var{var} \var{init-exp}) \ldots)} +list rather like the \ex{let} form. Whenever a clause in the loop body +executes, it evaluates to as many values as there are state variables, +updating them. + +The optional \synvar{counter} variable is an iteration counter. +It is bound to 0 when the loop starts. +The counter is incremented each time a non-eof record is read. + +There are several kinds of loop clause. When evaluating the body of the +loop, \ex{awk} evaluates \emph{all} the clauses sequentially. +Unlike \ex{cond}, it does not stop after the first clause is satisfied; +it checks them all. + +\begin{itemize} + + \itum{\ex{(\var{test} \vari{body}1 \vari{body}2 \ldots)}} + If \var{test} is true, execute the body forms. The last body form + is the value of the clause. The test and body forms are evaluated + in the scope of the record and state variables. + + The \var{test} form can be one of: + \begin{inset} + \begin{tabular}{lp{0.6\linewidth}} + \var{integer}: & The test is true for that iteration of the loop. + The first iteration is \#1. \\ + + \var{sre}: & A regular expression, in SRE notation + (see chapter~\ref{chapt:sre}) can be used as + a test. The test is successful if the pattern + matches the record. + In particular, note that any string is an SRE. \\ + + \ex{(when \var{expr})}: & + The body of a \ex{when} test is evaluated as a + Scheme boolean expression in the inner scope of the + \ex{awk} form. \\ + + \var{expr}: & If the form is none of the above, it is treated as + a Scheme expression---in practice, the \ex{when} + keyword is only needed in cases where SRE/Scheme + expression ambiguity might occur. + \end{tabular} + \end{inset} + + + \itum{\begin{tabular}[t]{l} +\ex{(range\ \ \ \var{start-test} \var{stop-test} \vari{body}1 \ldots)} \\ +\ex{(:range\ \ \var{start-test} \var{stop-test} \vari{body}1 \ldots)} \\ +\ex{(range:\ \ \var{start-test} \var{stop-test} \vari{body}1 \ldots)} \\ +\ex{(:range:\ \var{start-test} \var{stop-test} \vari{body}1 \ldots)} + \end{tabular}} +% + These clauses become activated when \var{start-test} is true; + they stay active on all further iterations until \var{stop-test} + is true. + + So, to print out the first ten lines of a file, we use the clause: + \codex{(:range: 1 10 (display record))} + + The colons control whether or not the start and stop lines + are processed by the clause. For example: + \begin{inset}\begin{tabular}{l@{\qquad}l} + \ex{(range\ \ \ 1 5\ \ \ldots)} & Lines \phantom{1} 2 3 4 \\ + \ex{(:range\ \ 1 5\ \ \ldots)} & Lines 1 2 3 4 \\ + \ex{(range:\ \ 1 5\ \ \ldots)} & Lines \phantom{1} 2 3 4 5 \\ + \ex{(:range: 1 5\ \ \ldots)} & Lines 1 2 3 4 5 + \end{tabular} + \end{inset} + + A line can trigger both tests, either simultaneously starting and + stopping an active region, or simultaneously stopping one and starting + a new one, so ranges can abut seamlessly. + + \itum{\ex{(else \vari{body}1 \vari{body}2 \ldots)}} + If no other clause has executed since the top of the loop, or + since the last \ex{else} clause, this clause executes. + + \itum{\ex{(\var{test} => \var{exp})}} + If evaluating \ex{test} produces a true value, + apply \var{exp} to that value. + If \var{test} is a regular expression, then \var{exp} is applied + to the match data structure returned by the regexp match routine. + + \itum{\ex{(after \vari{body}1 \ldots)}} + This clause executes when the loop encounters EOF. The body forms + execute in the scope of the state vars and the record-count var, + if there are any. The value of the last body form is the value + of the entire awk form. + + If there is no \ex{after} clause, \ex{awk} returns the loop's state + variables as multiple values. +\end{itemize} + +\subsection{Examples} +Here are some examples of \ex{awk} being used to process various types +of input stream. + +\begin{code} +(define $ nth) ; Saves typing. + +;;; Print out the name and home-directory of everyone in /etc/passwd: +(let ((read-passwd (field-reader (infix-splitter ":" 7)))) + (call-with-input-file "/etc/passwd" + (lambda (port) + (awk (read-passwd port) (record fields) () + (#t (format #t "~a's home directory is ~a~%" + ($ fields 0) + ($ fields 5)))))))\end{code} + +\begin{code} +;;; Print out the user-name and home-directory of everyone whose +;;; name begins with "S" +(let ((read-passwd (field-reader (infix-splitter ":" 7)))) + (call-with-input-file "/etc/passwd" + (lambda (port) + (awk (read-passwd port) (record fields) () + ((: bos "S") + (format #t "~a's home directory is ~a~%" + ($ fields 0) + ($ fields 5)))))))\end{code} + +\begin{code} +;;; Read a series of integers from stdin. This expression evaluates +;;; to the number of positive numbers that were read. Note our +;;; "record-reader" is the standard Scheme READ procedure. +(awk (read) (i) ((npos 0)) + ((> i 0) (+ npos 1)))\end{code} + +\begin{code} +;;; Filter -- pass only lines containing my name. +(awk (read-line) (line) () + ("Olin" (display line) (newline)))\end{code} + +\begin{code} +;;; Count the number of non-comment lines of code in my Scheme source. +(awk (read-line) (line) ((nlines 0)) + ((: bos (* white) ";") nlines) ; A comment line. + (else (+ nlines 1))) ; Not a comment line.\end{code} + +\begin{code} +;;; Read numbers, counting the evens and odds. +(awk (read) (val) ((evens 0) (odds 0)) + ((> val 0) (display "pos ") (values evens odds)) ; Tell me about + ((< val 0) (display "neg ") (values evens odds)) ; sign, too. + (else (display "zero ") (values evens odds)) + + ((even? val) (values (+ evens 1) odds)) + (else (values evens (+ odds 1))))\end{code} + +\begin{code} +;;; Determine the max length of all the lines in the file. +(awk (read-line) (line) ((max-len 0)) + (#t (max max-len (string-length line))))\end{code} + +\begin{code} +;;; (This could also be done with PORT-FOLD:) +(port-fold (current-input-port) read-line + (lambda (line maxlen) (max (string-length line) maxlen)) + 0)\end{code} + +\begin{code} +;;; Print every line longer than 80 chars. +;;; Prefix each line with its line #. +(awk (read-line) (line) lineno () + ((> (string-length line) 80) + (format #t "~d: ~s~%" lineno line)))\end{code} + +\begin{code} +;;; Strip blank lines from input. +(awk (read-line) (line) () + ((~ white) (display line) (newline)))\end{code} + +\begin{code} +;;; Sort the entries in /etc/passwd by login name. +(for-each (lambda (entry) (display (cdr entry)) (newline)) ; Out + (sort (lambda (x y) (string] option, then the following newline will +% be read *after* ^M is bound to \cr, so we're cool. If there isn't +% an option given (i.e., default to [c]), then the @\ifnextchar will +% gobble up the newline as it gobbles whitespace. So we insert the +% \cr explicitly. Isn't TeX fun? +\def\codebox{\leavevmode\@ifnextchar[{\@codebox}{\@codebox[c]\cr}} %] + +\def\@codebox[#1]% + {\hbox\bgroup$\if #1t\vtop \else \if#1b\vbox \else \vcenter \fi\fi\bgroup% + \tabskip\z@\setupcode\cd@obeycr% just before cd@obey + \halign\bgroup##\hfil\span} + +\def\endcodebox{\crcr\egroup\egroup\m@th$\egroup} + +% Center the box on the page: +\newenvironment{centercode}% + {\begin{center}\begin{codebox}[c]}% + {\end{codebox}\end{center}} + + +%% code, codeaux, tightcode +%%============================================================================= +%% Code environment as described above. Lines are kept on one page. +%% This actually works by setting a huge penalty for breaking +%% between lines of code. Code is indented same as other displayed paras. +%% Note: to increase left margin, use \begin{codeaux}{\leftmargin=1in}. + +% To allow pagebreaks, say \codeallowbreaks immediately inside the env. +% You can allow breaks at specific lines with a \pagebreak form. + +%% N.B.: The \global\@ignoretrue command must be performed just inside +%% the *last* \end{...} before the following text. If not, you will +%% get an extra space on the following line. Blech. + +%% This environment takes two arguments. +%% The second, required argument is the \list parameters to override the +%% \@listi... defaults. +%% - Usefully set by clients: \topsep \leftmargin +%% - Possible, but less useful: \partopsep +%% The first, optional argument is the extra \parskip glue that you get around +%% \list environments. It defaults to the value of \parskip. +\def\codeaux{\@ifnextchar[{\@codeaux}{\@codeaux[\parskip]}} %] +\def\@codeaux[#1]#2{% + \bgroup\parskip#1% + \begin{list}{}% + {\parsep\z@\rightskip\z@\listparindent\z@\itemindent\z@#2}% + \item[]\setupcode\cd@obeylines}% +\def\endcodeaux{\end{list}\leavevmode\egroup\ignorespaces\global\@ignoretrue} + +%% Code env is codeaux with the default margin and spacing \list params: +\def\code{\codeaux{}} \let\endcode=\endcodeaux + +%% Like code, but with no extra vertical space above and below. +\def\tightcode{\codeaux[=0pt]{\topsep\z@}}% +\let\endtightcode\endcodeaux +% {\vspace{-1\parskip}\begin{codeaux}{\partopsep\z@\topsep\z@}}% +% {\end{codeaux}\vspace{-1\parskip}} + + + +% Reasonable separation between lines of code +\newcommand{\codeskip}{\penalty0\vspace{2ex}} + + +% \cd is used to build a code environment in the middle of text. +% Note: only difference from display code is that cr's are taken +% as unbreakable spaces instead of linebreaks. + +\def\cd{\leavevmode\begingroup\ifmmode\let\startcode=\startmcode\else% + \let\startcode\starttcode\fi% + \setupcode\cd@obeycrsp\startcode} + +\def\starttcode#1{#1\endgroup} +\def\startmcode#1{\hbox{#1}\endgroup} + + +% Restore $&#^_~% to their normal catcodes +% Define \^ to give the ^ char. +% \dcd points to this guy inside a code env. +\def\cd@dcd{\catcode`\$=3\catcode`\&=4\catcode`\#=6\catcode`\^=7% + \catcode`\_=8\catcode`\~=13\catcode`\%=14\def\^{\char`\^}} + +% Selectively enable $, and $^_ as special. +% \cd@mathspecial also defines \^ give the ^ char. +% \cddollar and \cdmath point to these guys inside a code env. +\def\cd@dollarspecial{\catcode`\$=3} +\def\cd@mathspecial{\catcode`\$=3\catcode`\^=7\catcode`\_=8% + \def\^{\char`\^}} + + +% Change log: +% Started off as some macros found in C. Rich's library. +% Olin 1/90: +% Removed \makeatletter, \makeatother's -- they shouldn't be there, +% because style option files are read with makeatletter. The terminal +% makeatother screwed things up for the following style options. +% Olin 3/91: +% Rewritten. +% - Changed things so blank lines don't get compressed out (the \leavevmove +% in \cd@cr and \cd@crwb). +% - Changed names to somewhat less horrible choices. +% - Added lots of doc, so casual hackers can more easily mess with all this. +% - Removed `'"@ from the set of hacked chars, since they are already +% non-special. +% - Removed the bigcode env, which effect can be had with the \codeallowbreaks +% command. +% - Removed the \@noligs command, since it's already defined in latex.tex. +% - Win big with the new \dcd, \cddollar, and \cdmath commands. +% - Now, *only* the chars \{} are special inside the code env. If you need +% more, use the \dcd command inside a group. +% - \cd now works inside math mode. (But if you use it in a superscript, +% it still comes out full size. You must explicitly put a \scriptsize\tt +% inside the \cd: $x^{\cd{\scriptsize\tt...}}$. A \leavevmode was added +% so that if you begin a paragraph with a \cd{...}, TeX realises you +% are starting a paragraph. +% - Added the codebox env. Tricky bit involving the first line hacked +% with help from David Long. +% Olin 8/94 +% Changed the font commands for LaTeX2e. diff --git a/doc/scsh-manual/css.t2p b/doc/scsh-manual/css.t2p new file mode 100644 index 0000000..7c1fcee --- /dev/null +++ b/doc/scsh-manual/css.t2p @@ -0,0 +1,105 @@ +% css.t2p +% Dorai Sitaram +% 19 Jan 2001 +% A basic style for HTML documents generated +% with tex2page. + +\cssblock + +body { + color: black; + background-color: #e5e5e5; +/*background-color: beige;*/ + margin-top: 2em; + margin-left: 8%; + margin-right: 8%; +} + +h1,h2,h3,h4,h5,h6 { + margin-top: .5em; +} + +.partheading { + font-size: 70%; +} + +.chapterheading { + font-size: 70%; +} + +pre { + margin-left: 2em; +} + +ol { + list-style-type: decimal; +} + +ol ol { + list-style-type: lower-alpha; +} + +ol ol ol { + list-style-type: lower-roman; +} + +ol ol ol ol { + list-style-type: upper-alpha; +} + +.scheme { + color: brown; +} + +.scheme .keyword { + color: #990000; + font-weight: bold; +} + +.scheme .builtin { + color: #990000; +} + +.scheme .variable { + color: navy; +} + +.scheme .global { + color: purple; +} + +.scheme .selfeval { + color: green; +} + +.scheme .comment { + color: teal; +} + +.navigation { + color: red; + text-align: right; + font-style: italic; +} + +.disable { + /* color: #e5e5e5; */ +color: gray; +} + +.smallcaps { +font-size: 75%; +} + +.smallprint { + color: gray; + font-size: 75%; + text-align: right; +} + +.smallprint hr { + text-align: left; + width: 40%; +} + +\endcssblock \ No newline at end of file diff --git a/doc/scsh-manual/ct.sty b/doc/scsh-manual/ct.sty new file mode 100644 index 0000000..1edfbc0 --- /dev/null +++ b/doc/scsh-manual/ct.sty @@ -0,0 +1,6 @@ +% Loads cmtt fonts in on \tt. -*- latex -*- +% I prefer these to the Courier fonts that latex gives you w/postscript styles. +% Courier is too spidery and too wide -- it's hard to get 80 chars on a line. +% -Olin + +\renewcommand{\ttdefault}{cmtt} diff --git a/doc/scsh-manual/decls.tex b/doc/scsh-manual/decls.tex new file mode 100644 index 0000000..f927101 --- /dev/null +++ b/doc/scsh-manual/decls.tex @@ -0,0 +1,273 @@ +\makeatletter +\def\ie{\mbox{\emph{i.e.}}} % \mbox keeps the last period from +\def\Ie{\mbox{\emph{I.e.}}} % looking like an end-of-sentence. +\def\eg{\mbox{\emph{e.g.}}} +\def\Eg{\mbox{\emph{E.g.}}} +\def\etc{{\em etc.}} + +\def\Lisp{\textsc{Lisp}} +\def\CommonLisp{\textsc{Common Lisp}} +\def\Ascii{\textsc{Ascii}} +\def\Ansi{\textsc{Ansi}} +\def\Unix{{Unix}} % Not smallcaps, according to Bart. +\def\Scheme{{Scheme}} +\def\scm{{Scheme 48}} +\def\R4RS{R4RS} +\def\Posix{\textsc{Posix}} + +\def\sharpf{\normalfont\texttt{\#f}} +\def\sharpt{\normalfont\texttt{\#t}} +\newcommand{\synteq}{\textnormal{::=}} + +\def\maketildeother{\catcode`\~=12} +\def\maketildeactive{\catcode`\~=13} +\def\~{\char`\~} + +\newcommand{\evalsto}{\ensuremath{\Rightarrow}} + +% One-line code examples +%\newcommand{\codex}[1]% One line, centred. Tight spacing. +% {$$\abovedisplayskip=.75ex plus 1ex minus .5ex% +% \belowdisplayskip=\abovedisplayskip% +% \abovedisplayshortskip=0ex plus .5ex% +% \belowdisplayshortskip=\abovedisplayshortskip% +% \hbox{\ttt #1}$$} +%\newcommand{\codex}[1]{\begin{tightinset}\ex{#1}\end{tightinset}\ignorespaces} +\newcommand{\codex}[1]{\begin{leftinset}\ex{#1}\end{leftinset}\ignorespaces} + +\def\widecode{\codeaux{\leftmargin=0pt\topsep=0pt}} +\def\endwidecode{\endcodeaux} + +% For multiletter vars in math mode: +\newcommand{\var}[1]{\mbox{\frenchspacing\it{#1}}} +\newcommand{\vari}[2]{\ensuremath{\mbox{\it{#1}}_{#2}}} + +%% What you frequently want when you say \tt: +\def\ttchars{\catcode``=13\@noligs\frenchspacing} +\def\ttt{\normalfont\ttfamily\ttchars} + +% Works in math mode; all special chars remain special; cheaper than \cd. +% Will not be correct size in super and subscripts, though. +\newcommand{\ex}[1]{{\normalfont\texttt{\ttchars #1}}} + +\newenvironment{inset} + {\bgroup\parskip=1ex plus 1ex\begin{list}{}% + {\topsep=0pt\rightmargin\leftmargin}% + \item[]}% + {\end{list}\leavevmode\egroup\global\@ignoretrue} + +\newenvironment{leftinset} + {\bgroup\parskip=1ex plus 1ex\begin{list}{}% + {\topsep=0pt}% + \item[]}% + {\end{list}\leavevmode\egroup\global\@ignoretrue} + +\newenvironment{tightinset} + {\bgroup\parskip=0pt\begin{list}{}% + {\topsep=0pt\rightmargin\leftmargin}% + \item[]}% + {\end{list}\leavevmode\egroup\global\@ignoretrue} + +\newenvironment{tightleftinset} + {\bgroup\parskip=0pt\begin{list}{}% + {\topsep=0pt}% + \item[]}% + {\end{list}\leavevmode\egroup\global\@ignoretrue} + +\long\def\remark#1{\bgroup\small\begin{quote}\textsl{Remark: } #1\end{quote}\egroup} +\newenvironment{remarkenv}{\bgroup\small\begin{quote}\textsl{Remark: }}% + {\end{quote}\egroup} +\newcommand{\oops}[1]{\bgroup\small\begin{quote}\textsl{Oops: } #1\end{quote}\egroup} + +\newcommand{\note}[1]{\{Note #1\}} + +\newcommand{\itum}[1]{\item{\bf #1}\\*} + +% For use in code. The \llap magicness makes the lambda exactly as wide as +% the other chars in \tt; the \hskip shifts it right a bit so it doesn't +% crowd the left paren -- which is necessary if \tt is cmtt. +% Note that (\l{x y} (+ x y)) uses the same number of columns in TeX form +% as it produces when typeset. This makes it easy to line up the columns +% in your input. \l is bound to some useless command in LaTeX, so we have to +% define it w/renewcommand. +\let\oldl\l %Save the old \l on \oldl +\renewcommand{\l}[1]{\ \llap{$\lambda$\hskip-.05em}\ (#1)} + +% This one is for the rare (lambda x ...) case -- it doesn't have the +% column-invariant property. Oh, well. +\newcommand{\lx}[1]{\ \llap{$\lambda$\hskip-.05em}\ {#1}} + +% For subcaptions +\newcommand{\subcaption}[1] +{\unskip\vspace{-2mm}\begin{center}\unskip\em#1\end{center}} + +%%% T release notes stuff +\newlength{\notewidth} +\setlength{\notewidth}{\textwidth} +\addtolength{\notewidth}{-1.25in} + +%\newcommand{\remark} [1] +% {\par\vspace{\parskip} +% \parbox[t]{.75in}{\sc Remark:} +% \parbox[t]{\notewidth}{\em #1} +% \vspace{\parskip} +% } + +\newenvironment{optiontable}% + {\begin{tightinset}\renewcommand{\arraystretch}{1.5}% + \begin{tabular}{@{}>{\ttt}ll@{}}}% + {\end{tabular}\end{tightinset}}% + +\newenvironment{desctable}[1]% + {\begin{inset}\renewcommand{\arraystretch}{1.5}% + \begin{tabular}{lp{#1}}}% + {\end{tabular}\end{inset}} + +\def\*{{\ttt *}} + +% Names of things + +\newcommand{\keyword} [1]{\index{#1}{\normalfont\textsf{#1}}} + +\newcommand{\evalto}{$\Longrightarrow$\ } +\renewcommand{\star}{$^*$\/} +\newcommand{\+}{$^+$} + +% Semantic domains, used to indicate the type of a value + +\newcommand{\sem}{\normalfont\itshape} %semantic font +\newcommand{\semvar}[1]{\textit{#1}} %semantic font +\newcommand{\synvar}[1]{\textrm{\textit{$<$#1$>$}}} %syntactic font +\newcommand{\type}{\sem} +\newcommand{\zeroormore}[1]{{\sem #1$_1$ \ldots #1$_n$}} +\newcommand{\oneormore}[1]{{\sem #1$_1$ #1$_2$ \ldots #1$_n$}} + +\newcommand{\proc} {{\sem procedure}} +\newcommand{\boolean} {{\sem boolean}} +\newcommand{\true} {{\sem true}} +\newcommand{\false} {{\sem false}} + +\newcommand{\num} {{\sem number}} +\newcommand{\fixnum} {{\sem fixnum}} +\newcommand{\integer} {{\sem integer}} +\newcommand{\real} {{\sem real}} + +\newcommand{\character} {{\sem character}} +\newcommand{\str} {{\sem string}} +\newcommand{\sym} {{\sem symbol}} + +\newcommand{\location} {{\sem location}} +\newcommand{\object} {{\sem object}} + +\newcommand{\error} {{\sem error}} +\newcommand{\syntaxerror} {{\sem syntax error}} +\newcommand{\readerror} {{\sem read error}} +\newcommand{\undefined} {{\sem undefined}} +\newcommand{\noreturn} {{\sem no return value}} + +\newcommand{\port} {{\sem port}} + +% semantic variables + +\newcommand{\identifier} {{\sem identifier}} +\newcommand{\identifiers} {\zeroormore{\}} +\newcommand{\expr} {{\sem expression}} +\newcommand{\body} {{\sem body}} +\newcommand{\valueofbody} {{\sem value~of~body}} +\newcommand{\emptylist} {{\sem empty~list}} +\newcommand{\car} {\keyword{car}} +\newcommand{\cdr} {\keyword{cdr}} + + +% generally useful things + +% For line-breaking \tt stuff. +\renewcommand{\=}{\discretionary{-}{}{-}} +\newcommand{\ob}{\discretionary{}{}{}} % Optional break. + +\newcommand{\indx}[1]{#1 \index{ #1 }} +%\newcommand{\gloss}[1]{#1 \glossary{ #1 }} + +% This lossage produces #2 if #1 is zero length, otw #3. +% We use it to conditionally add a space between the procedure and +% the args in procedure prototypes, but only if there are any args-- +% we want to produce "(read)", not "(read )". +\newlength{\voidlen} +\newcommand{\testvoid}[3]{\settowidth\voidlen{#1}\ifdim\voidlen>0in{#3}\else{#2}\fi} + + +% Typeset a definition prototype line, e.g.: +% (cons ) -> pair procedure +% +% Five args are: proc-name args ret-value(s) type index-entry +\newcommand{\dfnix}[4]% FIVE args, really. + {\hbox to \linewidth{\ttchars% + {\ttt(#1\testvoid{#2}{}{\ }{\sem{#2}}\testvoid{#2}{}{\/})\hskip 1em minus +0.5em$\longrightarrow$\hskip 1em minus 0.5em{\sem{#3}}\hfill\quad\textnormal{#4}}}\index} + +\newcommand{\dfnx}[4] {\dfnix{#1}{#2}{#3}{#4}{#1@\texttt{#1}}} + +\newcommand{\dfn} {\par\medskip\dfnx} % Takes 4 args, actually. +\newcommand{\dfni} {\par\medskip\dfnix} % Takes 5 args, actually. + +\newcommand{\defvar} {\par\medskip\defvarx} % Takes 4 args, actually. +\newcommand{\defvarx}[2]% + {\index{#1} + \hbox to \linewidth{\ttchars{{\ttt{#1}} \hfill #2}}}% + +% Typeset the protocol line, then do the following descriptive text indented. +% If you want to group two procs together, do the first one with a \dfn, +% then the second one, and the documentation, with a \defndescx. + +% This one doesn't put whitespace above. Use it immediately after a \dfn +% to group two prototype lines together. +\newenvironment{dfndescx}[4]% + {\dfnx{#1}{#2}{#3}{#4}\begin{desc}}{\end{desc}} + +\newenvironment{dfndesc}[4] % This one puts whitespace above. + {\par\medskip\begin{dfndescx}{#1}{#2}{#3}{#4}} + {\end{dfndescx}} + +\newenvironment{desc}% + {\nopagebreak[2]% + \smallskip + \bgroup\begin{list}{}{\topsep=0pt\parskip=0pt}\item[]} + {\end{list}\leavevmode\egroup\global\@ignoretrue} + +\newcommand{\defun} [3] {\dfn{#1}{#2}{#3}{procedure}} % preskip +\newcommand{\defunx}[3]{\dfnx{#1}{#2}{#3}{procedure}} % no skip + +\newenvironment{defundescx}[3]% + {\begin{dfndescx}{#1}{#2}{#3}{procedure}} + {\end{dfndescx}} + +\newenvironment{defundesc}[3]% + {\begin{dfndesc}{#1}{#2}{#3}{procedure}} + {\end{dfndesc}} + + +\newenvironment{column}{\begin{tabular}[t]{@{}l@{}}}{\end{tabular}} + +\newenvironment{exampletable}% + {\begin{leftinset}% + \newcommand{\header}[1]{\multicolumn{2}{@{}l@{}}{##1}\\}% + \newcommand{\splitline}[2]% + {\multicolumn{2}{@{}l@{}}{##1}\\\multicolumn{2}{@{}l@{}}{\qquad\evalto\quad{##2}}} + \begin{tabular}{@{}l@{\quad\evalto\quad}l@{}}}% + {\end{tabular}\end{leftinset}} + +% Put on blank lines in a code env to allow a pagebreak. +\newcommand{\cb}{\pagebreak[0]} + +\newenvironment{boxedcode} + {\begin{inset}\tabular{|l|}\hline} + {\\ \hline \end{tabular}\end{inset}} + +% A ragged-right decl that doesn't redefine \\ -- for use in tables. +\newcommand{\raggedrightparbox}{\let\temp=\\\raggedright\let\\=\temp} + +\newenvironment{boxedfigure}[1]% + {\begin{figure}[#1]\begin{boxedminipage}{\linewidth}\vskip 1.5ex} + {\end{boxedminipage}\end{figure}} + +\makeatother diff --git a/doc/scsh-manual/draftfooters.sty b/doc/scsh-manual/draftfooters.sty new file mode 100644 index 0000000..862436d --- /dev/null +++ b/doc/scsh-manual/draftfooters.sty @@ -0,0 +1,76 @@ +% Document style option "draftfooter" +% -- usage: \documentstyle[...,draftfooter,...]{...} +% -- puts "DRAFT" with date and time in page footer +% +% Olin Shivers 1/17/94 +% - Hacked from code I used in my dissertation and from code in a +% drafthead.sty package written by Stephen Page sdpage@uk.ac.oxford.prg. +%---------------------------------------------------------------------------- + +% +% compute the time in hours and minutes; make new variables \timehh and \timemm +% +\newcount\timehh\newcount\timemm +\timehh=\time +\divide\timehh by 60 \timemm=\time +\count255=\timehh\multiply\count255 by -60 \advance\timemm by \count255 +% + +\def\draftbox{{\protect\small\bf \fbox{DRAFT}}} +\def\drafttime{% + {\protect\small\sl\today\ -- \ifnum\timehh<10 0\fi% + \number\timehh\,:\,\ifnum\timemm<10 0\fi\number\timemm}} +\def\drafttimer{\protect\makebox[0pt][r]{\drafttime}} +\def\drafttimel{\protect\makebox[0pt][l]{\drafttime}} + +\def\thepagel{\protect\makebox[0pt][l]{\rm\thepage}} +\def\thepager{\protect\makebox[0pt][r]{\rm\thepage}} + +% Header is empty. +% Footer is "date DRAFT pageno" +\def\ps@plain{ + \let\@mkboth\@gobbletwo + \let\@oddhead\@empty \let\@evenhead\@empty + + \def\@oddfoot{\reset@font\rm\drafttimel\hfil\draftbox\hfil\thepager} + \if@twoside + \def\@evenfoot{\reset@font\rm\thepagel\hfil\draftbox\hfil\drafttimer} + \else \let\@evenfoot\@oddfoot + \fi +} + +% Aux macro -- sets footer to be "date DRAFT". +\def\@draftfooters{ + \def\@oddfoot{\reset@font\rm\drafttimel\hfil\draftbox} + \if@twoside + \def\@evenfoot{\reset@font\rm\draftbox\hfil\drafttimer} + \else \let\@evenfoot\@oddfoot + \fi + } + +% Header is empty. +% Footer is "date DRAFT". +\def\ps@empty{ + \let\@mkboth\@gobbletwo + \let\@oddhead\@empty \let\@evenhead\@empty + \@draftfooters + } + +% Header is defined by the document style (article, book, etc.). +% Footer is "date DRAFT". +\let\@draftoldhead\ps@headings +\def\ps@headings{ + \@draftoldhead % Do the default \pagestyle{headings} stuff. + \@draftfooters % Then define the draft footers: + } + +% Header is defined by the document style (article, book, etc.), +% and filled in by user's \markboth and \markright commands. +% Footer is "date DRAFT". +\let\@draftoldmyhead\ps@myheadings +\def\ps@myheadings{ + \@draftoldmyhead % Do the default \pagestyle{myheadings} stuff. + \@draftfooters % Then define the draft footers: + } + +\ps@plain diff --git a/doc/scsh-manual/front.tex b/doc/scsh-manual/front.tex new file mode 100644 index 0000000..a5a8cf6 --- /dev/null +++ b/doc/scsh-manual/front.tex @@ -0,0 +1,56 @@ +%&latex -*- latex -*- + +\title{Scsh Reference Manual} +\subtitle{For scsh release 0.5.3} +\author{Olin Shivers and Brian D.~Carlstrom} +\date{June 2001} + +\maketitle +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% Some code-changes for tex2page and latex output. NF +\texonly +\chapter*{Acknowledgements} +\endtexonly +\htmlonly +\\ \ex{Acknowledgements} \\ \\ +\endhtmlonly + +Who should I thank? +My so-called ``colleagues,'' who laugh at me behind my back, + all the while becoming famous on {\em my\/} work? +My worthless graduate students, whose computer skills appear to + be limited to downloading bitmaps off of netnews? +My parents, who are still waiting for me to quit ``fooling around with + computers,'' go to med school, and become a radiologist? +My department chairman, a manager who gives one new insight into + and sympathy for disgruntled postal workers? + +My God, no one could blame me---no one!---if I went off the edge and just + lost it completely one day. +I couldn't get through the day as it is without the Prozac and Jack Daniels + I keep on the shelf, behind my Tops-20 JSYS manuals. + I start getting the shakes real bad around 10am, right before my + advisor meetings. A 10 oz.\ Jack 'n Zac helps me get through the + meetings without one of my students winding up with his severed head + in a bowling-ball bag. They look at me funny; they think I twitch a + lot. I'm not twitching. I'm controlling my impulse to snag my 9mm + Sig-Sauer out from my day-pack and make a few strong points about + the quality of undergraduate education in Amerika. + +If I thought anyone cared, if I thought anyone would even be reading this, +I'd probably make an effort to keep up appearances until the last possible +moment. But no one does, and no one will. So I can pretty much say exactly +what I think. + +Oh, yes, the {\em acknowledgements.} +I think not. I did it. I did it all, by myself. +\begin{flushright} +\begin{tabular}{l} +Olin Shivers \\ +Cambridge \\ +September 4, 1994 +\end{tabular} +\end{flushright} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\tableofcontents diff --git a/doc/scsh-manual/headings.sty b/doc/scsh-manual/headings.sty new file mode 100644 index 0000000..c928f58 --- /dev/null +++ b/doc/scsh-manual/headings.sty @@ -0,0 +1,16 @@ +% headings.tex -*- latex -*- +% Quieter headings that the ones used in article.sty. +% This is not a style option. Don't say [headings]. +% Instead, say \input{headings} after the \documentstyle. +% -Olin 7/91 + +\makeatletter + +\def\section{\@startsection {section}{1}{\z@}{-3.5ex plus -1ex minus + -.2ex}{2.3ex plus .2ex}{\large\normalfont\bfseries}} +\def\subsection{\@startsection{subsection}{2}{\z@}{-3.25ex plus -1ex minus + -.2ex}{1.5ex plus .2ex}{\normalsize\normalfont\bfseries}} +\def\subsubsection{\@startsection{subsubsection}{3}{\z@}{-3.25ex plus +-1ex minus -.2ex}{1.5ex plus .2ex}{\normalsize\normalfont\bfseries}} + +\makeatother diff --git a/doc/scsh-manual/intro.tex b/doc/scsh-manual/intro.tex new file mode 100644 index 0000000..02b6379 --- /dev/null +++ b/doc/scsh-manual/intro.tex @@ -0,0 +1,454 @@ +%&latex -*- latex -*- + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\chapter{Introduction} + +This is the reference manual for scsh, +a {\Unix} shell that is embedded within {\Scheme}. +Scsh is a Scheme system designed for writing useful standalone Unix +programs and shell scripts---it spans a wide range of application, +from ``script'' applications usually handled with perl or sh, +to more standard systems applications usually written in C. + +Scsh comes built on top of {\scm}, and has two components: +a process notation for running programs and setting up pipelines +and redirections, +and a complete syscall library for low-level access to the operating system. +This manual gives a complete description of scsh. +A general discussion of the design principles behind scsh can be found +in a companion paper, ``A Scheme Shell.'' + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Copyright \& source-code license} +Scsh is open source. The complete sources come with the standard +distribution, which can be downloaded off the net. + +For years, scsh's underlying Scheme implementation, Scheme 48, did not have an +open-source copyright. However, around 1999/2000, the Scheme 48 authors +graciously retrofitted a BSD-style open-source copyright onto the system. +Swept up by the fervor, we tacked an ideologically hip license onto scsh +source, ourselves (BSD-style, as well). Not that we ever cared before what you +did with the system. + +As a result, the whole system is now open source, top-to-bottom. + +We note that the code is a rich source for other Scheme implementations +to mine. Not only the \emph{code}, but the \emph{APIs} are available +for implementors working on Scheme environments for systems programming. +These APIs represent years of work, and should provide a big head-start +on any related effort. (Just don't call it ``scsh,'' unless it's +\emph{exactly} compliant with the scsh interfaces.) + +Take all the code you like; we'll just write more. + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Obtaining scsh} +Scsh is distributed via net publication. +We place new releases at well-known network sites, +and allow them to propagate from there. +We currently release scsh to the following Internet sites: +\begin{inset}\begin{flushleft} +\ex{\urlh{ftp://ftp-swiss.ai.mit.edu/pub/su/}{ftp://ftp-swiss.ai.mit.edu/pub/su/}} \\ +\ex{\urlh{http://www-swiss.ai.mit.edu/scsh/scsh.html}{http://www-swiss.ai.mit.edu/scsh/scsh.html}} \\ +\ex{\urlh{http://www.cs.indiana.edu/scheme-repository/}{http://www.cs.indiana.edu/scheme-repository/}} \\ +\end{flushleft} +\end{inset} +These sites are + the MIT Project Mac ftp server, + the Scheme Shell home page, and + the Indiana Scheme Repository home page, +respectively. +Each should have a compressed tar file of the entire scsh release, +which includes all the source code and the manual, +and a separate file containing just this manual in Postscript form, +for those who simply wish to read about the system. + +However, nothing is certain for long on the Net. +Probably the best way to get a copy of scsh is to use a network +resource-discovery tool, such as archie, +to find ftp servers storing scsh tar files. +Take the set of sites storing the most recent release of scsh, +choose one close to your site, and download the tar file. + +\section{Building scsh} +Scsh currently runs on a fairly large set of Unix systems, including +Linux, NetBSD, SunOS, Solaris, AIX, NeXTSTEP, Irix, and HP-UX. +We use the Gnu project's autoconfig tool to generate self-configuring +shell scripts that customise the scsh Makefile for different OS variants. +This means that if you use one of the common Unix implementations, +building scsh should require exactly the following steps: +\begin{inset} +\begin{tabular}{l@{\qquad}l} +\ex{gunzip scsh.tar.gz} & \emph{Uncompress the release tar file.} \\ +\ex{untar xfv scsh.tar} & \emph{Unpack the source code.} \\ +\ex{cd scsh-0.5} & \emph{Move to the source directory.} \\ +\ex{./configure} & \emph{Examine host; build Makefile.} \\ +\ex{make} & \emph{Build system.} +\end{tabular} +\end{inset} +When you are done, you should have a virtual machine compiled in +file \ex{scshvm}, and a heap image in file \ex{scsh/scsh.image}. +Typing +\begin{code} +make install +\end{code} +will install these programs in your installation directory +(by default, \ex{/usr/local}), along with a small stub startup +binary, \ex{scsh}. + +If you don't have the patience to do this, you can start up +a Scheme shell immediately after the initial make by simply +saying +\codex{./scshvm -o ./scshvm -i scsh/scsh.image} +See chapter~\ref{chapt:running} for full details on installation +locations and startup options. + +It is not too difficult to port scsh to another Unix platform if your +OS is not supported by the current release. +See the release notes for more details on how to do this. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Caveats} + +It is important to note what scsh is \emph{not}, as well as what it is. +Scsh, in the current release, is primarily designed for the writing of +shell scripts---programming. +It is not a very comfortable system for interactive command use: +the current release lacks job control, command-line editing, a terse, +convenient command syntax, and it does not read in an initialisation +file analogous to \ex{.login} or \ex{.profile}. +We hope to address all of these issues in future releases; +we even have designs for several of these features; +but the system as-released does not currently provide these features. + +In the current release, the system has some rough edges. +It is quite slow to start up---loading the initial image into the +{\scm} virtual machine induces a noticeable delay. +This can be fixed with the static heap linker provided with this release. + +We welcome parties interested in porting the manual to a more portable +XML or SGML format; please contact us if you are interested in doing so. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Naming conventions} +Scsh follows a general naming scheme that consistently employs a set of +abbreviations. +This is intended to make it easier to remember the names of things. +Some of the common ones are: +\begin{description} +\item [\ex{fdes}] + Means ``file descriptor,'' a small integer used in {\Unix} + to represent I/O channels. + +\item [\ex{\ldots*}] + A given bit of functionality sometimes comes in two related forms, + the first being a \emph{special form} that contains a body of + {\Scheme} code to be executed in some context, + and the other being a \emph{procedure} that takes a procedural + argument (a ``thunk'') to be called in the same context. + The procedure variant is named by taking the name of the special form, + and appending an asterisk. For example: +\begin{code} +;;; Special form: +(with-cwd "/etc" + (for-each print-file (directory-files)) + (display "All done")) + +;;; Procedure: +(with-cwd* "/etc" + (lambda () + (for-each print-file (directory-files)) + (display "All done")))\end{code} + +\item [\ex{\var{action}/\var{modifier}}] + The infix ``\ex{/}'' is pronounced ``with,'' as in + \ex{exec/env}---``exec with environment.'' + +\item [\ex{call/\ldots}] + Procedures that call their argument on some computed value + are usually named ``\ex{call/\ldots},'' \eg, + \ex{(call/fdes \var{port} \var{proc})}, which calls \var{proc} + on \var{port}'s file descriptor, returning whatever \var{proc} + returns. The abbreviated name means ``call with file descriptor.'' + +\item [\ex{with-\ldots}] + Procedures that call their argument, and special forms that execute + their bodies in some special dynamic context frequently have + names of the form \ex{with-\ldots}. For example, + \ex{(with-env \var{env} \vari{body}1 \ldots)} and + \ex{(with-env* \var{env} \var{thunk})}. These forms set + the process environment body, execute their body or thunk, + and then return after resetting the environment to its original + state. + +\item[\ex{create-}] + Procedures that create objects in the file system (files, directories, + temp files, fifos, \etc), begin with \ex{create-\ldots}. + +\item [\ex{delete-}] + Procedures that delete objects from the file system (files, + directories, temp files, fifos, \etc), begin with \ex{delete-\ldots}. + +\item[ \ex{\var{record}:\var{field}} ] + Procedures that access fields of a record are usually written + with a colon between the name of the record and the name of the + field, as in \ex{user-info:home-dir}. + +\item[\ex{\%\ldots}] + A percent sign is used to prefix lower-level scsh primitives + that are not commonly used. + +\item[\ex{-info}] + Data structures packaging up information about various OS + entities frequently end in \ldots\ex{-info}. Examples: + \ex{user-info}, \ex{file-info}, \ex{group-info}, and \ex{host-info}. + +\end{description} +% +Enumerated constants from some set \var{s} are usually named +\ex{\var{s}/\vari{const}1}, \ex{\var{s}/\vari{const}2}, \ldots. +For example, the various {\Unix} signal integers have the names +\ex{signal/cont}, \ex{signal/kill}, \ex{signal/int}, \ex{signal/hup}, +and so forth. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Lexical issues} +Scsh's lexical syntax is just {\R4RS} {\Scheme}, with the following +exceptions. + +\subsection{Extended symbol syntax} +Scsh's symbol syntax differs from {\R4RS} {\Scheme} in the following ways: +\begin{itemize} +\item In scsh, symbol case is preserved by \ex{read} and is significant on + symbol comparison. This means + \codex{(run (less Readme))} + displays the right file. + +\item ``\ex{-}'' and ``\ex{+}'' are allowed to begin symbols. + So the following are legitimate symbols: + \codex{-O2 -geometry +Wn} + +\item ``\ex{|}'' and ``\ex{.}'' are symbol constituents. + This allows \ex{|} for the pipe symbol, and \ex{..} for the parent-directory + symbol. (Of course, ``\ex{.}'' alone is not a symbol, but a + dotted-pair marker.) + +\item A symbol may begin with a digit. + So the following are legitimate symbols: +\codex{9x15 80x36-3+440} +\end{itemize} + +\subsection{Extended string syntax} +Scsh strings are allowed to contain the {\Ansi} C escape sequences + such as \verb|\n| and \verb|\161|. + +\subsection{Block comments and executable interpreter-triggers} +Scsh allows source files to begin with a header of the form +\codex{\#!/usr/local/bin/scsh -s} +The Unix operating system treats source files beginning with the headers +of this form specially; +they can be directly executed by the operating system +(see chapter~\ref{chapt:running} for information on how to use this feature). +The scsh interpreter ignores this special header by treating \ex{\#!} as a +comment marker similar to \ex{;}. +When the scsh reader encounters \ex{\#!}, it skips characters until it finds +the closing sequence +new\-line/{\ob}ex\-cla\-ma\-tion-{\ob}point/{\ob}sharp-{\ob}sign/{\ob}new\-line. + +Although the form of the \ex{\#!} read-macro was chosen to support +interpreter-triggers for executable Unix scripts, +it is a general block-comment sequence and can be used as such +anywhere in a scsh program. + +\subsection{Here-strings} +The read macro \ex{\#<} is used to introduce ``here-strings'' +in programs, similar to the \ex{<<} ``here document'' redirections +provided by sh and csh. +There are two kinds of here-string, character-delimited and line-delimited; +they are both introduced by the \ex{\#<} sequence. + +\subsubsection{Character-delimited here-strings} +A \emph{character-delimited} here-string has the form +\codex{\#<\emph{x}...stuff...\emph{x}} +where \emph{x} is any single character +(except \ex{<}, see below), +which is used to delimit the string bounds. +Some examples: +\begin{inset} +\begin{tabular}{ll} + Here-string syntax & Ordinary string syntax \\ \hline + \verb:#<|Hello, world.|: & \verb:"Hello, world.": \\ + \verb:#list s)))) + +} + + +\def\spaceifnotempty{\evalh{ + +(let ((x (ungroup (get-token)))) + (unless (all-blanks? x) + (emit #\space))) + +}} + +\def\dfnix#1#2#3#4{\leftline{{\tt(#1\spaceifnotempty{#2}{\it#2})} \quad $\longrightarrow$ \quad {\it #3} \qquad (#4)} \index} + +\def\ex#1{{\tt #1}} +\def\l#1{lambda (#1)} +\def\lx#1{lambda {#1}} +%\def\notenum#1{} +%\def\project#1{} +\def\var#1{{\it #1\/}} +\def\vari#1#2{\mbox{{\it #1\/}\undefcsactive\$$_{#2}$}} + +\renewenvironment{boxedfigure}{\def\srecomment#1{\\#1\\}% +\begin{figure}\pagestyle}{\end{figure}} + +\newenvironment{centercode}{\begin{code}}{\end{code}} + +\def\setupcode{\tt% +\def\\{\char`\\}% +\defcsactive\${\$}% +\def\evalto{==> }% +\defcsactive\%{\%}\obeywhitespace} + +\newenvironment{code}{\begin{quote}\bgroup\setupcode\GOBBLEOPTARG} +{\egroup\end{quote}} + +\newenvironment{codebox}{\begin{tableplain}\bgroup\setupcode\GOBBLEOPTARG} +{\egroup\end{tableplain}} + +\renewenvironment{desc}{\begin{quote}}{\end{quote}} + +\renewenvironment{exampletable}{% +\def\header#1{\\\leftline{#1}\\}% +\def\splitline#1#2{\\\leftline{#1}\\\leftline{#2}}% +\begin{tabular}{}}{\end{tabular}} + +\newenvironment{tightcode}{\begin{code}}{\end{code}} +\renewenvironment{widecode}{\begin{code}}{\end{code}} + +\renewenvironment{inset}{\begin{quote}}{\end{quote}} +\renewenvironment{leftinset}{\begin{quote}}{\end{quote}} +\renewenvironment{tightinset}{\begin{quote}}{\end{quote}} +\renewenvironment{tightleftinset}{\begin{quote}}{\end{quote}} +} \ No newline at end of file diff --git a/doc/scsh-manual/man.tex b/doc/scsh-manual/man.tex new file mode 100644 index 0000000..aa5418b --- /dev/null +++ b/doc/scsh-manual/man.tex @@ -0,0 +1,65 @@ +% -*- latex -*- + +% This is the reference manual for the Scheme Shell. + +\documentclass[twoside]{report} +\usepackage{code,boxedminipage,makeidx,palatino,ct, + headings,mantitle,array,matter,mysize10,tex2page} + +\texonly +\usepackage[dvipdfm,hyperindex,hypertex, + pdftitle={scsh manual, release 0.5.3}, + pdfauthor={Olin Shivers and Brian D.~Carlstrom} + colorlinks=true,linkcolor=blue,pagecolor=blue,urlcolor=blue, + pdfstartview=FitH,pdfview=FitH]{hyperref} +\endtexonly + +% These fonts are good choices for screen-readable pdf, but the man needs +% a pass over the layout, since the this tt font will blow out the width +% of some examples, making them wrap and generally screwing them up. Maybe this +% should also be a LaTeX option, so we can get palatino on the hardcopy +% runs and these fonts on pdf runs... +%\renewcommand{\rmdefault}{phv} +%\renewcommand{\sfdefault}{phv} +%\renewcommand{\ttdefault}{pcr} + +% Style issues +\parskip = 3pt plus 3pt +\sloppy + +%\includeonly{syscalls} + +\input{decls} +\makeindex +%%% End preamble + +\begin{document} + +\frontmatter +\include{front} + +\mainmatter +\include{intro} +\include{procnotation} +\include{syscalls} +\include{network} +\include{strings} +\include{sre} +\include{rdelim} +\include{awk} +\include{miscprocs} +\include{running} +\include{todo} + +\backmatter +\printindex + +\end{document} + +% General things to do when converting ASCII text to LaTeX: +% Build a set of \breakondash, \breakondot, \breakonslash commands +% that will enable breaking in \tt. This is better than \=, etc. +% +% Check for ..., quote char, double-dashes -- +% Double-word check +% lambda -> \l diff --git a/doc/scsh-manual/mantitle.sty b/doc/scsh-manual/mantitle.sty new file mode 100644 index 0000000..b17f5b5 --- /dev/null +++ b/doc/scsh-manual/mantitle.sty @@ -0,0 +1,76 @@ +% This is the title page style stolen from the Texinfo design, +% and expressed as a LaTeX style option. It is useful for manuals. +% +% Note that I play some *really* revolting games here to override +% the vertical and horizontal margins temporarily for the title page. +% The layout assumes you have 8.5" x 11" paper. You'd have to redo this +% for A4 or another size. +% -Olin 7/94 + + +% Fonts for title page: +\DeclareFixedFont{\titlefont}% + {\encodingdefault}{\familydefault}{bx}{\shapedefault}{20.5pt} +\DeclareFixedFont{\authorfnt}% + {\encodingdefault}{\familydefault}{bx}{\shapedefault}{14.4pt} +\DeclareFixedFont{\subtitlefnt}% + {\encodingdefault}{\familydefault}{m}{\shapedefault}{11} + +%\def\authorrm{\normalfont\selectfont\fontseries{bx}\fontsize{14.4}{14.4}} +%\def\subtitlefnt{\normalfont\selectfont\fontsize{11}{11}} + +\newskip\titlepagetopglue \titlepagetopglue = 2.5in + + +\newlength{\widewidth} +\setlength{\widewidth}{6.5in} +\newlength{\negwidemargin} +\setlength{\negwidemargin}{-\oddsidemargin} % Reset the margin +\addtolength{\negwidemargin}{-1in} % to edge of page +\addtolength{\negwidemargin}{1in} % Then move right one inch. + +%\def\wideline#1{\hbox to 0pt{\hspace\negwidemargin\hbox to\widewidth{#1}}} +\def\wideline#1{\hbox{\makebox[0pt][l]{\hspace\negwidemargin\hbox to\widewidth{#1}}}} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\def\maketitle{\begin{titlepage} + \thispagestyle{empty} + \let\footnotesize\small \let\footnoterule\relax + \null + \parindent=0pt + \def\subtitlefont{\normalbaselineskip = 13pt \normalbaselines \subtitlefnt}% + \def\authorfont{\normalbaselineskip = 16pt \normalbaselines \authorfnt}% +% + % Leave some space at the very top of the page. + \vspace*{-1in}\vspace*{-\topmargin}\vspace*{-\headheight}\vspace*{-\headsep} + \vglue\titlepagetopglue +% + \wideline{\titlefont \@title \hfill} % title +% \vskip4pt + \vskip -0.3\baselineskip + \wideline{\leaders\hrule height 4pt\hfill} + \wideline{\hfill\subtitlefont\begin{tabular}[t]{@{}r@{}}\@subtitle% + \\\@date% + \end{tabular}} % subtitle +% + % author + \vskip 0pt plus 1filll + \wideline{\authorfont \begin{tabular}[t]{@{}c@{}}\@author + \end{tabular}\hfill} +% +% \vskip4pt + \vskip -0.3\baselineskip + \wideline{\leaders\hrule height 2pt\hfill} + + % This weirdness puts the bottom line 2.75 in from the bottom of + % an 11in page. + \vskip \textheight \vskip \headsep \vskip \headheight + \vskip \topmargin \vskip 1in \vskip -11in \vskip 2.75in + + \gdef\@author{}\gdef\@title{}\gdef\@subtitle{}\let\maketitle\relax + \end{titlepage} + \setcounter{page}{2} + } + +\def\subtitle#1{\gdef\@subtitle{#1}} +\def\@subtitle{} diff --git a/doc/scsh-manual/matter.sty b/doc/scsh-manual/matter.sty new file mode 100644 index 0000000..f0c4fda --- /dev/null +++ b/doc/scsh-manual/matter.sty @@ -0,0 +1,16 @@ +%&latex -*- latex -*- +% Implement the \frontmatter, \mainmatter, and \backmatter macros, +% so I can use them in reports, not just books. + +\newif\if@mainmatter \@mainmattertrue + +\newcommand\frontmatter{% + \cleardoublepage\@mainmatterfalse\pagenumbering{roman}} + +\newcommand\mainmatter{% + \cleardoublepage\@mainmattertrue% + \pagenumbering{arabic}\setcounter{page}{1}} + +\newcommand\backmatter{% + \if@openright\cleardoublepage\else\clearpage\fi% + \@mainmatterfalse} diff --git a/doc/scsh-manual/miscprocs.tex b/doc/scsh-manual/miscprocs.tex new file mode 100644 index 0000000..f6d6987 --- /dev/null +++ b/doc/scsh-manual/miscprocs.tex @@ -0,0 +1,45 @@ +%&latex -*- latex -*- + +\chapter{Miscellaneous routines} + +\section{Integer bitwise ops} +\label{sec:bitwise} +\defun{arithmetic-shift} {i j} \integer +\defunx {bitwise-and} {i j} \integer +\defunx {bitwise-ior} {i j} \integer +\defunx {bitwise-not} {i} \integer +\defunx {bitwise-xor} {i j} \integer +\begin{desc} + These operations operate on integers representing semi-infinite + bit strings, using a 2's-complement encoding. + + \ex{arithmetic-shift} shifts \var{i} by \var{j} bits. + A left shift is $j > 0$; a right shift is $j < 0$. +\end{desc} + +\section{List procedures} +\defun{nth}{list i}\object +\begin{desc} +Returns the $i^{\mathrm th}$ element of \var{list}. +The first element (the car) is \ex{(nth \var{list} 0)}, +the second element is \ex{(nth \var{list} 1)}, and so on. + +This procedure is provided as it is useful for accessing elements +from the lists returned by the field-readers (chapter~\ref{chapt:fr-awk}). +\end{desc} + + +\section{Top level} +\defun{repl}{}\undefined +\begin{desc} + This runs a {\scm} read-eval-print loop, + reading forms from the current input port, + and writing their values to the current output port. + + If you wish to try something dangerous, + and want to be able to recover your shell state, you can + fork off a subshell with the following form: + \codex{(run (begin (repl)))} + {\ldots}or, rephrased for the proceduralists: + \codex{(wait (fork repl))} +\end{desc} diff --git a/doc/scsh-manual/mysize10.sty b/doc/scsh-manual/mysize10.sty new file mode 100644 index 0000000..94c52c0 --- /dev/null +++ b/doc/scsh-manual/mysize10.sty @@ -0,0 +1,22 @@ +%&latex -*- latex -*- +\if@twoside + \oddsidemargin 44pt + \evensidemargin 82pt + \marginparwidth 107pt +\else + \oddsidemargin 63pt + \evensidemargin 63pt + \marginparwidth 90pt +\fi +\marginparsep 11pt + +\topmargin 27pt +\headheight 12pt +\headsep 25pt +\topskip = 10pt +\footskip 30pt + +\textheight = 43\baselineskip +\advance\textheight by \topskip +\textwidth 345pt +\endinput diff --git a/doc/scsh-manual/network.tex b/doc/scsh-manual/network.tex new file mode 100644 index 0000000..3651903 --- /dev/null +++ b/doc/scsh-manual/network.tex @@ -0,0 +1,426 @@ +%&latex -*- latex -*- + +\chapter{Networking} + +The Scheme Shell provides a BSD-style sockets interface. +There is not an official standard for a network interface +for scsh to adopt (this is the subject of the forthcoming Posix.8 +standard). +However, Berkeley sockets are a \emph{de facto} standard, +being found on most Unix workstations and PC operating systems. + +It is fairly straightforward to add higher-level network protocols +such as smtp, telnet, or http on top of the the basic socket-level +support scsh provides. +The Scheme Underground has also released a network library with +many of these protocols as a companion to the current release of scsh. +See this code for examples showing the use of the sockets interface. + +\section{High-level interface} + +For convenience, and to avoid some of the messy details of the socket +interface, we provide a high level socket interface. These routines +attempt to make it easy to write simple clients and servers without +having to think of many of the details of initiating socket connections. +We welcome suggested improvements to this interface, including better +names, which right now are solely descriptions of the procedure's action. +This might be fine for people who already understand sockets, +but does not help the new networking programmer. + +\defun {socket-connect} {protocol-family socket-type . args} {socket} +\begin{desc} +\ex{socket-connect} is intended for creating client applications. +\var{protocol-family} is specified as either the +\ex{protocol-family/internet} or \ex{protocol-family/unix}. +\var{socket-type} is specified as either \ex{socket-type/stream} or +\ex{socket-type/datagram}. See \ex{socket} for a more complete +description of these terms. + +The variable \var{args} list is meant to specify protocol family +specific information. For Internet sockets, this consists of two +arguments: a host name and a port number. For {\Unix} sockets, this +consists of a pathname. + +\ex{socket-connect} returns a \ex{socket} which can be used for input +and output from a remote server. See \ex{socket} for a description of +the \emph{socket record}. +\end{desc} + +\defun {bind-listen-accept-loop} {protocol-family proc arg} {does-not-return} +\begin{desc} +\ex{bind-listen-accept-loop} is intended for creating server +applications. \var{protocol-family} is specified as either the +\ex{protocol-family/internet} or \ex{protocol-family/unix}. +\var{proc} is a procedure of two arguments: a \ex{socket} and a +{socket-address}. \var{arg} specifies a port number for Internet sockets +or a pathname for {\Unix} sockets. See \ex{socket} for a more complete +description of these terms. + +\var{proc} is called with a socket and a socket address each time there +is a connection from a client application. The socket allows +communications with the client. The socket address specifies the +address of the remote client. + +This procedure does not return, but loops indefinitely accepting +connections from client programs. +\end{desc} + +\section{Sockets} + +\defun {create-socket} {protocol-family type [protocol]} {socket} +\defunx {create-socket-pair} {type} {[socket$_{1}$ socket$_{2}$]} +\defunx {close-socket} {socket} \undefined +\begin{desc} + +A socket is one end of a network connection. Three specific properties +of sockets are specified at creation time: the protocol-family, type, +and protocol. + +The \var{protocol-family} specifies the protocol family to be used with +the socket. This also determines the address family of socket addresses, +which are described in more detail below. Scsh currently supports the +{\Unix} internal protocols and the Internet protocols using the +following constants: +\begin{code}\codeallowbreaks +protocol-family/unspecified +protocol-family/unix +protocol-family/internet\end{code} + +The \var{type} specifies the style of communication. Examples that your +operating system probably provides are stream and datagram sockets. +Others maybe available depending on your system. Typical values are: +\begin{code}\codeallowbreaks +socket-type/stream +socket-type/datagram +socket-type/raw\end{code} + +The \var{protocol} specifies a particular protocol to use within a +protocol family and type. Usually only one choice exists, but it's +probably safest to set this explicitly. See the protocol database +routines for information on looking up protocol constants. + +New sockets are typically created with \ex{create-socket}. However, +\ex{create-socket-pair} can also be used to create a pair of connected +sockets in the \ex{protocol-family/unix} protocol-family. The value of a +returned socket is a \emph{socket record}, defined to have the following +structure: +\begin{code} +(define-record socket + family ; protocol family + inport ; input-port + outport) ; output-port\end{code} + +The \ex{family} specifies the protocol family of the socket. The +\ex{inport} and \ex{outport} fields are ports that can be used for input +and output, respectively. For a stream socket, they are only usable +after a connection has been established via \ex{connect-socket} or +\ex{accept-connection}. For a datagram socket, \var{outport} can be +immediately using \ex{send-message}, and \var{inport} can be used after +\ex{bind} has created a local address. + +\ex{close-socket} provides a convenient way to close a socket's port. It +is preferred to explicitly closing the inport and outport because using +\ex{close} on sockets is not currently portable across operating systems. + +\end{desc} + +\section{Socket addresses} + +The format of a socket-address depends on the address family of the +socket. Address-family-specific routines are provided to convert +protocol-specific addresses to socket addresses. The value returned by +these routines is a \emph{socket-address record}, defined to have the +following visible structure: +\begin{code} +(define-record socket-address + family) ; address family\end{code} + +The \ex{family} is one of the following constants: +\begin{code}\codeallowbreaks +address-family/unspecified +address-family/unix +address-family/internet\end{code} + +\defun {unix-address->socket-address} {pathname} {socket-address} +\begin{desc} +\ex{unix-address->socket-address} returns a \var{socket-address} based +on the string \var{pathname}. There is a system dependent limit on the +length of \var{pathname}. +\end{desc} + +\defun {internet-address->socket-address} {host-address service-port} {socket-address} +\begin{desc} +\ex{internet-address->socket-address} returns a \var{socket-address} based +on an integer \var{host-address} and an integer \var{service-port}. +Besides being a 32-bit host address, an Internet host address can also +be one of the following constants: +\begin{code}\codeallowbreaks +internet-address/any +internet-address/loopback +internet-address/broadcast\end{code} + +The use of \ex{internet-address/any} is described below in +\ex{bind-socket}. \ex{internet-address/loopback} is an address that +always specifies the local machine. \ex{internet-address/broadcast} is +used for network broadcast communications. + +For information on obtaining a host's address, see the \ex{host-info} +function. +\end{desc} + +\defun {socket-address->unix-address} {socket-address} {pathname} +\defunx {socket-address->internet-address} {socket-address} {[host-address service-port]} +\begin{desc} + +The routines \ex{socket-address->internet-address} and +\ex{socket-address->unix-address} return the address-family-specific addresses. +Be aware that most implementations don't correctly return anything more +than an empty string for addresses in the {\Unix} address-family. +\end{desc} + +\section{Socket primitives} + +The procedures in this section are presented in the order in which a +typical program will use them. Consult a text on network systems +programming for more information on sockets.\footnote{ +Some recommended ones are: + +\begin{itemize} + +\item ``Unix Network Programming'' by W. Richard Stevens + +\item ``An Introductory 4.3BSD Interprocess Communication Tutorial.'' +(reprinted in UNIX Programmer's Supplementary Documents Volume 1, PS1:7) + +\item ``An Advanced 4.3BSD Interprocess Communication Tutorial.'' +(reprinted in UNIX Programmer's Supplementary Documents Volume 1, PS1:8) + +\end{itemize} +} +The last two tutorials are freely available as part of BSD. In the +absence of these, your {\Unix} manual pages for socket might be a good +starting point for information. + +\defun {connect-socket} {socket socket-address} \undefined +\begin{desc} +\ex{connect-socket} sets up a connection from a \var{socket} +to a remote \var{socket-address}. A connection has different meanings +depending on the socket type. A stream socket must be connected before +use. A datagram socket can be connected multiple times, but need not be +connected at all if the remote address is specified with each +\ex{send-message}, described below. Also, datagram sockets +may be disassociated from a remote address by connecting to a null +remote address. +\end{desc} + +\defun {bind-socket} {socket socket-address} \undefined +\begin{desc} +\ex{bind-socket} assigns a certain local \var{socket-address} to a +\var{socket}. Binding a socket reserves the local address. To receive +connections after binding the socket, use \ex{listen-socket} for stream +sockets and \ex{receive-message} for datagram sockets. + +Binding an Internet socket with a host address of +\ex{internet-address/any} indicates that the caller does +not care to specify from which local network interface connections are +received. Binding an Internet socket with a service port number of zero +indicates that the caller has no preference as to the port number +assigned. + +Binding a socket in the {\Unix} address family creates a socket special +file in the file system that must be deleted before the address can be +reused. See \ex{delete-file}. +\end{desc} + +\defun {listen-socket} {socket backlog} \undefined +\begin{desc} +\ex{listen-socket} allows a stream \var{socket} to start receiving connections, +allowing a queue of up to \var{backlog} connection requests. Queued +connections may be accepted by \ex{accept-connection}. +\end{desc} + +\defun {accept-connection} {socket} {[new-socket socket-address]} +\begin{desc} +\ex{accept-connection} receives a connection on a \var{socket}, returning +a new socket that can be used for this connection and the remote socket +address associated with the connection. +\end{desc} + +\defun {socket-local-address} {socket} {socket-address} +\defunx {socket-remote-address} {socket} {socket-address} +\begin{desc} +Sockets can be associated with a local address or a remote address or +both. \ex{socket-local-address} returns the local \var{socket-address} +record associated with \var{socket}. \ex{socket-remote-address} returns +the remote \var{socket-address} record associated with \var{socket}. +\end{desc} + +\defun {shutdown-socket} {socket how-to} \undefined +\begin{desc} + +\ex{shutdown-socket} shuts down part of a full-duplex socket. +The method of shutting done is specified by the \var{how-to} argument, +one of: +\begin{code}\codeallowbreaks +shutdown/receives +shutdown/sends +shutdown/sends+receives\end{code} +\end{desc} + +\section{Performing input and output on sockets} + +\defun {receive-message} {socket length [flags]} {[string-or-\sharpf socket-address]} +\dfnix {receive-message!} {socket string [start] [end] [flags]} + {[count-or-\sharpf socket-address]}{procedure} + {receive-message"!@\texttt{receive-message"!}} +\defunx {receive-message/partial} {socket length [flags]} + {[string-or-\sharpf socket-address]} +\dfnix {receive-message!/partial} {socket string [start] [end] [flags]} + {[count-or-\sharpf socket-address]}{procedure} + {receive-message"!/partial@\texttt{receive-message"!/partial}} +\defun {send-message} {socket string [start] [end] [flags] [socket-address]} + \undefined +\defunx {send-message/partial} + {socket string [start] [end] [flags] [socket-address]} {count} + +\begin{desc} +For most uses, standard input and output routines such as +\ex{read-string} and \ex{write-string} should suffice. However, in some +cases an extended interface is required. The \ex{receive-message} and +\ex{send-message} calls parallel the \ex{read-string} and +\ex{write-string} calls with a similar naming scheme. + +One additional feature of these routines is that \ex{receive-message} +returns the remote \var{socket-address} and \var{send-message} takes an +optional remote +\ex{socket-address}. This allows a program to know the source of input +from a datagram socket and to use a datagram socket for output without +first connecting it. + +All of these procedures take an optional \var{flags} field. This +argument is an integer bit-mask, composed by or'ing together the +following constants: +\begin{code}\codeallowbreaks +message/out-of-band +message/peek +message/dont-route\end{code} + +See \ex{read-string} and \ex{write-string} for a more detailed +description of the arguments and return values. +\end{desc} + +\section{Socket options} + +\defun {socket-option} {socket level option} {value} +\defunx {set-socket-option} {socket level option value} \undefined + +\begin{desc} +\ex{socket-option} and \ex{set-socket-option} allow the inspection and +modification, respectively, of several options available on sockets. The +\var{level} argument specifies what protocol level is to be examined or +affected. A level of \ex{level/socket} specifies the highest possible +level that is available on all socket types. A specific protocol number +can also be used as provided by \ex{protocol-info}, described below. + +There are several different classes of socket options. The first class +consists of boolean options which can be either true or false. Examples +of this option type are: +\begin{code}\codeallowbreaks +socket/debug +socket/accept-connect +socket/reuse-address +socket/keep-alive +socket/dont-route +socket/broadcast +socket/use-loop-back +socket/oob-inline +socket/use-privileged +socket/cant-signal +tcp/no-delay\end{code} + +Value options are another category of socket options. Options of this +type are an integer value. Examples of this option type are: +\begin{code}\codeallowbreaks +socket/send-buffer +socket/receive-buffer +socket/send-low-water +socket/receive-low-water +socket/error +socket/type +ip/time-to-live +tcp/max-segment\end{code} + +A third option type specifies how long for data to linger after a socket +has been closed. There is only one option of this type: +\ex{socket/linger}. It is set with either \sharpf to disable it or an +integer number of seconds to linger and returns a value of the same type +upon inspection. + +The fourth and final option type of this time is a timeout option. There +are two examples of this option type: \ex{socket/send-timeout} and +\ex{socket/receive-timeout}. These are set with a real number of +microseconds resolution and returns a value of the same type upon +inspection. + +\end{desc} + +\section{Database-information entries} + +\defun {host-info} {name-or-socket-address} {host-info} +\defunx {network-info} {name-or-socket-address} {network-info} +\defunx {service-info} {name-or-number [protocol-name]} {service-info} +\defunx {protocol-info} {name-or-number} {protocol-info} + +\begin{desc} + +\ex{host-info} allows a program to look up a host entry based on either +its string \var{name} or \var{socket-address}. The value returned by this +routine is a \emph{host-info record}, defined to have the following +structure: +\begin{code} +(define-record host-info + name ; Host name + aliases ; Alternative names + addresses) ; Host addresses\end{code} + +\ex{host-info} could fail and raise an error for one of the following +reasons: +\begin{code}\codeallowbreaks +herror/host-not-found +herror/try-again +herror/no-recovery +herror/no-data +herror/no-address\end{code} + +\ex{network-info} allows a program to look up a network entry based on either +its string \var{name} or \var{socket-address}. The value returned by this +routine is a \emph{network-info record}, defined to have the following +structure: +\begin{code} +(define-record network-info + name ; Network name + aliases ; Alternative names + net) ; Network number\end{code} + +\ex{service-info} allows a program to look up a service entry based +on either its string \var{name} or integer \var{port}. The value returned +by this routine is a \emph{service-info record}, defined to have the +following structure: +\begin{code} +(define-record service-info + name ; Service name + aliases ; Alternative names + port ; Port number + protocol) ; Protocol name\end{code} + +\ex{protocol-info} allows a program to look up a protocol entry based +on either its string \var{name} or integer \var{number}. The value returned +by this routine is a \emph{protocol-info record}, defined to have the +following structure: +\begin{code} +(define-record protocol-info + name ; Protocol name + aliases ; Alternative names + number) ; Protocol number)\end{code} + +\end{desc} diff --git a/doc/scsh-manual/procnotation.tex b/doc/scsh-manual/procnotation.tex new file mode 100644 index 0000000..42d7e76 --- /dev/null +++ b/doc/scsh-manual/procnotation.tex @@ -0,0 +1,543 @@ +%&latex -*- latex -*- + +\chapter{Process notation} +\label{sec:proc-forms} +Scsh has a notation for controlling {\Unix} processes that takes the +form of s-expressions; this notation can then be embedded inside of +standard {\Scheme} code. +The basic elements of this notation are \emph{process forms}, +\emph{extended process forms}, and \emph{redirections}. + +\section{Extended process forms and i/o redirections} +An \emph{extended process form} is a specification of a {\Unix} process to +run, in a particular I/O environment: + \codex{\var{epf} {\synteq} (\var{pf} $ \var{redir}_1$ {\ldots} $ \var{redir}_n $)} +where \var{pf} is a process form and the $\var{redir}_i$ are redirection specs. +A \emph{redirection spec} is one of: +\begin{inset} +\begin{tabular}{@{}l@{\qquad{\tt; }}l@{}} + \ex{(< \var{[fdes]} \var{file-name})} & \ex{Open file for read.} +\\\ex{(> \var{[fdes]} \var{file-name})} & \ex{Open file create/truncate.} +\\\ex{(<< \var{[fdes]} \var{object})} & \ex{Use \var{object}'s printed rep.} +\\\ex{(>> \var{[fdes]} \var{file-name})} & \ex{Open file for append.} +\\\ex{(= \var{fdes} \var{fdes/port})} & \ex{Dup2} +\\\ex{(- \var{fdes/port})} & \ex{Close \var{fdes/port}.} +\\\ex{stdports} & \ex{0,1,2 dup'd from standard ports.} +\end{tabular} +\end{inset} +The input redirections default to file descriptor 0; +the output redirections default to file descriptor 1. + +The subforms of a redirection are implicitly backquoted, +and symbols stand for their print-names. +So \ex{(> ,x)} means +``output to the file named by {\Scheme} variable \ex{x},'' +and \ex{(< /usr/shivers/.login)} means ``read from \ex{/usr/shivers/.login}.'' + +\pagebreak +Here are two more examples of i/o redirection: +% +\begin{center} +\begin{codebox} +(< ,(vector-ref fv i)) +(>> 2 /tmp/buf)\end{codebox} +\end{center} +% +These two redirections cause the file \ex{fv[i]} to be opened on stdin, and +\ex{/tmp/buf} to be opened for append writes on stderr. + +The redirection \ex{(<< \var{object})} causes input to come from the +printed representation of \var{object}. +For example, + \codex{(<< "The quick brown fox jumped over the lazy dog.")} +causes reads from stdin to produce the characters of the above string. +The object is converted to its printed representation using the \ex{display} +procedure, so + \codex{(<< (A five element list))} +is the same as + \codex{(<< "(A five element list)")} +is the same as + \codex{(<< ,(reverse '(list element five A))){\rm.}} +(Here we use the implicit backquoting feature to compute the list to +be printed.) + +The redirection \ex{(= \var{fdes} \var{fdes/port})} causes \var{fdes/port} +to be dup'd into file descriptor \var{fdes}. +For example, the redirection + \codex{(= 2 1)} +causes stderr to be the same as stdout. +\var{fdes/port} can also be a port, for example: + \codex{(= 2 ,(current-output-port))} +causes stderr to be dup'd from the current output port. +In this case, it is an error if the port is not a file port +(\eg, a string port). +More complex redirections can be accomplished using the \ex{begin} +process form, discussed below, which gives the programmer full control +of i/o redirection from {\Scheme}. + +\subsection{Port and file descriptor sync} +\begin{sloppypar} +It's important to remember that rebinding Scheme's current I/O ports +(\eg, using \ex{call-with-input-file} to rebind the value of +\ex{(current-input-port)}) +does \emph{not} automatically ``rebind'' the file referenced by the +{\Unix} stdio file descriptors 0, 1, and 2. +This is impossible to do in general, since some {\Scheme} ports are +not representable as {\Unix} file descriptors. +For example, many {\Scheme} implementations provide ``string ports,'' +that is, ports that collect characters sent to them into memory buffers. +The accumulated string can later be retrieved from the port as a string. +If a user were to bind \ex{(current-output-port)} to such a port, it would +be impossible to associate file descriptor 1 with this port, as it +cannot be represented in {\Unix}. +So, if the user subsequently forked off some other program as a subprocess, +that program would of course not see the {\Scheme} string port as its standard +output. +\end{sloppypar} + +To keep stdio synced with the values of {\Scheme}'s current i/o ports, +use the special redirection \ex{stdports}. +This causes 0, 1, 2 to be redirected from the current {\Scheme} standard ports. +It is equivalent to the three redirections: +\begin{code} +(= 0 ,(current-input-port)) +(= 1 ,(current-output-port)) +(= 2 ,(error-output-port))\end{code} +% +The redirections are done in the indicated order. This will cause an error if +one of the current i/o ports isn't a {\Unix} port (\eg, if one is a string +port). +This {\Scheme}/{\Unix} i/o synchronisation can also be had in {\Scheme} code +(as opposed to a redirection spec) with the \ex{(stdports->stdio)} +procedure. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Process forms} +A \emph{process form} specifies a computation to perform as an independent +{\Unix} process. It can be one of the following: +% +\begin{leftinset} +\begin{codebox} +(begin . \var{scheme-code}) +(| \vari{pf}{\!1} {\ldots} \vari{pf}{\!n}) +(|+ \var{connect-list} \vari{pf}{\!1} {\ldots} \vari{pf}{\!n}) +(epf . \var{epf}) +(\var{prog} \vari{arg}{1} {\ldots} \vari{arg}{n}) +\end{codebox} +\qquad +\begin{codebox} +; Run \var{scheme-code} in a fork. +; Simple pipeline +; Complex pipeline +; An extended process form. +; Default: exec the program. +\end{codebox} +\end{leftinset} +% +The default case \ex{(\var{prog} \vari{arg}1 {\ldots} \vari{arg}n)} +is also implicitly backquoted. +That is, it is equivalent to: +% +\codex{(begin (apply exec-path `(\var{prog} \vari{arg}1 {\ldots} \vari{arg}n)))} +% +\ex{Exec-path} is the version of the \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=exec&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{exec()}} system call that +uses scsh's path list to search for an executable. +The program and the arguments must be either strings, symbols, or integers. +Symbols and integers are coerced to strings. +A symbol's print-name is used. +Integers are converted to strings in base 10. +Using symbols instead of strings is convenient, since it suppresses the +clutter of the surrounding \ex{"\ldots"} quotation marks. +To aid this purpose, scsh reads symbols in a case-sensitive manner, +so that you can say +\codex{(more Readme)} +and get the right file. + +A \var{connect-list} is a specification of how two processes are to be wired +together by pipes. +It has the form \ex{((\vari{from}1 \vari{from}2 {\ldots} \var{to}) \ldots)} +and is implicitly backquoted. +For example, +% +\codex{(|+ ((1 2 0) (3 1)) \vari{pf}{\!1} \vari{pf}{\!2})} +% +runs \vari{pf}{\!1} and \vari{pf}{\!2}. +The first clause \ex{(1 2 0)} causes \vari{pf}{\!1}'s +stdout (1) and stderr (2) to be connected via pipe +to \vari{pf}{\!2}'s stdin (0). +The second clause \ex{(3 1)} causes \vari{pf}{\!1}'s file descriptor 3 to be +connected to \vari{pf}{\!2}'s file descriptor 1. +%---this is unusual, and not expected to occur very often. + +The \ex{begin} process form does a \ex{stdio->stdports} synchronisation +in the child process before executing the body of the form. +This guarantees that the \ex{begin} form, like all other process forms, +``sees'' the effects of any associated I/O redirections. + +Note that {\R4RS} does not specify whether or not \ex{|} and \ex{|+} +are readable symbols. Scsh does. + +\section{Using extended process forms in \Scheme} +Process forms and extended process forms are \emph{not} {\Scheme}. +They are a different notation for expressing computation that, like {\Scheme}, +is based upon s-expressions. +Extended process forms are used in {\Scheme} programs by embedding them inside +special {\Scheme} forms. +There are three basic {\Scheme} forms that use extended process forms: +\ex{exec-epf}, \cd{&}, and \ex{run}. + +\dfn {exec-epf} {. \var{epf}} {\noreturn} {syntax} +\dfnx {\&} {. \var{epf}} {proc} {syntax} +\dfnx {run} {. \var{epf}} {proc} {syntax} +\begin{desc} +\index{exec-epf} \index{\&} \index{run} +The \ex{(exec-epf . \var{epf})} form nukes the current process: it establishes +the i/o redirections and then overlays the current process with the requested +computation. + +The \ex{(\& . \var{epf})} form is similar, except that the process is forked +off in background. The form returns the subprocess' process object. + +The \ex{(run . \var{epf})} form runs the process in foreground: +after forking off the computation, it waits for the subprocess to exit, +and returns its exit status. + +These special forms are macros that expand into the equivalent +series of system calls. +The definition of the \ex{exec-epf} macro is non-trivial, +as it produces the code to handle i/o redirections and set up pipelines. +However, the definitions of the \cd{&} and \ex{run} macros are very simple: +\begin{leftinset} +\begin{tabular}{@{}l@{\quad$\equiv$\quad}l@{}} +\cd{(& . \var{epf})} & \ex{(fork (\l{} (exec-epf . \var{epf})))} \\ +\ex{(run . \var{epf})} & \cd{(wait (& . \var{epf}))} +\end{tabular} +\end{leftinset} +\end{desc} + +\subsection{Procedures and special forms} +It is a general design principle in scsh that all functionality +made available through special syntax is also available in a +straightforward procedural form. +So there are procedural equivalents for all of the process notation. +In this way, the programmer is not restricted by the particular details of +the syntax. +Here are some of the syntax/procedure equivalents: +\begin{inset} +\begin{tabular}{@{}|ll|@{}} +\hline +Notation & Procedure \\ \hline \hline +\ex{|} & \ex{fork/pipe} \\ +\ex{|+} & \ex{fork/pipe+} \\ +\ex{exec-epf} & \ex{exec-path} \\ +redirection & \ex{open}, \ex{dup} \\ +\cd{&} & \ex{fork} \\ +\ex{run} & $\ex{wait} + \ex{fork}$ \\ +\hline +\end{tabular} +\end{inset} +% +Having a solid procedural foundation also allows for general notational +experimentation using {\Scheme}'s macros. +For example, the programmer can build his own pipeline notation on top of the +\ex{fork} and \ex{fork/pipe} procedures. +Chapter~\ref{chapt:syscalls} gives the full story on all the procedures +in the syscall library. + +\subsection{Interfacing process output to {\Scheme}} +\label{sec:io-interface} +There is a family of procedures and special forms that can be used +to capture the output of processes as {\Scheme} data. +% +\dfn {run/port} {. \var{epf}} {port} {syntax} +\dfnx{run/file} {. \var{epf}} {\str} {syntax} +\dfnx{run/string} {. \var{epf}} {\str} {syntax} +\dfnx{run/strings} {. \var{epf}} {{\str} list} {syntax} +\dfnx{run/sexp} {. \var{epf}} {object} {syntax} +\dfnx{run/sexps} {. \var{epf}} {list} {syntax} +\begin{desc} +These forms all fork off subprocesses, collecting the process' output +to stdout in some form or another. +The subprocess runs with file descriptor 1 and the current output port +bound to a pipe. +\begin{desctable}{0.7\linewidth} +\ex{run/port} & Value is a port open on process's stdout. + Returns immediately after forking child. \\ +\ex{run/file} & Value is name of a temp file containing process's output. + Returns when process exits. \\ +\ex{run/string} & Value is a string containing process' output. + Returns when eof read. \\ +\ex{run/strings}& Splits process' output into a list of + newline-delimited strings. Returns when eof read. \\ +\ex{run/sexp} & Reads a single object from process' stdout with \ex{read}. + Returns as soon as the read completes. \\ +\ex{run/sexps} & Repeatedly reads objects from process' stdout with \ex{read}. + Returns accumulated list upon eof. +\end{desctable} +The delimiting newlines are not included in the strings returned by +\ex{run/strings}. + +These special forms just expand into calls to the following analogous +procedures. +\end{desc} + +\defun {run/port*} {thunk} {port} +\defunx {run/file*} {thunk} {\str} +\defunx {run/string*} {thunk} {\str} +\defunx {run/strings*} {thunk} {{\str} list} +\defunx {run/sexp*} {thunk} {object} +\defunx {run/sexps*} {thunk} {object list} +\begin{desc} +For example, \ex{(run/port . \var{epf})} expands into +\codex{(run/port* (\l{} (exec-epf . \var{epf}))).} +\end{desc} + +The following procedures are also of utility for generally parsing +input streams in scsh: +\defun {port->string} {port} {\str} +\defunx {port->sexp-list} {port} {list} +\defunx {port->string-list} {port} {{\str} list} +\defunx {port->list} {reader port} {list} +\begin{desc} +\ex{Port->string} reads the port until eof, +then returns the accumulated string. +\ex{Port->sexp-list} repeatedly reads data from the port until eof, +then returns the accumulated list of items. +\ex{Port->string-list} repeatedly reads newline-terminated strings from the +port until eof, then returns the accumulated list of strings. +The delimiting newlines are not part of the returned strings. +\ex{Port->list} generalises these two procedures. +It uses \var{reader} to repeatedly read objects from a port. +It accumulates these objects into a list, which is returned upon eof. +The \ex{port->string-list} and \ex{port->sexp-list} procedures +are trivial to define, being merely \ex{port->list} curried with +the appropriate parsers: +\begin{code}\cddollar +(port->string-list \var{port}) $\equiv$ (port->list read-line \var{port}) +(port->sexp-list \var{port}) $\equiv$ (port->list read \var{port})\end{code} +% +The following compositions also hold: +\begin{code}\cddollar +run/string* $\equiv$ port->string $\circ$ run/port* +run/strings* $\equiv$ port->string-list $\circ$ run/port* +run/sexp* $\equiv$ read $\circ$ run/port* +run/sexps* $\equiv$ port->sexp-list $\circ$ run/port*\end{code} +\end{desc} + +\defun{port-fold}{port reader op . seeds} {\object\star} +\begin{desc} +This procedure can be used to perform a variety of iterative operations +over an input stream. +It repeatedly uses \var{reader} to read an object from \var{port}. +If the first read returns eof, then the entire \ex{port-fold} +operation returns the seeds as multiple values. +If the first read operation returns some other value $v$, then +\var{op} is applied to $v$ and the seeds: +\ex{(\var{op} \var{v} . \var{seeds})}. +This should return a new set of seed values, and the reduction then loops, +reading a new value from the port, and so forth. +(If multiple seed values are used, then \var{op} must return multiple values.) + +For example, \ex{(port->list \var{reader} \var{port})} +could be defined as + \codex{(reverse (port-fold \var{port} \var{reader} cons '()))} + +An imperative way to look at \ex{port-fold} is to say that it +abstracts the idea of a loop over a stream of values read from +some port, where the seed values express the loop state. + +\remark{This procedure was formerly named \texttt{\indx{reduce-port}}. + The old binding is still provided, but is deprecated and will + probably vanish in a future release.} +\end{desc} + + +\section{More complex process operations} +The procedures and special forms in the previous section provide for the +common case, where the programmer is only interested in the output of the +process. +These special forms and procedures provide more complicated facilities +for manipulating processes. + + +\subsection{Pids and ports together} +\dfn {run/port+proc} {. \var{epf}} {[port proc]} {syntax} +\defunx {run/port+proc*} {thunk} {[port proc]} +\begin{desc} +This special form and its analogous procedure can be used +if the programmer also wishes access to the process' pid, exit status, +or other information. +They both fork off a subprocess, returning two values: +a port open on the process' stdout (and current output port), +and the subprocess's process object. +A process object encapsulates the subprocess' process id and exit code; +it is the value passed to the \ex{wait} system call. + + +For example, to uncompress a tech report, reading the uncompressed +data into scsh, and also be able to track the exit status of +the decompression process, use the following: +\begin{code} +(receive (port child) (run/port+proc (zcat tr91-145.tex.Z)) + (let* ((paper (port->string port)) + (status (wait child))) + {\rm\ldots{}use \ex{paper}, \ex{status}, and \ex{child} here\ldots}))\end{code} +% +Note that you must \emph{first} do the \ex{port->string} and +\emph{then} do the wait---the other way around may lock up when the +zcat fills up its output pipe buffer. +\end{desc} + + +\subsection{Multiple stream capture} + +Occasionally, the programmer may want to capture multiple distinct output +streams from a process. For instance, he may wish to read the stdout and +stderr streams into two distinct strings. This is accomplished with the +\ex{run/collecting} form and its analogous procedure, \ex{run/collecting*}. +% +\dfn {run/collecting} {fds . epf} {[port\ldots]} {syntax} +\defunx {run/collecting*} {fds thunk} {[port\ldots]} +\begin{desc} +\ex{Run/collecting} and \ex{run/collecting*} run processes that produce +multiple output streams and return ports open on these streams. To avoid +issues of deadlock, \ex{run/collecting} doesn't use pipes. Instead, it first +runs the process with output to temp files, then returns ports open on the +temp files. For example, +% +\codex{(run/collecting (1 2) (ls))} +% +runs \ex{ls} with stdout (fd 1) and stderr (fd 2) redirected to temporary +files. +When the \ex{ls} is done, \ex{run/collecting} returns three values: the +\ex{ls} process' exit status, and two ports open on the temporary files. The +files are deleted before \ex{run/collecting} returns, so when the ports are +closed, they vanish. The \ex{fds} list of file descriptors is implicitly +backquoted by the special-form version. + +For example, if Kaiming has his mailbox protected, then +\begin{code} +(receive (status out err) + (run/collecting (1 2) (cat /usr/kmshea/mbox)) + (list status (port->string out) (port->string err)))\end{code} +% +might produce the list +\codex{(256 "" "cat: /usr/kmshea/mbox: Permission denied")} + +What is the deadlock hazard that causes \ex{run/collecting} to use temp files? +Processes with multiple output streams can lock up if they use pipes +to communicate with {\Scheme} i/o readers. For example, suppose +some {\Unix} program \ex{myprog} does the following: +\begin{enumerate} + \item First, outputs a single ``\ex{(}'' to stderr. + \item Then, outputs a megabyte of data to stdout. + \item Finally, outputs a single ``\ex{)}'' to stderr, and exits. +\end{enumerate} + +Our scsh programmer decides to run \ex{myprog} with stdout and stderr redirected +\emph{via {\Unix} pipes} to the ports \ex{port1} and \ex{port2}, respectively. +He gets into trouble when he subsequently says \ex{(read port2)}. +The {\Scheme} \ex{read} routine reads the open paren, and then hangs in a +\ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=read&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{read()}} system call trying to read a matching close paren. +But before \ex{myprog} sends the close paren down the stderr +pipe, it first tries to write a megabyte of data to the stdout pipe. +However, {\Scheme} is not reading that pipe---it's stuck waiting for input on +stderr. +So the stdout pipe quickly fills up, and \ex{myprog} hangs, waiting for the +pipe to drain. +The \ex{myprog} child is stuck in a stdout/\ex{port1} write; +the {\Scheme} parent is stuck in a stderr/\ex{port2} read. +Deadlock. + +Here's a concrete example that does exactly the above: +\begin{code} +(receive (status port1 port2) + (run/collecting (1 2) + (begin + ;; Write an open paren to stderr. + (run (echo "(") (= 1 2)) + ;; Copy a lot of stuff to stdout. + (run (cat /usr/dict/words)) + ;; Write a close paren to stderr. + (run (echo ")") (= 1 2)))) + + ;; OK. Here, I have a port PORT1 built over a pipe + ;; connected to the BEGIN subproc's stdout, and + ;; PORT2 built over a pipe connected to the BEGIN + ;; subproc's stderr. + (read port2) ; Should return the empty list. + (port->string port1)) ; Should return a big string.\end{code} +% +In order to avoid this problem, \ex{run/collecting} and \ex{run/collecting*} +first run the child process to completion, buffering all the output +streams in temp files (using the \ex{temp-file-channel} procedure, see below). +When the child process exits, ports open on the buffered output are returned. +This approach has two disadvantages over using pipes: +\begin{itemize} + \item The total output from the child output is temporarily written + to the disk before returning from \ex{run/collecting}. If this output + is some large intermediate result, the disk could fill up. + + \item The child producer and {\Scheme} consumer are serialised; there is + no concurrency overlap in their execution. +\end{itemize} +% +However, it remains a simple solution that avoids deadlock. More +sophisticated solutions can easily be programmed up as +needed---\ex{run/collecting*} itself is only 12 lines of simple code. + +See \ex{temp-file-channel} for more information on creating temp files +as communication channels. +\end{desc} + +\section{Conditional process sequencing forms} +These forms allow conditional execution of a sequence of processes. + +\dfn{||} {\vari{pf}1 \ldots \var{pf}n} {\boolean} {syntax} +\begin{desc} + Run each proc until one completes successfully (\ie, exit status zero). + Return true if some proc completes successfully; otherwise \sharpf. +\end{desc} + +\dfn{\&\&} {\vari{pf}1 \ldots \var{pf}n} {\boolean} {syntax} +\begin{desc} + Run each proc until one fails (\ie, exit status non-zero). + Return true if all procs complete successfully; otherwise \sharpf. +\end{desc} + +\section{Process filters} + +These procedures are useful for forking off processes to filter +text streams. + +\begin{defundesc}{char-filter}{filter}{\proc} + The \var{filter} argument is a character$\rightarrow$character procedure. + Returns a procedure that when called, repeatedly reads a character + from the current input port, applies \var{filter} to the character, + and writes the result to the current output port. + The procedure returns upon reaching eof on the input port. + + For example, to downcase a stream of text in a spell-checking pipeline, + instead of using the {\Unix} \ex{tr A-Z a-z} command, we can say: +\begin{code} +(run (| (delatex) + (begin ((char-filter char-downcase))) ; tr A-Z a-z + (spell) + (sort) + (uniq)) + (< scsh.tex) + (> spell-errors.txt))\end{code} +\end{defundesc} + +\begin{defundesc}{string-filter}{filter [buflen]}{\proc} + The \var{filter} argument is a string$\rightarrow$string procedure. + Returns a procedure that when called, repeatedly reads a string + from the current input port, applies \var{filter} to the string, + and writes the result to the current output port. + The procedure returns upon reaching eof on the input port. + + The optional \var{buflen} argument controls the number of characters + each internal read operation requests; this means that \var{filter} + will never be applied to a string longer than \var{buflen} chars. + The default \var{buflen} value is 1024. +\end{defundesc} diff --git a/doc/scsh-manual/rdelim.tex b/doc/scsh-manual/rdelim.tex new file mode 100644 index 0000000..97ee3e0 --- /dev/null +++ b/doc/scsh-manual/rdelim.tex @@ -0,0 +1,148 @@ +%&latex -*- latex -*- + +\chapter{Reading delimited strings} +\label{chapt:rdelim} + +Scsh provides a set of procedures that read delimited strings from +input ports. +There are procedures to read a single line of text +(terminated by a newline character), +a single paragraph (terminated by a blank line), +and general delimited strings +(terminated by a character belonging to an arbitrary character set). + +These procedures can be applied to any Scheme input port. +However, the scsh virtual machine has native-code support for performing +delimited reads on Unix ports, and these input operations should be +particularly fast---much faster than doing the equivalent character-at-a-time +operation from Scheme code. + +All of the delimited input operations described below take a \ex{handle-delim} +parameter, which determines what the procedure does with the terminating +delimiter character. +There are four possible choices for a \ex{handle-delim} parameter: +\begin{inset} +\begin{tabular}{|l|l|} \hline + \ex{handle-delim} & Meaning \\ \hline\hline + \ex{'trim} & Ignore delimiter character. \\ + \ex{'peek} & Leave delimiter character in input stream. \\ + \ex{'concat} & Append delimiter character to returned value. \\ + \ex{'split} & Return delimiter as second value. \\ + \hline +\end{tabular} +\end{inset} +The first case, \ex{'trim}, is the standard default for all the routines +described in this section. +The last three cases allow the programmer to distinguish between strings +that are terminated by a delimiter character, and strings that are +terminated by an end-of-file. + + +\begin{defundesc} {read-line} {[port handle-newline]} {{\str} or eof-object} + Reads and returns one line of text; on eof, returns the eof object. + A line is terminated by newline or eof. + + \var{handle-newline} determines what \ex{read-line} does with the + newline or EOF that terminates the line; it takes the general set + of values described for the general \ex{handle-delim} case above, + and defaults to \ex{'trim} (discard the newline). + Using this argument allows one to tell whether or not the last line of + input in a file is newline terminated. +\end{defundesc} + +\defun{read-paragraph} {[port handle-delim]} {{\str} or eof} +\begin{desc} + This procedure skips blank lines, + then reads text from a port until a blank line or eof is found. + A ``blank line'' is a (possibly empty) line composed only of white space. + The \var{handle-delim} parameter determines how the terminating + blank line is handled. + It is described above, and defaults to \ex{'trim}. + The \ex{'peek} option is not available. +\end{desc} + + +The following procedures read in strings from ports delimited by characters +belonging to a specific set. +See section~\ref{sec:char-sets} for information on character set manipulation. + +\defun{read-delimited}{char-set [port handle-delim]} {{\str} or eof} +\begin{desc} + Read until we encounter one of the chars in \var{char-set} or eof. + The \var{handle-delim} parameter determines how the terminating character + is handled. It is described above, and defaults to \ex{'trim}. + + The \var{char-set} argument may be a charset, a string, a character, or a + character predicate; it is coerced to a charset. +\end{desc} + +\dfni{read-delimited!} {char-set buf [port handle-delim start end]} + {nchars or eof or \#f}{procedure} + {read-delimited"!@\texttt{read-delimited"!}} +\begin{desc} + A side-effecting variant of \ex{read-delimited}. + + The data is written into the string \var{buf} at the indices in the + half-open interval $[\var{start},\var{end})$; the default interval is the + whole string: $\var{start}=0$ and $\var{end}=\ex{(string-length + \var{buf})}$. The values of \var{start} and \var{end} must specify a + well-defined interval in \var{str}, \ie, $0 \le \var{start} \le \var{end} + \le \ex{(string-length \var{buf})}$. + + It returns \var{nbytes}, the number of bytes read. If the buffer filled up + without a delimiter character being found, \ex{\#f} is returned. If + the port is at eof when the read starts, the eof object is returned. + + If an integer is returned (\ie, the read is successfully terminated by + reading a delimiter character), then the \var{handle-delim} parameter + determines how the terminating character is handled. + It is described above, and defaults to \ex{'trim}. +\end{desc} + + + +\dfni{\%read-delimited!} {char-set buf gobble? [port start end]} + {[char-or-eof-or-\#f \integer]}{procedure} + {"%read-delimited"!@\verb:"%read-delimited"!:} +\begin{desc} +This low-level delimited reader uses an alternate interface. +It returns two values: \var{terminator} and \var{num-read}. +\begin{description} +\item [terminator] + A value describing why the read was terminated: + \begin{flushleft} + \begin{tabular}{l@{\qquad$\Rightarrow$\qquad}l} + Character or eof-object & Read terminated by this value. \\ + \ex{\#f} & Filled buffer without finding a delimiter. + \end{tabular} + \end{flushleft} + +\item [num-read] + Number of characters read into \var{buf}. +\end{description} + +If the read is successfully terminated by reading a delimiter character, +then the \var{gobble?} parameter determines what to do with the terminating +character. +If true, the character is removed from the input stream; +if false, the character is left in the input stream where a subsequent +read operation will retrieve it. +In either case, the character is also the first value returned by +the procedure call. +\end{desc} + +%Note: +%- Invariant: TERMINATOR = #f => NUM-READ = END - START. +%- Invariant: TERMINATOR = eof-object and NUM-READ = 0 => at EOF. +%- When determining the TERMINATOR return value, ties are broken +% favoring character or the eof-object over #f. That is, if the buffer +% fills up, %READ-DELIMITED! will peek at one more character from the +% input stream to determine if it terminates the input. If so, that +% is returned, not #f. + +\begin{defundesc} {skip-char-set} {skip-chars [port]} {\integer} + Skip characters occurring in the set \var{skip-chars}; + return the number of characters skipped. + The \var{skip-chars} argument may be a charset, a string, a character, or a + character predicate; it is coerced to a charset. +\end{defundesc} diff --git a/doc/scsh-manual/running.tex b/doc/scsh-manual/running.tex new file mode 100644 index 0000000..78b9096 --- /dev/null +++ b/doc/scsh-manual/running.tex @@ -0,0 +1,934 @@ +%&latex -*- latex -*- + +\chapter{Running scsh} +\label{chapt:running} + +Scsh is currently implemented on top of {\scm}, a freely-available +{\Scheme} implementation written by Jonathan Rees and Richard Kelsey. +{\scm} uses a byte-code interpreter for good code density, portability +and medium efficiency. It is {\R4RS}. +It also has a module system designed by Jonathan Rees. + +Scsh's design is not {\scm} specific, although the current implementation +is necessarily so. +Scsh is intended to be implementable in other {\Scheme} implementations. +The {\scm} virtual machine that scsh uses is a specially modified version; +standard {\scm} virtual machines cannot be used with the scsh heap image. + +There are several different ways to invoke scsh. +You can run it as an interactive Scheme system, with a standard +read-eval-print interaction loop. +Scsh can also be invoked as the interpreter for a shell script by putting +a ``\verb|#!/usr/local/bin/scsh -s|'' line at the top of the shell script. + +Descending a level, it is also possible to invoke the underlying virtual +machine byte-code interpreter directly on dumped heap images. +Scsh programs can be pre-compiled to byte-codes and dumped as raw, +binary heap images. +Writing heap images strips out unused portions of the scsh runtime +(such as the compiler, the debugger, and other complex subsystems), +reducing memory demands and saving loading and compilation times. +The heap image format allows for an initial \verb|#!/usr/local/lib/scsh/scshvm| trigger +on the first line of the image, making heap images directly executable as +another kind of shell script. + +Finally, scsh's static linker system allows dumped heap images to be compiled +to a raw Unix a.out(5) format, which can be linked into the text section +of the vm binary. +This produces a true Unix executable binary file. +Since the byte codes comprising the program are in the file's text section, +they are not traced or copied by the garbage collector, do not occupy space +in the vm's heap, and do not need to be loaded and linked at startup time. +This reduces the program's startup time, memory requirements, +and paging overhead. + +This chapter will cover these various ways of invoking scsh programs. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Scsh command-line switches} + +When the scsh top-level starts up, it scans the command line +for switches that control its behaviour. +These arguments are removed from the command line; +the remaining arguments can be accessed as the value of +the scsh variable \ex{command-line-arguments}. + +\subsection{Scripts and programs} + +The scsh command-line switches provide sophisticated support for +the authors of shell scripts and programs; +they also allow the programmer to write programs +that use the {\scm} module system. + +There is a difference between a \emph{script}, which performs its action +\emph{as it is loaded}, and a \emph{program}, which is loaded/linked, +and then performs its action by having control transferred to an entry point +(\eg, the \ex{main()} function in C programs) that was defined by the +load/link operation. + +A \emph{script}, by the above definition, cannot be compiled by the simple +mechanism of loading it into a scsh process and dumping out a heap image---it +executes as it loads. It does not have a top-level \ex{main()}-type entry +point. + +It is more flexible and useful to implement a system +as a program than as a script. +Programs can be compiled straightforwardly; +they can also export procedural interfaces for use by other Scheme packages. +However, scsh supports both the script and the program style of programming. + +\subsection{Inserting interpreter triggers into scsh programs} +When Unix tries to execute an executable file whose first 16 bits are +the character pair ``\ex{\#!}'', it treats the file not as machine-code +to be directly executed by the native processor, but as source code to +be executed by some interpreter. +The interpreter to use is specified immediately after the ``\ex{\#!}'' +sequence on the first line of the source file +(along with one optional initial argument). +The kernel reads in the name of the interpreter, and executes that instead. +The interpreter is passed the source filename as its first argument, with +the original arguments following. +Consult the Unix man page for the \ex{exec} system call for more information. + +Scsh allows Scheme programs to have these triggers placed on +their first line. +Scsh treats the character sequence ``\ex{\#!}'' as a block-comment sequence,% +\footnote{Why a block-comment instead of an end-of-line delimited comment? + See the section on meta-args.} +and skips all following characters until it reads the comment-terminating +sequence newline/exclamation-point/sharp-sign/newline (\ie, the +sequence ``\ex{!\#}'' occurring on its own line). + +In this way, the programmer can arrange for an initial +\begin{code} +#!/usr/local/bin/scsh -s +!#\end{code} +header appearing in a Scheme program +to be ignored when the program is loaded into scsh. + +\subsection{Module system} +Scsh uses the {\scm} module system, which defines +\emph{packages}, \emph{structures}, and \emph{interfaces}. +% +\begin{description} + +\item [Package] A package is an environment---that is, a set of +variable/value bindings. +You can evaluate Scheme forms inside a package, or load a file into a package. +Packages export sets of bindings; these sets are called \emph{structures}. + +\item [Structure] A structure is a named view on a package---a set of + bindings. Other packages can \emph{open} the structure, importing its + bindings into their environment. Packages can provide more than one + structure, revealing different portions of the package's environment. + +\item [Interface] An interface is the ``type'' of a structure. An + interface is the set of names exported by a structure. These names + can also be marked with other static information (\eg, advisory type + declarations, or syntax information). +\end{description} +More information on the the {\scm} module system can be found in the +file \ex{module.ps} in the \ex{doc} directory of the {\scm} and scsh releases. + +Programming Scheme with a module system is different from programming +in older Scheme implementations, +and the associated development problems are consequently different. +In Schemes that lack modular abstraction mechanisms, +everything is accessible; the major problem is preventing name-space conflicts. +In Scheme 48, name-space conflicts vanish; the major problem is that not +all bindings are accessible from every place. +It takes a little extra work to specify what packages export which values. + +It may take you a little while to get used to the new style of program +development. +Although scsh can be used without referring to the module system at +all, we recommend taking the time to learn and use it. +The effort will pay off in the construction of modular, factorable programs. + +\subsubsection{Module warning} +Programmers who open both the \ex{scheme} and \ex{scsh} structures in their +own packages should make sure to always put the \ex{scsh} reference first. +\begin{center} +\begin{tabular}{l@{\qquad}l} +Do this: & Not this: \strut \\ +\quad{\begin{codebox}[b] +(define-structure web-server + (open scsh + scheme + net-hax + \vdots) + (file web))\end{codebox}} +& +\quad{\begin{codebox}[b] +(define-structure web-server + (open scheme + scsh + net-hax + \vdots) + (file web))\end{codebox}}\\ +% +Open \ex{scsh} before \ex{scheme}. & +Not \ex{scsh} after \ex{scheme}. +\end{tabular} +\end{center} +Ordering the two packages like this is necessary because scsh overrides +some of the standard R4RS Scheme definitions exported by the \ex{scheme} +package with its own definitions. +For example, scsh's versions of the R4RS I/O functions such as \ex{display} +and \ex{write} take integer file descriptors as arguments, as well as Scheme +ports. +If you open the \ex{scheme} structure before the \ex{scsh} structure, +you'll get the standard {\scm} definitions, which is not what you want. + + +\subsection{Switches} +\label{sec:scsh-switches} +The scsh top-level takes command-line switches in the following format: +% +\codex{scsh [\var{meta-arg}] [\vari{switch}i {\ldots}] + [\var{end-option} \vari{arg}1 {\ldots} \vari{arg}n]} +where +\begin{inset} +\begin{flushleft} +\begin{tabular}{ll@{\qquad}l} +\var{meta-arg:} & \verb|\| \var{script-file-name} \\ +\\ +\var{switch:} & \ex{-e} \var{entry-point} + & Specify top-level entry-point. \\ + + & \ex{-o} \var{structure} + & Open structure in current package. \\ + + & \ex{-m} \var{structure} + & Switch to package. \\ + + & \ex{-n} \var{new-package} + & Switch to new package. \\ \\ + + + & \ex{-lm} \var{module-file-name} + & Load module into config package. \\ + + & \ex{-l} \var{file-name} + & Load file into current package. \\ + + + & \ex{-dm} & Do script module. \\ + & \ex{-ds} & Do script. \\ +\\ +\var{end-option:} & \ex{-s} \var{script} \\ + & \ex{-sfd} \var{num} \\ + & \ex{-c} \var{exp} \\ + & \ex{--} +\end{tabular} +\end{flushleft} +\end{inset} +% +These command-line switches +essentially provide a little linker language for linking a shell script or a +program together with {\scm} modules. +The command-line processor serially opens structures and loads code into a +given package. +Switches that side-effect a package operate on a particular ``current'' +package; there are switches to change this package. +(These switches provide functionality equivalent to the interactive + \ex{,open} \ex{,load} \ex{,in} and \ex{,new} commands.) +Except where indicated, switches specify actions that are executed in a +left-to-right order. +The initial current package is the user package, which is completely +empty and opens (imports the bindings of) the R4RS and scsh structures. + +If the Scheme process is started up in an interactive mode, then the current +package in force at the end of switch scanning is the one inside which +the interactive read-eval-print loop is started. + +The command-line switch processor works in two passes: +it first parses the switches, building a list of actions to perform, +then the actions are performed serially. +The switch list is terminated by one of the \var{end-option} switches. +The \vari{arg}{i} arguments occurring after an end-option switch are +passed to the scsh program as the value of \ex{command-line-arguments} +and the tail of the list returned by \ex{(command-line)}. +That is, an \var{end-option} switch separates switches that control +the scsh ``machine'' from the actual arguments being passed to the scsh +program that runs on that machine. + +The following switches and end options are defined: +\begin{itemize} +\def\Item#1{\item{\ex{#1}}\\} + +\Item{-o \var{struct}} + Open the structure in the current package. + +\Item{-n \var{package}} + Make and enter a new package. The package has an associated structure + named \var{package} with an empty export list. + If \var{package} is the string ``\ex{\#f}'', + the new package is anonmyous, with no associated named structure. + + The new package initially opens no other structures, + not even the R4RS bindings. You must follow a ``\ex{-n foo}'' + switch with ``\ex{-o scheme}'' to access the standard identifiers such + as \ex{car} and \ex{define}. + +\Item{-m \var{struct}} + Change the current package to the package underlying + structure \var{struct}. + (The \ex{-m} stands for ``module.'') + +\Item{-lm \var{module-file-name}} + Load the specified file into scsh's config package --- the file + must contain source written in the Scheme 48 module language + (``load module''). Does not alter the current package. + +\Item{-l \var{file-name}} + Load the specified file into the current package. + +\Item{-c \var{exp}} + Evaluate expression \var{exp} in the current package and exit. + This is called \ex{-c} after a common shell convention (see sh and csh). + The expression is evaluated in the the current package (and hence is + affected by \ex{-m}'s and \ex{-n}'s.) + + When the scsh top-level constructs the scsh command-line in this case, + it takes \ex{"scsh"} to be the program name. + This switch terminates argument scanning; following args become + the tail of the command-line list. + +\Item{-e \var{entry-point}} + Specify an entry point for a program. The \var{entry-point} is + a variable that is taken from the current package in force at the end + of switch evaluation. The entry point does not have to be exported + by the package in a structure; it can be internal to the package. + The top level passes control to the entry point by applying it to + the command-line list (so programs executing in private + packages can reference their command-line arguments without opening + the \ex{scsh} package to access the \ex{(command-line)} procedure). + Note that, like the list returned by the \ex{(command-line)} procedure, + the list passed to the entry point includes the name + of the program being executed (as the first element of the list), + not just the arguments to the program. + + A \ex{-e} switch can occur anywhere in the switch list, but it is the + \emph{last} action performed by switch scanning if it occurs. + (We violate ordering here as the shell-script \ex{\#!} mechanism + prevents you from putting the \emph{-e} switch last, where it belongs.) + +\Item{-s \var{script}} + Specify a file to load. + A \ex{-ds} (do-script) or \ex{-dm} (do-module) switch occurring earlier in + the switch list gives the place where the script should be loaded. If + there is no \ex{-ds} or \ex{-dm} switch, then the script is loaded at the + end of switch scanning, into the module that is current at the end of + switch scanning. + + We use the \ex{-ds} switch to violate left-to-right switch execution order + as the \ex{-s} switch is \emph{required} to be last + (because of the \ex{\#!} machinery), + independent of when/where in the switch-processing order + it should be loaded. + + When the scsh top-level constructs the scsh command-line in this case, + it takes \var{script} to be the program name. + This switch terminates switch parsing; following args are ignored + by the switch-scanner and are passed through to the program as + the tail of the command-line list. + +\Item{-sfd \var{num}} + Loads the script from file descriptor \var{num}. + This switch is like the \ex{-s} switch, + except that the script is loaded from one of the process' open input + file descriptors. + For example, to have the script loaded from standard input, specify + \ex{-sfd 0}. + +\Item{--} + Terminate argument scanning and start up scsh in interactive mode. + If the argument list just runs out, without either a terminating + \ex{-s} or \ex{--} arg, then scsh also starts up in interactive mode, + with an empty \ex{command-line-arguments} list + (for example, simply entering \ex{scsh} at a shell prompt with no + args at all). + + When the scsh top-level constructs the scsh command-line in this case, + it takes \ex{"scsh"} to be the program name. + This switch terminates switch parsing; following args are ignored + by the switch-scanner and are passed through to the program as + the tail of the command-line list. + +\Item{-ds} + Specify when to load the script (``do-script''). If this switch occurs, + the switch list \emph{must} be terminated by a \ex{-s \var{script}} + switch. The script is loaded into the package that is current at the + \ex{-ds} switch. + +\Item{-dm} + As above, but the current module is ignored. The script is loaded into the + \ex{config} package (``do-module''), and hence must be written in the + {\scm} module language. + This switch doesn't affect the current module---after executing this + switch, the current module is the same as as it was before. + + This switch is provided to make it easy to write shell scripts in the + {\scm} module language. +\end{itemize} + +\subsection{The meta argument} +\label{sec:meta-arg} +The scsh switch parser takes a special command-line switch, +a single backslash called the ``meta-argument,'' which is useful for +shell scripts. +If the initial command-line argument is a ``\verb|\|'' +argument, followed by a filename argument \var{fname}, scsh will open the file +\var{fname} and read more arguments from the second line of this file. +This list of arguments will then replace the ``\verb|\|'' argument---\ie, +the new arguments are inserted in front of \var{fname}, +and the argument parser resumes argument scanning. +This is used to overcome a limitation of the \ex{\#!} feature: +the \ex{\#!} line can only specify a single argument after the interpreter. +For example, we might hope the following scsh script, \ex{ekko}, +would implement a simple-minded version of the Unix \ex{echo} program: +\begin{code} +#!/usr/local/bin/scsh -e main -s +!# +(define (main args) + (map (\l{arg} (display arg) (display " ")) + (cdr args)) + (newline))\end{code} +% +The idea would be that the command + \codex{ekko Hi there.} +would by expanded by the \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=exec&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{exec(2)}} kernel call into +% +\begin{code} +/usr/local/bin/scsh -e main -s ekko Hi there.\end{code} +% +In theory, this would cause scsh to start up, load in file \ex{ekko}, +call the entry point on the command-line list +\codex{(main '("ekko" "Hi" "there."))} +and exit. + +Unfortunately, the {\Unix} \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=exec&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{exec(2)}} syscall's support for scripts is +not very general or well-designed. +It will not handle multiple arguments; +the \ex{\#!} line is usually required to contain no more than 32 characters; +it is not recursive. +If these restrictions are violated, most Unix systems will not provide accurate +error reporting, but either fail silently, or simply incorrectly implement +the desired functionality. +These are the facts of Unix life. + +In the \ex{ekko} example above, our \ex{\#!} trigger line has three +arguments (``\ex{-e}'', ``\ex{main}'', and ``\ex{-s}''), so it will not +work. +The meta-argument is how we work around this problem. +We must instead invoke the scsh interpreter with the single \cd{\\} argument, +and put the rest of the arguments on line two of the program. +Here's the correct program: +% +\begin{code} +#!/usr/local/bin/scsh \\ +-e main -s +!# +(define (main args) + (map (\l{arg} (display arg) (display " ")) + (cdr args)) + (newline))\end{code} +% +Now, the invocation starts as + \codex{ekko Hi there.} +and is expanded by exec(2) into +\begin{code} +/usr/local/bin/scsh \\ ekko Hi there.\end{code} +When scsh starts up, it expands the ``\cd{\\}'' argument into the arguments +read from line two of \ex{ekko}, producing this argument list: +\begin{code}\cddollar +\underline{-e main -s ekko} Hi there. + $\uparrow$ +{\rm{}Expanded from} \cd{\\} ekko\end{code} +% +With this argument list, processing proceeds as we intended. + +\subsubsection{Secondary argument syntax} +Scsh uses a very simple grammar to encode the extra arguments on +the second line of the scsh script. +The only special characters are space, tab, newline, and backslash. +\begin{itemize} +\item Each space character terminates an argument. + This means that two spaces in a row introduce an empty-string argument. + +\item The tab character is not permitted + (unless you quote it with the backslash character described below). + This is to prevent the insidious bug where you believe you have + six space characters, but you really have a tab character, + and \emph{vice-versa}. + +\item The newline character terminates an argument, like the space character, + and also terminates the argument sequence. + This means that an empty line parses to the singleton list whose one + element is the empty string: \ex{("")}. + The grammar doesn't admit the empty list. + +\item The backslash character is the escape character. + It escapes backslash, space, tab, and newline, turning off their + special functions, and allowing them to be included in arguments. + The {\Ansi} C escape sequences (\verb|\b|, \verb|\n|, \verb|\r| + and \verb|\t|) are also supported; + these also produce argument-constituents---\verb|\n| doesn't act + like a terminating newline. + The escape sequence \verb|\|\emph{nnn} for \emph{exactly} three + octal digits reads as the character whose {\Ascii} code is \emph{nnn}. + It is an error if backslash is followed by just one or two octal digits: + \verb|\3Q| is an error. + Octal escapes are always constituent chars. + Backslash followed by other chars is not allowed + (so we can extend the escape-code space later if we like). +\end{itemize} + +You have to construct these line-two argument lines carefully. +In particular, beware of trailing spaces at the end of the line---they'll +give you extra trailing empty-string arguments. +Here's an example: +% +\begin{inset} +\begin{verbatim} +#!/bin/interpreter \ +foo bar quux\ yow\end{verbatim} +\end{inset} +% +would produce the arguments +% +\codex{("foo" "bar" "" "quux yow")} + +\subsection{Examples} + +\begin{itemize} +\def\Item#1{\item{\ex{#1}}\\} +\def\progItem#1{\item{Program \ex{#1}}\\} + +\Item{scsh -dm -m myprog -e top -s myprog.scm} + Load \ex{myprog.scm} into the \ex{config} package, then shift to the + \ex{myprog} package and call \ex{(top '("myprog.scm"))}, then exit. + This sort of invocation is typically used in \ex{\#!} script lines + (see below). + +\Item{scsh -c '(display "Hello, world.")'} + A simple program. + +\Item{scsh -o bigscheme} + Start up interactively in the user package after opening + structure \ex{bigscheme}. + +\Item{scsh -o bigscheme -- Three args passed} + Start up interactively in the user package after opening \ex{bigscheme}. + The \ex{command-line-args} variable in the scsh package is bound to the + list \ex{("Three" "args" "passed")}, and the \ex{(command-line)} + procedure returns the list \ex{("scsh" "Three" "args" "passed")}. + + +\progItem{ekko} +This shell script, called \ex{ekko}, implements a version of +the Unix \ex{echo} program: +\begin{code} +#!/usr/local/bin/scsh -s +!# +(for-each (\l{arg} (display arg) (display " ")) + command-line-args)\end{code} + +Note this short program is an example of a \emph{script}---it +executes as it loads. +The Unix rule for executing \ex{\#!} shell scripts causes +\codex{ekko Hello, world.} +to expand as +\codex{/usr/local/bin/scsh -s ekko Hello, world.} + +\progItem{ekko} +This is the same program, \emph{not} as a script. +Writing it this way makes it possible to compile the program +(and then, for instance, dump it out as a heap image). +% +\begin{code} +#!/usr/local/bin/scsh \\ +-e top -s +!# +(define (top args) + (for-each (\l{arg} (display arg) (display " ")) + (cdr args)))\end{code} +% +The \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=exec&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{exec(2)}} expansion of the \ex{\#!} line together with +the scsh expansion of the ``\verb|\ ekko|'' meta-argument +(see section~\ref{sec:meta-arg}) gives the following command-line expansion: +\begin{code} +ekko Hello, world. + {\evalto} /usr/local/bin/scsh \\ ekko Hello, world. + {\evalto} /usr/local/bin/scsh -e top -s ekko Hello, world.\end{code} + +\progItem{sort} +This is a program to replace the Unix \ex{sort} utility---sorting lines +read from stdin, and printing the results on stdout. +Note that the source code defines a general sorting package, +which is useful (1) as a Scheme module exporting sort procedures +to other Scheme code, and (2) as a standalone program invoked from +the \ex{top} procedure. +\begin{code} +#!/usr/local/bin/scsh \\ +-dm -m sort-toplevel -e top -s +!# + +;;; This is a sorting module. TOP procedure exports +;;; the functionality as a Unix program akin to sort(1). +(define-structures ((sort-struct (export sort-list + sort-vector!)) + (sort-toplevel (export top))) + (open scheme) + + (begin (define (sort-list elts <=) {\ldots}) + (define (sort-vec! vec <=) {\ldots}) + + ;; Parse the command line and + ;; sort stdin to stdout. + (define (top args) + {\ldots})))\end{code} + +The expansion below shows how the command-line scanner +(1) loads the config file \ex{sort} (written in the {\scm} module language), +(2) switches to the package underlying the \ex{sort-toplevel} structure, +(3) calls \ex{(top '("sort" "foo" "bar"))} in the package, and finally +(4) exits. +% +{\small +\begin{centercode} +sort foo bar +{\evalto} /usr/local/bin/scsh \\ sort foo bar +{\evalto} /usr/local/bin/scsh -dm -m sort-toplevel -e top -s sort foo bar\end{centercode}} + +An alternate method would have used a +\begin{code} +-n #f -o sort-toplevel\end{code} +sequence of switches to specify a top-level package. + +\end{itemize} + +Note that the sort example can be compiled into a Unix program by +loading the file into an scsh process, and dumping a heap with top-level +\ex{top}. Even if we don't want to export the sort's functionality as a +subroutine library, it is still useful to write the sort program with the +module language. The command line design allows us to run this program as +either an interpreted script (given the \ex{\#!} args in the header) or as a +compiled heap image. + +\subsection{Process exit values} +Scsh ignores the value produced by its top-level computation when determining +its exit status code. +If the top-level computation completed with no errors, +scsh dies with exit code 0. +For example, a scsh process whose top-level is specified by a \ex{-c \var{exp}} +or a \ex{-e \var{entry}} entry point ignores the value produced +by evaluating \var{exp} and calling \var{entry}, respectively. +If these computations terminate with no errors, the scsh process +exits with an exit code of 0. + +To return a specific exit status, use the \ex{exit} procedure explicitly, \eg, +\begin{tightcode} +scsh -c \\ + "(exit (status:exit-val (run (| (fmt) (mail shivers)))))"\end{tightcode} + +\section{The scsh virtual machine} +To run the {\scm} implementation of scsh, you run a specially modified +copy of the {\scm} virtual machine with a scsh heap image. +The scsh binary is actually nothing but a small cover program that invokes the +byte-code interpreter on the scsh heap image for you. +This allows you to simply start up an interactive scsh from a command +line, as well as write shell scripts that begin with the simple trigger +\codex{\#!/usr/local/bin/scsh -s} + +You can also directly execute the virtual machine, +which takes its own set of command-line switches.. +For example, +this command starts the vm up with a 1Mword heap (split into two semispaces): + \codex{scshvm -o scshvm -h 1000000 -i scsh.image arg1 arg2 \ldots} +The vm peels off initial vm arguments +up to the \ex{-i} heap image argument, which terminates vm argument parsing. +The rest of the arguments are passed off to the scsh top-level. +Scsh's top-level removes scsh switches, as discussed in the previous section; +the rest show up as the value of \ex{command-line-arguments}. + +Directly executing the vm can be useful to specify non-standard switches, or +invoke the virtual machine on special heap images, which can contain +pre-compiled scsh programs with their own top-level procedures. + +\subsection{VM arguments} +\label{sec:vm-args} + +The vm takes arguments in the following form: +\codex{scshvm [\var{meta-arg}] [\var{vm-options}\+] [\var{end-option} \var{scheme-args}]} +where +\begin{inset} +\begin{tabular}{ll} +\var{meta-arg:} & \verb|\ |\var{filename} \\ +\\ +\var{vm-option}: & \ex{-h }\var{heap-size-in-words} \\ + & \ex{-s }\var{stack-size-in-words} \\ + & \ex{-o }\var{object-file-name} \\ +\\ +\var{end-option:} & \ex{-i }\var{image-file-name} \\ + & \ex{--} +\end{tabular} +\end{inset} + +The vm's meta-switch ``\verb|\ |\var{filename}'' is handled the same +as scsh's meta-switch, and serves the same purpose. + +\subsubsection{VM options} +The \ex{-o \var{object-file-name}} switch tells the vm where to find +relocation information for its foreign-function calls. +Scsh will use a pre-compiled default if it is not specified. +Scsh \emph{must} have this information to run, +since scsh's syscall interfaces are done with foreign-function calls. + +The \ex{-h} and \ex{-s} options tell the vm how much space to allocate +for the heap and stack. +The heap size value is the total number of words allocated for the heap; +this space is then split into two semi-spaces for {\scm}'s stop-and-copy +collector. + +\subsubsection{End options} +End options terminate argument parsing. +The \ex{-i} switch is followed by the name of a heap image for the +vm to execute. +The \var{image-file-name} string is also taken to be the name of the program +being executed by the VM; this name becomes the head of the argument +list passed to the heap image's top-level entry point. +The tail of the argument list is constructed from all following arguments. + +The \ex{--} switch terminates argument parsing without giving +a specific heap image; the vm will start up using a default +heap (whose location is compiled into the vm). +All the following arguments comprise the tail of the list passed off to +the heap image's top-level procedure. + +Notice that you are not allowed to pass arguments to the heap image's +top-level procedure (\eg, scsh) without delimiting them with \ex{-i} +or \ex{--} flags. + +\subsection{Inserting interpreter triggers into heap images} +{\scm}'s heap image format allows for an informational header: +when the vm loads in a heap image, it ignores all data occurring before +the first control-L character (\textsc{Ascii} 12). +This means that you can insert a ``\ex{\#!}'' trigger line into a +heap image, making it a form of executable ``shell script.'' +Since the vm requires multiple arguments to be given on the command +line, you must use the meta-switch. +Here's an example heap-image header: +\begin{code} +#!/usr/local/lib/scsh/scshvm \\ +-o /usr/local/lib/scsh/scshvm -i +{\ldots} \textnormal{\emph{Your heap image goes here}} \ldots\end{code} + +\subsection{Inserting a double-level trigger into Scheme programs} +If you're a nerd, you may enjoy doing a double-level machine shift +in the trigger line of your Scheme programs with the following magic: +\begin{code}\small +#!/usr/local/lib/scsh/scshvm \\ +-o /usr/local/lib/scsh/scshvm -i /usr/local/lib/scsh/scsh.image -s +!# +{\ldots} \textnormal{\emph{Your Scheme program goes here}} \ldots\end{code} + +\section{Compiling scsh programs} +Scsh allows you to create a heap image with your own top-level procedure. +Adding the pair of lines +\begin{code} +#!/usr/local/lib/scsh/scshvm \\ +-o /usr/local/lib/scsh/scshvm -i\end{code} +to the top of the heap image will turn it into an executable {\Unix} file. + +You can create heap images with the following two procedures. + +\defun{dump-scsh-program}{main fname}{\undefined} +\begin{desc} + This procedure writes out a scsh heap image. When the + heap image is executed by the {\scm} vm, it will call + the \var{main} procedure, passing it the vm's argument list. + When \ex{main} returns an integer value $i$, the vm exits with + exit status $i$. + The {\Scheme} vm will parse command-line switches as + described in section~\ref{sec:vm-args}; remaining arguments + form the tail of the command-line list that is passed to \ex{main}. + (The head of the list is the name of the program being executed + by the vm.) + Further argument parsing + (as described for scsh in section~\ref{sec:scsh-switches}) + is not performed. + + The heap image created by \ex{dump-scsh-program} has unused + code and data pruned out, so small programs compile to much smaller + heap images. +\end{desc} + +\defun{dump-scsh}{fname}{\undefined} +\begin{desc} + This procedure writes out a heap image with the standard + scsh top-level. + When the image is resumed by the vm, it will parse and + execute scsh command-line switches as described in section + \ref{sec:scsh-switches}. + + You can use this procedure to write out custom scsh heap images + that have specific packages preloaded and start up in specific + packages. +\end{desc} + +Unfortunately, {\scm} does not support separate compilation of +Scheme files or Scheme modules. +The only way to compile is to load source and then dump out a +heap image. +One occasionally hears rumours that this is being addressed +by the {\scm} development team. + +\section{Statically linking heap images} +The static heap linker converts a {\scm} bytecode image contained +in a .image file to a C representation. This C code is then compiled and +linked in with a virtual machine, producing a single executable. +Some of the benefits are: +\begin{itemize} + \item Instantaneous start-up time. + \item Improved paging; scsh images can be shared between different + processes. + \item Vastly reduced GC copying---the whole initial image + is moved out of the heap, and neither traced nor copied. + \item Result program no longer depends on the filesystem for its + initial image. +\end{itemize} + +The static heap linker takes arguments in the following form: +\codex{scsh-hlink \var{image} \var{executable} [\var{option} \ldots]} +It reads in the heap image \var{image}, translates it into C code, +compiles the C code, and links it against the scsh vm, producing the +standalone binary file \var{executable}. + +Each C file represents part of the heap image as a constant C \ex{long} vector +that looks something like this: +{\small\begin{verbatim} +const long p116[]={0x882,0x24,0x19, + 0x882,(long)(&p19[785])+7,(long)(&p119[125])+7, + 0x882,(long)(&p119[128])+7,(long)(&p119[131])+7, + 0x882,(long)(&p102[348])+7,(long)(&p3[114])+7, + 0xfc2,0x2030200,0x7100209,0x1091002,0x1c075a, + 0x882,(long)(&p29[1562])+7,(long)(&p119[137])+7, + 0x882,(long)(&p78[692])+7,(long)(&p119[140])+7, + . + . + . + }; +\end{verbatim}}% +% +Translating to a C declaration gives us freedom from the various +object-file formats.\footnote{This idea is due to Jonathan Rees.} +Note that the const declaration allows the compiler to put this array in the +text pages of the executable. +The heap is split into parts because many C compilers cannot handle +multi-megabyte initialised vector declarations. + +The allowed options to the heap linker are: +\begin{itemize} +\def\Item#1{\item{\ex{#1}}\\} + +\Item{--temp \var{dir}} The temporary directory to hold .c and .o files. + The default is typically configured to be + \ex{/usr/tmp}, and can be overridden by the + environment variable \ex{TMPDIR}. + Example: + \codex{--temp /tmp} + +\Item{--cc \var{command}} The command to run the C compiler. + The default can be overridden by the environment + variable \ex{CC}. + Example: + \codex{--cc "gcc -g -O"} + +\Item{--ld \var{command}} The arguments to run the C compiler as a linker. + The default can be overridden by the + environment variable \ex{LDFLAGS}. + Example: + \codex{--ld "-Wl,-E"} + +\Item{--libs \var{libs}} The libraries needed to link the VM and heap. + The default can be overridden by the + environment variable \ex{LIBS}. + Example: + \codex{--libs "-ldld -lld -lm"} +\end{itemize} + +Be warned that the current heap linker has many shortcomings. +\begin{itemize} +\item It is extremely slow. Really, really slow. Translating the standard + scsh heap image into a standalone binary takes well over an hour on a + 40Mb/133Mhz Pentium system. + A memory-starved 486 could take all night. + +\item It cannot be applied to itself. The current implementation + works by replacing some of the heap-dumping code. This means + you cannot load the heap-linker code into a scsh system and + subsequently use \ex{dump-scsh-program} to create a heap-linker + heap image. + +\item The interface leaves a lot to be desired. + \begin{itemize} + \item It requires the heap image to be referenced by a file-name; + the linker will not allow you to feed it the input heap image + on a port. + \item The heap-image is linked against the vm contained in +\begin{tightcode} +/usr/local/lib/scsh/libscshvm.a\end{tightcode} + This is wired in at the time scsh is installed on your system. + \item There is no Scheme procedural interface. + \end{itemize} + +\item The program produced uses the default VM argv parser \verb|process_args| + from the scsh source file \ex{main.c} to process the command line + before handing it off to the heap image's top-level procedure. + This is not what you want for many programs. + + The system needs to be changed to allow users to override this default + with their own VM argument parsers. + +\item A possible problem is the Unix limits on the number of command + line arguments. The heap-linker calls the C linker with a large number of + object files. Its conceivable that on some Unix systems this could fail + now or if scsh grows in the future. The solution could be to create + library archives of a few dozen files and then link the result few dozen + library archives to make the executable. +\end{itemize} + +In spite of these many shortcomings, we are providing the static linker +as it stands in this release so that people may get some experience with +it. + +Here is an example of how one might use the heap linker: +\begin{code} + scsh-hlink scsh.image fastscsh\end{code} + +We'd love it if someone would dive into the source and improve it. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Standard file locations} +Because the scshvm binary is intended to be used for writing shell +scripts, it is important that the binary be installed in a standard +place, so that shell scripts can dependably refer to it. +The standard directory for the scsh tree should be \ex{/usr/local/lib/scsh/}. +Whenever possible, the vm should be located in + \codex{/usr/local/lib/scsh/scshvm} +and a scsh heap image should be located in + \codex{/usr/local/lib/scsh/scsh.image} +The top-level scsh program should be located in + \codex{/usr/local/lib/scsh/scsh} +with a symbolic link to it from + \codex{/usr/local/bin/scsh} + +The {\scm} image format allows heap images to have \ex{\#!} triggers, +so \ex{scsh.image} should have a \ex{\#!} trigger of the following form: +\begin{code} +#!/usr/local/lib/scsh/scshvm \\ +-o /usr/local/lib/scsh/scshvm -i +{\ldots} \textnormal{\emph{heap image goes here}} \ldots\end{code} + diff --git a/doc/scsh-manual/sre.tex b/doc/scsh-manual/sre.tex new file mode 100644 index 0000000..15b0c74 --- /dev/null +++ b/doc/scsh-manual/sre.tex @@ -0,0 +1,1477 @@ +%latex -*- latex -*- +% Many of the \object's should be \values or something. +% look for "...", *...*, hand-inset code blocks + +%\documentclass[twoside]{report} +%\usepackage{code,boxedminipage,makeidx,palatino,ct, +% headings,mantitle,array,matter,mysize10} + +\newcommand{\anglequote}[1]{{$<\!\!<$}#1$>\!\!>$} + +% Style issues +%\parskip = 3pt plus 3pt +%\sloppy + +%\input{decls} +%\begin{document} + +%\mainmatter + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\chapter{Pattern-matching strings with regular expressions} +\label{chapt:sre} + +Scsh provides a rich facility for matching regular-expression patterns +in strings. +The system is composed of several pieces: +\begin{itemize} + +\item An s-expression notation for writing down general regular expressions. + In most systems, regexp patterns are encoded as string literals, such + as \verb+"g(oo|ee)se"+. + In scsh, they are written using s-expressions, such as + \verb+(: "g" (| "oo" "ee") "se")+, and are called \emph{sre's}. + The sre notation has several + advantages over the traditional string-based notation. It's more expressive, + can be commented, and can be indented to expose the structure of the form. + +\item An abstract data type (ADT) representation for regexp values. + Traditional regular-expression systems compute regular expressions + from run-time values using strings. This can be awkward. Scsh, instead, + provides a separate data type for regexps, with a set of basic constructor + and accessor functions; regular expressions can be dynamically computed + and manipulated using these functions. + +\item Some tools that work on the regexp ADT: case-sensitve to case-insensitive + regexp transform, a regexp simplifier, and so forth. + +\item Parsers and unparsers that can convert between external representations + and the regexp ADT. The supported external representations are + \begin{itemize} + \item Posix strings + \item S-expression notation (that is, sre's) + \end{itemize} + Being able to convert regexps to Posix strings allows implementations + to implement regexp matching using standard Posix C-based engines. + +\item Macro support for the s-expression notation. + The \ex{rx} macro provides a new special form that allows you to embed + regexps in the s-expression notation within a Scheme program. Evaluating + the macro form produces a regexp ADT value which can be used by + Scheme pattern-matching procedures and other regexp consumers. + +\item Pattern-matching and searching procedures. + Spencer's Posix regexp engine is linked in to the runtime; the + regexp code uses this engine to provide text matching. +\end{itemize} + +The regexp language supported is a complete superset of Posix functionality, +providing: +\begin{itemize} +\item sequencing and choice (\ex{|}) +\item repetition (\ex{*}, \ex{+}, \ex{?}, \ex{\{$m$,$n$\}}) +\item character classes (\eg, \ex{[aeiou]}) and wildcard (\ex{.}) +\item beginning/end of string anchors (\verb|^|, \verb|$|) +\item beginning/end of line anchors +\item beginning/end of word anchors +\item case-sensitivity control +\item submatch-marking +\end{itemize} + + +\section{Summary SRE syntax} +The following figures give a summary of the SRE syntax; +the next section is a friendlier tutorial introduction. + +\newlength{\foolength} +\def\srecomment#1{\multicolumn{2}{l}% + {\qquad\setlength{\foolength}{\textwidth}% + \addtolength{\textwidth}{-4em}\begin{tabular}{p{\textwidth}}#1\end{tabular}}} +\begin{boxedfigure}{tbhp} +\begin{tabular}{lp{3in}} +\var{string} & + Literal match---interpreted relative to + the current case-sensitivity lexical context + (default is case-sensitive) \\ +\\ +\ex{(\var{string1} \var{string2} {\ldots})} & + Set of chars, \eg, \ex{("abc" "XYZ")}. + Interpreted relative to the current + case-sensitivity lexical context. \\ +\\ +\ex{(* \var{sre} {\ldots})} & 0 or more matches \\ +\ex{(+ \var{sre} {\ldots})} & 1 or more matches \\ +\ex{(? \var{sre} {\ldots})} & 0 or 1 matches \\ +\ex{(= \var{n} \var{sre} {\ldots})} & \var{n} matches \\ +\ex{(>= \var{n} \var{sre} {\ldots})} & \var{n} or more matches \\ +\ex{(** \var{n} \var{m} \var{sre} {\ldots})} & \var{n} to \var{m} matches \\ +\srecomment{ + \var{N} and \var{m} are Scheme expressions producing non-negative + integers. \\ + \var{M} may also be \ex{\#f}, meaning ``infinity.''} \\ +\\ +\ex{(| \var{sre} {\ldots})} & Choice (\ex{or} is R5RS symbol; \\ +\ex{(or \var{sre} {\ldots})} & \ex{|} is not specified by R5RS.) \\ +\\ +\ex{(: \var{sre} {\ldots})} & Sequence (\ex{seq} is legal \\ +\ex{(seq \var{sre} {\ldots})} & Common Lisp symbol) \\ +\\ +\ex{(submatch \var{sre} {\ldots})} & Numbered submatch \\ +\\ +\ex{(dsm \var{pre} \var{post} \var{sre} {\ldots})} & Deleted submatches \\ + \srecomment{\var{Pre} and \var{post} are numerals.} \\ +\\ +\ex{(uncase \var{sre} {\ldots})} & Case-folded match \\ +\ex{(w/case \var{sre} {\ldots})} & Introduce a lexical case-sensitivity \\ +\ex{(w/nocase \var{sre} {\ldots})} & context. \\ +\\ +\ex{,@\var{exp}} & Dynamically computed regexp \\ +\ex{,\var{exp}} & Same as ,@\var{exp}, but no submatch info \\ + \srecomment{\var{Exp} must produce a character, string, + char-set, or regexp.} \\ +\\ +\ex{bos eos} & Beginning/end of string \\ +\ex{bol eol} & Beginning/end of line \\ +\ex{bow eow} & Beginning/end of word \\ +\end{tabular} +\caption{SRE syntax summary (part 1)} +\end{boxedfigure} + +\begin{boxedfigure}{tbhp} +\begin{tabular}{lp{3in}} +\ex{(word \var{sre} {\ldots})} & (: bow \var{sre} {\ldots} eow) \\ +\ex{(word+ \var{cset-sre} {\ldots})} + & \cd{(word (+ (& (| alphanumeric "_")} \\ + & \cd{ (| \var{cset-sre} {\ldots}))))} \\ +\ex{word} & \ex{(word+ any)} \\ +\\ +\ex{(posix-string \var{string})} & Escape for Posix string notation \\ +\\ +\ex{\var{char}} & Singleton char set \\ +\ex{\var{class-name}} & alphanumeric, whitespace, \etc \\ + \srecomment{These two forms are interpreted subject to + the lexical case-sensitivity context.} \\ +\\ +\cd{(~ \var{cset-sre} {\ldots})} & Complement-of-union (\cd{[^{\ldots}]}) \\ +\ex{(- \var{cset-sre} {\ldots})} & Difference \\ +\cd{(& \var{cset-sre} {\ldots})} & Intersection \\ +\\ +\ex{(/ \var{range-spec} {\ldots})} & Character range---interpreted + subject to + the lexical case-sensitivy context \\ +\end{tabular} +\caption{SRE syntax summary (part 2)} +\end{boxedfigure} + +\begin{boxedfigure}{tbhp} +{\tt +\begin{tabular}{l@{\quad\texttt{|}\quad}ll} +\multicolumn{1}{l}{\var{class-name}\quad ::=\quad} & any \\ + & nonl \\ + & lower-case & | lower \\ + & upper-case & | upper \\ + & alphabetic & | alpha \\ + & numeric & | digit | num \\ + & alphanumeric & | alnum \\ + & punctuation & | punct \\ + & graphic & | graph \\ + & whitespace & | space | white \\ + & printing & | print \\ + & control & | cntrl \\ + & hex-digit & | xdigit | hex \\ + & ascii +\end{tabular} +\\[2ex] +\ex{\var{range-spec} ::= \var{string} | \var{char}} \\ +} +The chars are taken in pairs to form inclusive ranges. + +\caption{SRE character-class names and range specs.} +\end{boxedfigure} + + +\begin{boxedfigure}{tbhp} +\begin{verbatim} + ::= (~ ...) Set complement-of-union + | (- ...) Set difference + | (& ...) Intersection + | (| ...) Set union + | (/ ...) Range + + | () Constant set + | Singleton constant set + | For 1-char string "c" + + | Constant set + + | , evals to a char-set, + | ,@ char, single-char string, + or re-char-set regexp. + + | (uncase ) Case-folding + | (w/case ) + | (w/nocase ) +\end{verbatim} +\caption{%The \cd{~}, \cd{-}, \cd{&}, and \cd{word+} operators may only be + applied to SRE's that specify character sets. + These are the ``type-checking'' rules for character-set SRE's.} +\end{boxedfigure} + + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Examples} + +\begin{widecode} +(- alpha ("aeiouAEIOU")) ; Various forms of +(- alpha ("aeiou") ("AEIOU")) ; non-vowel letter +(w/nocase (- alpha ("aeiou"))) +(- (/"azAZ") ("aeiouAEIOU")) +(w/nocase (- (/"az") ("aeiou"))) + +;;; Upper-case letter, lower-case vowel, or digit +(| upper ("aeiou") digit) +(| (/"AZ09") ("aeiou")) + +;;; Not an SRE, but Scheme code containing some embedded SREs. +(let* ((ws (rx (+ whitespace))) ; Seq of whitespace + (date (rx (: (| "Jan" "Feb" "Mar" ...) ; A month/day date. + ,ws + (| ("123456789") ; 1-9 + (: ("12") digit) ; 10-29 + "30" "31"))))) ; 30-31 + + ;; Now we can use DATE several times: + (rx ... ,date ... (* ... ,date ...) + ... .... ,date)) + +;;; More Scheme code +(define (csl re) ; A comma-separated list of RE's is + (rx (| "" ; either zero of them (empty string), or + (: ,re ; one RE, followed by + (* ", " ,re))))) ; Zero or more comma-space-RE matches. + +(csl (rx (| "John" "Paul" "George" "Ringo")))\end{widecode} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{A short tutorial} + +S-expression regexps are called "SRE"s. Keep in mind that they are \emph{not} +Scheme expressions; they are another, separate notation that is expressed +using the underlying framework of s-expression list structure: lists, +symbols, {\etc} SRE's can be \emph{embedded} inside of Scheme expressions using +special forms that extend Scheme's syntax (such as the \ex{rx} macro); +there are places in the SRE +grammar where one may place a Scheme expression. +In these ways, SRE's and Scheme expressions can be intertwined. +But this isn't fundamental; +SRE's may be used in a completely Scheme-independent context. +By simply restricting the notation to eliminate two special +Scheme-embedding forms, they can be a completely independent notation. + +\paragraph{Constant strings} + +The simplest SRE is a string, denoting a constant regexp. For example, the SRE +\begin{code} + "Spot"\end{code} +% +matches only the string +\anglequote{capital-S, little-p, little-o, little-t}. +There is no interpretation of the characters in the string at all---the SRE +\begin{code} + ".*["\end{code} +% +matches the string \anglequote{period, asterisk, open-bracket}. + + +\paragraph{Simple character sets} + +To specify a set of characters, write a list whose single element is +a string containing the set's elements. So the SRE +\begin{code} + ("aeiou")\end{code} +% +only matches a vowel. One way to think of this, notationally, is that the +set brackets are \ex{("} and \ex{")}. + + +\paragraph{Wild card} + +Another simple SRE is the symbol \ex{any}, +which matches any single character---including newline and \textsc{Ascii} nul. + + +\paragraph{Sequences} + +We can form sequences of SRE's with the SRE \ex{(: \var{sre} \ldots)}. +So the SRE +\begin{code} + (: "x" any "z")\end{code} +% +matches any three-character string starting with ``x'' and ending with ``z''. +As we'll see shortly, many SRE forms have bodies that are implicit sequences of +other SRE's, analogous to the manner in which the body of a Scheme +\ex{lambda} or \ex{let} expression is an implicit \ex{begin} sequence. +The regexp \ex{(seq \var{sre} \ldots)} is +completely equivalent to \ex{(: \var{sre} \ldots)}; +it's included in order to have a syntax that doesn't require +\ex{:} to be a legal symbol \footnote{That is, for use within s-expression +syntax frameworks that, unlike R5RS, don't allow for \ex{:} as a legal symbol. +A Common Lisp embedding of SREs, for example, would need to use +\ex{seq} instead of \ex{:}.} + + +\section{Choices} + +The SRE \ex{(| \var{sre} \ldots)} is a regexp that matches anything any of the +\var{sre} regexps match. So the regular expression +\begin{code} + (| "sasha" "Pete")\end{code} +% +matches either the string ``sasha'' or the string ``Pete''. The regexp +\begin{code} + (| ("aeiou") ("0123456789"))\end{code} +% +is the same as +\begin{code} + ("aeiou0123456789") \end{code} +% +The regexp \ex{(or \var{sre} \ldots)} is completely equivalent to +\ex{(| \var{sre} \ldots)}; +it's included in order to have a syntax that doesn't require \ex{|} to be a +legal symbol. + + +\paragraph{Repetition} + +There are several SRE forms that match multiple occurences of a regular +expression. For example, the SRE \ex{(* \var{sre} \ldots)} matches zero or more +occurences of the sequence \ex{(: \var{sre} \ldots)}. Here is the complete list +of SRE repetition forms: +\begin{inset} +\begin{tabular}{llrr} +SRE & means & at least & no more than \\ \hline +\ex{(* \var{sre} \ldots)} &zero-or-more &0 &infinity \\ +\ex{(+ \var{sre} \ldots)} &one-or-more &1 &infinity \\ +\ex{(? \var{sre} \ldots)} &zero-or-one &0 &1 \\ +\ex{(= \var{from} \var{sre} \ldots)} &exactly-n &\var{from} &\var{from} \\ +\ex{(>= \var{from} \var{sre} \ldots)} &n-or-more &\var{from} &infinity \\ +\ex{(** \var{from} \var{to} \var{sre} \ldots)} &n-to-m &\var{from} &\var{to} +\end{tabular} +\end{inset} + +A \var{from} field is a Scheme expression that produces an integer. +A \var{to} field is a Scheme expression that produces either an integer, +or false, meaning infinity. + +While it is illegal for the \var{from} or \var{to} fields to be negative, +it \emph{is} allowed for \var{from} to be greater than \var{to} in a +\ex{**} form---this simply produces a regexp that will never match anything. + +As an example, we can describe the names of car/cdr access functions +("car", "cdr", "cadr", "cdar", "caar" , "cddr", "caaadr", \etc) with +either of the SREs +\begin{code} + (: "c" (+ (| "a" "d")) "r") + (: "c" (+ ("ad")) "r")\end{code} +We can limit the a/d chains to 4 characters or less with the SRE +\begin{code} + (: "c" (** 1 4 ("ad")) "r")\end{code} + +Some boundary cases: +\begin{code} + (** 5 2 "foo") ; Will never match + (** 0 0 "foo") ; Matches the empty string\end{code} + +\paragraph{Character classes} + +There is a special set of SRE's that form ``character classes''---basically, +a regexp that matches one character from some specified set of characters. +There are operators to take the intersection, union, complement, and +difference of character classes to produce a new character class. (Except +for union, these capabilities are not provided for general regexps as they +are computationally intractable in the general case.) + +A single character is the simplest character class: \verb|#\x| is a character +class that matches only the character ``x''. A string that has only one +letter is also a character class: \ex{"x"} is the same SRE as \verb|#\x|. + +The character-set notation \ex{(\var{string})} we've seen is a primitive character +class, as is the wildcard \ex{any}. +When arguments to the choice operator, \ex{|}, are +all character classes, then the choice form is itself a character-class. +So these SREs are all character-classes: +\begin{code} +("aeiou") +(| #\\a #\\e #\\i #\\o #\\u) +(| ("aeiou") ("1234567890"))\end{code} +However, these SRE's are \emph{not} character-classes: +\begin{code} +"aeiou" +(| "foo" #\\x)\end{code} + +The \cd{(~ \var{cset-sre} \ldots)} char class matches one character +not in the specified classes: +\begin{code} +(~ ("0248") ("1359"))\end{code} +% +matches any character that is not a digit. + +More compactly, we can use the \ex{/} operator to specify character sets by +giving the endpoints of contiguous ranges, where the endpoints are specified +by a sequence of strings and characters. +For example, any of these char classes +\begin{inset} +\begin{verbatim} +(/ #\A #\Z #\a #\z #\0 #\9) +(/ "AZ" #\a #\z "09") +(/ "AZ" #\a "z09") +(/"AZaz09") +\end{verbatim}\end{inset}% +% +matches a letter or a digit. The range endpoints are taken in pairs to +form inclusive ranges of characters. Note that the exact set of characters +included in a range is dependent on the underlying implementation's +character type, so ranges may not be portable across different implementations. + +There is a wide selection of predefined, named character classes that may be +used. One such SRE is the wildcard \ex{any}. +\ex{nonl} is a character class matching anything but newline; +it is equivalent to +\begin{inset} +\begin{verbatim} +(~ #\newline) +\end{verbatim}\end{inset}% +% +and is useful as a wildcard in line-oriented matching. + +There are also predefined named char classes for the standard Posix and Gnu +character classes: +\begin{inset} +\begin{tabular}{llll} +scsh name & Posix/ctype & Alternate name & Comment \\ \hline +\ex{lower-case} & \ex{lower} \\ +\ex{upper-case} & \ex{upper} \\ +\ex{alphabetic} & \ex{alpha} \\ +\ex{numeric} & \ex{digit} & \ex{num} \\ +\ex{alphanumeric} & \ex{alnum} & \ex{alphanum} \\ +\ex{punctuation} & \ex{punct} \\ +\ex{graphic} & \ex{graph} \\ +\ex{blank} & (Gnu extension) \\ +\ex{whitespace} & \ex{space} & \ex{white} & {``\ex{space}'' is deprecated.}\\ +\ex{printing} & \ex{print} \\ +\ex{control} & \ex{cntrl} \\ +\ex{hex-digit} & \ex{xdigit} & \ex{hex} \\ +\ex{ascii} & (Gnu extension) \\ +\end{tabular} +\end{inset} +See the scsh character-set documentation or the Posix isalpha(3) man page +for the exact definitions of these sets. + +You can use either the long scsh name or the shorter Posix and alternate names +to refer to these char classes. +The standard Posix name ``\ex{space}'' is provided, +but deprecated, since it is ambiguous. It means ``whitespace,'' the set of +whitespace characters, not the singleton set of the \verb|#\space| character. +If you want a short name for the set of whitespace characters, use the +char-class name ``white'' instead. + +Char classes may be intersected with the operator +\cd{(& \var{cset-sre} \ldots)}, +and set-difference can be performed with +\ex{(- \var{cset-sre} \ldots)}. +These operators are +particularly useful when you want to specify a set by negation +\emph{with respect to a limited universe.} +For example, the set of all non-vowel letters is +\begin{code} +(- alpha ("aeiou") ("AEIOU"))\end{code}% +% +whereas writing a simple complement +\begin{code} +(~ ("aeiouAEIOU"))\end{code}% +% +gives a char class that will match any non-vowel---including punctuation, +digits, white space, control characters, and \textsc{Ascii} nul. + +We can \emph{compute} a char class by writing the SRE +\begin{code} +,\var{cset-exp}\end{code}% +% +where \var{cset-exp} is a Scheme expression producing a value that can be +coerced to a character set: a character set, character, one-character +string, or char-class regexp value. This regexp matches one character +from the set. + +The char-class SRE \cd{,@\var{cset-exp}} is entirely equivalent to +\ex{,\var{cset-exp}} +when \var{cset-exp} produces a character set (but see below for the more +general non-char-class context, where there \emph{is} a distinction between +\cd{,\var{exp}} and \cd{,@\var{exp}}. + +As an example of character-class SREs, +an SRE that matches a lower-case vowel, upper-case letter, or digit is +\begin{code} +(| ("aeiou") (/"AZ09"))\end{code}% +% +or, equivalently +\begin{code} +(| ("aeiou") upper-case numeric)\end{code}% +% +Boundary cases: the empty-complement char class +\begin{code} +(~)\end{code}% +% +matches any character; it is equivalent to \ex{any}. +The empty-union char class +\begin{code} +(|)\end{code}% +% +never matches at all. This is rarely useful for human-written regexps, +but may be of occasional utility in machine-generated regexps, perhaps +produced by macros. + +The rules for determining if an SRE is a simple, char-class SRE or a +more complex SRE form a little ``type system'' for SRE's. See the summary +section preceding this one for a complete listing of these rules. + +\paragraph{Case sensitivity} + +There are three forms that control case sensitivity: +\begin{code} +(uncase \var{sre} \ldots) +(w/case \var{sre} \ldots) +(w/nocase \var{sre} \ldots)\end{code}% +% + +\ex{uncase} is a regexp operator producing a regexp that matches any +case permutation of any string that matches \ex{(: \var{sre} \ldots)}. +For example, the regexp +\begin{code} +(uncase "foo")\end{code}% +% +matches the strings ``foo'', ``foO'', ``fOo'', ``fOO'', ``Foo'', \ldots + +Expressions in SRE notation are interpreted in a lexical case-sensitivy +context. The forms \ex{w/case} and \ex{w/nocase} are the scoping operators +for this context, which controls how constant strings and char-class forms are +interpreted in their bodies. So, for example, the regexp +\begin{code} +(w/nocase "abc" + (* "FOO" (w/case "Bar")) + ("aeiou"))\end{code}% +% +defines a case-insensitive match for all of its elements except for the +sub-element "Bar", which must match exactly capital-B, little-a, little-r. +The default, the outermost, top-level context is case sensitive. + +The lexical case-sensitivity context affects the interpretation of +\begin{itemize} + \item constant strings, such as \ex{"foo"}, + \item chars, such as \verb|#\x|, + \item char sets, such as \ex{("abc")}, and + \item ranges, such as \ex{(/"az")} +that appear within that context. It does not affect dynamically computed +regexps---ones that are introduced by ,\var{exp} and ,@\var{exp} forms. +It does not affect named char-classes---presumably, +if you wrote \ex{lower}, you didn't mean \ex{alpha}. + +\ex{uncase} is \emph{not} the same as \ex{w/nocase}. +To point up one distinction, consider the two regexps +\begin{code} +(uncase (~ "a")) +(w/nocase (~ "a"))\end{code}% +% +\end{itemize} + +The regexp \cd{(~ "a")} matches any character except ``a,'' +which means it \emph{does} match ``A.'' +Now, \ex{(uncase \var{re})} matches any case-permutation of a string that +\var{re} matches. +\cd{(~ "a")} matches ``A,'' +so \cd{(uncase (~ "a"))} matches ``A'' and ``a''---and, +for that matter, every other character. +So \cd{(uncase (~ "a"))} is equivalent to \ex{any}. + +In contrast, \cd{(w/nocase (~ "a"))} establishes a case-insensitive lexical +context in which the \cd{"a"} is interpreted, making the SRE equivalent to +\cd{(~ ("aA"))}. + + +\paragraph{Dynamic regexps} + +SRE notation allows you to compute parts of a regular expressions +at run time. The SRE +\begin{code} +,\var{exp}\end{code}% +% +is a regexp whose body \var{exp} is a Scheme expression producing a +string, character, char-set, or regexp as its value. Strings and +characters are converted into constant regexps; char-sets are converted +into char-class regexps; and regexp values are substituted in place. +So we can write regexps like this +\begin{code} +(: "feeding the " + ,(if (> n 1) "geese" "goose"))\end{code}% +% +This is how you can drop computed strings, such as someone's name, +or the decimal numeral for a computed number, into a complex regexp. + +If we have a large, complex regular expression that is used multiple +times in some other, containing regular expression, we can name it, using +the binding forms of the embedding language (\eg, Scheme), and refer to +it by name in the containing expression. +For example, consider the Scheme expression +\begin{code} +(let* ((ws (rx (+ whitespace))) ; Seq of whitespace + ;; Something like "Mar 14" + (date (rx (: (| "Jan" "Feb" "Mar" {\ldots}) + ,ws + (| ("123456789") ; 1-9 + (: ("12") digit) ; 10-29 + "30" ; 30 + "31"))))) ; 31 + ;; Now we can use DATE several times: + (rx {\ldots} ,date {\ldots} (* {\ldots} ,date {\ldots}) + {\ldots} ,date {\ldots}))\end{code}% +% +where the \ex{(rx \var{sre} \ldots)} +macro is the Scheme special form that produces +a Scheme regexp value given a body in SRE notation. + +As we saw in the char-class section, if a dynamic regexp is used +in a char-class context (\eg, as an argument to a \verb|~| operation), +the expression must be coercable not merely to a general regexp, +but to a character sre---so it must be either a singleton string, +a character, a scsh char set, or a char-class regexp. + +We can also define and use functions on regexps in the host language. +For example, consider the following Scheme expressions, containing +embedded SRE's (inside the \ex{rx} macro expressions) +which in term contain embedded Scheme expressions computing dynamic regexps: +\begin{code} +(define (csl re) + ;; A comma-separated list of RE's is either + (rx (| "" ; zero of them (empty string), + (: ,re ; or RE followed by + (* ", " ,re))))); zero or more comma-space-RE matches. + +(rx ... ,date ... + ,(csl (rx (| "John" "Paul" "George" "Ringo"))) + ... + ,(csl date) + ...)\end{code}% +% +We leave the extension of \ex{csl} to allow for an optional ``and'' between +the last two matches as an exercise for the interested reader (\eg, to match +``John, Paul, George and Ringo''). + +Note, in passing, one of the nice features of SRE notation: they can +be commented, and indented in a fashion to show the lexical extent of +the subexpressions. + +When we embed a computed regexp inside another regular expression with +the ,\var{exp} form, we must specify how to account for the submatches that +may be in the computed part. For example, suppose we have the regexp +\begin{code} +(rx (submatch (* "foo")) + (submatch (? "bar")) + ,(f x) + (submatch "baz"))\end{code}% +% +It's clear that the submatch for the \ex{(* "foo")} part of the regexp is +submatch \#1, and the \ex{(? "bar")} part is submatch \#2. But what number +submatch is the \ex{"baz"} submatch? It's not clear. Suppose the Scheme +expression \ex{(f x)} produces a regular expression that itself has 3 +subforms. Are these counted (making the \ex{"baz"} submatch \#6), or not +counted (making the \ex{"bar"} submatch \#3)? + +SRE notation provides for both possibilities. The SRE +\begin{code} +,\var{exp}\end{code}% +% +does \emph{not} contribute its submatches to its containing regexp; it +has zero submatches. So one can reliably assign submatch indices to +forms appearing after a \ex{,\var{exp}} form in a regexp. + +On the other hand, the SRE +\begin{code} +,@\var{exp}\end{code}% +% +``splices'' its resulting regexp into place, \emph{exposing} its submatches +to the containing regexp. This is useful if the computed regexp is defined +to produce a certain number of submatches---if that is part of \var{exp}'s +``contract.'' + + +\paragraph{String, line, and word units} + +The regexps \ex{bos} and \ex{eos} match the empty string at the beginning and +end of the string, respectively. + +The regexps \ex{bol} and \ex{eol} match the empty string at the beginning and +end of a line, respectively. A line begins at the beginning of the string, and +just after every newline character. A line ends at the end of the string, and +just before every newline character. The char class \ex{nonl} matches any +character except newline, and is useful in conjunction with line-based pattern +matching. + +The regexps \ex{bow} and \ex{eow} match the empty string at the beginning and +end of a word, respectively. A word is a contiguous sequence of characters +that are either alphanumeric or the underscore character. + +The regexp \ex{(word \var{sre} \ldots)} surrounds the sequence +\ex{(: \var{sre} \ldots)}with bow/eow delimiters. It is equivalent to +\begin{code} +(: bow \var{sre} \ldots eow)\end{code}% +% + +The regexp \ex{(word+ \var{cset-sre} \ldots)} matches a word whose body is +one or more word characters matched by the char-set sre \var{cset-sre}. +It is equivalent to +\begin{code} +(word (+ (& (| alphanumeric "_") + (| \var{cset-sre} \ldots))))\end{code}% +% +For example, a word not containing x, y, or z is +\begin{code} +(word+ (~ ("xyz")))\end{code}% +% +The regexp \ex{word} matches one word; it is equivalent to +\begin{code} +(word+ any) +\end{code}% + +\note{\ex{bol} and \ex{eol} are not supported by scsh's current + regexp search engine, which is Spencer's Posix matcher. This is the only + element of the notation that is not supported by the current scsh + reference implementation.} + +%\paragraph{Miscellaneous elements} + +\paragraph{Posix string notation} + +The SRE \ex{(posix-string \var{string})}, +where \var{string} is a string literal +(\emph{not} a general Scheme expression), allows one to use Posix string +notation for a regexp. It's intended as backwards compatibility and +is deprecated. +For example, \verb!(posix-string "[aeiou]+|x*|y{3,5}")! matches +a string of vowels, a possibly empty string of x's, or three to five +y's. + +Note that parentheses are used ambiguously in Posix notation---both for +grouping and submatch marking. +The \ex{(posix-string \var{string})} form makes the conservative assumption: +all parentheses introduce submatches. + +\paragraph{Deleted submatches} + +Deleted submatches, or ``DSM's,'' +are a subtle feature that are never required in expressions written +by humans. They can be introduced by the simplifier when reducing +regular expressions to simpler equivalents, and are included in the +syntax to give it expressibility spanning the full regexp ADT. They +may appear when unparsing simplified regular expressions that have +been run through the simplifier; otherwise you are not likely to see them. +Feel free to skip this section. + +The regexp simplifier can sometimes eliminate entire sub-expressions from a +regexp. For example, the regexp +\begin{code} +(: "foo" (** 0 0 "apple") "bar")\end{code}% +% +can be simplified to +\begin{code} +"foobar"\end{code}% +% +since \ex{(** 0 0 "apple")} will always match the empty string. The regexp +\begin{code} +(| "foo" + (: "Richard" (|) "Nixon") + "bar")\end{code}% +% +can be simplified to +\begin{code} +(| "foo" "bar")\end{code}% +% +The empty choice \ex{(|)} can't match anything, so the whole +\begin{code} +(: "Richard" (|) "Nixon")\end{code}% +% +sequence can't match, and we can remove it from the choice. + +However, if deleting part of a regular expression removes a submatch +form, any following submatch forms will have their numbering changed, +which would be an error. For example, if we simplify +\begin{code} +(: (** 0 0 (submatch "apple")) + (submatch "bar"))\end{code}% +% +to +\begin{code} +(submatch "bar")\end{code}% +% +then the \ex{"bar"} submatch changes from submatch \#2 to submatch \#1---so +this is not a legal simplification. + +When the simplifier deletes a sub-regexp that contains submatches, +it introduces a special regexp form to account for the missing, +deleted submatches, thus keeping the submatch accounting correct. +\begin{code} +(dsm \var{pre} \var{post} \var{sre} \ldots)\end{code}% +% +is a regexp that matches the sequence \ex{(: \var{sre} \ldots)}. +\var{pre} and \var{post} are integer constants. +The DSM form introduces \var{pre} deleted +submatches before the body, and \var{post} deleted submatches after the +body. +If the body \var{(: \var{sre} \ldots)} itself has \var{body-sm} submatches, +then the total number of submatches for the DSM form is + $$\var{pre} + \var{body-sm} + \var{post}.$$ +These extra, deleted submatches are never assigned string indices in any +match values produced when matching the regexp against a string. + +As examples, +\begin{code} +(| (: (submatch "Richard") (|) "Nixon") + (submatch "bar"))\end{code}% +% +can be simplified to +\begin{code} +(dsm 1 0 (submatch "bar"))\end{code}% +% +The regexp +\begin{code} +(: (** 0 0 (submatch "apple")) + (submatch "bar"))\end{code}% +% +can be simplified to +\begin{code} +(dsm 1 0 (submatch "bar"))\end{code}% + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Embedding regexps within Scheme programs} + +SRE's can be placed in a Scheme program using the \ex{(rx \var{sre} \ldots) } +Scheme form, which evaluates to a Scheme regexp value. + +\subsubsection{Static and dynamic regexps} + +We separate SRE expressions into two classes: static and dynamic +expressions. +A \emph{static} expression is one that has no run-time dependencies; +it is a complete, self-contained description of a regular set. +A \emph{dynamic} expression is one that requires run-time computation to +determine the particular regular set being described. +There are two places where one can +embed run-time computations in an SRE: +\begin{itemize} + \item The \var{from} or \var{to} repetition counts of + \ex{**}, \ex{=}, and \ex{>=} forms; + \item \ex{,\var{exp}} and \ex{,@\var{exp}} forms. +\end{itemize} + +A static SRE is one that does not contain any \ex{,\var{exp}} or +\ex{,@\var{exp}} forms, +and whose \ex{**}, \ex{=}, and \ex{>=} forms all contain constant +repetition counts. + +Scsh's \ex{rx} macro is able, at macro-expansion time, to completely parse, +simplify and translate any static SRE into the equivalent Posix string +which is used to drive the underlying C-based matching engine; there is +no run-time overhead. Dynamic SRE's are partially simplified and then expanded +into Scheme code that constructs the regexp at run-time. + + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Regexp functions} + +\subsection{Obsolete, deprecated procedures} + +These two procedures are survivors from the previous, now-obsolete scsh regexp +interface. Old code must open the \ex{re-old-funs} package to access them. They +should not be used in new code. + + +\defun{string-match}{posix-re-string string [start]}{match or false} +\defunx{make-regexp}{posix-re-string}{regexp} +\begin{desc} + These are old functions included for backwards compatibility with + previous releases. They are deprecated and will go away at some point in + the future. + + Note that the new release has no ``regexp compiling'' procedure at + all---regexp values are compiled for the matching engine on-demand, + and the necessary data structures are cached inside the ADT values. +\end{desc} + +\subsection{Standard procedures and syntax} + +\dfn{rx}{sre \ldots}{regexp}{Syntax} +\begin{desc} + This allows you to describe a regexp value with SRE notation. +\end{desc} + +\defun{regexp?}{x}{\boolean} +\begin{desc} + Returns true if the value is a regular expression. +\end{desc} + +\defun{regexp-search}{re string [start flags]}{match-data or false} +\defunx{regexp-search?}{re string [start flags]}{\boolean} +\begin{desc} + Search \var{string} starting at position \var{start}, looking for a match + for regexp \var{re}. If a match is found, return a match structure describing + the match, otherwise {\sharpf}. \var{Start} defaults to 0. + + \var{Flags} is the bitwise-or of \ex{regexp/bos-not-bol} and + \ex{regexp/eos-not-eol}. + \ex{regexp/bos-not-bol} means the beginning of the string isn't a + line-begin. \ex{regexp/eos-not-eol} is analogous. + \note{They're currently ignored because + begining/end-of-line anchors aren't supported by the current + implementation.} + + Use \ex{regexp-search?} when you don't need submatch information, as + it has the potential to be \emph{significantly} faster on + submatch-containing regexps. + + There is no longer a separate regexp ``compilation'' function; regexp + values are compiled for the C engine on demand, and the resulting + C structures are cached in the regexp structure after the first use. +\end{desc} + +\defun {match:start}{m [i]}{{\integer} or false} +\defunx{match:end}{ m [i]}{{\integer} or false} +\defunx{match:substring}{m [i]}{{\str} or false} +\begin{desc} + \ex{match:start} returns the start position of the submatch denoted by + \var{match-number}. + The whole regexp is 0; positive integers index submatches in the + regexp, counting left-to-right. + \var{Match-number} defaults to 0. + + If the regular expression matches as a whole, + but a particular sub-expression does not match, then + \ex{match:start} returns {\sharpf}. + + \ex{match:end} is analogous to \ex{match:start}, returning the end + position of the indexed submatch. + + \ex{match:substring} returns the substring matched regexp's submatch. + If there was no match for the indexed submatch, it returns false. +\end{desc} + +\defun{regexp-substitute}{port-or-false match . items}{\object} +\begin{desc} +This procedure can be used to perform string substitutions based on +regular-expression matches. +The results of the substitution can be either output to a port or +returned as a string. + +The \var{match} argument is a regular-expression match structure +that controls the substitution. +If \var{port} is an output port, the \var{items} are written out to +the port: +\begin{itemize} + \item If an item is a string, it is copied directly to the port. + \item If an item is an integer, the corresponding submatch from \var{match} + is written to the port. + \item If an item is \ex{'pre}, + the prefix of the matched string (the text preceding the match) + is written to the port. + \item If an item is \ex{'post}, + the suffix of the matched string is written. +\end{itemize} + +If \var{port} is {\sharpf}, nothing is written, and a string is constructed +and returned instead. +\end{desc} + +% An item is a string (copied verbatim), integer (match index), +% \ex{'pre} (chars before the match), or \ex{'post} (chars after the match). +% Passing false for the port means return a string. + +\defun{regexp-substitute/global}{port-or-false re str . items}{\object} +\begin{desc} +% Same as above, except \ex{'post} item means recurse +% on post-match substring. +% If \var{re} doesn't match \var{str}, returns \var{str.} +This procedure is similar to \ex{regexp-substitute}, +but can be used to perform repeated match/substitute operations over +a string. +It has the following differences with \ex{regexp-substitute}: +\begin{itemize} + \item It takes a regular expression and string to be matched as + parameters, instead of a completed match structure. + \item If the regular expression doesn't match the string, this + procedure is the identity transform---it returns or outputs the + string. + \item If an item is \ex{'post}, the procedure recurses on the suffix string + (the text from \var{string} following the match). + Including a \ex{'post} in the list of items is how one gets multiple + match/substitution operations. + \item If an item is a procedure, it is applied to the match structure for + a given match. + The procedure returns a string to be used in the result. + \end{itemize} +The \var{regexp} parameter can be either a compiled regular expression or +a string specifying a regular expression. + +Some examples: +{\small +\begin{widecode} +;;; Replace occurrences of "Cotton" with "Jin". +(regexp-substitute/global #f (rx "Cotton") s + 'pre "Jin" 'post) + +;;; mm/dd/yy -> dd/mm/yy date conversion. +(regexp-substitute/global #f (rx (submatch (+ digit)) "/" ; 1 = M + (submatch (+ digit)) "/" ; 2 = D + (submatch (+ digit))) ; 3 = Y + s ; Source string + 'pre 2 "/" 1 "/" 3 'post) + +;;; "9/29/61" -> "Sep 29, 1961" date conversion. +(regexp-substitute/global #f (rx (submatch (+ digit)) "/" ; 1 = M + (submatch (+ digit)) "/" ; 2 = D + (submatch (+ digit))) ; 3 = Y + s ; Source string + 'pre + ;; Sleazy converter -- ignores "year 2000" issue, + ;; and blows up if month is out of range. + (lambda (m) + (let ((mon (vector-ref '#("Jan" "Feb" "Mar" "Apr" "May" "Jun" + "Jul" "Aug" "Sep" "Oct" "Nov" "Dec") + (- (string->number (match:substring m 1)) 1))) + (day (match:substring m 2)) + (year (match:substring m 3))) + (string-append mon " " day ", 19" year))) + 'post) + +;;; Remove potentially offensive substrings from string S. +(define (kill-matches re s) + (regexp-substitute/global #f s 'pre 'post)) + +(kill-matches (rx (| "Windows" "tcl" "Intel")) s) ; Protect the children.\end{widecode}} + +\end{desc} + +\defun{regexp-fold}{re kons knil s [finish start]}{\object} +\begin{desc} + The following definition is a bit unwieldy, but the intuition is + simple: + this procedure uses the regexp \var{re} to divide up string \var{s} into + non-matching/matching chunks, and then ``folds'' the procedure \var{kons} + across this sequence of chunks. It is useful when you wish to operate + on a string in sub-units defined by some regular expression, as are + the related \ex{regexp-fold-right} and \ex{regexp-for-each} procedures. + + Search from \var{start} (defaulting to 0) for a match to \var{re}; call + this match \var{m}. Let \var{i} be the index of the end of the match + (that is, \ex{(match:end \var{m} 0))}. Loop as follows: +\begin{tightcode} +(regexp-fold \var{re} \var{kons} (\var{kons} \var{start} \var{m} \var{knil}) \var{s} \var{finish} \var{i})\end{tightcode} +% + If there is no match, return instead +\begin{tightcode} +(\var{finish} \var{start} \var{knil})\end{tightcode} +% + \var{Finish} defaults to \ex{(lambda (i knil) knil)}. + + In other words, we divide up \var{s} into a sequence of + non-matching/matching chunks: + $$ \vari{NM}1 \; \vari{M}1 \; \vari{NM}1 \; \vari{M}2 \; {\ldots} \; + \vari{NM}{k-1} \; \vari{M}{k-1} \; \vari{NM}k $$ +% + where \vari{NM}1 is the initial part of \var{s} that isn't matched by + the regexp \var{re}, \vari{M}1 is the + first match, \vari{NM}2 is the following part of \var{s} that + isn't matched, \vari{M}2 is the second match, + and so forth---\vari{NM}k is the final non-matching chunk of + \var{s}. + We apply \var{kons} from left to right to build up a result, passing it one + non-matching/matching chunk each time: + on an application \ex{(\var{kons} \var{i} \var{m} \var{knil})}, + the non-matching chunk goes from \var{i} to \ex{(match:begin \var{m} 0)}, + and the following matching chunk goes from \ex{(match:begin \var{m} 0)} + to \ex{(match:end \var{m} 0)}. The last non-matching chunk \vari{NM}k + is processed by \var{k}. So the computation we perform is +\begin{centercode} +(\var{final} \var{Q} (\var{kons} \vari{j}{k} \vari{M}{k} {\ldots} (\var{kons} \vari{J}{1} \vari{M}{1} \var{knil}) \ldots))\end{centercode}% +% + where \vari{J}{i} is the index of the start of \vari{NM}{i}, + \vari{M}{i} is a match value describing \vari{M}{i}, + and \var{Q} is the index of the beginning of \vari{NM}k. + + Hint: The \ex{let-match} macro is frequently useful for operating on the + match value \var{M} passed to the \var{kons} function. +\end{desc} + +\defun{regexp-fold-right}{re kons knil s [finish start]}\object +\begin{desc} + The right-to-left variant of \ex{regexp-fold}. + + This procedure repeatedly matches regexp \var{re} across string \var{s}. + This divides \var{s} up into a sequence of matching/non-matching chunks: + $$ \vari{NM}1 \; \vari{M}1 \; \vari{NM}1 \; \vari{M}2 \; {\ldots} \; + \vari{NM}{k-1} \; \vari{M}{k-1} \; \vari{NM}k $$ +% + where \vari{NM}1 is the initial part of \var{s} that isn't matched by + the regexp \var{re}, \vari{M}1 is the + first match, \vari{NM}2 is the following part of \var{s} that + isn't matched, \vari{M}2 is the second match, + and so forth---\vari{NM}k is the final non-matching chunk of + \var{s}. + We apply \var{kons} from right to left to build up a result, passing it one + non-matching/matching chunk each time: +\begin{centercode} +(\var{final} \var{Q} (\var{kons} \vari{M}{1} \vari{j}{1} {\ldots} (\var{kons} \vari{M}{k} \vari{J}{k} \var{knil}) \ldots))\end{centercode}% +% + where MTCHi is a match value describing Mi, Ji is the index of the end of + NMi (or, equivalently, the beginning of Mi+1), and Q is the index of the + beginning of M1. In other words, KONS is passed a match, an index + describing the following non-matching text, and the value produced by + folding the following text. The FINAL function "polishes off" the fold + operation by handling the initial chunk of non-matching text (NM0, above). + FINISH defaults to (lambda (i knil) knil) + + Example: To pick out all the matches to \var{re} in \var{s}, say +\begin{code} +(regexp-fold-right re + (\l{m i lis} + (cons (match:substring m 0) lis)) + '() s)\end{code}% +% + Hint: The \ex{let-match} macro is frequently useful for operating on the + match value \var{m} passed to the \ex{kons} function. +\end{desc} + +\defun{regexp-for-each}{re proc s [start]}{\undefined} +\begin{desc} + Repeatedly match regexp \var{re} against string \var{s}. + Apply \var{proc} to each match that is produced. + Matches do not overlap. + + Hint: The \ex{let-match} macro is frequently useful for operating on the + match value \var{m} passed to var{proc}. +\end{desc} + +\dfn{let-match}{match-exp mvars body \ldots}{\object}{Syntax} +\dfnx{if-match}{match-exp mvars on-match no-match}{\object}{Syntax} +\begin{desc} + \var{Mvars} is a list of vars that is bound to the match and submatches + of the string; \verb|#F| is allowed as a don't-care element. For example, +\begin{code} +(let-match (regexp-search date s) (whole-date month day year) + {\ldots} \var{body} {\ldots})\end{code}% +% + matches the regexp against string \ex{s}, then evaluates the body of the + \ex{let-match} in a scope where \ex{whole-date} is bound to the matched + string, and \ex{month}, \ex{day} and \ex{year} are bound to the first, + second and third submatches. + + \ex{if-match} is similar, but if the match expression is false, + then the \var{no-match} expression is evaluated; this would be an + error in \ex{let-match}. +\end{desc} + +\dfn{match-cond}{clause \ldots}{\object}{Syntax} +\begin{desc} +This macro allows one to conditionally attempt a sequence of pattern +matches, interspersed with other, general conditional tests. +There are four kinds of \ex{match-cond} clause, one introducing a pattern +match, and the other three simply being regular \ex{cond}-style clauses, +marked by the \ex{test} and \ex{else} keywords: +\begin{code} +(match-cond (\var{match-exp} \var{match-vars} \var{body} \ldots) ; As in if-match + (test \var{exp} \var{body} \ldots) ; As in cond + (test \var{exp} => \var{proc}) ; As in cond + (else \var{body} \ldots)) ; As in cond\end{code}% +\end{desc} + +\defun {flush-submatches}{re}{re} +\defunx{uncase}{re}{re} +\defunx{simplify-regexp}{re}{re} +\defunx{uncase-char-set}{cset}{re} +\defunx{uncase-string}{str}{re} +\begin{desc} +These functions map regexps and char sets to other regexps. +\ex{flush-submatches} returns a regexp which matches exactly what +its argument matches, but contains no submatches. + +\ex{uncase} returns a regexp that matches any case-permutation of +its argument regexp. + +\ex{simplify-regexp} applies the simplifier to its argument. +This is done automatically when compiling regular expressions, +so this is only useful for programmers that are directly examining +the ADT value with lower-level accessors. + +\ex{uncase-char-set} maps a char set to a regular expression that +matches any character from that set, regardless of case. +Similarly, \ex{uncase-string} returns a regexp that matches any +case-permutation of the string. For example, +\ex{(uncase-string "Knight")} returns the same value that +\ex{(rx ("kK") ("nN") ("iI") ("gG") ("hH") ("tT"))} +or \ex{(rx (w/nocase "Knight"))}. +\end{desc} + + +\defun {sre->regexp}{sre}{re} +\defunx{regexp->sre}{re}{sre} +\begin{desc} +These are the SRE parser and unparser. +That is, \ex{sre->regexp} maps an SRE to a regexp value, and +\ex{regexp->sre} does the inverse. +The latter function can be useful for printing out regexps in a +readable format. + +\begin{widecode} +(sre->regexp '(: "Olin " (? "G. ") "Shivers")) {\evalto} \var{regexp} +(define re (re-seq (re-string "Pete ") + (re-repeat 1 #f (re-string "Sz")) + (re-string "ilagyi"))) +(regexp->sre (re-repeat 0 1 re)) + {\evalto} '(? "Pete" (+ "Sz") "ilagyi")\end{widecode} + +\end{desc} + +\defun {posix-string->regexp}{string}{re} +\defunx{regexp->posix-string}{re}{string} +\begin{desc} +These two functions are the Posix notation parser and unparser. +That is, \ex{posix-string->regexp} maps a Posix-notation regular +expression, such as \ex{"g(ee|oo)se"}, to a regexp value, and +\ex{regexp->posix-string} does the inverse. + +You can use these tools to map between scsh regexps and Posix +regexp strings, which can be useful if you want to do conversion +between SRE's and Posix form. For example, you can write a particularly +complex regexp in SRE form, or compute it using the ADT constructors, +then convert to Posix form, print it out, cut and paste it into a +C or emacs lisp program. Or you can import an old regexp from some other +program, parse it into an ADT value, render it to an SRE, print it out, +then cut and paste it into a scsh program. + +Note:\begin{itemize} +\item The string parser doesn't handle the exotica of character class + names such as \verb|[[:alnum:]]|; the current implementation was written + in in three hours. + +\item The unparser produces Spencer-specific strings for bow/eow + elements; otherwise, it's Posix all the way. +\end{itemize} +\end{desc} + +\section{The regexp ADT} +The following functions may be used to construct and examine scsh's +regexp abstract data type. They are in the following Scheme 48 packages: + re-adt-lib + re-lib + scsh + +Each basic class of regexp has a predicate, a basic constructor, +a ``smart'' consructor that performs limited ``peephole'' optimisation +on its arguments, and a set of accessors. +The \ex{\ldots:tsm} accessor returns the total number of submatches +contained in the regular expression. + +\dfn {re-seq?}{x}{boolean}{Type predicate} +\dfnx{make-re-seq}{re \ldots}{re}{Basic constructor} +\dfnx{re-seq}{re \ldots}{re}{Smart constructor} +\dfnx{re-seq:elts}{re}{re-list}{Accessor} +\dfnx{re-seq:tsm}{re}{integer}{Accessor} + +\dfn {re-choice?}{x}{boolean}{Type predicate} +\dfnx{make-re-choice}{re-list}{re}{Basic constructor} +\dfnx{re-choice}{re \ldots}{re}{Smart constructor} +\dfnx{re-choice:elts}{re}{re-list}{Accessor} +\dfnx{re-choice:tsm}{re}{integer}{Accessor} + +\dfn {re-repeat?}{x}{boolean}{Type predicate} +\dfnx{make-re-repeat}{from to body}{re}{Accessor} +\dfnx{re-repeat:from}{re}{integer}{Accessor} +\dfnx{re-repeat:to}{re}{integer}{Accessor} +\dfnx{re-repeat:tsm}{re}{integer}{Accessor} + +\dfn {re-submatch?}{x}{boolean}{Type predicate} +\dfnx{make-re-submatch}{body [pre-dsm post-dsm]}{re}{Accessor} +\dfnx{re-submatch:pre-dsm}{re}{integer}{Accessor} +\dfnx{re-submatch:post-dsm}{re}{integer}{Accessor} +\dfnx{re-submatch:tsm}{re}{integer}{Accessor} + +\dfn {re-string?}{x}{boolean}{Type predicate} +\dfnx{make-re-string}{chars}{re}{Basic constructor} +\dfnx{re-string}{chars}{re}{Basic constructor} +\dfnx{re-string:chars}{re}{string}{Accessor} + +\dfn {re-char-set?}{x}{boolean}{Type predicate} +\dfnx{make-re-char-set}{cset}{re}{Basic constructor} +\dfnx{re-char-set}{cset}{re}{Basic constructor} +\dfnx{re-char-set:cset}{re}{char-set}{Accessor} + +\dfn {re-dsm?}{x}{boolean}{Type predicate} +\dfnx{make-re-dsm}{body pre-dsm post-dsm}{re}{Basic constructor} +\dfnx{re-dsm}{body pre-dsm post-dsm}{re}{Smart constructor} +\dfnx{re-dsm:body}{re}{re}{Accessor} +\dfnx{re-dsm:pre-dsm}{re}{integer}{Accessor} +\dfnx{re-dsm:post-dsm}{re}{integer}{Accessor} +\dfnx{re-dsm:tsm}{re}{integer}{Accessor} + +\defvar {re-bos}{regexp} +\defvarx{re-eos}{regexp} +\defvarx{re-bol}{regexp} +\defvarx{re-eol}{regexp} +\defvarx{re-bow}{regexp} +\defvarx{re-eow}{regexp} +\begin{desc} +These variables are bound to the primitive anchor regexps. +\end{desc} + +\defun {re-bos?}{\object}{\boolean} +\defunx{re-eos?}{\object}{\boolean} +\defunx{re-bol?}{\object}{\boolean} +\defunx{re-eol?}{\object}{\boolean} +\defunx{re-bow?}{\object}{\boolean} +\defunx{re-eow?}{\object}{\boolean} +\begin{desc} +These predicates recognise the associated primitive anchor regexp. +\end{desc} + +\defvar{re-trivial}{regexp} +\defunx{re-trivial?}{re}{\boolean} +\begin{desc} +The variable \ex{re-trivial} is bound to a regular expression +that matches the empty string (corresponding to the SRE \ex{""} or \ex{(:)}); +it is recognised by the associated predicate. +Note that the predicate is only guaranteed to recognise +this particular trivial regexp; other trivial regexps built using +other constructors may or may not produce a true value. +\end{desc} + +\defvar{re-empty}{regexp} +\defunx{re-empty?}{re}{\boolean} +\begin{desc} +The variable \ex{re-empty} is bound to a regular expression +that never matches (corresponding to the SRE \ex{(|)}); +it is recognised by the associated predicate. +Note that the predicate is only guaranteed to recognise +this particular empty regexp; other empty regexps built using +other constructors may or may not produce a true value. +\end{desc} + +\defvar{re-any}{regexp} +\defunx{re-any?}{re}{\boolean} +\begin{desc} +The variable \ex{re-any} is bound to a regular expression +that matches any character (corresponding to the SRE \ex{any}); +it is recognised by the associated predicate. +Note that the predicate is only guaranteed to recognise +this particular any-character regexp value; other any-character +regexps built using other constructors may or may not produce a true value. +\end{desc} + +% These are non-primitive predefined regexps of general utility. + +\defvar {re-nonl}{regexp} +\defvarx{re-word}{regexp} +\begin{desc} +The variable \ex{re-nonl} is bound to a regular expression +that matches any non-newline character +(corresponding to the SRE \verb|(~ #\newline)|). + +Similarly, \ex{re-word} is bound to a regular expression +that matches any word (corresponding to the SRE \ex{word}). +\end{desc} + +\defun{regexp?}{\object}{\boolean} +\begin{desc} +Is the object a regexp? +\end{desc} + +\defun{re-tsm}{re}{\integer} +\begin{desc} +Return the total number of submatches contained in the regexp. +\end{desc} + +\defun{clean-up-cres}{}{\undefined} +\begin{desc} +The current scsh implementation should call this function periodically +to release C-heap storage associated with compiled regexps. +Hopefully, this procedure will be removed at a later date. +\end{desc} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Syntax-hacking tools} + +The Scheme 48 package \ex{sre-syntax-tools} exports several tools for macro +writers that want to use SREs in their macros. In the functions defined +below, \var{compare} and \var{rename} parameters are as passed to Clinger-Rees +explicit-renaming low-level macros. + +\dfn{if-sre-form}{form conseq-form alt-form}{form}{Syntax} +\begin{desc} +If \var{form} is a legal SRE, this is equivalent to the expression +\var{conseq-form}, otherwise it expands to \var{alt-form}. + +This is useful for high-level macro authors who want to write a macro +where one field in the macro can be an SRE or possibly something +else. \Eg, we might have a conditional form wherein if the +test part of one arm is an SRE, it expands to a regexp match +on some implied value, otherwise the form is evaluated as a boolean +Scheme expression. +For example, a conditional macro might expand into code containing +the following form, which in turn would have one of two possible +expansions: +\begin{centercode} +(if-sre-form test-exp ; If TEST-EXP is SRE, + (regexp-search? (rx test-exp) line) ; match it w/the line, + test-exp) ; otw it's a text exp.\end{centercode}% +\end{desc} + + +\defun{sre-form?}{form rename compare}{\boolean} +\begin{desc} +This procedure is for low-level macros doing things equivalent to +\ex{if-sre-form}. It returns true if the form is a legal SRE. + +Note that neither \ex{sre-form} nor \ex{if-sre-form} does a deep recursion +over the form in the case where the form is a list. +They simply check the car of the form for one of the legal SRE keywords. +\end{desc} + +\defun {parse-sre}{sre-form compare rename}{re} +\defunx{parse-sres}{sre-forms compare rename}{re} +\begin{desc} +Parse \ex{sre-form} into an ADT. Note that if the SRE is dynamic---contains +\ex{,\var{exp}} or \ex{,@\var{exp}} forms, +or has repeat operators whose from/to counts are not constants---then +the returned ADT will have \var{Scheme expressions} in the corresponding +slots of the regexp records instead of the corresponding +integer, char-set, or regexp. +In other words, we use the ADT as its own AST. It's called a ``hack.'' + +\ex{parse-sres} parses a list of SRE forms that comprise an implicit sequence. +\end{desc} + +\defun{regexp->scheme}{re rename}{Scheme-expression} +\begin{desc} +Returns a Scheme expression that will construct the regexp \var{re} +using ADT constructors such as \ex{make-re-sequence}, \ex{make-re-repeat}, +and so forth. + +If the regexp is static, it will be simplified and pre-translated +to a Posix string as well, which will be part of the constructed +regexp value. +\end{desc} + +\defun{static-regexp?}{re}{\boolean} +\begin{desc} +Is the regexp a static one? +\end{desc} diff --git a/doc/scsh-manual/strings.tex b/doc/scsh-manual/strings.tex new file mode 100644 index 0000000..5ea9ef7 --- /dev/null +++ b/doc/scsh-manual/strings.tex @@ -0,0 +1,496 @@ +% -*- latex -*- +\chapter{Strings and characters} + +Strings are the basic communication medium for {\Unix} processes, so a +Unix programming environment must have reasonable facilities for manipulating +them. +Scsh provides a powerful set of procedures for processing strings and +characters. +Besides the the facilities described in this chapter, scsh also provides +\begin{itemize} +\itum{Regular expressions (chapter~\ref{chapt:sre})} + A complete regular-expression system. + +\itum{Field parsing, delimited record I/O and the awk loop + (chapter~\ref{chapt:fr-awk})} + These procedures let you read in chunks of text delimited by selected + characters, and + parse each record into fields based on regular expressions + (for example, splitting a string at every occurrence of colon or + white-space). + The \ex{awk} form allows you to loop over streams of these records + in a convenient way. + +\itum{The SRFI-13 string libraries} + This pair of libraries contains procedures that create, fold, iterate over, + search, compare, assemble, cut, hash, case-map, and otherwise manipulate + strings. + They are provided by the \ex{string-lib} and \ex{string-lib-internals} + packages, and are also available in the default \ex{scsh} package. + + More documentation on these procedures can be found at URLs + \begin{tightinset} + % The gratuitous mbox makes xdvi render the hyperlinks better. + \texonly + \mbox{\url{http://srfi.schemers.org/srfi-13/srfi-13.html}}\\ + \url{http://srfi.schemers.org/srfi-13/srfi-13.txt} + \endtexonly + % Changed the \mbox into \urlh for tex2page to avoid problems runing tex2page + \htmlonly + \urlh{http://srfi.schemers.org/srfi-13/srfi-13.html}{http://srfi.schemers.org/srfi-13/srfi-13.html}\\ + \urlh{http://srfi.schemers.org/srfi-13/srfi-13.txt}{http://srfi.schemers.org/srfi-13/srfi-13.txt} + \endhtmlonly + \end{tightinset} + +\itum{The SRFI-14 character-set library} + This library provides a set-of-characters abstraction, which is frequently + useful when searching, parsing, filtering or otherwise operating on + strings and character data. The SRFI is provided by the \ex{char-set-lib} + package; it's bindings are also available in the default \ex{scsh} package. + + More documentation on this library can be found at URLs + \begin{tightinset} + % The gratuitous mbox makes xdvi render the hyperlinks better. + \texonly + \mbox{\url{http://srfi.schemers.org/srfi-14/srfi-14.html}}\\ + \url{http://srfi.schemers.org/srfi-14/srfi-14.txt} + \endtexonly + % Changed the \mbox into \urlh for tex2page to avoid problems runing tex2page + \htmlonly + \urlh{http://srfi.schemers.org/srfi-14/srfi-14.html}{http://srfi.schemers.org/srfi-14/srfi-14.html}\\ + \urlh{http://srfi.schemers.org/srfi-14/srfi-14.txt}{http://srfi.schemers.org/srfi-14/srfi-14.txt} + \endhtmlonly + \end{tightinset} + +\end{itemize} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Manipulating file names} +\label{sec:filenames} + +These procedures do not access the file-system at all; they merely operate +on file-name strings. Much of this structure is patterned after the gnu emacs +design. Perhaps a more sophisticated system would be better, something +like the pathname abstractions of {\CommonLisp} or MIT Scheme. However, +being {\Unix}-specific, we can be a little less general. + +\subsection{Terminology} +These procedures carefully adhere to the {\Posix} standard for file-name +resolution, which occasionally entails some slightly odd things. +This section will describe these rules, and give some basic terminology. + +A \emph{file-name} is either the file-system root (``/''), +or a series of slash-terminated directory components, followed by +a a file component. +Root is the only file-name that may end in slash. +Some examples: +\begin{center} +\begin{tabular}{lll} + File name & Dir components & File component \\\hline + \ex{src/des/main.c} & \ex{("src" "des")} & \ex{"main.c"} \\ + \ex{/src/des/main.c} & \ex{("" "src" "des")} & \ex{"main.c"} \\ + \ex{main.c} & \ex{()} & \ex{"main.c"} \\ +\end{tabular} +\end{center} + +Note that the relative filename \ex{src/des/main.c} and the absolute filename +\ex{/src/des/main.c} are distinguished by the presence of the root component +\ex{""} in the absolute path. + +Multiple embedded slashes within a path have the same meaning as +a single slash. +More than two leading slashes at the beginning of a path have the same +meaning as a single leading slash---they indicate that the file-name +is an absolute one, with the path leading from root. +However, {\Posix} permits the OS to give special meaning to +\emph{two} leading slashes. +For this reason, the routines in this section do not simplify two leading +slashes to a single slash. + +A file-name in \emph{directory form} is either a file-name terminated by +a slash, \eg, ``\ex{/src/des/}'', or the empty string, ``''. +The empty string corresponds to the current working directory, +whose file-name is dot (``\ex{.}''). +Working backwards from the append-a-slash rule, +we extend the syntax of {\Posix} file-names to define the empty string +to be a file-name form of the root directory ``\ex{/}''. +(However, ``\ex{/}'' is also acceptable as a file-name form for root.) +So the empty string has two interpretations: +as a file-name form, it is the file-system root; +as a directory form, it is the current working directory. +Slash is also an ambiguous form: \ex{/} is both a directory-form and +a file-name form. + +The directory form of a file-name is very rarely used. +Almost all of the procedures in scsh name directories by giving +their file-name form (without the trailing slash), not their directory form. +So, you say ``\ex{/usr/include}'', and ``\ex{.}'', not +``\ex{/usr/include/}'' and ``''. +The sole exceptions are +\ex{file-name-as-directory} and \ex{directory-as-file-name}, +whose jobs are to convert back-and-forth between these forms, +and \ex{file-name-directory}, whose job it is to split out the +directory portion of a file-name. +However, most procedures that expect a directory argument will coerce +a file-name in directory form to file-name form if it does not have +a trailing slash. +Bear in mind that the ambiguous case, empty string, will be +interpreted in file-name form, \ie, as root. + + + +\subsection{Procedures} + +\defun {file-name-directory?} {fname} \boolean +\defunx {file-name-non-directory?} {fname} \boolean +\begin{desc} +These predicates return true if the string is in directory form, or +file-name form (see the above discussion of these two forms). +Note that they both return true on the ambiguous case of empty string, +which is both a directory (current working directory), and a file name +(the file-system root). +\begin{center} +\begin{tabular}{lll} +File name & \ex{\ldots-directory?} & \ex{\ldots-non-directory?} \\ +\hline +\ex{"src/des"} & \ex{\sharpf} & \ex{\sharpt} \\ +\ex{"src/des/"} & \ex{\sharpt} & \ex{\sharpf} \\ +\ex{"/"} & \ex{\sharpt} & \ex{\sharpf} \\ +\ex{"."} & \ex{\sharpf} & \ex{\sharpt} \\ +\ex{""} & \ex{\sharpt} & \ex{\sharpt} +\end{tabular} +\end{center} +\end{desc} + +\begin{defundesc} {file-name-as-directory} {fname} \str + Convert a file-name to directory form. + Basically, add a trailing slash if needed: + \begin{exampletable} + \ex{(file-name-as-directory "src/des")} & \ex{"src/des/"} \\ + \ex{(file-name-as-directory "src/des/")} & \ex{"src/des/"} \\[2ex] + % + \header{\ex{.}, \ex{/}, and \ex{""} are special:} + \ex{(file-name-as-directory ".")} & \ex{""} \\ + \ex{(file-name-as-directory "/")} & \ex{"/"} \\ + \ex{(file-name-as-directory "")} & \ex{"/"} + \end{exampletable} +\end{defundesc} + +\begin{defundesc} {directory-as-file-name} {fname} \str + Convert a directory to a simple file-name. + Basically, kill a trailing slash if one is present: + \begin{exampletable} + \ex{(directory-as-file-name "foo/bar/")} & \ex{"foo/bar"} \\[2ex] + % + \header{\ex{/} and \ex{""} are special:} + \ex{(directory-as-file-name "/")} & \ex{"/"} \\ + \ex{(directory-as-file-name "")} & \ex{"."} (\ie, the cwd) \\ + \end{exampletable} +\end{defundesc} + +\begin{defundesc} {file-name-absolute?} {fname} \boolean + Does \var{fname} begin with a root or \ex{\~} component? + (Recognising \ex{\~} as a home-directory specification + is an extension of {\Posix} rules.) +% + \begin{exampletable} + \ex{(file-name-absolute? "/usr/shivers")} & {\sharpt} \\ + \ex{(file-name-absolute? "src/des")} & {\sharpf} \\ + \ex{(file-name-absolute? "\~/src/des")} & {\sharpt} \\[2ex] + % + \header{Non-obvious case:} + \ex{(file-name-absolute? "")} & {\sharpt} (\ie, root) + \end{exampletable} +\end{defundesc} + + +\begin{defundesc} {file-name-directory} {fname} {{\str} or false} + Return the directory component of \var{fname} in directory form. + If the file-name is already in directory form, return it as-is. +% + \begin{exampletable} + \ex{(file-name-directory "/usr/bdc")} & \ex{"/usr/"} \\ + {\ex{(file-name-directory "/usr/bdc/")}} & + {\ex{"/usr/bdc/"}} \\ + \ex{(file-name-directory "bdc/.login")} & \ex{"bdc/"} \\ + \ex{(file-name-directory "main.c")} & \ex{""} \\[2ex] + % + \header{Root has no directory component:} + \ex{(file-name-directory "/")} & \ex{""} \\ + \ex{(file-name-directory "")} & \ex{""} + \end{exampletable} +\end{defundesc} + + +\begin{defundesc} {file-name-nondirectory} {fname} \str + Return non-directory component of fname. +% + \begin{exampletable} + {\ex{(file-name-nondirectory "/usr/ian")}} & + {\ex{"ian"}} \\ + \ex{(file-name-nondirectory "/usr/ian/")} & \ex{""} \\ + {\ex{(file-name-nondirectory "ian/.login")}} & + {\ex{".login"}} \\ + \ex{(file-name-nondirectory "main.c")} & \ex{"main.c"} \\ + \ex{(file-name-nondirectory "")} & \ex{""} \\ + \ex{(file-name-nondirectory "/")} & \ex{"/"} + \end{exampletable} +\end{defundesc} + + +\begin{defundesc} {split-file-name} {fname} {{\str} list} + Split a file-name into its components. +% + \begin{exampletable} + \splitline{\ex{(split-file-name "src/des/main.c")}} + {\ex{("src" "des" "main.c")}} \\[1.5ex] + % + \splitline{\ex{(split-file-name "/src/des/main.c")}} + {\ex{("" "src" "des" "main.c")}} \\[1.5ex] + % + \splitline{\ex{(split-file-name "main.c")}} {\ex{("main.c")}} \\[1.5ex] + % + \splitline{\ex{(split-file-name "/")}} {\ex{("")}} + \end{exampletable} +\end{defundesc} + + +\begin{defundesc} {path-list->file-name} {path-list [dir]} \str + Inverse of \ex{split-file-name}. +\begin{code} +(path-list->file-name '("src" "des" "main.c")) + {\evalto} "src/des/main.c" +(path-list->file-name '("" "src" "des" "main.c")) + {\evalto} "/src/des/main.c" +\cb +{\rm{}Optional \var{dir} arg anchors relative path-lists:} +(path-list->file-name '("src" "des" "main.c") + "/usr/shivers") + {\evalto} "/usr/shivers/src/des/main.c"\end{code} +% + The optional \var{dir} argument is usefully \ex{(cwd)}. +\end{defundesc} + + +\begin{defundesc} {file-name-extension} {fname} \str + Return the file-name's extension. +% + \begin{exampletable} + \ex{(file-name-extension "main.c")} & \ex{".c"} \\ + \ex{(file-name-extension "main.c.old")} & \ex{".old"} \\ + \ex{(file-name-extension "/usr/shivers")} & \ex{""} + \end{exampletable} +% + \begin{exampletable} + \header{Weird cases:} + \ex{(file-name-extension "foo.")} & \ex{"."} \\ + \ex{(file-name-extension "foo..")} & \ex{"."} + \end{exampletable} +% + \begin{exampletable} + \header{Dot files are not extensions:} + \ex{(file-name-extension "/usr/shivers/.login")} & \ex{""} + \end{exampletable} +\end{defundesc} + + +\begin{defundesc} {file-name-sans-extension} {fname} \str + Return everything but the extension. +% + \begin{exampletable} + \ex{(file-name-sans-extension "main.c")} & \ex{"main"} \\ + \ex{(file-name-sans-extension "main.c.old")} & \ex{"main.c""} \\ + \splitline{\ex{(file-name-sans-extension "/usr/shivers")}} + {\ex{"/usr/shivers"}} + \end{exampletable} +% + \begin{exampletable} + \header{Weird cases:} + \ex{(file-name-sans-extension "foo.")} & \ex{"foo"} \\ + \ex{(file-name-sans-extension "foo..")} & \ex{"foo."} \\[2ex] + % + \header{Dot files are not extensions:} + \splitline{\ex{(file-name-sans-extension "/usr/shivers/.login")}} + {\ex{"/usr/shivers/.login}} + \end{exampletable} + + Note that appending the results of \ex{file-name-extension} and + {\ttt file\=name\=sans\=extension} in all cases produces the original file-name. +\end{defundesc} + + +\begin{defundesc} {parse-file-name} {fname} {[dir name extension]} + Let $f$ be \ex{(file-name-nondirectory \var{fname})}. + This function returns the three values: + \begin{itemize} + \item \ex{(file-name-directory \var{fname})} + \item \ex{(file-name-sans-extension \var{f}))} + \item \ex{(file-name-extension \var{f}\/)} + \end{itemize} + The inverse of \ex{parse-file-name}, in all cases, is \ex{string-append}. + The boundary case of \ex{/} was chosen to preserve this inverse. +\end{defundesc} + +\begin{defundesc} {replace-extension} {fname ext} \str + This procedure replaces \var{fname}'s extension with \var{ext}. + It is exactly equivalent to + \codex{(string-append (file-name-sans-extension \var{fname}) \var{ext})} +\end{defundesc} + +\defun{simplify-file-name}{fname}\str +\begin{desc} + Removes leading and internal occurrences of dot. + A trailing dot is left alone, as the parent could be a symlink. + Removes internal and trailing double-slashes. + A leading double-slash is left alone, in accordance with {\Posix}. + However, triple and more leading slashes are reduced to a single slash, + in accordance with {\Posix}. + Double-dots (parent directory) are left alone, in case they come after + symlinks or appear in a \ex{/../\var{machine}/\ldots} ``super-root'' form + (which {\Posix} permits). +\end{desc} + +\defun{resolve-file-name}{fname [dir]}\str +\begin{desc} + \begin{itemize} + \item Do \ex{\~} expansion. + \item If \var{dir} is given, + convert a relative file-name to an absolute file-name, + relative to directory \var{dir}. + \end{itemize} +\end{desc} + +\begin{defundesc} {expand-file-name} {fname [dir]} \str +Resolve and simplify the file-name. +\end{defundesc} + +\begin{defundesc} {absolute-file-name} {fname [dir]} \str +Convert file-name \var{fname} into an absolute file name, +relative to directory \var{dir}, which defaults to the current +working directory. The file name is simplified before being +returned. + +This procedure does not treat a leading tilde character specially. +\end{defundesc} + +\begin{defundesc} {home-dir} {[user]} \str + \ex{home-dir} returns \var{user}'s home directory. + \var{User} defaults to the current user. + + \begin{exampletable} + \ex{(home-dir)} & \ex{"/user1/lecturer/shivers"} \\ + \ex{(home-dir "ctkwan")} & \ex{"/user0/research/ctkwan"} + \end{exampletable} +\end{defundesc} + +\begin{defundesc} {home-file} {[user] fname} \str + Returns file-name \var{fname} relative to \var{user}'s home directory; + \var{user} defaults to the current user. +% + \begin{exampletable} + \ex{(home-file "man")} & \ex{"/usr/shivers/man"} \\ + \ex{(home-file "fcmlau" "man")} & \ex{"/usr/fcmlau/man"} + \end{exampletable} +\end{defundesc} + +The general \ex{substitute-env-vars} string procedure, +defined in the previous section, +is also frequently useful for expanding file-names. + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Other string manipulation facilities} + +\begin{defundesc} {substitute-env-vars} {fname} \str + Replace occurrences of environment variables with their values. + An environment variable is denoted by a dollar sign followed by + alphanumeric chars and underscores, or is surrounded by braces. + + \begin{exampletable} + \splitline{\ex{(substitute-env-vars "\$USER/.login")}} + {\ex{"shivers/.login"}} \\ + \cd{(substitute-env-vars "$\{USER\}_log")} & \cd{"shivers_log"} + \end{exampletable} +\end{defundesc} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{ASCII encoding} + +\defun {char->ascii}{\character} \integer +\defunx {ascii->char}{\integer} \character +\begin{desc} + These are identical to \ex{char->integer} and \ex{integer->char} except that + they use the {\Ascii} encoding. +\end{desc} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Character predicates} + +\defun {char-letter?}\character\boolean +\defunx{char-lower-case?}\character\boolean +\defunx{char-upper-case?}\character\boolean +\defunx{char-title-case?}\character\boolean +\defunx{char-digit?}\character\boolean +\defunx{char-letter+digit?}\character\boolean +\defunx{char-graphic?}\character\boolean +\defunx{char-printing?}\character\boolean +\defunx{char-whitespace?}\character\boolean +\defunx{char-blank?}\character\boolean +\defunx{char-iso-control?}\character\boolean +\defunx{char-punctuation?}\character\boolean +\defunx{char-hex-digit?}\character\boolean +\defunx{char-ascii?}\character\boolean +\begin{desc} +Each of these predicates tests for membership in one of the standard +character sets provided by the SRFI-14 character-set library. +Additionally, the following redundant bindings are provided for {R5RS} +compatibility: +\begin{inset} +\begin{tabular}{ll} + {R5RS} name & scsh definition \\ \hline + \ex{char-alphabetic?} & \ex{char-letter+digit?} \\ + \ex{char-numeric?} & \ex{char-digit?} \\ + \ex{char-alphanumeric?} & \ex{char-letter+digit?} +\end{tabular} +\end{inset} +\end{desc} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Deprecated character-set procedures} +\label{sec:char-sets} + +The SRFI-13 character-set library grew out of an earlier library developed +for scsh. +However, the SRFI standardisation process introduced incompatibilities with +the original scsh bindings. +The current version of scsh provides the library + \ex{obsolete-char-set-lib}, which contains the old bindings found in +previous releases of scsh. +The following table lists the members of this library, along with +the equivalent SRFI-13 binding. This obsolete library is deprecated and +\emph{not} open by default in the standard \ex{scsh} environment; +new code should use the SRFI-13 bindings. +\begin{inset} +\begin{tabular}{ll} + Old \ex{obsolete-char-set-lib} & SRFI-13 \ex{char-set-lib} \\ \hline + + \ex{char-set-members} & \ex{char-set->list} \\ + \ex{chars->char-set} & \ex{list->char-set} \\ + \ex{ascii-range->char-set} & \ex{ucs-range->char-set} (not exact) \\ + \ex{predicate->char-set} & \ex{char-set-filter} (not exact) \\ + \ex{char-set-every}? & \ex{char-set-every} \\ + \ex{char-set-any}? & \ex{char-set-any} \\ + \\ + \ex{char-set-invert} & \ex{char-set-complement} \\ + \ex{char-set-invert}! & \ex{char-set-complement!} \\ + \\ + \ex{char-set:alphabetic} & \ex{char-set:letter} \\ + \ex{char-set:numeric} & \ex{char-set:digit} \\ + \ex{char-set:alphanumeric} & \ex{char-set:letter+digit} \\ + \ex{char-set:control} & \ex{char-set:iso-control} +\end{tabular} +\end{inset} +Note also that the \ex{->char-set} procedure no longer handles a predicate +argument. diff --git a/doc/scsh-manual/syscalls.tex b/doc/scsh-manual/syscalls.tex new file mode 100644 index 0000000..2a1deb8 --- /dev/null +++ b/doc/scsh-manual/syscalls.tex @@ -0,0 +1,3184 @@ +%&latex -*- latex -*- + +\chapter{System Calls} +\label{chapt:syscalls} + +Scsh provides (almost) complete access to the basic {\Unix} kernel services: +processes, files, signals and so forth. These procedures comprise a +{\Scheme} binding for {\Posix}, with a few of the more standard extensions +thrown in (\eg, symbolic links, \ex{fchown}, \ex{fstat}, sockets). + + +\section{Errors} +Scsh syscalls never return error codes, and do not use a global +\ex{errno} variable to report errors. +Errors are consistently reported by raising exceptions. +This frees up the procedures to return useful values, +and allows the programmer to assume that +\emph{if a syscall returns, it succeeded.} +This greatly simplifies the flow of the code from the programmer's point +of view. + +Since {\Scheme} does not yet have a standard exception system, the scsh +definition remains somewhat vague on the actual form of exceptions +and exception handlers. When a standard exception system is defined, +scsh will move to it. For now, scsh uses the {\scm} exception system, +with a simple sugaring on top to hide the details in the common case. + +System call error exceptions contain the {\Unix} \ex{errno} code reported by +the system call. Unlike C, the \ex{errno} value is a part of the exception +packet, it is \emph{not} accessed through a global variable. + +For reference purposes, the {\Unix} \ex{errno} numbers +are bound to the variables \ex{errno/perm}, \ex{errno/noent}, {\etc} +System calls never return \ex{error/intr}---they +automatically retry. + +\begin{dfndesc} + {errno-error}{errno syscall .\ data}{\noreturn}{procedure} +Raises a {\Unix} error exception for {\Unix} error number \var{errno}. +The \var{syscall} and \var{data} arguments are packaged up in the exception +packet passed to the exception handler. +\end{dfndesc} + +\defunx{with-errno-handler*}{handler thunk}{value(s) of thunk} +\begin{dfndescx} + {with-errno-handler}{handler-spec . body}{\valueofbody}{syntax} +{\Unix} syscalls raise error exceptions by calling \ex{errno-error}. +Programs can use \ex{with-errno-handler*} to establish +handlers for these exceptions. + +If a {\Unix} error arises while \var{thunk} is executing, +\var{handler} is called on two arguments like this: + \codex{(\var{handler} \var{errno} \var{packet})} +\var{packet} is a list of the form + $$\var{packet} = \ex{(\var{errno-msg} \var{syscall} . \var{data})},$$ +where \var{errno-msg} is the standard {\Unix} error message for the error, + \var{syscall} is the procedure that generated the error, +and \var{data} is a list of information generated by the error, + which varies from syscall to syscall. + +If \var{handler} returns, the handler search continues upwards. +\var{Handler} can acquire the exception by invoking a saved continuation. +This procedure can be sugared over with the following syntax: +% +\begin{code} +(with-errno-handler + ((\var{errno} \var{packet}) \var{clause} \ldots) + \var{body1} + \var{body2} + \ldots)\end{code} +% +This form executes the body forms with a particular errno handler installed. +When an errno error is raised, the handler search machinery will +bind variable \var{errno} to the error's integer code, and variable +\var{packet} to the error's auxiliary data packet. +Then, the clauses will be checked for a match. +The first clause that matches is executed, and its value is the +value of the entire \ex{with-errno-handler} form. +If no clause matches, the handler search continues. + +Error clauses have two forms +% +\begin{code} +((\var{errno} \ldots) \var{body} \ldots) +(else \var{body} \ldots)\end{code} +% +In the first type of clause, the \var{errno} forms are integer expressions. +They are evaluated and compared to the error's errno value. +An \ex{else} clause matches any errno value. +Note that the \var{errno} and \var{data} +variables are lexically visible to the error clauses. + +Example: +\begin{code} +(with-errno-handler + ((errno packet) ; Only handle 3 particular errors. + ((errno/wouldblock errno/again) + (loop)) + ((errno/acces) + (format #t "Not allowed access!") + #f)) + + (foo frobbotz) + (blatz garglemumph))\end{code} +% +It is not defined what dynamic context the handler executes in, +so fluid variables cannot reliably be referenced. + +Note that Scsh system calls always retry when interrupted, so that +the \ex{errno/intr} exception is never raised. +If the programmer wishes to abort a system call on an interrupt, he +should have the interrupt handler explicitly raise an exception or +invoke a stored continuation to throw out of the system call. +\end{dfndescx} + + +\subsection{Interactive mode and error handling} +Scsh runs in two modes: interactive and script mode. It starts up in +interactive mode if the scsh interpreter is started up with no script +argument. Otherwise, scsh starts up in script mode. The mode determines +whether scsh prints prompts in between reading and evaluating forms, and it +affects the default error handler. In interactive mode, the default error +handler will report the error, and generate an interactive breakpoint so that +the user can interact with the system to examine, fix, or dismiss from the +error. In script mode, the default error handler causes the scsh process to +exit. + +When scsh forks a child with \ex{(fork)}, the child resets to script mode. +This can be overridden if the programmer wishes. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{I/O} + +\subsection{Standard {\R4RS} I/O procedures} +In scsh, most standard {\R4RS} i/o operations (such as \ex{display} or +\ex{read-char}) work on both integer file descriptors and {\Scheme} ports. +When doing i/o with a file descriptor, the i/o operation is done +directly on the file, bypassing any buffered data that may have +accumulated in an associated port. +Note that character-at-a-time operations such as \ex{read-char} +are likely to be quite slow when performed directly upon file +descriptors. + +The standard {\R4RS} procedures \ex{read-char}, \ex{char-ready?}, \ex{write}, +\ex{display}, \ex{newline}, +and \ex{write-char} are all generic, accepting integer file descriptor +arguments as well as ports. +Scsh also mandates the availability of \ex{format}, and further requires +\ex{format} to accept file descriptor arguments as well as ports. + +The procedures \ex{peek-char} and \ex{read} do \emph{not} accept +file descriptor arguments, since these functions require the ability to +read ahead in the input stream, a feature not supported by {\Unix} I/O. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Port manipulation and standard ports} +\defun {close-after} {port consumer} {value(s) of consumer} +\begin{desc} + Returns \ex{(\var{consumer} \var{port})}, but closes the port on return. + No dynamic-wind magic. \remark{Is there a less-awkward name?} +\end{desc} + +\defun {error-output-port}{} {port} +\begin{desc} +This procedure is analogous to \ex{current-output-port}, but produces +a port used for error messages---the scsh equivalent of stderr. +\end{desc} + +\defun {with-current-input-port*} {port thunk} {value(s) of thunk} +\defunx {with-current-output-port*} {port thunk} {value(s) of thunk} +\defunx {with-error-output-port*} {port thunk} {value(s) of thunk} +\begin{desc} +These procedures install \var{port} as the current input, current output, +and error output port, respectively, for the duration of a call to +\var{thunk}. +\end{desc} + +\dfn {with-current-input-port} {port . body} {value(s) of body} {syntax} +\dfnx {with-current-output-port} {port . body} {value(s) of body} {syntax} +\dfnx {with-error-output-port} {port . body} {value(s) of body} {syntax} +\begin{desc} +These special forms are simply syntactic sugar for the +{\ttt with\=current\=input\=port*} procedure and friends. +\end{desc} + +\defun {set-current-input-port!} {port}{\undefined} +\defunx{set-current-output-port!}{port}{\undefined} +\defunx{set-error-output-port!} {port}{\undefined} +\begin{desc} +These procedures alter the dynamic binding of the current I/O port procedures +to new values. +\end{desc} + +\defun {close} {fd/port} {\boolean} +\begin{desc} + Close the port or file descriptor. + + If \var{fd/port} is a file descriptor, and it has a port allocated to it, + the port is shifted to a new file descriptor created with \ex{(dup + fd/port)} before closing \ex{fd/port}. The port then has its revealed + count set to zero. This reflects the design criteria that ports are not + associated with file descriptors, but with open files. + + To close a file descriptor, and any associated port it might have, you + must instead say one of (as appropriate): +\begin{code} +(close (fdes->inport fd)) +(close (fdes->outport fd))\end{code} + + The procedure returns true if it closed an open port. + If the port was already closed, it returns false; + this is not an error. +\end{desc} + +\defun {stdports->stdio}{} {\undefined} +\defunx {stdio->stdports}{} {\undefined} +\begin{desc} + These two procedures are used to synchronise Unix' standard I/O + file descriptors and Scheme's current I/O ports. + + \ex{(stdports->stdio)} causes the standard I/O file descriptors + (0, 1, and 2) to take their values from the current I/O ports. + It is exactly equivalent to the series of + redirections:\footnote{Why not \ex{move->fdes}? + Because the current output port and error port + might be the same port.} +\begin{code} +(dup (current-input-port) 0) +(dup (current-output-port) 1) +(dup (error-output-port) 2)\end{code} +% + \ex{stdio->stdports} causes the bindings of the current I/O ports + to be changed to ports constructed over the standard I/O file + descriptors. + It is exactly equivalent to the series of assignments +\begin{code} +(set-current-input-port! (fdes->inport 0)) +(set-current-output-port! (fdes->outport 1)) +(set-error-output-port! (fdes->outport 2))\end{code} +However, you are more likely to find the dynamic-extent variant, +\ex{with-stdio-ports*}, below, to be of use in general programming. +\end{desc} + +\defun{with-stdio-ports*} {thunk} {value(s) of thunk} +\dfnx {with-stdio-ports} {body \ldots} {value(s) of body}{syntax} +\begin{desc} + \ex{with-stdio-ports*} binds the standard ports \ex{(current-input-port)}, + \ex{(current-output-port)}, and \ex{(error-output-port)} to be ports + on file descriptors 0, 1, 2, and then calls \var{thunk}. + It is equivalent to: +\begin{code} +(with-current-input-port (fdes->inport 0) + (with-current-output-port (fdes->inport 1) + (with-error-output-port (fdes->outport 2) + (thunk))))\end{code} +% +The \ex{with-stdio-ports} special form is merely syntactic sugar. +\end{desc} + + + + +\subsection{String ports} +{\scm} has string ports, which you can use. Scsh has not committed to the +particular interface or names that {\scm} uses, so be warned that the +interface described herein may be liable to change. + +\defun {make-string-input-port} {string} {\port} +\begin{desc} + Returns a port that reads characters from the supplied string. +\end{desc} + +\defun {make-string-output-port} {} {\port} +\defunx {string-output-port-output} {port} {\str} +\begin{desc} +A string output port is a port that collects the characters given to it into +a string. +The accumulated string is retrieved by applying \ex{string-output-port-output} +to the port. +\end{desc} + +\defun {call-with-string-output-port} {procedure} {\str} +\begin{desc} + The \var{procedure} value is called on a port. When it returns, + \ex{call-with-string-output-port} returns a string containing the + characters that were written to that port during the execution + of \var{procedure}. +\end{desc} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Revealed ports and file descriptors} + +The material in this section and the following one is not critical for most +applications. +You may safely skim or completely skip this section on a first reading. + +Dealing with {\Unix} file descriptors in a {\Scheme} environment is difficult. +In {\Unix}, open files are part of the process environment, and are referenced +by small integers called \emph{file descriptors}. Open file descriptors are +the fundamental way i/o redirections are passed to subprocesses, since +file descriptors are preserved across fork's and exec's. + +{\Scheme}, on the other hand, uses ports for specifying i/o sources. Ports are +garbage-collected {\Scheme} objects, not integers. Ports can be garbage +collected; when a port is collected, it is also closed. Because file +descriptors are just integers, it's impossible to garbage collect them---you +wouldn't be able to close file descriptor 3 unless there were no 3's in the +system, and you could further prove that your program would never again +compute a 3. This is difficult at best. + +If a {\Scheme} program only used {\Scheme} ports, and never actually used +file descriptors, this would not be a problem. But {\Scheme} code +must descend to the file descriptor level in at least two circumstances: +% +\begin{itemize} + \item when interfacing to foreign code + \item when interfacing to a subprocess. +\end{itemize} +% +This causes a problem. Suppose we have a {\Scheme} port constructed +on top of file descriptor 2. We intend to fork off a program that +will inherit this file descriptor. If we drop references to the port, +the garbage collector may prematurely close file 2 before we fork +the subprocess. The interface described below is intended to fix this and +other problems arising from the mismatch between ports and file descriptors. + +The {\Scheme} kernel maintains a port table that maps a file descriptor +to the {\Scheme} port allocated for it (or, {\sharpf} if there is no port +allocated for this file descriptor). This is used to ensure that +there is at most one open port for each open file descriptor. + +The port data structure for file ports has two fields besides the descriptor: +\var{revealed} and \var{closed?}. +When a file port is closed with \ex{(close port)}, +the port's file descriptor is closed, its entry in the port table is cleared, +and the port's \var{closed?} field is set to true. + +When a file descriptor is closed with \ex{(close fdes)}, any associated +port is shifted to a new file descriptor created with \ex{(dup fdes)}. +The port has its revealed count reset to zero (and hence becomes eligible +for closing on GC). See discussion below. +To really put a stake through a descriptor's heart without waiting for +associated ports to be GC'd, you must say one of +% +\begin{code} +(close (fdes->inport fdes)) +(close (fdes->output fdes))\end{code} + +The \var{revealed} field is an aid to garbage collection. It is an integer +semaphore. If it is zero, the port's file descriptor can be closed when +the port is collected. Essentially, the \var{revealed} field reflects whether +or not the port's file descriptor has escaped to the {\Scheme} user. If +the {\Scheme} user doesn't know what file descriptor is associated with +a given port, then he can't possibly retain an ``integer handle'' on the +port after dropping pointers to the port itself, so the garbage collector +is free to close the file. + +Ports allocated with \ex{open-output-file} and \ex{open-input-file} are +unrevealed ports---\ie, \var{revealed} is initialised to 0. +No one knows the port's file descriptor, so the file descriptor can be closed +when the port is collected. + +The functions \ex{fdes->output-port}, \ex{fdes->input-port}, \ex{port->fdes} +are used to shift back and forth between file descriptors and ports. When +\ex{port->fdes} reveals a port's file descriptor, it increments the port's +\var{revealed} field. When the user is through with the file descriptor, he +can call \ex{(release-port-handle \var{port})}, which decrements the count. +The function \ex{(call/fdes fd/port \var{proc})} automates this protocol. +\ex{call/fdes} uses \ex{dynamic-wind} to enforce the protocol. +If \var{proc} throws out of the \ex{call/fdes} application, +the unwind handler releases the descriptor handle; +if the user subsequently tries to throw \emph{back} into \var{proc}'s +context, the wind handler raises an error. When the user maps a file +descriptor to a port with \ex{fdes->outport} or \ex{fdes->inport}, the port +has its revealed field incremented. + +Not all file descriptors are created by requests to make ports. Some are +inherited on process invocation via \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=exec&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{exec(2)}}, and are simply part of the +global environment. Subprocesses may depend upon them, so if a port is later +allocated for these file descriptors, is should be considered as a revealed +port. For example, when the {\Scheme} shell's process starts up, it opens ports +on file descriptors 0, 1, and 2 for the initial values of +\ex{(current-input-port)}, \ex{(current-output-port)}, and +\ex{(error-output-port)}. +These ports are initialised with \var{revealed} set to 1, +so that stdin, stdout, and stderr are not closed even if the user drops the +port. + +Unrevealed file ports have the nice property that they can be closed when all +pointers to the port are dropped. This can happen during gc, or at an +\ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=exec&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{exec()}}---since all memory is dropped at an \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=exec&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{exec()}}. No one knows the +file descriptor associated with the port, so the exec'd process certainly +can't refer to it. + +This facility preserves the transparent close-on-collect property +for file ports that are used in straightforward ways, yet allows +access to the underlying {\Unix} substrate without interference from +the garbage collector. This is critical, since shell programming +absolutely requires access to the {\Unix} file descriptors, as their +numerical values are a critical part of the process interface. + +A port's underlying file descriptor can be shifted around with \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=dup&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{dup(2)}} +when convenient. That is, the actual file descriptor on top of which a port is +constructed can be shifted around underneath the port by the scsh kernel when +necessary. This is important, because when the user is setting up file +descriptors prior to a \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=exec&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{exec(2)}}, he may explicitly use a file descriptor +that has already been allocated to some port. In this case, the scsh kernel +just shifts the port's file descriptor to some new location with \ex{dup}, +freeing up its old descriptor. This prevents errors from happening in the +following scenario. Suppose we have a file open on port \ex{f}. Now we want +to run a program that reads input on file 0, writes output to file 1, errors +to file 2, and logs execution information on file 3. We want to run this +program with input from \ex{f}. +So we write: +% +\begin{code} +(run (/usr/shivers/bin/prog) + (> 1 output.txt) + (> 2 error.log) + (> 3 trace.log) + (= 0 ,f))\end{code} +% +Now, suppose by ill chance that, unbeknownst to us, when the operating system +opened \ex{f}'s file, it allocated descriptor 3 for it. If we blindly redirect +\ex{trace.log} into file descriptor 3, we'll clobber \ex{f}! However, the +port-shuffling machinery saves us: when the \ex{run} form tries to dup +\ex{trace.log}'s file descriptor to 3, \ex{dup} will notice that file +descriptor 3 is already associated with an unrevealed port (\ie, \ex{f}). So, +it will first move \ex{f} to some other file descriptor. This keeps \ex{f} +alive and well so that it can subsequently be dup'd into descriptor 0 for +\ex{prog}'s stdin. + +The port-shifting machinery makes the following guarantee: a port is only +moved when the underlying file descriptor is closed, either by a \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=close&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{close()}} +or a \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=dup2&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{dup2()}} operation. Otherwise a port/file-descriptor association is +stable. + +Under normal circumstances, all this machinery just works behind the scenes to +keep things straightened out. The only time the user has to think about it is +when he starts accessing file descriptors from ports, which he should almost +never have to do. If a user starts asking what file descriptors have been +allocated to what ports, he has to take responsibility for managing this +information. + +\subsection{Port-mapping machinery} + +The procedures provided in this section are almost never needed. +You may safely skim or completely skip this section on a first reading. + +Here are the routines for manipulating ports in scsh. The important +points to remember are: +\begin{itemize} + \item A file port is associated with an open file, not a particular file + descriptor. + \item The association between a file port and a particular file descriptor + is never changed \emph{except} when the file descriptor is explicitly + closed. ``Closing'' includes being used as the target of a \ex{dup2}, so + the set of procedures below that close their targets are + \ex{close}, two-argument \ex{dup}, and \ex{move->fdes}. + If the target file descriptor of one of these routines has an + allocated port, the port will be shifted to another freshly-allocated + file descriptor, and marked as unrevealed, thus preserving the port + but freeing its old file descriptor. +\end{itemize} +These rules are what is necessary to ``make things work out'' with no +surprises in the general case. + +\defun {fdes->inport} {fd} {port} +\defunx {fdes->outport} {fd} {port} +\defunx {port->fdes} {port} {\fixnum} +\begin{desc} + These increment the port's revealed count. +\end{desc} + +\defun {port-revealed} {port} {{\integer} or \sharpf} +\begin{desc} +Return the port's revealed count if positive, otherwise \sharpf. +\end{desc} + +\defun{release-port-handle} {port} {\undefined} +\begin{desc} +Decrement the port's revealed count. +\end{desc} + +\defun {call/fdes} {fd/port consumer} {value(s) of consumer} +\begin{desc} + Calls \var{consumer} on a file descriptor; + takes care of revealed bookkeeping. + If \var{fd/port} is a file descriptor, this is just + \ex{(\var{consumer} \var{fd/port})}. + If \var{fd/port} is a port, + calls \var{consumer} on its underlying file descriptor. + While \var{consumer} is running, the port's revealed count is incremented. + + When \ex{call/fdes} is called with port argument, you are not allowed to + throw into \var{consumer} with a stored continuation, as that would violate + the revealed-count bookkeeping. +\end{desc} + +\defun{move->fdes} {fd/port target-fd} {port or fdes} +\begin{desc} + Maps fd$\rightarrow$fd and port$\rightarrow$port. + + If \var{fd/port} is a file-descriptor not equal to \var{target-fd}, + dup it to \var{target-fd} and close it. Returns \var{target-fd}. + + If \var{fd/port} is a port, it is shifted to \var{target-fd}, + by duping its underlying file-descriptor if necessary. + \var{Fd/port}'s original file descriptor is + closed (if it was different from \var{target-fd}). + Returns the port. + This operation resets \var{fd/port}'s revealed count to 1. + + In all cases when \var{fd/port} is actually shifted, if there is a port + already using \var{target-fd}, it is first relocated to some other file + descriptor. +\end{desc} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{{\Unix} I/O} + +\defun {dup} {fd/port [newfd]} {fd/port} +\defunx{dup->inport} {fd/port [newfd]} {port} +\defunx{dup->outport} {fd/port [newfd]} {port} +\defunx{dup->fdes} {fd/port [newfd]} {fd} +\begin{desc} +These procedures provide the functionality of C's \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=dup&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{dup()}} and \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=dup2&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{dup2()}}. +The different routines return different types of values: +\ex{dup->inport}, \ex{dup->outport}, and \ex{dup->fdes} return +input ports, output ports, and integer file descriptors, respectively. +\ex{dup}'s return value depends on on the type of +\var{fd/port}---it maps fd$\rightarrow$fd and port$\rightarrow$port. + +These procedures use the {\Unix} \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=dup&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{dup()}} syscall to replicate +the file descriptor or file port \var{fd/port}. +If a \var{newfd} file descriptor is given, it is used as the target of +the dup operation, \ie, the operation is a \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=dup2&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{dup2()}}. +In this case, procedures that return a port (such as \ex{dup->inport}) +will return one with the revealed count set to one. +For example, \ex{(dup (current-input-port) 5)} produces +a new port with underlying file descriptor 5, whose revealed count is 1. +If \var{newfd} is not specified, +then the operating system chooses the file descriptor, +and any returned port is marked as unrevealed. + +If the \var{newfd} target is given, +and some port is already using that file descriptor, +the port is first quietly shifted (with another \ex{dup}) +to some other file descriptor (zeroing its revealed count). + +Since {\Scheme} doesn't provide read/write ports, +\ex{dup->inport} and \ex{dup->outport} can be useful for +getting an output version of an input port, or \emph{vice versa}. +For example, if \ex{p} is an input port open on a tty, and +we would like to do output to that tty, we can simply use +\ex{(dup->outport p)} to produce an equivalent output port for the tty. +\end{desc} + +\defun {seek} {fd/port offset [whence]} {\integer} +\begin{desc} +Reposition the I/O cursor for a file descriptor or port. +\var{whence} is one of \{\ex{seek/set}, \ex{seek/delta}, \ex{seek/end}\}, +and defaults to \ex{seek/set}. +If \ex{seek/set}, then \var{offset} is an absolute index into the file; +if \ex{seek/delta}, then \var{offset} is a relative offset from the current + I/O cursor; +if \ex{seek/end}, then \var{offset} is a relative offset from the end of file. +The \var{fd/port} argument may be a port or an integer file descriptor. +Not all such values are seekable; +this is dependent on the OS implementation. +The return value is the resulting position of the I/O cursor in the I/O stream. +\oops{The current implementation doesn't handle \var{offset} arguments + that are not immediate integers (\ie, representable in 30 bits).} +\end{desc} + + +\defun {tell} {fd/port} {\integer} +\begin{desc} +Returns the position of the I/O cursor in the the I/O stream. +Not all file descriptors or ports support cursor-reporting; +this is dependent on the OS implementation. +\end{desc} + +\begin{defundesc} {open-file} {fname flags [perms]} {\port} + \var{Perms} defaults to \cd{#o666}. + \var{Flags} is an integer bitmask, composed by or'ing together constants + listed in table~\ref{table:fdes-status-flags} + (page~\pageref{table:fdes-status-flags}). + You must use exactly one of the \ex{open/read}, \ex{open/write}, or + \ex{open/read+write} flags. +% + The returned port is an input port if the \var{flags} permit it, + otherwise an output port. \R4RS/\scm/scsh do not have input/output ports, + so it's one or the other. This should be fixed. (You can hack simultaneous + i/o on a file by opening it r/w, taking the result input port, + and duping it to an output port with \ex{dup->outport}.) +\end{defundesc} + +\defun{open-input-file}{fname [flags]}\port +\begin{defundescx}{open-output-file}{fname [flags perms]}\port + These are equivalent to \ex{open-file}, after first setting the + read/write bits of the \var{flags} argument to \ex{open/read} or + \ex{open/write}, respectively. + \var{Flags} defaults to zero for \ex{open-input-file}, + and + \codex{(bitwise-ior open/create open/truncate)} + for \ex{open-output-file}. + These defaults make the procedures backwards-compatible with their + unary {\R4RS} definitions. +\end{defundescx} + +\begin{defundesc} {open-fdes} {fname flags [perms]} \integer + Returns a file descriptor. +\end{defundesc} + +\defun{fdes-flags}{fd/port}{\integer} +\begin{defundescx}{set-fdes-flags}{fd/port \integer}{\undefined} +These procedures allow reading and writing of an open file's flags. +The only such flag defined by {\Posix} is \ex{fdflags/close-on-exec}; +your {\Unix} implementation may provide others. + +These procedures should not be particularly useful to the programmer, +as the scsh runtime already provides automatic control of the close-on-exec +property. +Unrevealed ports always have their file descriptors marked +close-on-exec, as they can be closed when the scsh process execs a new program. +Whenever the user reveals or unreveals a port's file descriptor, +the runtime automatically sets or clears the flag for the programmer. +Programmers that manipulate this flag should be aware of these extra, automatic +operations. +\end{defundescx} + +\defun{fdes-status}{fd/port}{\integer} +\begin{defundescx}{set-fdes-status}{fd/port \integer}{\undefined} +These procedures allow reading and writing of an open file's status flags +(table~\ref{table:fdes-status-flags}). +% +\begin{table} +\begin{center} +\begin{tabular}{@{}rp{1.5in}>{\ttfamily}l@{}} +& Allowed operations & Status flag \\ \cline{2-3} +\textbf{Open+Get+Set} & + \parbox[t]{1.5in}{\raggedright + These flags can be used in \ex{open-file}, \ex{fdes-status}, + and \ex{set-fdes-status} calls.} & +% + \begin{tabular}[t]{@{}>{\ttfamily}l@{}} + %% These are gettable and settable + open/append \\ + open/non-blocking \\ + open/async \textrm{(Non-\Posix)} \\ + open/fsync \textrm{(Non-\Posix)} + \end{tabular} +\\\cline{2-3} +\textbf{Open+Get} & + \parbox[t]{1.5in}{\raggedright + These flags can be used in \ex{open-file} and \ex{fdes-status} calls, + but are ignored by \ex{set-fdes-status}.\strut} & +% + \begin{tabular}[t]{@{}>{\ttfamily}l@{}} + %% These are gettable, not settable + open/read \\ + open/write \\ + open/read+write \\ + open/access-mask + \end{tabular} +\\\cline{2-3} +\textbf{Open} & + \parbox[t]{1.5in}{\raggedright + These flags are only relevant in + \ex{open-file} calls; + they are ignored by \ex{fdes-status} and \ex{set-fdes-status} calls.} & +% + \begin{tabular}[t]{@{}>{\ttfamily}l@{}} + %% These are neither gettable nor settable. + open/create \\ + open/exclusive \\ + open/no-control-tty \\ + open/truncate + \end{tabular} +\end{tabular} +\end{center} +\caption{Status flags for \texttt{open-file}, + \texttt{fdes-status} and \texttt{set-fdes-status}. + Only {\Posix} flags are guaranteed to be present; + your operating system may define others. + The \texttt{open/access-mask} value is not an actual flag, + but a bit mask used to select the field for the \texttt{open/read}, + \texttt{open/write} and \texttt{open/read+write} bits. + } +\label{table:fdes-status-flags} +\end{table} + +Note that this file-descriptor state is shared between file descriptors +created by \ex{dup}---if you create port \var{b} by applying \ex{dup} +to port \var{a}, and change {\var{b}}'s status flags, you will also have +changed {\var{a}}'s status flags. +\end{defundescx} + +\begin{defundesc}{pipe}{} {[\var{rport} \var{wport}]} +Returns two ports, the read and write end-points of a {\Unix} pipe. +\end{defundesc} + +\defun{read-string}{nbytes [fd/port]} {{\str} or \sharpf} +\dfnix{read-string!} {str [fd/port start end]} {nread or \sharpf}{procedure} + {read-string"!@\texttt{read-string"!}} +\begin{desc} + These calls read exactly as much data as you requested, unless + there is not enough data (eof). + \ex{read-string!} reads the data into string \var{str} + at the indices in the half-open interval $[\var{start},\var{end})$; + the default interval is the whole string: $\var{start}=0$ and + $\var{end}=\ex{(string-length \var{string})}$. + They will persistently retry on partial reads and when interrupted + until (1) error, (2) eof, or (3) the input request is completely + satisfied. + Partial reads can occur when reading from an intermittent source, + such as a pipe or tty. + + \ex{read-string} returns the string read; \ex{read-string!} returns + the number of characters read. They both return false at eof. + A request to read zero bytes returns immediately, with no eof check. + + The values of \var{start} and \var{end} must specify a well-defined + interval in \var{str}, + \ie, $0 \le \var{start} \le \var{end} \le \ex{(string-length \var{str})}$. + + Any partially-read data is included in the error exception packet. + Error returns on non-blocking input are considered an error. +\end{desc} + +\defun {read-string/partial} {nbytes [fd/port]} {{\str} or \sharpf} +\dfnix{read-string!/partial} {str [fd/port start end]} {nread or \sharpf} + {procedure}{read-string"!/partial@\texttt{read-string"!/partial}} +\begin{desc} +% + These are atomic best-effort/forward-progress calls. + Best effort: they may read less than you request if there is a + lesser amount of data immediately available (\eg, because you + are reading from a pipe or a tty). + Forward progress: if no data is immediately available + (\eg, empty pipe), they will block. + Therefore, if you request an $n>0$ byte read, + while you may not get everything you asked for, you will always get something + (barring eof). + + There is one case in which the forward-progress guarantee is cancelled: + when the programmer explicitly sets the port to non-blocking i/o. + In this case, if no data is immediately available, + the procedure will not block, but will immediately return a zero-byte read. + + \ex{read-string/partial} reads the data into a freshly allocated string, + which it returns as its value. + \ex{read-string!/partial} reads the data into string \var{str} + at the indices in the half-open interval $[\var{start},\var{end})$; + the default interval is the whole string: $\var{start}=0$ and + $\var{end}=\ex{(string-length \var{string})}$. + The values of \var{start} and \var{end} must specify a well-defined + interval in \var{str}, + \ie, $0 \le \var{start} \le \var{end} \le \ex{(string-length \var{str})}$. + It returns the number of bytes read. + + A request to read zero bytes returns immediatedly, with no eof check. + + In sum, there are only three ways you can get a zero-byte read: + (1) you request one, (2) you turn on non-blocking i/o, or (3) you + try to read at eof. + + These are the routines to use for non-blocking input. + They are also useful when you wish to efficiently process data + in large blocks, and your algorithm is insensitive to the block size + of any particular read operation. +\end{desc} + +\defun {select }{rvec wvec evec [timeout]}{[rvec' wvec' evec']} +\defunx{select!}{rvec wvec evec [timeout]}{[nr nw ne]} +\begin{desc} +The \ex{select} procedure allows a process to block and wait for events on +multiple I/O channels. +The \var{rvec} and \var{evec} arguments are vectors of input ports and +integer file descriptors; \var{wvec} is a vector of output ports and +integer file descriptors. +The procedure returns three vectors whose elements are subsets of the +corresponding arguments. +Every element of \var{rvec'} is ready for input; +every element of \var{wvec'} is ready for output; +every element of \var{evec'} has an exceptional condition pending. + +The \ex{select} call will block until at least one of the I/O channels +passed to it is ready for operation. +The \var{timeout} value can be used to force the call to time-out +after a given number of seconds. It defaults to the special value +\ex{\#f}, meaning wait indefinitely. A zero value can be used to poll +the I/O channels. + +If an I/O channel appears more than once in a given vector---perhaps +occuring once as a Scheme port, and once as the port's underlying +integer file descriptor---only one of these two references may appear +in the returned vector. +Buffered I/O ports are handled specially---if an input port's buffer is +not empty, or an output port's buffer is not yet full, then these +ports are immediately considered eligible for I/O without using +the actual, primitive \ex{select} system call to check the underlying +file descriptor. +This works pretty well for buffered input ports, but is a little +problematic for buffered output ports. + +The \ex{select!} procedure is similar, but indicates the subset +of active I/O channels by side-effecting the argument vectors. +Non-active I/O channels in the argument vectors are overwritten with +{\sharpf} values. +The call returns the number of active elements remaining in each +vector. +As a convenience, the vectors passed in to \ex{select!} are +allowed to contain {\sharpf} values as well as integers and ports. + +\remark{I have found the \ex{select!} interface to be the more + useful of the two. After the system call, it allows you + to check a specific I/O channel in constant time.} +\end{desc} + + +\begin{defundescx}{write-string}{string [fd/port start end]}\undefined + This procedure writes all the data requested. + If the procedure cannot perform the write with a single kernel call + (due to interrupts or partial writes), + it will perform multiple write operations until all the data is written + or an error has occurred. + A non-blocking i/o error is considered an error. + (Error exception packets for this syscall include the amount of + data partially transferred before the error occurred.) + + The data written are the characters of \var{string} in the half-open + interval $[\var{start},\var{end})$. + The default interval is the whole string: $\var{start}=0$ and + $\var{end}=\ex{(string-length \var{string})}$. + The values of \var{start} and \var{end} must specify a well-defined + interval in \var{str}, + \ie, $0 \le \var{start} \le \var{end} \le \ex{(string-length \var{str})}$. + A zero-byte write returns immediately, with no error. + + Output to buffered ports: \ex{write-string}'s efforts end as soon + as all the data has been placed in the output buffer. + Errors and true output may not happen until a later time, of course. +\end{defundescx} + +\begin{defundescx}{write-string/partial}{string [fd/port start end]}{nwritten} + This routine is the atomic best-effort/forward-progress analog + to \ex{write-string}. + It returns the number of bytes written, which may be less than you + asked for. + Partial writes can occur when (1) we write off the physical end of + the media, (2) the write is interrrupted, or (3) the file descriptor + is set for non-blocking i/o. + + If the file descriptor is not set up for non-blocking i/o, then + a successful return from these procedures makes a forward progress + guarantee---that is, a partial write took place of at least one byte: + \begin{itemize} + \item If we are at the end of physical media, and no write takes place, + an error exception is raised. + So a return implies we wrote \emph{something}. + \item If the call is interrupted after a partial transfer, it returns + immediately. But if the call is interrupted before any data transfer, + then the write is retried. + \end{itemize} + + If we request a zero-byte write, then the call immediately returns 0. + If the file descriptor is set for non-blocking i/o, then the call + may return 0 if it was unable to immediately write anything + (\eg, full pipe). + Barring these two cases, a write either returns $\var{nwritten} > 0$, + or raises an error exception. + + Non-blocking i/o is only available on file descriptors and unbuffered + ports. Doing non-blocking i/o to a buffered port is not well-defined, + and is an error (the problem is the subsequent flush operation). +\end{defundescx} + +\subsection{Buffered I/O} + +{\scm} ports use buffered I/O---data is transferred to or from the +OS in blocks. Scsh provides control of this mechanism: the programmer +may force saved-up output data to be transferred to the OS when +he chooses, +and may also choose which I/O buffering policy to employ for a given +port (or turn buffering off completely). + +It can be useful to turn I/O buffering off in some cases, for example +when an I/O stream is to be shared by multiple subprocesses. +For this reason, scsh allocates an unbuffered port for file descriptor 0 +at start-up time. +Because shells frequently share stdin with subprocesses, if the shell +does buffered reads, it might ``steal'' input intended for a subprocess. For +this reason, all shells, including sh, csh, and scsh, read stdin unbuffered. +Applications that can tolerate buffered input on stdin can reset +\ex{(current-input-port)} to block buffering for higher performance. + +\begin{defundesc}{set-port-buffering}{port policy [size]}\undefined +This procedure allows the programmer to assign a particular I/O buffering +policy to a port, and to choose the size of the associated buffer. +It may only be used on new ports, \ie, before I/O is performed on the port. +There are three buffering policies that may be chosen: + \begin{inset} + \begin{tabular}{l@{\qquad}l} + \ex{bufpol/block} & General block buffering (general default) \\ + \ex{bufpol/line} & Line buffering (tty default) \\ + \ex{bufpol/none} & Direct I/O---no buffering + \end{tabular} + \end{inset} +The line buffering policy flushes output whenever a newline is output; +whenever the buffer is full; or whenever an input is read from stdin. +Line buffering is the default for ports open on terminal devices. + +The \var{size} argument requests an I/O buffer of \var{size} bytes. +If not given, a reasonable default is used; if given and zero, +buffering is turned off +(\ie, $\var{size} = 0$ for any policy is equivalent to + $\var{policy} = \ex{bufpol/none}$). +\end{defundesc} + +\begin{defundesc}{force-output} {[fd/port]}{\undefined} + This procedure does nothing when applied to an integer file descriptor + or unbuffered port. + It flushes buffered output when applied to a buffered port, + and raises a write-error exception on error. Returns no value. +\end{defundesc} + +\begin{defundesc}{flush-all-ports} {}{\undefined} + This procedure flushes all open output ports with buffered data. +\end{defundesc} + +\subsection{File locking} + +Scsh provides {\Posix} advisory file locking. +\emph{Advisory} locks are locks that can be checked by user code, +but do not affect other I/O operations. +For example, if a process has an exclusive lock on a region of a file, +other processes will not be able to obtain locks on that region of the file, +but they will still be able to read and write the file with no hindrance. +Using advisory locks requires cooperation amongst the agents accessing +the shared resource. + +\remark{ +Unfortunately, {\Posix} file locks are associated with actual files, +not with associated open file descriptors. +Once a process locks a file, using some file descriptor \var{fd}, +the next time \emph{any} file descriptor referencing that file is closed, +all associated locks are released. +This severely limits the utility of {\Posix} advisory file locks, +and we'd recommend caution when using them. +It is not without reason that the FreeBSD man pages refer to {\Posix} +file locking as ``completely stupid.'' + +Scsh moves Scheme ports from file descriptor to file descriptor with +\ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=dup&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{dup()}} and \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=close&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{close()}} as required by the runtime, +so it is impossible to keep file locks open across one of these shifts. +Hence we can only offer {\Posix} advisory file locking directly on raw +integer file descriptors; +regrettably, there are no facilities for locking Scheme ports. + +Note that once a Scheme port is revealed in scsh, the runtime will not +shift the port around with \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=dup&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{dup()}} and \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=close&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{close()}}. +This means the file-locking procedures can then be applied to the port's +associated file descriptor. +} + +{\Posix} allows the user to lock a region of a file with either +an exclusive or shared lock. +Locked regions are described by the \emph{lock-region} record: +\begin{code} +(define-record lock-region + exclusive? + start + len + whence + proc)\end{code}% +\index{lock-region?}% +\index{lock-region:exclusive?} \index{lock-region:whence}% +\index{lock-region:start} \index{lock-region:end}% +\index{lock-region:len} \index{lock-region:proc}% +% +\begin{itemize} +\item +The \ex{exclusive?} field is true if the lock is exclusive; +false if it is shared. + +\item +The \ex{whence} field is one of the values from the \ex{seek} call: +\ex{seek/set}, \ex{seek/delta}, or \ex{seek/end}, +and determines the interpretation of the \ex{start} field: +\begin{itemize} +\item If \ex{seek/set}, the \ex{start} value is simply an absolute index +into the file. +\item If \ex{seek/delta}, the \ex{start} value is an offset from the +file descriptor's current position in the file. +\item If \ex{seek/end}, the \ex{start} value is an offset from the +end of the file. +\end{itemize} +The region of the file being locked is given by the \ex{start} and \ex{len} +fields; +if \ex{len} is zero, it means ``infinity,'' that is, the region extends +from the starting point through the end of the file, even as the file is +extended by subsequent write operations. + +\item +The \ex{proc} field gives the process object for the process holding the region +lock, when relevant (see \ex{get-lock-region} below). +\end{itemize} + +\begin{defundesc}{make-lock-region}{exclusive? start len [whence]}{lock-region} +This procedure makes a lock-region record. +The \ex{whence} field defaults to \ex{seek/set}. +\end{defundesc} + +\defun {lock-region}{fdes lock}{\undefined} +\defunx{lock-region/no-block}{fdes lock}{\boolean} +\begin{desc} +These procedures lock a region of the file referenced by file descriptor +\var{fdes}. +The \ex{lock-region} procedure blocks until the lock is granted; +the non-blocking variant returns a boolean indicating whether or not +the lock was granted. +To take an exclusive (write) lock, you must have the file descriptor +open with write access; +to take a shared (read) lock, you must have the file descriptor +open with read access. +\end{desc} + +\begin{defundesc}{get-lock-region}{fdes lock}{lock-region or \sharpf} +Return the first lock region on \var{fdes} that would conflict with +lock region \var{lock}. +If there is no such lock region, return false. +This procedure fills out the \ex{proc} field of the returned lock region, +and is the only procedure that has anything to do with this field. +(See section~\ref{sec:proc-objects} for a description of process objects.) +Note that if you apply this procedure to a file system that is shared +across multiple operating systems (\ie, an NFS file system), the \ex{proc} +field may be ambiguous. +We note, again, that {\Posix} advisory file locking is not a terribly useful +or well-designed facility. +\end{defundesc} + +\begin{defundesc}{unlock-region}{fdes lock}{\undefined} +Release a lock from a file. +\end{defundesc} + +\defun{with-region-lock*}{fdes lock thunk}{value(s) of thunk} +\dfnx{with-region-lock}{fdes lock body \ldots}{value(s) of body}{syntax} +\begin{desc} +This procedure obtains the requested lock, and then calls +\ex{(\var{thunk})}. When \var{thunk} returns, the lock is released. +A non-local exit (\eg, throwing to a saved continuation or raising +an exception) also causes the lock to be released. + +After a normal return from \var{thunk}, its return values are returned +by \ex{with-region-lock*}. +The \ex{with-region-lock} special form is equivalent syntactic sugar. +\end{desc} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\section{File system} + +Besides the following procedures, which allow access to the +computer's file system, scsh also provides a set of procedures +which manipulate file \emph{names}. These string-processing +procedures are documented in section \ref{sec:filenames}. + +\defun {create-directory} {fname [perms override?]} {\undefined} +\defunx{create-fifo} {fname [perms override?]} {\undefined} +\defunx{create-hard-link} {oldname newname [override?]} {\undefined} +\begin{defundescx} + {create-symlink} {old-name new-name [override?]} {\undefined} + + These procedures create objects of various kinds in the file system. + + The \var{override?} argument controls the action if there is already an + object in the file system with the new name: + \begin{optiontable} + \sharpf & signal an error (default) \\ + 'query & prompt the user \\ + \textnormal{\emph{other}}& \parbox[t]{0.7\linewidth}{ + delete the old object (with \ex{delete-file} + or \ex{delete-directory,} as appropriate) before + creating the new object.} + + \end{optiontable} + + \var{Perms} defaults to \cd{#o777} (but is masked by the current umask). + + \remark{Currently, if you try to create a hard or symbolic link from a + file to itself, you will error out with \var{override?} false, and simply + delete your file with \var{override?} true. Catching this will require + some sort of true-name procedure, which I currently do not have.} +\end{defundescx} + +\defun {delete-directory} {fname} \undefined +\defunx{delete-file} {fname} \undefined +\begin{defundescx} {delete-filesys-object} {fname} \undefined +These procedures delete objects from the file system. +The {\ttt delete\=filesys\=object} procedure will delete an object +of any type from the file system: files, (empty) directories, symlinks, fifos, +\etc. + +If the object being deleted doesn't exist, \ex{delete-directory} and +\ex{delete-file} raise an error, +while \ex{delete-filesys-object} simply returns. +\end{defundescx} + +\begin{defundescx}{read-symlink}{fname} \str + Return the filename referenced by symbolic link \ex{fname}. +\end{defundescx} + +\begin{defundescx} {rename-file} {old-fname new-fname [override?]} \undefined + If you override an existing object, then \var{old-fname} + and \var{new-fname} must type-match---either both directories, + or both non-directories. + This is required by the semantics of {\Unix} \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=rename&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{rename()}}. + + \remark{ + There is an unfortunate atomicity problem with the \ex{rename-file} + procedure: if you + specify no-override, but create file \ex{new-fname} sometime between + \ex{rename-file}'s existence check and the actual rename operation, + your file will be clobbered with \ex{old-fname}. There is no way to fix + this problem, given the semantics of {\Unix} \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=rename&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{rename()}}; + at least it is highly unlikely to occur in practice. + } +\end{defundescx} + +\defun {set-file-mode} {fname/fd/port mode} \undefined +\defunx{set-file-owner} {fname/fd/port uid} {\undefined} +\defunx{set-file-group} {fname/fd/port gid} {\undefined} +\begin{desc} + These procedures set the permission bits, owner id, and group id of a + file, respectively. + The file can be specified by giving the file name, or either an + integer file descriptor or a port open on the file. + Setting file user ownership usually requires root privileges. +\end{desc} + +\defun {set-file-times} {fname [access-time mod-time]} {\undefined} +\begin{desc} + This procedure sets the access and modified times for the file + \var{fname} to the supplied values (see section~\ref{sec:time} + for the scsh representation of time). + If neither time argument is supplied, they are both taken to be + the current time. You must provide both times or neither. + If the procedure completes successfully, the file's time of last + status-change (\ex{ctime}) is set to the current time. +\end{desc} + +\defun {sync-file} {fd/port} \undefined +\defunx{sync-file-system}{} \undefined +\begin{desc} + Calling \ex{sync-file} + causes {\Unix} to update the disk data structures for a given file. + If \var{fd/port} is a port, any buffered data it may have is first + flushed. + Calling \ex{sync-file-system} synchronises the kernel's entire file + system with the disk. + + These procedures are not {\Posix}. + Interestingly enough, \ex{sync\=file\=system} doesn't actually + do what it is claimed to do. We just threw it in for humor value. + See the \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=sync&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{sync(2)}} man page for {\Unix} enlightenment. +\end{desc} + +\begin{defundesc} {truncate-file} {fname/fd/port len} \undefined + The specified file is truncated to \var{len} bytes in length. +\end{defundesc} + +\begin{defundesc}{file-info} {fname/fd/port [chase?]} {file-info-record} + The \ex{file-info} procedure + returns a record structure containing everything + there is to know about a file. If the \var{chase?} flag is true + (the default), then the procedure chases symlinks and reports on + the files to which they refer. If \var{chase?} is false, then + the procedure checks the actual file itself, even if it's a symlink. + The \var{chase?} flag is ignored if the file argument is a file descriptor + or port. + + The value returned is a \emph{file-info record}, defined to have the + following structure: +\begin{code} +(define-record file-info + type ; \{block-special, char-special, directory, + ; fifo, regular, socket, symlink\} + device ; Device file resides on. + inode ; File's inode. + mode ; File's mode bits: permissions, setuid, setgid + nlinks ; Number of hard links to this file. + uid ; Owner of file. + gid ; File's group id. + size ; Size of file, in bytes. + atime ; Time of last access. + mtime ; Time of last mod. + ctime) ; Time of last status change.\end{code} +\index{file-info:type}\index{file-info:device}\index{file-info:inode}% +\index{file-info:mode}\index{file-info:nlinks}\index{file-info:uid}% +\index{file-info:gid}\index{file-info:size}\index{file-info:atime}% +\index{file-info:mtime}\index{file-info:ctime}% +% + The uid field of a file-info record is accessed with the procedure +\codex{(file-info:uid x)} + and similarly for the other fields. + The \ex{type} field is a symbol; all other fields are integers. + A file-info record is discriminated with the \ex{file-info?} predicate. + +The following procedures all return selected information about +a file; they are built on top of \ex{file-info}, and are +called with the same arguments that are passed to it. +\begin{inset} +\newcommand{\Ex}[1]{\ex{#1}\index{#1@{\tt{#1}}}} +\begin{tabular}{ll} +Procedure & returns \\\hline +\Ex{file-type} & type \\ +\Ex{file-inode} & inode \\ +\Ex{file-mode} & mode \\ +\Ex{file-nlinks} & nlinks \\ +\Ex{file-owner} & uid \\ +\Ex{file-group} & gid \\ +\Ex{file-size} & size \\ +\Ex{file-last-access} & atime \\ +\Ex{file-last-mod} & mtime \\ +\Ex{file-last-status-change} & ctime +\end{tabular} +\end{inset} +% +Example: +\begin{code} +;; All my files in /usr/tmp: +(filter (\l{f} (= (file-owner f) (user-uid))) + (directory-files "/usr/tmp")))\end{code} + +\remark{\ex{file-info} was named \ex{file-attributes} in releases of scsh + prior to release 0.4. We changed the name to \ex{file-info} for + consistency with the other information-retrieval procedures in + scsh: \ex{user-info}, \ex{group-info}, \ex{host-info}, + \ex{network-info }, \ex{service-info}, and \ex{protocol-info}. + + The \ex{file-attributes} binding is still supported in the current + release of scsh, but is deprecated, and may go away in a future + release.} +\end{defundesc} + +\defun {file-directory?}{fname/fd/port [chase?]}{\boolean} +\defunx {file-fifo?}{fname/fd/port [chase?]}{\boolean} +\defunx {file-regular?}{fname/fd/port [chase?]}{\boolean} +\defunx {file-socket?}{fname/fd/port [chase?]}{\boolean} +\defunx {file-special?}{fname/fd/port [chase?]}{\boolean} +\defunx {file-symlink?}{fname/fd/port}{\boolean} +\begin{desc} +These procedures are file-type predicates that test the +type of a given file. +They are applied to the same arguments to which \ex{file-info} is applied; +the sole exception is \ex{file-symlink?}, which does not take +the optional \var{chase?} second argument. +\begin{inset} +\newcommand{\Ex}[1]{\ex{#1}\index{\tt{#1}}} +\begin{tabular}{l@{\qquad}l} +\end{tabular} +\end{inset} +For example, +\codex{(file-directory? "/usr/dalbertz")\qquad\evalto\qquad\sharpt} +\end{desc} + +\defun {file-not-readable?} {fname} \boolean +\defunx{file-not-writable?} {fname} \boolean +\defunx{file-not-executable?} {fname} \boolean +\begin{desc} + Returns: + \begin{optiontable} + \textnormal{Value} & meaning \\ \hline + \sharpf & Access permitted \\ + 'search-denied & {\renewcommand{\arraystretch}{1}% + \begin{tabular}[t]{@{}l@{}} + Can't stat---a protected directory \\ + is blocking access.\end{tabular}} \\ + 'permission & Permission denied. \\ + 'no-directory & Some directory doesn't exist. \\ + 'nonexistent & File doesn't exist. + \end{optiontable} +% + A file is considered writeable if either (1) it exists and is writeable + or (2) it doesn't exist and the directory is writeable. + Since symlink permission bits are ignored by the filesystem, these + calls do not take a \var{chase?} flag. + + Note that these procedures use the process' \emph{effective} user + and group ids for permission checking. {\Posix} defines an \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=access&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{access()}} + function that uses the process' real uid and gids. This is handy + for setuid programs that would like to find out if the actual user + has specific rights; scsh ought to provide this functionality (but doesn't + at the current time). + + There are several problems with these procedures. First, there's an + atomicity issue. In between checking permissions for a file and then trying + an operation on the file, another process could change the permissions, + so a return value from these functions guarantees nothing. Second, + the code special-cases permission checking when the uid is root---if + the file exists, root is assumed to have the requested permission. + However, not even root can write a file that is on a read-only file system, + such as a CD ROM. In this case, \ex{file-not-writable?} will lie, saying + that root has write access, when in fact the opening the file for write + access will fail. + Finally, write permission confounds write access and create access. + These should be disentangled. + + Some of these problems could be avoided if {\Posix} had a real-uid + variant of the \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=access&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{access()}} call we could use, but the atomicity + issue is still a problem. In the final analysis, the only way to + find out if you have the right to perform an operation on a file + is to try and open it for the desired operation. These permission-checking + functions are mostly intended for script-writing, where loose guarantees + are tolerated. +\end{desc} + +\defun {file-readable?} {fname} \boolean +\defunx {file-writable?} {fname} \boolean +\defunx {file-executable?} {fname} \boolean +\begin{desc} + These procedures are the logical negation of the + preceding \ex{file-not-\ldots?} procedures. + Refer to them for a discussion of their problems and limitations. +\end{desc} + +\begin{defundesc}{file-not-exists?} {fname [chase?]} \object +Returns: + \begin{optiontable} + \sharpf & Exists. \\ + \sharpt & Doesn't exist. \\ + 'search-denied & \parbox[t]{0.5\linewidth}{\sloppy\raggedright + Some protected directory + is blocking the search.} + \end{optiontable} +\end{defundesc} + +\begin{defundesc}{file-exists?} {fname [chase?]} \boolean + This is simply + \ex{(not (file-not-exists? \var{fname} \var{[chase?]}))} +\end{defundesc} + +\defun {directory-files} {[dir dotfiles?]} {string list} +\begin{desc} + Return the list of files in directory \var{dir}, + which defaults to the current working directory. + The \var{dotfiles?} flag (default {\sharpf}) causes dot files to be + included in the list. + Regardless of the value of \var{dotfiles?}, the two files \ex{.} and + \ex{..} are \emph{never} returned. + + The directory \var{dir} is not prepended to each file name in the + result list. That is, + \codex{(directory-files "/etc")} + returns + \codex{("chown" "exports" "fstab" \ldots)} + \emph{not} + \codex{("/etc/chown" "/etc/exports" "/etc/fstab" \ldots)} + To use the files in returned list, the programmer can either manually + prepend the directory: + \codex{(map (\l{f} (string-append dir "/" f)) files)} + or cd to the directory before using the file names: +% +\begin{code} +(with-cwd dir + (for-each delete-file (directory-files)))\end{code} +% + or use the \ex{glob} procedure, defined below. + + A directory list can be generated by \ex{(run/strings (ls))}, but this + is unreliable, as filenames with whitespace in their names will be + split into separate entries. Using \ex{directory-files} is reliable. +\end{desc} + +\defun {glob} {\vari{pat}1 \ldots} {string list} +\begin{desc} + Glob each pattern against the filesystem and return the sorted list. + Duplicates are not removed. Patterns matching nothing are not included + literally.\footnote{Why bother to mention such a silly possibility? + Because that is what sh does.} + C shell \verb|{a,b,c}| patterns are expanded. Backslash quotes + characters, turning off the special meaning of + \verb|{|, \verb|}|, \cd{*}, \verb|[|, \verb|]|, and \verb|?|. + + Note that the rules of backslash for {\Scheme} strings and glob patterns + work together to require four backslashes in a row to specify a + single literal backslash. Fortunately, it is very rare that a backslash + occurs in a Unix file name. + + A glob subpattern will not match against dot files unless the first + character of the subpattern is a literal ``\ex{.}''. + Further, a dot subpattern will not match the files \ex{.} or \ex{..} + unless it is a constant pattern, as in \ex{(glob "../*/*.c")}. + So a directory's dot files can be reliably generated + with the simple glob pattern \ex{".*"}. + + Some examples: +\begin{inset} +\begin{verbatim} +(glob "*.c" "*.h") + ;; All the C and #include files in my directory. + +(glob "*.c" "*/*.c") + ;; All the C files in this directory and + ;; its immediate subdirectories. + +(glob "lexer/*.c" "parser/*.c") +(glob "{lexer,parser}/*.c") + ;; All the C files in the lexer and parser dirs. + +(glob "\\{lexer,parser\\}/*.c") + ;; All the C files in the strange + ;; directory "{lexer,parser}". + +(glob "*\\*") + ;; All the files ending in "*", e.g. + ;; ("foo*" "bar*") + +(glob "*lexer*") + ("mylexer.c" "lexer1.notes") + ;; All files containing the string "lexer". + +(glob "lexer") + ;; Either ("lexer") or ().\end{verbatim} +\end{inset} +% +If the first character of the pattern (after expanding braces) is a slash, +the search begins at root; otherwise, the search begins in the current +working directory. + +If the last character of the pattern (after expanding braces) is a slash, +then the result matches must be directories, \eg, +\begin{code} +(glob "/usr/man/man?/") \evalto + ("/usr/man/man1/" "/usr/man/man2/" \ldots)\end{code} + +Globbing can sometimes be useful when we need a list of a directory's files +where each element in the list includes the pathname for the file. +Compare: +\begin{code} +(directory-files "../include") \evalto + ("cig.h" "decls.h" \ldots) + +(glob "../include/*") \evalto + ("../include/cig.h" "../include/decls.h" \ldots)\end{code} +\end{desc} + +\defun{glob-quote}{str}\str +\begin{desc} +Returns a constant glob pattern that exactly matches \var{str}. +All wild-card characters in \var{str} are quoted with a backslash. +\begin{code} +(glob-quote "Any *.c files?") + {\evalto}"Any \\*.c files\\?"\end{code} +\end{desc} + + +\begin{defundesc}{file-match}{root dot-files? \vari{pat}1 \vari{pat}2 {\ldots} \vari{pat}n}{string list} + \note{This procedure is deprecated, and will probably either go away or + be substantially altered in a future release. New code should not + call this procedure. The problem is that it relies upon + Posix-notation regular expressions; the rest of scsh has been + converted over to the new SRE notation.} + + \ex{file-match} provides a more powerful file-matching service, at the + expense of a less convenient notation. It is intermediate in + power between most shell matching machinery and recursive \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=find&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{find(1)}}. + + Each pattern is a regexp. The procedure searches from \var{root}, + matching the first-level files against pattern \vari{pat}1, the + second-level files against \vari{pat}2, and so forth. + The list of files matching the whole path pattern is returned, + in sorted order. + The matcher uses Spencer's regular expression package. + + The files \ex{.} and \ex{..} are never matched. Other dot files are only + matched if the \var{dot-files?} argument is \sharpt. + + A given \vari{pat}i pattern is matched as a regexp, so it is not forced + to match the entire file name. \Eg, pattern \ex{"t"} matches any + file containing a ``t'' in its name, while pattern \verb|"^t$"| matches + only a file whose entire name is ``\ex{t}''. + + The \vari{pat}i patterns can be more general than stated above. + \begin{itemize} + \item A single pattern can specify multiple levels of the path by + embedding \ex{/} characters within the pattern. For example, + the pattern \ex{"a/b/c"} gives a match equivalent to the + list of patterns \ex{"a" "b" "c"}. + + \item A \vari{pat}i pattern can be a procedure, + which is used as a match predicate. + It will be repeatedly called with a candidate file-name to test. + The file-name will be the entire path accumulated. + If the procedure raises an error condition, \ex{file-match} will + catch the error and treat it as a failed match. + This keeps \ex{file-match} from being blown out of the water + by applying tests to dangling symlinks and other similar situations. + + \end{itemize} + + Some examples: +%% UGH. Because we are using code instead of verbatim, we have to +%% double up on backslashes. +\begin{tightleftinset} +\begin{code} +(file-match "/usr/lib" #f "m$" "^tab") \evalto + ("/usr/lib/term/tab300" "/usr/lib/term/tab300-12" \ldots) +\cb +(file-match "." #f "^lex|parse|codegen$" "\\\\.c$") \evalto + ("lex/lex.c" "lex/lexinit.c" "lex/test.c" + "parse/actions.c" "parse/error.c" parse/test.c" + "codegen/io.c" "codegen/walk.c") +\cb +(file-match "." #f "^lex|parse|codegen$/\\\\.c$") + ;; The same. +\cb +(file-match "." #f file-directory?) + ;; Return all subdirs of the current directory. +\cb +(file-match "/" #f file-directory?) \evalto + ("/bin" "/dev" "/etc" "/tmp" "/usr") + ;; All subdirs of root. +\cb +(file-match "." #f "\\\\.c") + ;; All the C files in my directory. +\cb +(define (ext extension) + (\l{fn} (string-suffix? fn extension))) +\cb +(define (true . x) #t) +\cb +(file-match "." #f "./\\\\.c") +(file-match "." #f "" "\\\\.c") +(file-match "." #f true "\\\\.c") +(file-match "." #f true (ext "c")) + ;; All the C files of all my immediate subdirs. +\cb +(file-match "." #f "lexer") \evalto + ("mylexer.c" "lexer.notes") + ;; Compare with (glob "lexer"), above.\end{code} +\end{tightleftinset} + +Note that when \var{root} is the current working directory (\ex{"."}), +when it is converted to directory form, it becomes \ex{""}, and doesn't +show up in the result file-names. + +It is regrettable that the regexp wild card char, ``\ex{.}'', +is such an important file name literal, as dot-file prefix and extension +delimiter. +\end{defundesc} + +\begin{defundesc} {create-temp-file} {[prefix]} \str + \ex{Create-temp-file} creates a new temporary file and return its name. + The optional argument specifies the filename prefix to use, and defaults + to \ex{"/usr/tmp/\var{pid}"}, where \var{pid} is the current process' id. + The procedure generates a sequence of filenames that have \var{prefix} as + a common prefix, looking for a filename that doesn't already exist in the + file system. When it finds one, it creates it, with permission \cd{#o600} + and returns the filename. (The file permission can be changed to a more + permissive permission with \ex{set-file-mode} after being created). + + This file is guaranteed to be brand new. No other process will have it + open. This procedure does not simply return a filename that is very + likely to be unused. It returns a filename that definitely did not exist + at the moment \ex{create-temp-file} created it. + + It is not necessary for the process' pid to be a part of the filename + for the uniqueness guarantees to hold. The pid component of the default + prefix simply serves to scatter the name searches into sparse regions, so + that collisions are less likely to occur. This speeds things up, but does + not affect correctness. + + Security note: doing i/o to files created this way in \ex{/usr/tmp/} is + not necessarily secure. General users have write access to \ex{/usr/tmp/}, + so even if an attacker cannot access the new temp file, he can delete it + and replace it with one of his own. A subsequent open of this filename + will then give you his file, to which he has access rights. There are + several ways to defeat this attack, + \begin{enumerate} + \item Use \ex{temp-file-iterate}, below, to return the file descriptor + allocated when the file is opened. This will work if the file + only needs to be opened once. + \item If the file needs to be opened twice or more, create it in a + protected directory, \eg, \verb|$HOME|. + \item Ensure that \ex{/usr/tmp} has its sticky bit set. This + requires system administrator privileges. + \end{enumerate} + The actual default prefix used is controlled by the dynamic variable + \ex{*temp-file-template*}, and can be overridden for increased security. + See \ex{temp-file-iterate}. +\end{defundesc} + +\defunx {temp-file-iterate} {maker [template]} {\object\+} +\defvarx {*temp-file-template*} \str +\begin{desc} + This procedure can be used to perform certain atomic transactions on + the file system involving filenames. Some examples: + \begin{itemize} + \item Linking a file to a fresh backup temp name. + \item Creating and opening an unused, secure temp file. + \item Creating an unused temporary directory. + \end{itemize} + + This procedure uses \var{template} to generate a series of trial file + names. + \var{Template} is a \ex{format} control string, and defaults to + \codex{"/usr/tmp/\var{pid}.\~a"} + where \var{pid} is the current process' process id. + File names are generated by calling \ex{format} to instantiate the + template's \verb|~a| field with a varying string. + + \var{Maker} is a procedure which is serially called on each file name + generated. It must return at least one value; it may return multiple + values. If the first return value is {\sharpf} or if \var{maker} raises the + \ex{errno/exist} errno exception, \ex{temp-file-iterate} will loop, + generating a new file name and calling \var{maker} again. If the first + return value is true, the loop is terminated, returning whatever value(s) + \var{maker} returned. + + After a number of unsuccessful trials, \ex{temp-file-iterate} may give up + and signal an error. + + Thus, if we ignore its optional \var{prefix} argument, + \ex{create-temp-file} could be defined as: +\begin{code} +(define (create-temp-file) + (let ((flags (bitwise-ior open/create open/exclusive))) + (temp-file-iterate + (\l{f} + (close (open-output-file f flags #o600)) + f))))\end{code} + + To rename a file to a temporary name: +\begin{code} +(temp-file-iterate (\l{backup} + (create-hard-link old-file backup) + backup) + ".#temp.\~a") ; Keep link in cwd. +(delete-file old-file)\end{code} + Recall that scsh reports syscall failure by raising an error + exception, not by returning an error code. This is critical to + to this example---the programmer can assume that if the + \ex{temp-file-iterate} call returns, it returns successully. + So the following \ex{delete-file} call can be reliably invoked, + safe in the knowledge that the backup link has definitely been established. + + To create a unique temporary directory: +\begin{code} +(temp-file-iterate (\l{dir} (create-directory dir) dir) + "/usr/tmp/tempdir.\~a")\end{code} +% + Similar operations can be used to generate unique symlinks and fifos, + or to return values other than the new filename (\eg, an open file + descriptor or port). + + The default template is in fact taken from the value of the dynamic + variable \ex{*temp-file-template*}, which itself defaults to + \ex{"/usr/tmp/\var{pid}.\~a"}, where \var{pid} is the scsh process' + pid. + For increased security, a user may wish to change the template + to use a directory not allowing world write access + (\eg, his home directory). +\end{desc} + +\defun{temp-file-channel}{} {[inp outp]} +\begin{desc} + This procedure can be used to provide an interprocess communications + channel with arbitrary-sized buffering. It returns two values, an input + port and an output port, both open on a new temp file. The temp file + itself is deleted from the {\Unix} file tree before \ex{temp-file-channel} + returns, so the file is essentially unnamed, and its disk storage is + reclaimed as soon as the two ports are closed. + + \ex{Temp-file-channel} is analogous to \ex{port-pipe} with two exceptions: + \begin{itemize} + \item If the writer process gets ahead of the reader process, it will + not hang waiting for some small pipe buffer to drain. It will simply + buffer the data on disk. This is good. + + \item If the reader process gets ahead of the writer process, it will + also not hang waiting for data from the writer process. It will + simply see and report an end of file. This is bad. + + In order to ensure that an end-of-file returned to the reader is + legitimate, the reader and writer must serialise their i/o. The + simplest way to do this is for the reader to delay doing input + until the writer has completely finished doing output, or exited. + \end{itemize} +\end{desc} + +\section{Processes} + +\defun {exec} {prog arg1 \ldots argn} \noreturn +\defunx {exec-path} {prog arg1 \ldots argn} \noreturn +\defunx {exec/env} {prog env arg1 \ldots argn} \noreturn +\defunx {exec-path/env} {prog env arg1 \ldots argn} \noreturn +\begin{desc} + +The \ex{\ldots/env} variants take an environment specified as a +string$\rightarrow$string alist. +An environment of {\sharpt} is taken to mean the current process' environment +(\ie, the value of the external char \ex{**environ}). + +[Rationale: {\sharpf} is a more convenient marker for the current environment + than {\sharpt}, but would cause an ambiguity on Schemes that identify + {\sharpf} and \ex{()}.] + +The path-searching variants search the directories in the list +{\ttt exec\=path\=list} for the program. +A path-search is not performed if the program name contains +a slash character---it is used directly. So a program with a name like +\ex{"bin/prog"} always executes the program \ex{bin/prog} in the current working +directory. See \verb|$path| and \verb|exec-path-list|, below. + +Note that there is no analog to the C function \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=execv&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{execv()}}. +To get the effect just do +\codex{(apply exec prog arglist)} + +All of these procedures flush buffered output and close unrevealed ports +before executing the new binary. +To avoid flushing buffered output, see \verb|%exec| below. + +Note that the C \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=exec&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{exec()}} procedure allows the zeroth element of the +argument vector to be different from the file being executed, \eg +% +\begin{inset} +\begin{verbatim} +char *argv[] = {"-", "-f", 0}; +exec("/bin/csh", argv, envp);\end{verbatim} +\end{inset} +% +The scsh \ex{exec}, \ex{exec-path}, \ex{exec/env}, and \ex{exec-path/env} +procedures do not give this functionality---element 0 of the arg vector is +always identical to the \ex{prog} argument. In the rare case the user wishes +to differentiate these two items, he can use the low-level \verb|%exec| and +\verb|exec-path-search| procedures. +These procedures never return under any circumstances. +As with any other system call, if there is an error, they raise +an exception. +\end{desc} + + +\defun {\%exec} {prog arglist env} \undefined +\defunx{exec-path-search} {fname pathlist} {{\str} or \sharpf} +\begin{desc} +The \ex{\%exec} procedure is the low-level interface to the system call. +The \var{arglist} parameter is a list of arguments; +\var{env} is either a string$\rightarrow$string alist or {\sharpt}. +The new program's \cd{argv[0]} will be taken from \ex{(car \var{arglist})}, +\emph{not} from \var{prog}. +An environment of {\sharpt} means the current process' environment. +\verb|%exec| does not flush buffered output +(see \ex{flush-all-ports}). + +All exec procedures, including \verb|%exec|, coerce the \cd{prog} and \cd{arg} +values to strings using the usual conversion rules: numbers are converted to +decimal numerals, and symbols converted to their print-names. + +\ex{exec-path-search} searches the directories of \var{pathlist} looking for +an occurrence of file \ex{fname}. If no executable file is found, it returns +{\sharpf}. If \ex{fname} contains a slash character, the path search is +short-circuited, but the procedure still checks to ensure that the file exists +and is executable---if not, it still returns {\sharpf}. +Users of this procedure should be aware that it invites a potential race +condition: between checking the file with \ex{exec-path-search} and executing +it with \ex{\%exec}, the file's status might change. +The only atomic way to do the search is to loop over the candidate +file names, exec'ing each one and looping when the exec operation fails. + +See \cd{$path} and \ex{exec-path-list}, below. +\end{desc} + +\defun {exit} {[status]} \noreturn +\defunx {\%exit} {[status]} \noreturn +\begin{desc} +These procedures terminate the current process with a given exit status. +The default exit status is 0. +The low-level \verb|%exit| procedure immediately terminates the process +without flushing buffered output. +\end{desc} + +\begin{defundesc} {call-terminally} {thunk} \noreturn + \ex{call-terminally} calls its thunk. When the thunk returns, the process + exits. Although \ex{call-terminally} could be implemented as + \codex{(\l{thunk} (thunk) (exit 0))} + an implementation can take advantage of the fact that this procedure never + returns. For example, the runtime can start with a fresh stack and also + start with a fresh dynamic environment, where shadowed bindings are + discarded. This can allow the old stack and dynamic environment to be + collected (assuming this data is not reachable through some live + continuation). +\end{defundesc} + +\begin{defundesc}{suspend}{} \undefined +Suspend the current process with a SIGSTOP signal. +\end{defundesc} + +\defun {fork} {[thunk]} {proc or \sharpf} +\defunx {\%fork} {[thunk]} {proc or \sharpf} +\begin{desc} + \ex{fork} with no arguments is like C \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=fork&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{fork()}}. + In the parent process, it returns the child's \emph{process object} + (see below for more information on process objects). + In the child process, it returns {\sharpf}. + + \ex{fork} with an argument only returns in the parent process, returning + the child's process object. + The child process calls \var{thunk} and then exits. + + \ex{fork} flushes buffered output before forking, and sets the child + process to non-interactive. \verb|%fork| does not perform this bookkeeping; + it simply forks. +\end{desc} + +\defun {fork/pipe} {[thunk]} {proc or \sharpf} +\defunx{\%fork/pipe} {[thunk]} {proc or \sharpf} +\begin{desc} + Like \ex{fork} and \ex{\%fork}, but the parent and child communicate via a + pipe connecting the parent's stdin to the child's stdout. These procedures + side-effect the parent by changing his stdin. + + In effect, \ex{fork/pipe} splices a process into the data stream + immediately upstream of the current process. + This is the basic function for creating pipelines. + Long pipelines are built by performing a sequence of \ex{fork/pipe} calls. + For example, to create a background two-process pipe \ex{a | b}, we write: +% +\begin{code} +(fork (\l{} (fork/pipe a) (b)))\end{code} +% + which returns the process object for \ex{b}'s process. + + To create a background three-process pipe \ex{a | b | c}, we write: +% +\begin{code} +(fork (\l{} (fork/pipe a) + (fork/pipe b) + (c)))\end{code} +% + which returns the process object for \ex{c}'s process. + + Note that these procedures affect file descriptors, not ports. + That is, the pipe is allocated connecting the child's file descriptor + 1 to the parent's file descriptor 0. + \emph{Any previous Scheme port built over these affected file descriptors + is shifted to a new, unused file descriptor with \ex{dup} before + allocating the I/O pipe.} + This means, for example, that the ports bound to \ex{(current-input-port)} + and \ex{(current-output-port)} in either process are not affected---they + still refer to the same I/O sources and sinks as before. + Remember the simple scsh rule: Scheme ports are bound to I/O sources + and sinks, \emph{not} particular file descriptors. + + If the child process wishes to rebind the current output port + to the pipe on file descriptor 1, it can do this using + \ex{with-current-output-port} or a related form. + Similarly, if the parent wishes to change the current input port + to the pipe on file descriptor 0, it can do this using + \ex{set-current-input-port!} or a related form. + Here is an example showing how to set up the I/O ports on both sides + of the pipe: +\begin{code} +(fork/pipe (\l{} + (with-current-output-port (fdes->outport 1) + (display "Hello, world.\\n")))) + +(set-current-input-port! (fdes->inport 0) +(read-line) ; Read the string output by the child.\end{code} +None of this is necessary when the I/O is performed by an exec'd +program in the child or parent process, only when the pipe will +be referenced by Scheme code through one of the default current I/O +ports. +\end{desc} + +\defun {fork/pipe+} {conns [thunk]} {proc or \sharpf} +\defunx {\%fork/pipe+} {conns [thunk]} {proc or \sharpf} +\begin{desc} + Like \ex{fork/pipe}, but the pipe connections between the child and parent + are specified by the connection list \var{conns}. + See the + \codex{(|+ \var{conns} \vari{pf}{\!1} \ldots{} \vari{pf}{\!n})} + process form for a description of connection lists. +\end{desc} + +\subsection{Process objects and process reaping} +\label{sec:proc-objects} +Scsh uses \emph{process objects} to represent Unix processes. +They are created by the \ex{fork} procedure, and have the following +exposed structure: +\begin{code} +(define-record proc + pid)\end{code} +\index{proc}\index{proc?}\index{proc:pid} +The only exposed slot in a proc record is the process' pid, +the integer id assigned by Unix to the process. +The only exported primitive procedures for manipulating process objects +are \ex{proc?} and \ex{proc:pid}. +Process objects are created with the \ex{fork} procedure. + +\begin{defundesc}{pid->proc}{pid [probe?]}{proc} +This procedure maps integer Unix process ids to scsh process objects. +It is intended for use in interactive and debugging code, +and is deprecated for use in production code. +If there is no process object in the system indexed by the given pid, +\ex{pid->proc}'s action is determined by the \var{probe?} parameter +(default \sharpf): +\begin{center} +\begin{tabular}{|l|l|} +\hline +\var{probe?} & Return \\ \hline\hline +\sharpf & \emph{signal error condition.} \\ \hline +\ex{'create} & Create new proc object. \\ \hline +True value & \sharpf \\ \hline +\end{tabular} +\end{center} +\end{defundesc} + +Sometime after a child process terminates, scsh will perform a \ex{wait} +system call on the child in background, caching the process' exit status +in the child's proc object. +This is called ``reaping'' the process. +Once the child has been waited, the Unix kernel can free the storage allocated +for the dead process' exit information, so process reaping prevents the process +table from becoming cluttered with un-waited dead child processes +(a.k.a. ``zombies''). +This can be especially severe if the scsh process never waits on child +processes at all; if the process table overflows with forgotten zombies, +the OS may be unable to fork further processes. + +Reaping a child process moves its exit status information from the kernel +into the scsh process, where it is cached inside the child's process object. +If the scsh user drops all pointers to the process object, it will simply be +garbage collected. +On the other hand, if the scsh program retains a pointer to the process object, +it can use scsh's \ex{wait} system call to synchronise with the child and +retrieve its exit status multiple times (this is not possible with simple +Unix integer pids in C---the programmer can only wait on a pid once). + +Thus, process objects allow scsh programmer to do two things not allowed +in other programming environments: +\begin{itemize} +\item Subprocesses that are never waited on are still removed from the + process table, and their associated exit status data is eventually + automatically garbage collected. +\item Subprocesses can be waited on multiple times. +\end{itemize} + +However, note that once a child has exited, if the scsh programmer +drops all pointers to the child's proc object, the child's exit status +will be reaped and thrown away. +This is the intended behaviour, and it means that integer pids are not +enough to cause a process's exit status to be retained by the scsh runtime. +(This is because it is clearly impossible to GC data referenced by integers.) + +As a convenience for interactive use and debugging, all procedures that +take process objects will also accept integer Unix pids as arguments, +coercing them to the corresponding process objects. +Since integer process ids are not reliable ways to keep a child's exit +status from being reaped and garbage collected, programmers are encouraged +to use process objects in production code. + +\begin{defundesc}{autoreap-policy}{[policy]}{old-policy} +The scsh programmer can choose different policies for automatic +process reaping. +The policy is determined by applying this procedure to one of the +values \ex{'early}, \ex{'late}, or {\sharpf} (\ie, no autoreap). +\begin{description} +\item [early] + The child is reaped from the {\Unix} kernel's process table + into scsh as soon as it dies. This is done by having a + signal handler for the \ex{SIGCHLD} signal reap the process. + \emph{ + If a scsh program sets its own handler for the \ex{SIGCHLD} + signal, the handler must reap dead children + by calling \ex{wait}, \ex{wait-any}, or \ex{reap-zombies}.} + We deprecate interrupt-driven code, and hope to provide + alternative tools in a future, multi-threaded release of scsh. + +\item [late] + The child is not autoreaped until it dies \emph{and} the scsh program + drops all pointers to its process object. That is, the process + table is cleaned out during garbage collection. + \oops{The \ex{late} policy is not supported under the current + release of scsh. It requires more sophisticated gc hooks than + we can get from the release of {\scm} that we use.} + +\item [\sharpf] + If autoreaping is turned off, process reaping is completely under + control of the programmer, who can force outstanding zombies to + be reaped by manually calling the \ex{reap-zombies} procedure + (see below). +\end{description} +Note that under any of the autoreap policies, a particular process $p$ can +be manually reaped into scsh by simply calling \ex{(wait $p$)}. +\emph{All} zombies can be manually reaped with \ex{reap-zombies}. + +The \ex{autoreap-policy} procedure returns the policy's previous value. +Calling \ex{autoreap-policy} with no arguments returns the current +policy without no change. +\end{defundesc} + + +\begin{defundesc}{reap-zombies}{}{\boolean} +This procedure reaps all outstanding exited child processes into scsh. +It returns true if there are no more child processes to wait on, and +false if there are outstanding processes still running or suspended. +\end{defundesc} + +\subsubsection{Issues with process reaping} +Reaping a process does not reveal its process group at the time of +death; this information is lost when the process reaped. +This means that a dead, reaped process is \emph{not eligible} as a return +value for a future \ex{wait-process-group} call. +This is not likely to be a problem for most code, as programs almost +never wait on exited processes by process group. +Process group waiting is usually applied to \emph{stopped} processes, +which are never reaped. +So it is unlikely that this will be a problem for most programs. + +%%% Actually, this is *not* a problem if you stick with proc objects, instead +%%% of using pids, so I commented it out. +% +%\paragraph{Pid aliasing} +%Second, once a process has been reaped, its 16-bit process id becomes +%available to Unix for re-use. +%So it is conceivable that a long time in the future, a \ex{fork} operation +%could produce a subprocess with the identical pid, causing \ex{wait} +%operations on the old, dead, reaped child, and the new child to become +%confused. +%This kind of pid aliasing is intrinsic to the nature of Unix's single-use pid +%deallocation policy, +%but is very, very unlikely to happen in practice, +%given the 16-bit size of the pid space. +%Scsh will detect occurences of pid aliasing, +%in the unlikely event that one occurs. +%When \ex{fork} creates a proc object, it checks to see if the scsh heap +%contains an already existing proc object with the same pid as the newly forked +%process. +%If so, an exception is raised; if not handled by the program, this will stop +%the program, either killing the process or invoking an interactive debugger. + +Automatic process reaping is a useful programming convenience. +However, if a program is careful to wait for all children, and does not wish +automatic reaping to happen, the programmer can simply turn process +autoreaping off. + +Programs that do not wish to use automatic process reaping should be +aware that some scsh routines create subprocesses but do not return +the child's pid: \ex{run/port*}, and its related procedures and +special forms (\ex{run/strings}, \emph{et al.}). +Automatic process reaping will clean the child processes created by +these procedures out of the kernel's process table. +If a program doesn't use process reaping, it should either avoid these +forms, or use \ex{wait-any} to wait for the children to exit. + +\subsection{Process waiting} + +\defun {wait} {proc/pid [flags]} {status} +\begin{desc} + This procedure waits until a child process exits, and returns its + exit code. The \var{proc/pid} argument is either a process object + (section \ref{sec:proc-objects}) or an integer process id. + \ex{Wait} returns the child's exit status code (or suspension code, + if the \ex{wait/stopped-children} option is used, see below). + Status values can be queried with the procedures in section + \ref{sec:wait-codes}. + + The \var{flags} argument is an integer whose bits specify + additional options. It is composed by or'ing together the following + flags: + \begin{center} + \begin{tabular}{|l|l|} + \hline + Flag & Meaning \\ \hline \hline + \ex{wait/poll} & Return {\sharpf} immediately if + child still active. \\ \hline + \ex{wait/stopped-children} & Wait for suspend as well as exit. \\ \hline + \end{tabular} + \end{center} +\end{desc} + +\begin{defundesc} {wait-any} {[flags]} {[proc status]} + The optional \var{flags} argument is as for \ex{wait}. + This procedure waits for any child process to exit (or stop, if the + \ex{wait/stopped-children} flag is used) + It returns the process' process object and status code. + If there are no children left for which to wait, the two values + \ex{[{\sharpf} {\sharpt}]} are returned. + If the \ex{wait/poll} flag is used, and none of the children + are immediately eligble for waiting, + then the values \ex{[{\sharpf} {\sharpf}]} are returned: + \begin{center} + \begin{tabular}{|l|l|} + \hline + [{\sharpf} {\sharpf}] & Poll, none ready \\ \hline + [{\sharpf} {\sharpt}] & No children \\ \hline + \end{tabular} + \end{center} + + \ex{Wait-any} will not return a process that has been previously waited + by any other process-wait procedure (\ex{wait}, \ex{wait-any}, + and \ex{wait-process-group}). + It will return reaped processes that haven't yet been waited. + + The use of \ex{wait-any} is deprecated. +\end{defundesc} + +\begin{defundesc} {wait-process-group} {proc/pid [flags]} {[proc status]} + This procedure waits for any child whose process group is \var{proc/pid} + (either a process object or a pid). + The \var{flags} argument is as for \ex{wait}. + + Note that if the programmer wishes to wait for exited processes + by process group, the program should take care not to use process + reaping (section \ref{sec:proc-objects}), as this loses + process group information. However, most process-group waiting is + for stopped processes (to implement job control), so this is rarely + an issue, as stopped processes are not subject to reaping. +\end{defundesc} + + +\subsection{Analysing process status codes} +\label{sec:wait-codes} +When a child process dies (or is suspended), its parent can call the \ex{wait} +procedure to recover the exit (or suspension) status of the child. +The exit status is a small integer that encodes information +describing how the child terminated. +The bit-level format of the exit status is not defined by {\Posix}; +you must use the following three functions to decode one. +However, if a child terminates normally with exit code 0, +{\Posix} does require \ex{wait} to return an exit status that is exactly +zero. +So \ex{(zero? \var{status})} is a correct way to test for non-error, +normal termination, \eg, +\begin{code} +(if (zero? (run (rcp scsh.tar.gz lambda.csd.hku.hk:))) + (delete-file "scsh.tar.gz"))\end{code} + +\defun {status:exit-val}{status}{{\integer} or \sharpf} +\defunx{status:stop-sig}{status}{{\integer} or \sharpf} +\defunx{status:term-sig}{status}{{\integer} or \sharpf} +\begin{desc} +For a given status value produced by calling \ex{wait}, +exactly one of these routines will return a true value. + +If the child process exited normally, \ex{status:exit-val} returns the +exit code for the child process (\ie, the value the child passed to \ex{exit} +or returned from \ex{main}). Otherwise, this function returns false. + +If the child process was suspended by a signal, \ex{status:stop-sig} +returns the signal that suspended the child. +Otherwise, this function returns false. + +If the child process terminated abnormally, \ex{status:term-sig} +returns the signal that terminated the child. +Otherwise, this function returns false. +\end{desc} + +%% Dereleased until we have a more portable implementation. + +%\defun{halts?}{proc}\boolean +%\begin{desc} +%This procedure, ported from early T implementations, +%returns true iff \ex{(\var{proc})} returns at all. +%\remark{The current implementation is a constant function returning {\sharpt}, +% which suffices for all {\Unix} implementations of which we are aware.} +%\end{desc} + +\section{Process state} + +\defun {umask}{} \fixnum +\defunx {set-umask} {perms} \undefined +\defunx {with-umask*} {perms thunk} {value(s) of thunk} +\dfnx {with-umask} {perms . body} {value(s) of body} {syntax} +\begin{desc} + The process' current umask is retrieved with \ex{umask}, and set with + \ex{(set-umask \var{perms})}. Calling \ex{with-umask*} changes the umask + to \var{perms} for the duration of the call to \var{thunk}. If the + program throws out of \var{thunk} by invoking a continuation, the umask is + reset to its external value. If the program throws back into \var{thunk} + by calling a stored continuation, the umask is restored to the \var{perms} + value. The special form \ex{with-umask} is equivalent in effect to + the procedure \ex{with-umask*}, but does not require the programmer + to explicitly wrap a \ex{(\l{} \ldots)} around the body of the code + to be executed. +\end{desc} + + + +\defun {chdir} {[fname]} \undefined +\defunx {cwd}{} \str +\defunx {with-cwd*} {fname thunk} {value(s) of thunk} +\dfnx {with-cwd} {fname . body} {value(s) of body} {syntax} +\begin{desc} +These forms manipulate the current working directory. +The cwd can be changed with \ex{chdir} +(although in most cases, \ex{with-cwd} is preferrable). +If \ex{chdir} is called with no arguments, it changes the cwd to +the user's home directory. +The \ex{with-cwd*} procedure calls \ex{thunk} with the cwd temporarily +set to \var{fname}; when \var{thunk} returns, or is exited in a non-local +fashion (\eg, by raising an exception or by invoking a continuation), +the cwd is returned to its original value. +The special form \ex{with-cwd} is simply syntactic sugar for \ex{with-cwd*}. +\end{desc} + +\defun {pid}{} \fixnum +\defunx {parent-pid}{} \fixnum +\defunx {process-group} {} \fixnum +\defunx {set-process-group} {[proc/pid] pgrp} \undefined % [not implemented] +\begin{desc} +\ex{(pid)} and \ex{(parent-pid)} retrieve the process id for the +current process and its parent. +\ex{(process-group)} returns the process group of the current process. +A process' process-group can be set with \ex{set-process-group}; +the value \var{proc/pid} specifies the affected process. It may be either +a process object or an integer process id, and defaults to the current +process. +\end{desc} + +\defun {set-priority} {which who priority} \undefined %; priority stuff unimplemented +\defunx {priority} {which who} \fixnum % ; not implemented +\defunx {nice} {[proc/pid delta]} \undefined %; not implemented +\begin{desc} +These procedures set and access the priority of processes. +I can't remember how \ex{set-priority} and \ex{priority} work, so no + documentation, and besides, they aren't implemented yet, anyway. +\end{desc} + +\defunx {user-login-name}{} \str +\defunx {user-uid}{} \fixnum +\defunx {user-effective-uid}{} \fixnum +\defunx {user-gid}{} \fixnum +\defunx {user-effective-gid}{} \fixnum +\defunx {user-supplementary-gids}{} {{\fixnum} list} +\defunx {set-uid} {uid} \undefined +\defunx {set-gid} {gid} \undefined +\begin{desc} +These routines get and set the effective and real user and group ids. +The \ex{set-uid} and \ex{set-gid} routines correspond to the {\Posix} +\ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=setuid&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{setuid()}} and \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=setgid&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{setgid()}} procedures. +\end{desc} + + +\defun {process-times} {} {[{\fixnum} {\fixnum} {\fixnum} \fixnum]} +\begin{desc} +Returns four values: +\begin{tightinset} +\begin{flushleft} + user CPU time in clock-ticks \\ + system CPU time in clock-ticks \\ + user CPU time of all descendant processes \\ + system CPU time of all descendant processes +\end{flushleft} +\end{tightinset} +Note that CPU time clock resolution is not the same as +the real-time clock resolution provided by \ex{time+ticks}. +That's Unix. +\end{desc} + +\defun{cpu-ticks/sec}{} {integer} +\begin{desc} +Returns the resolution of the CPU timer in clock ticks per second. +This can be used to convert the times reported by \ex{process-times} +to seconds. +\end{desc} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{User and group database access} +These procedures are used to access the user and group databases +(\eg, the ones traditionally stored in \ex{/etc/passwd} and \ex{/etc/group}.) + +\defun {user-info} {uid/name} {record} +\begin{desc} + Return a \ex{user-info} record giving the recorded information for a +particular user: +\index{user-info} +\index{user-info:name} +\index{user-info:uid} +\index{user-info:gid} +\index{user-info:home-dir} +\index{user-info:shell} +\begin{code} +(define-record user-info + name uid gid home-dir shell)\end{code} +The \var{uid/name} argument is either an integer uid or a string user-name. +\end{desc} + +\defun {->uid} {uid/name} \fixnum +\defunx {->username} {uid/name} \str +\begin{desc} +These two procedures coerce integer uid's and user names to a particular +form. +\end{desc} + +\defun {group-info} {gid/name} {record} +\begin{desc} + Return a \ex{group-info} record giving the recorded information for a +particular group: +\index{group-info} +\index{group-info:name} +\index{group-info:gid} +\index{group-info:members} +\begin{code} +(define-record group-info + name gid members)\end{code} +The \var{gid/name} argument is either an integer gid or a string group-name. +\end{desc} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Accessing command-line arguments} + +\defvar {command-line-arguments}{{\str} list} +\defunx {command-line}{} {{\str} list} +\begin{desc} +The list of strings \ex{command-line-arguments} contains the arguments +passed to the scsh process on the command line. +Calling \ex{(command-line)} returns the complete \ex{argv} +string list, including the program. So if we run a scsh program + \codex{/usr/shivers/bin/myls -CF src} +then \ex{command-line-arguments} is + \codex{("-CF" "src")} +and \ex{(command-line)} returns + \codex{("/usr/shivers/bin/myls" "-CF" "src")} +\ex{command-line} returns a fresh list each time it is called. +In this way, the programmer can get a fresh copy of the original +argument list if \ex{command-line-arguments} has been modified or is lexically +shadowed. +\end{desc} + +\defun {arg} {arglist n [default]} \str +\defunx {arg*} {arglist n [default-thunk]} \str +\defunx {argv} {n [default]} \str +\begin{desc} +These procedures are useful for accessing arguments from argument +lists. +\ex{arg} returns the $n^{\rm{th}}$ element of \var{arglist}. +The index is 1-based. +If \var{n} is too large, \var{default} is returned; +if no \var{default}, then an error is signaled. + +\ex{arg*} is similar, except that the \var{default-thunk} is called to generate +the default value. + +\ex{(argv \var{n})} is simply \ex{(arg (command-line) (+ \var{n} 1))}. +The +1 offset ensures that the two forms +% +\begin{code} +(arg command-line-arguments \var{n}) +(argv \var{n})\end{code} +% +return the same argument +(assuming the user has not rebound or modified \ex{command-line-arguments}). + +Example: +% +\begin{code} +(if (null? command-line-arguments) + (& (xterm -n ,host -title ,host + -name ,(string-append "xterm_" host))) + (let* ((progname (file-name-nondirectory (argv 1))) + (title (string-append host ":" progname))) + (& (xterm -n ,title + -title ,title + -e ,@command-line-arguments))))\end{code} +% +A subtlety: when the scsh interpreter is used to execute a scsh program, +the program name reported in the head of the \ex{(command-line)} list +is the scsh program, {\em not} the interpreter. +For example, if we have a shell script in file \ex{fullecho}: +\begin{code} +#!/usr/local/bin/scsh -s +!# +(for-each (\l{arg} (display arg) (display " ")) + (command-line))\end{code} +and we run the program +\codex{fullecho hello world} +the program will print out +\codex{fullecho hello world} +not +\codex{/usr/local/bin/scsh -s fullecho hello world} + +This argument line processing ensures that if a scsh program is subsequently +compiled into a standalone executable or byte-compiled to a heap-image +executable by the {\scm} virtual machine, its semantics will be +unchanged---the arglist processing is invariant. In effect, the + \codex{/usr/local/bin/scsh -s} +is not part of the program; +it's a specification for the machine to execute the program on, so it is +not properly part of the program's argument list. + +\end{desc} + +\section{System parameters} + +%\defun {maximum-fds}{}\fixnum +%\defunx {page-size}{} \fixnum +\defun {system-name}{} \str +\begin{desc} +Returns the name of the host on which we are executing. +This may be a local name, such as ``solar,'' as opposed to a +fully-qualified domain name such as ``solar.csie.ntu.edu.tw.'' +\end{desc} + +\section{Signal system} + +Signal numbers are bound to the variables \ex{signal/hup}, \ex{signal/int}, +\ldots. See tables~\ref{table:signals-and-interrupts} and +\ref{table:uncatchable-signals} for the full list. + +\defun {signal-process} {proc sig} \undefined +\defunx {signal-process-group} {prgrp sig} \undefined +\begin{desc} +These two procedures send signals to a specific process, and all the processes +in a specific process group, respectively. +The \var{proc} and \var{prgrp} arguments are either processes +or integer process ids. +\end{desc} + +\defun{itimer}{???} \undefined +\defunx{pause-until-interrupt}{} \undefined + +\defun{sleep}{secs} \undefined +\defunx{sleep-until}{time}\undefined +\begin{desc} +The \ex{sleep} procedure causes the process to sleep for \var{secs} seconds. +The \ex{sleep-until} procedure causes the process to sleep until \var{time} +(see section~\ref{sec:time}). +\end{desc} + +\subsubsection{Interrupt handlers} +Scsh interrupt handlers are complicated by the fact that scsh is implemented on +top of the {\scm} virtual machine, which has its own interrupt system, +independent of the Unix signal system. +This means that {\Unix} signals are delivered in two stages: first, +{\Unix} delivers the signal to the {\scm} virtual machine, then +the {\scm} virtual machine delivers the signal to the executing Scheme program +as a {\scm} interrupt. +This ensures that signal delivery happens between two vm instructions, +keeping individual instructions atomic. + +The {\scm} machine has its own set of interrupts, which includes the +asynchronous {\Unix} signals (table~\ref{table:signals-and-interrupts}). +\begin{table} +\begin{minipage}{\textwidth} +\begin{center} +\newcommand{\kwd}[1]{\index{\texttt{#1}}\texttt{#1}} +\begin{tabular}{lll}\hline +Interrupt & Unix signal & OS Variant \\ \hline\hline +\kwd{interrupt/alrm}\footnote{Also bound to {\scm} interrupt + \kwd{interrupt/alarm}.} + & \kwd{signal/alrm} & \Posix \\ +% +\kwd{interrupt/int}\footnote{Also bound to {\scm} interrupt + \kwd{interrupt/keyboard}.} + & \kwd{signal/int} & \Posix \\ +% +\kwd{interrupt/memory-shortage} & N/A & \\ +\kwd{interrupt/chld} & \kwd{signal/chld} & \Posix \\ +\kwd{interrupt/cont} & \kwd{signal/cont} & \Posix \\ +\kwd{interrupt/hup} & \kwd{signal/hup} & \Posix \\ +\kwd{interrupt/quit} & \kwd{signal/quit} & \Posix \\ +\kwd{interrupt/term} & \kwd{signal/term} & \Posix \\ +\kwd{interrupt/tstp} & \kwd{signal/tstp} & \Posix \\ +\kwd{interrupt/usr1} & \kwd{signal/usr1} & \Posix \\ +\kwd{interrupt/usr2} & \kwd{signal/usr2} & \Posix \\ +\\ +\kwd{interrupt/info} & \kwd{signal/info} & BSD only \\ +\kwd{interrupt/io} & \kwd{signal/io} & BSD + SVR4 \\ +\kwd{interrupt/poll} & \kwd{signal/poll} & SVR4 only \\ +\kwd{interrupt/prof} & \kwd{signal/prof} & BSD + SVR4 \\ +\kwd{interrupt/pwr} & \kwd{signal/pwr} & SVR4 only \\ +\kwd{interrupt/urg} & \kwd{signal/urg} & BSD + SVR4 \\ +\kwd{interrupt/vtalrm} & \kwd{signal/vtalrm} & BSD + SVR4 \\ +\kwd{interrupt/winch} & \kwd{signal/winch} & BSD + SVR4 \\ +\kwd{interrupt/xcpu} & \kwd{signal/xcpu} & BSD + SVR4 \\ +\kwd{interrupt/xfsz} & \kwd{signal/xfsz} & BSD + SVR4 \\ +\end{tabular} +\end{center} +\caption{{\scm} virtual-machine interrupts and related {\Unix} signals. + Only the {\Posix} signals are guaranteed to be defined; however, + your implementation and OS may define other signals and + interrupts not listed here.} +\end{minipage} +\label{table:signals-and-interrupts} +\end{table} +% +\begin{table} +\newcommand{\kwd}[1]{\index{\texttt{#1}}\texttt{#1}} +\begin{center} +\begin{tabular}{lll}\hline +Unix signal & Type & OS Variant \\ \hline\hline +\kwd{signal/stop} & Uncatchable & \Posix \\ +\kwd{signal/kill} & Uncatchable & \Posix \\ +\\ +\kwd{signal/abrt} & Synchronous & \Posix \\ +\kwd{signal/fpe} & Synchronous & \Posix \\ +\kwd{signal/ill} & Synchronous & \Posix \\ +\kwd{signal/pipe} & Synchronous & \Posix \\ +\kwd{signal/segv} & Synchronous & \Posix \\ +\kwd{signal/ttin} & Synchronous & \Posix \\ +\kwd{signal/ttou} & Synchronous & \Posix \\ +\\ +\kwd{signal/bus} & Synchronous & BSD + SVR4 \\ +\kwd{signal/emt} & Synchronous & BSD + SVR4 \\ +\kwd{signal/iot} & Synchronous & BSD + SVR4 \\ +\kwd{signal/sys} & Synchronous & BSD + SVR4 \\ +\kwd{signal/trap} & Synchronous & BSD + SVR4 \\ +\end{tabular} +\end{center} +\caption{Uncatchable and synchronous {\Unix} signals. While these signals + may be sent with \texttt{signal-process} or + \texttt{signal-process-group}, + there are no corresponding scsh interrupt handlers. + Only the {\Posix} signals are guaranteed to be defined; however, + your implementation and OS may define other signals not listed + here.} +\label{table:uncatchable-signals} +\end{table} +Note that scsh does \emph{not} support signal handlers for ``synchronous'' +{\Unix} signals, such as \ex{signal/ill} or \ex{signal/pipe} +(see table~\ref{table:uncatchable-signals}). +Synchronous occurrences of these signals are better handled by raising +a Scheme exception. +We recommend you avoid using signal handlers unless you absolutely have +to; we intend to provide a better, higher-level interface to {\Unix} +signals after scsh has been ported to a multi-threaded platform. + +\begin{defundesc}{signal->interrupt}{\integer}{\integer} +The programmer maps from {\Unix} signals to {\scm} interrupts with the +\ex{signal->interrupt} procedure. +If the signal does not have a defined {\scm} interrupt, an errror is signaled. +\end{defundesc} + + +\begin{defundesc}{interrupt-set}{\zeroormore{\integer}}{\integer} +This procedure builds interrupt sets from its interrupt arguments. +A set is represented as an integer using a two's-complement representation of +the bit set. +\end{defundesc} + + +\defun{enabled-interrupts}{}{interrupt-set} +\defunx{set-enabled-interrupts}{interrupt-set}{interrupt-set} +\begin{desc} +Get and set the value of the enabled-interrupt set. +Only interrupts in this set have their handlers called when delivered. +When a disabled interrupt is delivered to the {\scm} machine, it is +held pending until it becomes enabled, at which time its handler is invoked. + +Interrupt sets are represented as integer bit sets (constructed with +the \ex{interrupt-set} function). +The \ex{set-enabled-interrupts} procedure returns the previous value of +the enabled-interrupt set. +\end{desc} + +\dfn {with-enabled-interrupts} {interrupt-set . body} {value(s) of body} {syntax} +\defunx{with-enabled-interrupts*}{interrupt-set thunk} {value(s) of thunk} +\begin{desc} +Run code with a given set of interrupts enabled. +Note that ``enabling'' an interrupt means enabling delivery from +the {\scm} vm to the scsh program. +Using the {\scm} interrupt system is fairly lightweight, and does not involve +actually making a system call. +Note that enabling an interrupt means that the assigned interrupt handler +is allowed to run when the interrupt is delivered. +Interrupts not enabled are held pending when delivered. + +Interrupt sets are represented as integer bit sets (constructed with +the \ex{interrupt-set} function). +\end{desc} + + +\begin{defundesc}{set-interrupt-handler}{interrupt handler}{old-handler} +Assigns a handler for a given interrupt, +and returns the interrupt's old handler. +The \var{handler} argument is \ex{\#f} (ignore), \ex{\#t} (default), or a +procedure taking an integer argument; +the return value follows the same conventions. +Note that the \var{interrupt} argument is an interrupt value, +not a signal value. +An interrupt is delivered to the {\scm} machine by (1) blocking all interrupts, +and (2) applying the handler procedure to the set of interrupts +that were enabled prior to the interrupt delivery. +If the procedure returns normally (\ie, it doesn't throw to a continuation), +the set of enabled interrupts will be returned to its previous value. +(To restore the enabled-interrupt set before throwing out of an interrupt +handler, see \ex{set-enabled-interrupts}) + +\note{If you set a handler for the \ex{interrupt/chld} interrupt, + you may break scsh's autoreaping process machinery. See the + discussion of autoreaping in section~\ref{sec:proc-objects}.} +\end{defundesc} + +\begin{defundesc}{interrupt-handler}{interrupt}{handler} +Return the handler for a given interrupt. +Note that the argument is an interrupt value, not a signal value. +A handler is either \ex{\#f} (ignore), \ex{\#t} (default), or a +procedure taking an integer argument. +\end{defundesc} + +% %set-unix-signal-handler +% %unix-signal-handler + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Time} +\label{sec:time} + +Scsh's time system is fairly sophisticated, particularly with respect +to its careful treatment of time zones. +However, casual users shouldn't be intimidated; +all of the complexity is optional, +and defaulting all the optional arguments reduces the system +to a simple interface. + +\subsection{Terminology} +``UTC'' and ``UCT'' stand for ``universal coordinated time,'' which is the +official name for what is colloquially referred to as ``Greenwich Mean +Time.'' + +{\Posix} allows a single time zone to specify \emph{two} different offsets +from UTC: one standard one, and one for ``summer time.'' +Summer time is frequently some sort of daylight savings time. + +The scsh time package consistently uses this terminology: we never say +``gmt'' or ``dst;'' we always say ``utc'' and ``summer time.'' + +\subsection{Basic data types} +We have two types: \emph{time} and \emph{date}. + +\index{time} +A \emph{time} specifies an instant in the history of the universe. +It is location and time-zone independent.\footnote{Physics pedants please note: + The scsh authors live in a Newtonian universe. We disclaim responsibility + for calculations performed in non-ANSI standard light-cones.} +A time is a real value +giving the number of elapsed seconds since the Unix ``epoch'' +(Midnight, January 1, 1970 UTC). +Time values provide arbitrary time resolution, +limited only by the number system of the underlying Scheme system. + +\index{date} +A \emph{date} is a name for an instant in time that is specified +relative to some location/time-zone in the world, \eg: +\begin{tightinset} + Friday October 31, 1994 3:47:21 pm EST. +\end{tightinset} +Dates provide one-second resolution, +and are expressed with the following record type: +% +\begin{code}\index{date} +(define-record date ; A Posix tm struct + seconds ; Seconds after the minute [0-59] + minute ; Minutes after the hour [0-59] + hour ; Hours since midnight [0-23] + month-day ; Day of the month [1-31] + month ; Months since January [0-11] + year ; Years since 1900 + tz-name ; Time-zone name: #f or a string. + tz-secs ; Time-zone offset: #f or an integer. + summer? ; Summer (Daylight Savings) time in effect? + week-day ; Days since Sunday [0-6] + year-day) ; Days since Jan. 1 [0-365]\end{code} +% +If the \ex{tz-secs} field is given, it specifies the time-zone's offset from +UTC in seconds. If it is specified, the \ex{tz-name} and \ex{summer?} +fields are ignored when using the date structure to determine a specific +instant in time. + +If the \ex{tz-name} field is given, it is a time-zone string such as +\ex{"EST"} or \ex{"HKT"} understood by the OS. +Since {\Posix} time-zone strings can specify dual standard/summer time-zones +(e.g., "EST5EDT" specifies U.S. Eastern Standard/Eastern Daylight Time), +the value of the \ex{summer?} field is used to resolve the amiguous +boundary cases. For example, on the morning of the Fall daylight savings +change-over, 1:00am--2:00am happens twice. Hence the date 1:30 am +on this morning can specify two different seconds; +the \ex{summer?} flag says which one. + +A date with $\ex{tz-name} = \ex{tz-secs} = \ex{\#f}$ is a date that +is specified in terms of the system's current time zone. + +There is redundancy in the \ex{date} data structure. +For example, the \ex{year-day} field is redundant +with the \ex{month-day} and \ex{month} fields. +Either of these implies the values of the \ex{week-day} field. +The \ex{summer?} and \ex{tz-name} fields are redundant with the \ex{tz-secs} +field in terms of specifying an instant in time. +This redundancy is provided because consumers of dates may want it broken out +in different ways. +The scsh procedures that produce date records fill them out completely. +However, when date records produced by the programmer are passed to +scsh procedures, the redundancy is resolved by ignoring some of the +secondary fields. +This is described for each procedure below. + +\defun{make-date} {s min h mday mon y [tzn tzs summ? wday yday]} {date} +\begin{desc} + When making a \ex{date} record, the last five elements of the record + are optional, and default to \ex{\#f}, \ex{\#f}, \ex{\#f}, 0, + and 0 respectively. + This is useful when creating a \ex{date} record to pass as an + argument to \ex{time}. +\end{desc} + +\subsection{Time zones} + Several time procedures take time zones as arguments. When optional, + the time zone defaults to local time zone. Otherwise the time zone + can be one of: +\begin{inset} +\begin{tabular}{lp{0.7\linewidth}} +\ex{\#f} & Local time \\ +Integer & Seconds of offset from UTC. For example, + New York City is -18000 (-5 hours), San Francisco + is -28800 (-8 hours). \\ +String & A {\Posix} time zone string understood by the OS + (\ie., the sort of time zone assigned to the \ex{\$TZ} + environment variable). +\end{tabular} +\end{inset} + An integer time zone gives the number of seconds you must add to UTC + to get time in that zone. It is \emph{not} ``seconds west'' of UTC---that + flips the sign. + + To get UTC time, use a time zone of either 0 or \ex{"UCT0"}. + +\subsection{Procedures} +\defun {time+ticks} {} {[secs ticks]} +\defunx{ticks/sec} {} \real +\begin{desc} + The current time, with sub-second resolution. + Sub-second resolution is not provided by {\Posix}, + but is available on many systems. + The time is returned as elapsed seconds since the Unix epoch, plus + a number of sub-second ``ticks.'' + The length of a tick may vary from implementation to implementation; + it can be determined from \ex{(ticks/sec)}. + + The system clock is not required to report time at the full resolution + given by \ex{(ticks/sec)}. For example, on BSD, time is reported at + $1\mu$s resolution, so \ex{(ticks/sec)} is 1,000,000. That doesn't mean + the system clock has micro-second resolution. + + If the OS does not support sub-second resolution, the \var{ticks} value + is always 0, and \ex{(ticks/sec)} returns 1. + + \begin{remarkenv} + I chose to represent system clock resolution as ticks/sec + instead of sec/tick to increase the odds that the value could + be represented as an exact integer, increasing efficiency and + making it easier for Scheme implementations that don't have + sophisticated numeric support to deal with the quantity. + + You can convert seconds and ticks to seconds with the expression + \codex{(+ \var{secs} (/ \var{ticks} (ticks/sec)))} + Given that, why not have the fine-grain time procedure just + return a non-integer real for time? Following Common Lisp, I chose to + allow the system clock to report sub-second time in its own units to + lower the overhead of determining the time. This would be important + for a system that wanted to precisely time the duration of some + event. Time stamps could be collected with little overhead, deferring + the overhead of precisely calculating with them until after collection. + + This is all a bit academic for the {\scm} implementation, where + we determine time with a heavyweight system call, but it's nice + to plan for the future. + \end{remarkenv} +\end{desc} + +\defun {date} {} {date-record} +\defunx{date} {[time tz]} {date-record} +\begin{desc} + Simple \ex{(date)} returns the current date, in the local time zone. + + With the optional arguments, \ex{date} converts the time to the date as + specified by the time zone \var{tz}. + \var{Time} defaults to the current time; \var{tz} defaults to local time, + and is as described in the time-zone section. + + If the \var{tz} argument is an integer, the date's \ex{tz-name} + field is a {\Posix} time zone of the form + ``\ex{UTC+\emph{hh}:\emph{mm}:\emph{ss}}''; + the trailing \ex{:\emph{mm}:\emph{ss}} portion is deleted if it is zeroes. + + \oops{The Posix facility for converting dates to times, \ex{\urlh{http://www.FreeBSD.org/cgi/man.cgi?query=mktime&apropos=0&sektion=0&manpath=FreeBSD+4.3-RELEASE&format=html}{mktime()}}, + has a broken design: it indicates an error by returning -1, which + is also a legal return value (for date 23:59:59 UCT, 12/31/1969). + Scsh resolves the ambiguity in a paranoid fashion: it always + reports an error if the underlying Unix facility returns -1. + We feel your pain. + } +\end{desc} + +\defun {time} {} \integer +\defunx{time} {[date]} \integer +\begin{desc} + Simple \ex{(time)} returns the current time. + + With the optional date argument, \ex{time} converts a date to a time. + \var{Date} defaults to the current date. + + Note that the input \var{date} record is overconstrained. + \ex{time} ignores \var{date}'s \ex{week-day} and \ex{year-day} fields. + If the date's \ex{tz-secs} field is set, the \ex{tz-name} and + \ex{summer?} fields are ignored. + + If the \ex{tz-secs} field is \ex{\#f}, then the time-zone is taken + from the \ex{tz-name} field. A false \ex{tz-name} means the system's + current time zone. When calculating with time-zones, the date's + \ex{summer?} field is used to resolve ambiguities: +\begin{tightinset} +\begin{tabular}{ll} +\ex{\#f} & Resolve an ambiguous time in favor of non-summer time. \\ +true & Resolve an ambiguous time in favor of summer time. +\end{tabular} +\end{tightinset} + This is useful in boundary cases during the change-over. For example, + in the Fall, when US daylight savings time changes over at 2:00 am, + 1:30 am happens twice---it names two instants in time, an hour apart. + + Outside of these boundary cases, the \ex{summer?} flag is ignored. For + example, if the standard/summer change-overs happen in the Fall and the + Spring, then the value of \ex{summer?} is ignored for a January or + July date. A January date would be resolved with standard time, and a + July date with summer time, regardless of the \ex{summer?} value. + + The \ex{summer?} flag is also ignored if the time zone doesn't have + a summer time---for example, simple UTC. +\end{desc} + + +\defun {date->string} {date} \str +\defunx{format-date} {fmt date} \str +\begin{desc} + \ex{Date->string} formats the date as a 24-character string of the + form: + \begin{tightinset} + Sun Sep 16 01:03:52 1973 + \end{tightinset} + + \ex{Format-date} formats the date according to the format string + \var{fmt}. The format string is copied verbatim, except that tilde + characters indicate conversion specifiers that are replaced by fields from + the date record. Figure \ref{fig:dateconv} gives the full set of + conversion specifiers supported by \ex{format-date}. + +\begin{boxedfigure}{tbp} + \renewcommand{\arraystretch}{1.25} + \begin{tabular}{l>{\raggedrightparbox}p{0.9\linewidth}} + \verb|~~| & Converted to the \verb|~| character. \\ + \verb|~a| & abbreviated weekday name \\ + \verb|~A| & full weekday name \\ + \verb|~b| & abbreviated month name \\ + \verb|~B| & full month name \\ + \verb|~c| & time and date using the time and date representation + for the locale (\verb|~X ~x|) \\ + \verb|~d| & day of the month as a decimal number (01-31) \\ + \verb|~H| & hour based on a 24-hour clock + as a decimal number (00-23) \\ + \verb|~I| & hour based on a 12-hour clock + as a decimal number (01-12) \\ + \verb|~j| & day of the year as a decimal number (001-366) \\ + \verb|~m| & month as a decimal number (01-12) \\ + \verb|~M| & minute as a decimal number (00-59) \\ + \verb|~p| & AM/PM designation associated with a 12-hour clock \\ + \verb|~S| & second as a decimal number (00-61) \\ + \verb|~U| & week number of the year; + Sunday is first day of week (00-53) \\ + \verb|~w| & weekday as a decimal number (0-6), where Sunday is 0 \\ + \verb|~W| & week number of the year; + Monday is first day of week (00-53) \\ + \verb|~x| & date using the date representation for the locale \\ + \verb|~X| & time using the time representation for the locale \\ + \verb|~y| & year without century (00-99) \\ + \verb|~Y| & year with century (\eg 1990) \\ + \verb|~Z| & time zone name or abbreviation, or no characters + if no time zone is determinable + \end{tabular} + +\caption{\texttt{format-date} conversion specifiers} +\label{fig:dateconv} +\end{boxedfigure} +\end{desc} + +%\defun{utc-offset} {[time tz]} \integer +%\begin{desc} +% Returns the offset from UTC of time zone \var{tz} at instant \var{time}. +% \var{time} defaults to the current time; \var{tz} defaults to local time, +% and is as described in the time-zone section. +% +% The offset is the number of seconds you add to UTC time to get +% local time. +% +% Note: Be aware that other time interfaces (\eg, the BSD C interface) +% give offsets as seconds \emph{west} of UTC, which flips the sign. The scsh +% definition is chosen for arithmetic simplicity. It's easy to remember +% the definition of the offset: what you add to UTC to get local. +%\end{desc} +% +%\defun{time-zone} {[summer? tz]} \str +%\begin{desc} +% Returns the name of the time zone as a string. \var{Summer?} is +% used to choose between the summer name and the standard name +% (\eg, ``EST'' and ``EDT'')\@. \var{Summer?} is interpreted as follows: +% \begin{inset} +% \begin{tabular}{lp{0.7\linewidth}} +% Integer & A time value. +% The variant in use at that time is returned. \\ +% \ex{\#f} & The standard time name is returned. \\ +% \emph{Otherwise} & The summer time name is returned. +% \end{tabular} +% \end{inset} +% \ex{Summer?} defaults to the case that pertains at the time of the call. +% It is ignored if the time zone doesn't have a summer variant. +%\end{desc} + +\dfni {fill-in-date!}{date}{date}{procedure} + {fill-in-date"!@\texttt{fill-in-date"!}} +\begin{desc} +This procedure fills in missing, redundant slots in a date record. +In decreasing order of priority: +\begin{itemize} +\itum{year, month, month-day $\Rightarrow$ year-day} + If the \ex{year}, \ex{month}, and \ex{month-day} fields are all + defined (are all integers), the \ex{year-day} + field is set to the corresponding value. +\itum{year, year-day $\Rightarrow$ month, month-day} + If the \ex{month} and \ex{month-day} fields aren't set, but + the \ex{year} and \ex{year-day} fields are set, then + \ex{month} and \ex{month-day} are calculated. +\itum{year, month, month-day, year-day $\Rightarrow$ week-day} + If either of the above rules is able to determine what day it is, + the \ex{week-day} field is then set. +\itum{tz-secs $\Rightarrow$ tz-name} + If \ex{tz-secs} is defined, but \ex{tz-name} is not, it is assigned + a time-zone name of the form ``\ex{UTC+\emph{hh}:\emph{mm}:\emph{ss}}''; + the trailing \ex{:\emph{mm}:\emph{ss}} portion is deleted if it + is zeroes. +\itum{tz-name, date, summer? $\Rightarrow$ tz-secs, summer?} + If the date information is provided up to second resolution, + \ex{tz-name} is also provided, and \ex{tz-secs} is not set, + then \ex{tz-secs} and \ex{summer?} are set to their correct values. + Summer-time ambiguities are resolved using the original value of + \ex{summer?}. If the time zone doesn't have a + summer time variant, then \ex{summer?} is set to \ex{\#f}. +\itum{local time, date, summer? $\Rightarrow$ tz-name, tz-secs, summer?} + If the date information is provided up to second resolution, + but no time zone information is provided (both \ex{tz-name} and + \ex{tz-secs} aren't set), then we proceed as in the above case, + except the system's current time zone is used. +\end{itemize} +These rules allow one particular ambiguity to escape: +if both \ex{tz-name} and \ex{tz-secs} are set, they are not brought +into agreement. It isn't clear how to do this, nor is it clear which +one should take precedence. + +\oops{\ex{fill-in-date!} isn't implemented yet.} + +\end{desc} + + +\section{Environment variables} + +\defun {setenv} {var val} \undefined +\defunx {getenv} {var} \str +\begin{desc} +These functions get and set the process environment, stored in the +external C variable \ex{char **environ}. +An environment variable \var{var} is a string. +If an environment variable is set to a string \var{val}, +then the process' global environment structure is altered with an entry +of the form \ex{"\var{var}=\var{val}"}. +If \var{val} is {\sharpf}, then any entry for \var{var} is deleted. +\end{desc} + +\defun {env->alist}{} {{\str$\rightarrow$\str} alist} +\begin{desc} + The \ex{env->alist} procedure converts the entire environment into + an alist, \eg, +\begin{code} +(("TERM" . "vt100") + ("SHELL" . "/usr/local/bin/scsh") + ("PATH" . "/sbin:/usr/sbin:/bin:/usr/bin") + ("EDITOR" . "emacs") + \ldots)\end{code} +\end{desc} + +\defun {alist->env} {alist} \undefined +\begin{desc} + \var{Alist} must be an alist whose keys are all strings, and whose values + are all either strings or string lists. String lists are converted to + colon lists (see below). The alist is installed as the current {\Unix} + environment (\ie, converted to a null-terminated C vector of + \ex{"\var{var}=\var{val}"} strings which is assigned to the global + \ex{char **environ}). + +\begin{code} +;;; Note $PATH entry is converted +;;; to /sbin:/usr/sbin:/bin:/usr/bin. +(alist->env '(("TERM" . "vt100") + ("PATH" "/sbin" "/usr/sbin" "/bin") + ("SHELL" . "/usr/local/bin/scsh"))) +\end{code} + +Note that \ex{env->alist} and \ex{alist->env} are not exact +inverses---\ex{alist->env} will convert a list value into a single +colon-separated string, but \ex{env->alist} will not parse colon-separated +values into lists. (See the \ex{\$PATH} element in the examples given for +each procedure.) + +\end{desc} + +The following three functions help the programmer manipulate alist +tables in some generally useful ways. They are all defined using +\ex{equal?} for key comparison. + +\begin{defundesc} {alist-delete} {key alist} {alist} + Delete any entry labelled by value \var{key}. +\end{defundesc} + +\begin{defundesc} {alist-update} {key val alist} {alist} + Delete \var{key} from \var{alist}, then cons on a + \ex{(\var{key} . \var{val})} entry. +\end{defundesc} + +\defun{alist-compress} {alist} {alist} +\begin{desc} + Compresses \var{alist} by removing shadowed entries. + Example: +\begin{code} +;;; Shadowed (1 . c) entry removed. +(alist-compress '( (1 . a) (2 . b) (1 . c) (3 . d) )) + {\evalto} ((1 . a) (2 . b) (3 . d))\end{code} +\end{desc} + +\defun {with-env*} {env-alist-delta thunk} {value(s) of thunk} +\defunx {with-total-env*} {env-alist thunk} {value(s) of thunk} +\begin{desc} + These procedures call \var{thunk} in the context of an altered + environment. They return whatever values \var{thunk} returns. + Non-local returns restore the environment to its outer value; + throwing back into the thunk by invoking a stored continuation + restores the environment back to its inner value. + + The \var{env-alist-delta} argument specifies + a \emph{modification} to the current en\-vi\-ron\-ment---\var{thunk}'s + environment is the original environment overridden with the + bindings specified by the alist delta. + + The \var{env-alist} argument specifies a complete environment + that is installed for \var{thunk}. +\end{desc} + +\dfn {with-env} {env-alist-delta . body} {value(s) of body} {syntax} +\dfnx {with-total-env} {env-alist . body} {value(s) of body} {syntax} +\begin{desc} + These special forms provide syntactic sugar for \ex{with-env*} + and {\ttt with\=total\=env*}. + The env alists are not evaluated positions, but are implicitly backquoted. + In this way, they tend to resemble binding lists for \ex{let} and + \ex{let*} forms. +\end{desc} + +Example: These four pieces of code all run the mailer with special +\cd{$TERM} and \cd{$EDITOR} values. +{\small +\begin{code} +(with-env (("TERM" . "xterm") ("EDITOR" . ,my-editor)) + (run (mail shivers@lcs.mit.edu))) +\cb +(with-env* `(("TERM" . "xterm") ("EDITOR" . ,my-editor)) + (\l{} (run (mail shivers@csd.hku.hk)))) +\cb +(run (begin (setenv "TERM" "xterm") ; Env mutation happens + (setenv "EDITOR" my-editor) ; in the subshell. + (exec-epf (mail shivers@research.att.com)))) +\cb +;; In this example, we compute an alternate environment ENV2 +;; as an alist, and install it with an explicit call to the +;; EXEC-PATH/ENV procedure. +(let* ((env (env->alist)) ; Get the current environment, + (env1 (alist-update env "TERM" "xterm")) ; and compute + (env2 (alist-update env1 "EDITOR" my-editor))) ; the new env. + (run (begin (exec-path/env "mail" env2 "shivers@cs.cmu.edu"))))\end{code}} + +\subsection{Path lists and colon lists} + +When environment variables such as \ex{\$PATH} need to encode a list of +strings (such as a list of directories to be searched), +the common Unix convention is to separate the list elements with +colon delimiters.\footnote{\ldots and hope the individual list elements +don't contain colons themselves.} +To convert between the colon-separated string encoding and the +list-of-strings representation, see the \ex{infix-splitter} function +(section~\ref{sec:field-splitter}) and the string library's +\ex{string-join} function. +For example, +\begin{code} +(define split (infix-splitter (rx ":"))) +(split "/sbin:/bin::/usr/bin") {\evalsto} + '("/sbin" "/bin" "" "/usr/bin") +(string-join ":" '("/sbin" "/bin" "" "/usr/bin")) {\evalsto} + "/sbin:/bin::/usr/bin"\end{code} +The following two functions are useful for manipulating these ordered lists, +once they have been parsed from their colon-separated form. + +%\remark{An earlier release of scsh provided the \ex{split-colon-list} +% and \ex{string-list->colon-list} functions. These have been +% removed from scsh, and are replaced by the more general +% parsers and unparsers of the field-reader module.} +% +%\defun {split-colon-list} {string} {{\str} list} +%\defunx {string-list->colon-list} {string-list} \str +%\begin{desc} +% Many {\Unix} lists, such as the \cd{$PATH} search path, +% are stored as ``colon lists.'' +% A colon list is a string containing elements delimited by colon characters. +% These functions provide conversions between colon lists and true +% {\Scheme} lists. +%% +%\begin{code} +%(split-colon-list "/foo:/bar::/usr/tmp") \evalto +% ("/foo" "/bar" "" "/usr/tmp")\end{code} +%% +% \ex{string-list->colon-list} is the inverse function. +% +% \ex{with-env*}, \ex{with-total-env*}, and \ex{alist->env} all coerce +% string lists to colon lists where appropriate. +%\end{desc} + +\defun {add-before} {elt before list} {list} +\defunx {add-after} {elt after list} {list} +\begin{desc} + These functions are for modifying search-path lists, where element order + is significant. + + \ex{add-before} adds \var{elt} to the list immediately + before the first occurrence of \var{before} in the list. + If \var{before} is not in the list, \var{elt} is added to the end + of the list. + + \ex{add-after} is similar: + \var{elt} is added after the last occurrence of \var{after}. + If \var{after} is not found, + \var{elt} is added to the beginning of the list. + + Neither function destructively alters the original path-list. + The result may share structure with the original list. + Both functions use \ex{equal?} for comparing elements. +\end{desc} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{\protect{\tt\$USER}, \protect{\tt\$HOME}, and \protect{\tt\$PATH}} + +Like sh and unlike csh, scsh has \emph{no} interactive dependencies on +environment variables. +It does, however, initialise certain internal values at startup time from the +initial process environment, in particular \cd{$HOME} and \cd{$PATH}. +Scsh never uses \cd{$USER} at all. +It computes \ex{(user-login-name)} from the system call \ex{(user-uid)}. + +\defvar {home-directory} \str +\defvarx {exec-path-list} {{\str} list fluid} +\begin{desc} + Scsh accesses \cd{$HOME} at start-up time, and stores the value in the + global variable \ex{home-directory}. It uses this value for \ex{\~} + lookups and for returning to home on \ex{(chdir)}. + + Scsh accesses \cd{$PATH} at start-up time, colon-splits the path list, and + stores the value in the fluid \ex{exec-path-list}. This list is + used for \ex{exec-path} and \ex{exec-path/env} searches. + + To access, rebind or side-effect fluid cells, you must open + the \ex{fluids} package. +\end{desc} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\input{tty} diff --git a/doc/scsh-manual/test.tex b/doc/scsh-manual/test.tex new file mode 100644 index 0000000..9dbd2d4 --- /dev/null +++ b/doc/scsh-manual/test.tex @@ -0,0 +1,33 @@ +%&latex -*- latex -*- + +\documentclass{report} +\usepackage{code,boxedminipage,draftfooters,palatino,ct,makeidx, + headings,mantitle,array,matter,mysize10} + +\parskip = 3pt plus 3pt +\sloppy + +\input{decls} +%%% End preamble + +\begin{document} + +\begin{tabular}{ll} +{\begin{codebox}[b] +define structure web server + open scsh + scheme + net-hax + + file web\end{codebox}} +& +{\begin{codebox}[b] +(define-structure web-server + (open scheme + scsh + net-hax + \vdots) + (file web))\end{codebox}}\\ +\end{tabular} + +\end{document} diff --git a/doc/scsh-manual/tex2page.sty b/doc/scsh-manual/tex2page.sty new file mode 100644 index 0000000..28bd5bb --- /dev/null +++ b/doc/scsh-manual/tex2page.sty @@ -0,0 +1,9 @@ +% tex2page.sty +% Dorai Sitaram + +% Loading this file in a LaTeX document +% gives it all the macros of tex2page.tex, +% but via a more LaTeX-convenient filename. + +\input{tex2page} + diff --git a/doc/scsh-manual/todo.tex b/doc/scsh-manual/todo.tex new file mode 100644 index 0000000..c8535ab --- /dev/null +++ b/doc/scsh-manual/todo.tex @@ -0,0 +1,40 @@ +%&latex -*- latex -*- + +\chapter{Todo} + +There are always many, many improvements and extensions that could be +made to scsh. +We invite interested hackers to do any of them, and send us the code; +we'll put you on the team. +Visit the Scheme Underground Web page for more information on good hacks at +\begin{inset} \begin{flushleft} +\ex{\urlh{http://www.ai.mit.edu/projects/su/}{http://www.ai.mit.edu/projects/su/}} +\end{flushleft} +\end{inset} +Scsh is a tool that lets you write fun programs that do real things in +an elegant language; go wild. + +\begin{itemize} +\item Threads. +\item An X gui interface. (Needs threads.) +\item A better C function/data-structure interface. This is not easy. +\item More network protocols. Telnet and ftp would be the most important. +\item Port Edwin, and emacs text editor written in MIT Scheme, to scsh. + Combine it with scsh's OS interfaces to make a visual shell. +\item Manual hacking. +\begin{itemize} +\item The {\LaTeX} hackery needs yet another serious pass. Most importantly, + long procedure ``declarations'' need to be broken across two lines. + +\item Soup up the markup processor, and redo manual in markup. Generate + {\LaTeX}, HTML, and info versions. Alternatively, persuade some kind + soul to hand-port manual to HTML or info. +\end{itemize} + +\item Job control, after \ex{jcontrol.scm} + +\item Better static heap linker. + +\item Gnu readline lib. + +\end{itemize} diff --git a/doc/scsh-manual/tty.tex b/doc/scsh-manual/tty.tex new file mode 100644 index 0000000..7c068bb --- /dev/null +++ b/doc/scsh-manual/tty.tex @@ -0,0 +1,751 @@ +%&latex -*- latex -*- +% Fix OXTABS footnote bug +% Figures should be dumped out earlier? Pack two to a page? + +\section{Terminal device control} +\label{sect:tty} + +\newcommand{\fr}[1]{\makebox[0pt][r]{#1}} + +% \ex{#1} and also generates an index entry. +\newcommand{\exi}[1]{\index{#1@\texttt{#1}}\ex{#1}} +\newcommand{\indextt}[1]{\index{#1@\texttt{#1}}} + +Scsh provides a complete set of routines for manipulating terminal +devices---putting them in ``raw'' mode, changing and querying their +special characters, modifying their i/o speeds, and so forth. +The scsh interface is designed both for generality and portability +across different Unix platforms, so you don't have to rewrite your +program each time you move to a new system. +We've also made an effort to use reasonable, Scheme-like names for +the multitudinous named constants involved, so when you are reading +code, you'll have less likelihood of getting lost in a bewildering +maze of obfuscatory constants named \ex{ICRNL}, \ex{INPCK}, \ex{IUCLC}, +and \ex{ONOCR}. + +This section can only lay out the basic functionality of the terminal +device interface. +For further details, see the termios(3) man page on your system, +or consult one of the standard {\Unix} texts. + +\subsection{Portability across OS variants} +Terminal-control software is inescapably complex, ugly, and low-level. +Unix variants each provide their own way of controlling terminal +devices, making it difficult to provide interfaces that are +portable across different Unix systems. +Scsh's terminal support is based primarily upon the {\Posix} termios +interface. +Programs that can be written using only the {\Posix} interface are likely +to be widely portable. + +The bulk of the documentation that follows consists of several pages worth +of tables defining different named constants that enable and disable different +features of the terminal driver. +Some of these flags are {\Posix}; others are taken from the two common +branches of Unix development, SVR4 and 4.3+ Berkeley. +Scsh guarantees that the non-{\Posix} constants will be bound identifiers. +\begin{itemize} +\item If your OS supports a particular non-{\Posix} flag, + its named constant will be bound to the flag's value. +\item If your OS doesn't support the flag, its named constant + will be present, but bound to \sharpf. +\end{itemize} +This means that if you want to use SVR4 or Berkeley features in a program, +your program can portably test the values of the flags before using +them---the flags can reliably be referenced without producing OS-dependent +``unbound variable'' errors. + +Finally, note that although {\Posix}, SVR4, and Berkeley cover the lion's +share of the terminal-driver functionality, +each operating system inevitably has non-standard extensions. +While a particular scsh implementation may provide these extensions, +they are not portable, and so are not documented here. + +\subsection{Miscellaneous procedures} +\defun{tty?}{fd/port}{\boolean} +\begin{desc} +Return true if the argument is a tty. +\end{desc} + +\defun{tty-file-name}{fd/port}{\str} +\begin{desc} +The argument \var{fd/port} must be a file descriptor or port open on a tty. +Return the file-name of the tty. +\end{desc} + +\subsection{The tty-info record type} + +The primary data-structure that describes a terminal's mode is +a \ex{tty-info} record, defined as follows: +\index{tty-info record type} +\indextt{tty-info:control-chars} +\indextt{tty-info:input-flags} +\indextt{tty-info:output-flags} +\indextt{tty-info:control-flags} +\indextt{tty-info:local-flags} +\indextt{tty-info:input-speed} +\indextt{tty-info:output-speed} +\indextt{tty-info:min} +\indextt{tty-info:time} +\indextt{tty-info?} +\begin{code} +(define-record tty-info + control-chars ; String: Magic input chars + input-flags ; Int: Input processing + output-flags ; Int: Output processing + control-flags ; Int: Serial-line control + local-flags ; Int: Line-editting UI + input-speed ; Int: Code for input speed + output-speed ; Int: Code for output speed + min ; Int: Raw-mode input policy + time) ; Int: Raw-mode input policy\end{code} + +\subsubsection{The control-characters string} +The \ex{control-chars} field is a character string; +its characters may be indexed by integer values taken from +table~\ref{table:ttychars}. + +As discussed above, +only the {\Posix} entries in table~\ref{table:ttychars} are guaranteed +to be legal, integer indices. +A program can reliably test the OS to see if the non-{\Posix} +characters are supported by checking the index constants. +If the control-character function is supported by the terminal driver, +then the corresponding index will be bound to an integer; +if it is not supported, the index will be bound to \sharpf. + +To disable a given control-character function, set its corresponding +entry in the \ex{tty-info:control-chars} string to the +special character \exi{disable-tty-char} +(and then use the \ex{(set-tty-info \var{fd/port} \var{info})} procedure +to update the terminal's state). + +\subsubsection{The flag fields} +The \ex{tty-info} record's \ex{input-flags}, \ex{output-flags}, +\ex{control-flags}, and \ex{local-flags} fields are all bit sets +represented as two's-complement integers. +Their values are composed by or'ing together values taken from +the named constants listed in tables~\ref{table:ttyin} +through \ref{table:ttylocal}. + +As discussed above, +only the {\Posix} entries listed in these tables are guaranteed +to be legal, integer flag values. +A program can reliably test the OS to see if the non-{\Posix} +flags are supported by checking the named constants. +If the feature is supported by the terminal driver, +then the corresponding flag will be bound to an integer; +if it is not supported, the flag will be bound to \sharpf. + +%%%%% I managed to squeeze this into the DEFINE-RECORD's comments. +% Here is a small table classifying the four flag fields by +% the kind of features they determine: +% \begin{center} +% \begin{tabular}{|ll|}\hline +% Field & Affects \\ \hline \hline +% \ex{input-flags} & Processing of input chars \\ +% \ex{output-flags} & Processing of output chars \\ +% \ex{control-flags} & Controlling of terminal's serial line \\ +% \ex{local-flags} & Details of the line-editting user interface \\ +% \hline +% \end{tabular} +% \end{center} + +%%% +%%% The figures used to go here. +%%% + +\subsubsection{The speed fields} +The \ex{input-speed} and \ex{output-speed} fields determine the +I/O rate of the terminal's line. +The value of these fields is an integer giving the speed +in bits-per-second. +The following speeds are supported by {\Posix}: +\begin{center} +\begin{tabular}{rrrr} +0 & 134 & 600 & 4800 \\ +50 & 150 & 1200 & 9600 \\ +75 & 200 & 1800 & 19200 \\ +110 & 300 & 2400 & 38400 \\ +\end{tabular} +\end{center} +Your OS may accept others; it may also allow the special symbols +\ex{'exta} and \ex{'extb}. + +\subsubsection{The min and time fields} +The integer \ex{min} and \ex{time} fields determine input blocking +behaviour during non-canonical (raw) input; otherwise, they are ignored. +See the termios(3) man page for further details. + +Be warned that {\Posix} allows the base system call's representation +of the \ex{tty-info} record to share storage for the \ex{min} field +and the \ex{ttychar/eof} element of the control-characters string, +and for the \ex{time} field and the \ex{ttychar/eol} element +of the control-characters string. +Many implementations in fact do this. + +To stay out of trouble, set the \ex{min} and \ex{time} fields only +if you are putting the terminal into raw mode; +set the eof and eol control-characters only if you are putting +the terminal into canonical mode. +It's ugly, but it's {\Unix}. + +\subsection{Using tty-info records} + +\defun{make-tty-info}{if of cf lf ispeed ospeed min time} + {tty-info-record} +\defunx{copy-tty-info}{tty-info-record}{tty-info-record} +\begin{desc} +These procedures make it possible to create new \ex{tty-info} records. +The typical method for creating a new record is to copy one retrieved +by a call to the \ex{tty-info} procedure, then modify the copy as desired. +Note that the \ex{make-tty-info} procedure does not take a parameter +to define the new record's control characters.\footnote{ + Why? Because the length of the string varies from Unix to Unix. + For example, the word-erase control character (typically control-w) + is provided by most Unixes, but not part of the {\Posix} spec.} +Instead, it simply returns a \ex{tty-info} record whose control-character +string has all elements initialised to {\Ascii} nul. +You may then install the special characters by assigning to the string. +Similarly, the control-character string in the record produced by +\ex{copy-tty-info} does not share structure with the string in the record +being copied, so you may mutate it freely. +\end{desc} + + +\defun{tty-info}{[fd/port/fname]}{tty-info-record} +\begin{desc} +The \var{fd/port/fname} parameter is an integer file descriptor or +Scheme I/O port opened on a terminal device, +or a file-name for a terminal device; it defaults to the current input port. +This procedure returns a \ex{tty-info} record describing the terminal's +current mode. +\end{desc} + +\defun {set-tty-info/now} {fd/port/fname info}{no-value} +\defunx{set-tty-info/drain}{fd/port/fname info}{no-value} +\defunx{set-tty-info/flush}{fd/port/fname info}{no-value} +\begin{desc} +The \var{fd/port/fname} parameter is an integer file descriptor or +Scheme I/O port opened on a terminal device, +or a file-name for a terminal device. +The procedure chosen determines when and how the terminal's mode is altered: +\begin{center} +\begin{tabular}{|ll|} \hline +Procedure & Meaning \\ \hline \hline +\ex{set-tty-info/now} & Make change immediately. \\ +\ex{set-tty-info/drain} & Drain output, then change. \\ +\ex{set-tty-info/flush} & Drain output, flush input, then change. \\ \hline +\end{tabular} +\end{center} +\oops{If I had defined these with the parameters in the reverse order, + I could have made \var{fd/port/fname} optional. Too late now.} +\end{desc} + +\subsection{Other terminal-device procedures} +\defun{send-tty-break}{[fd/port/fname duration]}{no-value} +\begin{desc} +The \var{fd/port/fname} parameter is an integer file descriptor or +Scheme I/O port opened on a terminal device, +or a file-name for a terminal device; it defaults to the current output port. +Send a break signal to the designated terminal. +A break signal is a sequence of continuous zeros on the terminal's transmission +line. + +The \var{duration} argument determines the length of the break signal. +A zero value (the default) causes a break of between +0.25 and 0.5 seconds to be sent; +other values determine a period in a manner that will depend upon local +community standards. +\end{desc} + +\defun{drain-tty}{[fd/port/fname]}{no-value} +\begin{desc} +The \var{fd/port/fname} parameter is an integer file descriptor or +Scheme I/O port opened on a terminal device, +or a file-name for a terminal device; it defaults to the current output port. + +This procedure waits until all the output written to the +terminal device has been transmitted to the device. +If \var{fd/port/fname} is an output port with buffered I/O +enabled, then the port's buffered characters are flushed before +waiting for the device to drain. +\end{desc} + +\defun {flush-tty/input} {[fd/port/fname]}{no-value} +\defunx{flush-tty/output}{[fd/port/fname]}{no-value} +\defunx{flush-tty/both} {[fd/port/fname]}{no-value} +\begin{desc} +The \var{fd/port/fname} parameter is an integer file descriptor or +Scheme I/O port opened on a terminal device, +or a file-name for a terminal device; it defaults to the current input +port (\ex{flush-tty/input} and \ex{flush-tty/both}), +or output port (\ex{flush-tty/output}). + +These procedures discard the unread input chars or unwritten +output chars in the tty's kernel buffers. +\end{desc} + +\defun {start-tty-output}{[fd/port/fname]} {no-value} +\defunx{stop-tty-output} {[fd/port/fname]} {no-value} +\defunx{start-tty-input} {[fd/port/fname]} {no-value} +\defunx{stop-tty-input} {[fd/port/fname]} {no-value} +\begin{desc} +These procedures can be used to control a terminal's input and output flow. +The \var{fd/port/fname} parameter is an integer file descriptor or +Scheme I/O port opened on a terminal device, +or a file-name for a terminal device; it defaults to the current input +or output port. + +The \ex{stop-tty-output} and \ex{start-tty-output} procedures suspend +and resume output from a terminal device. +The \ex{stop-tty-input} and \ex{start-tty-input} procedures transmit +the special STOP and START characters to the terminal with the intention +of stopping and starting terminal input flow. +\end{desc} + +% --- Obsolete --- +% \defun {encode-baud-rate}{speed}{code} +% \defunx{decode-baud-rate}{code}{speed} +% \begin{desc} +% These procedures can be used to map between the special codes +% that are legal values for the \ex{tty-info:input-speed} and +% \ex{tty-info:output-speed} fields, and actual integer bits-per-second speeds. +% The codes are the values bound to the +% \ex{baud/4800}, \ex{baud/9600}, and other named constants defined above. +% For example: +% \begin{code} +% (decode-baud-rate baud/9600) {\evalto} 9600 +% +% ;;; These two expressions are identical: +% (set-tty-info:input-speed ti baud/14400) +% (set-tty-info:input-speed ti (encode-baud-rate 14400))\end{code} +% \end{desc} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Control terminals, sessions, and terminal process groups} + +\defun{open-control-tty}{tty-name [flags]}{port} +\begin{desc} +This procedure opens terminal device \var{tty-name} as the process' +control terminal +(see the \ex{termios} man page for more information on control terminals). +The \var{tty-name} argument is a file-name such as \ex{/dev/ttya}. +The \var{flags} argument is a value suitable as the second argument +to the \ex{open-file} call; it defaults to \ex{open/read+write}, causing +the terminal to be opened for both input and output. + +The port returned is an input port if the \var{flags} permit it, +otherwise an output port. +\R4RS/\scm/scsh do not have input/output ports, +so it's one or the other. +However, you can get both read and write ports open on a terminal +by opening it read/write, taking the result input port, +and duping it to an output port with \ex{dup->outport}. + +This procedure guarantees to make the opened terminal the +process' control terminal only if the process does not have +an assigned control terminal at the time of the call. +If the scsh process already has a control terminal, the results are undefined. + +To arrange for the process to have no control terminal prior to calling +this procedure, use the \ex{become-session-leader} procedure. + +%\oops{The control terminal code was added just before release time +% for scsh release 0.4. Control terminals are one of the less-standardised +% elements of Unix. We can't guarantee that the terminal is definitely +% attached as a control terminal; we were only able to test this out +% on HP-UX. If you intend to use this feature on your OS, you should +% test it out first. If your OS requires the use of the \ex{TIOCSCTTY} +% \ex{ioctl}, uncomment the appropriate few lines of code in the +% file \ex{tty1.c} and send us email.} +\end{desc} + +\defun{become-session-leader}{}{\integer} +\begin{desc} +This is the C \ex{setsid()} call. +{\Posix} job-control has a three-level hierarchy: +session/process-group/process. +Every session has an associated control terminal. +This procedure places the current process into a brand new session, +and disassociates the process from any previous control terminal. +You may subsequently use \ex{open-control-tty} to open a new control +terminal. + +It is an error to call this procedure if the current process is already +a process-group leader. +One way to guarantee this is not the case is only to call this procedure +after forking. +\end{desc} + + +\defun {tty-process-group}{fd/port/fname}{\integer} +\defunx{set-tty-process-group}{fd/port/fname pgrp}{\undefined} +\begin{desc} +This pair of procedures gets and sets the process group of a given +terminal. +\end{desc} + +\defun{control-tty-file-name}{}{\str} +\begin{desc} +Return the file-name of the process' control tty. +On every version of Unix of which we are aware, this is just the string +\ex{"/dev/tty"}. +However, this procedure uses the official Posix interface, so it is more +portable than simply using a constant string. +\end{desc} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Pseudo-terminals} +Scsh implements an interface to Berkeley-style pseudo-terminals. + +\defun{fork-pty-session}{thunk}{[process pty-in pty-out tty-name]} +\begin{desc} +This procedure gives a convenient high-level interface to pseudo-terminals. +It first allocates a pty/tty pair of devices, and then forks a child +to execute procedure \var{thunk}. +In the child process +\begin{itemize} +\item Stdio and the current I/O ports are bound to the terminal device. +\item The child is placed in its own, new session + (see \ex{become\=session\=leader}). +\item The terminal device becomes the new session's controlling terminal + (see \ex{open-control-tty}). +\item The \ex{(error-output-port)} is unbuffered. +\end{itemize} + +The \ex{fork-pty-session} procedure returns four values: +the child's process object, two ports open on the controlling pty device, +and the name of the child's corresponding terminal device. +\end{desc} + +\defun{open-pty}{}{pty-inport tty-name} +\begin{desc} +This procedure finds a free pty/tty pair, and opens the pty device +with read/write access. +It returns a port on the pty, +and the name of the corresponding terminal device. + +The port returned is an input port---Scheme doesn't allow input/output +ports. +However, you can easily use \ex{(dup->outport \var{pty-inport})} +to produce a matching output port. +You may wish to turn off I/O buffering for this output port. +\end{desc} + + +\defun {pty-name->tty-name}{pty-name}{tty-name} +\defunx{tty-name->pty-name}{tty-name}{pty-name} +\begin{desc} +These two procedures map between corresponding terminal and pty controller +names. +For example, +\begin{code} +(pty-name->tty-name "/dev/ptyq3") {\evalto} "/dev/ttyq3" +(tty-name->pty-name "/dev/ttyrc") {\evalto} "/dev/ptyrc"\end{code} + +\remark{This is rather Berkeley-specific. SVR4 ptys are rare enough that + I've no real idea if it generalises across the Unix gap. Experts + are invited to advise. Users feel free to not worry---the predominance + of current popular Unix systems use Berkeley ptys.} +\end{desc} + +\defunx{make-pty-generator}{}{\proc} +\begin{desc} +This procedure returns a generator of candidate pty names. +Each time the returned procedure is called, it produces a +new candidate. +Software that wishes to search through the set of available ptys +can use a pty generator to iterate over them. +After producing all the possible ptys, a generator returns {\sharpf} +every time it is called. +Example: +\begin{code} +(define pg (make-pty-generator)) +(pg) {\evalto} "/dev/ptyp0" +(pg) {\evalto} "/dev/ptyp1" + \vdots +(pg) {\evalto} "/dev/ptyqe" +(pg) {\evalto} "/dev/ptyqf" \textit{(Last one)} +(pg) {\evalto} {\sharpf} +(pg) {\evalto} {\sharpf} + \vdots\end{code} +\end{desc} + + +% Flag tables +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +% Control-chars indices +%%%%%%%%%%%%%%%%%%%%%%% +\begin{table}[p] +\begin{center} +\begin{tabular}{|lll|} \hline +Scsh & C & Typical char \\ +\hline\hline +{\Posix} & & \\ +\exi{ttychar/delete-char} & \ex{ERASE} & del \\ +\exi{ttychar/delete-line} & \ex{KILL} & \verb|^U| \\ +\exi{ttychar/eof} & \ex{EOF} & \verb|^D| \\ +\exi{ttychar/eol} & \ex{EOL} & \\ +\exi{ttychar/interrupt} & \ex{INTR} & \verb|^C| \\ +\exi{ttychar/quit} & \ex{QUIT} & \verb|^\| \\ +\exi{ttychar/suspend} & \ex{SUSP} & \verb|^Z| \\ +\exi{ttychar/start} & \ex{START} & \verb|^Q| \\ +\exi{ttychar/stop} & \ex{STOP} & \verb|^S| \\ + +\hline\hline +{SVR4 and 4.3+BSD} & & \\ +\exi{ttychar/delayed-suspend} & \ex{DSUSP} & \verb|^Y| \\ +\exi{ttychar/delete-word} & \ex{WERASE} & \verb|^W| \\ +\exi{ttychar/discard} & \ex{DISCARD} & \verb|^O| \\ +\exi{ttychar/eol2} & \ex{EOL2} & \\ +\exi{ttychar/literal-next} & \ex{LNEXT} & \verb|^V| \\ +\exi{ttychar/reprint} & \ex{REPRINT} & \verb|^R| \\ + +\hline\hline +{4.3+BSD} & & \\ +\exi{ttychar/status} & \ex{STATUS} & \verb|^T| \\ +\hline +\end{tabular} +\end{center} +\caption{Indices into the \protect\ex{tty-info} record's + \protect\var{control-chars} string, + and the character traditionally found at each index. + Only the indices for the {\Posix} entries are guaranteed to + be non-\sharpf.} +\label{table:ttychars} +\end{table} + +% Input flags +%%%%%%%%%%%%% +\begin{table}[p] +\begin{center}\small +\begin{tabular}{|lll|} \hline +Scsh & C & Meaning \\ +\hline\hline +\Posix & & \\ +\exi{ttyin/check-parity} + & \ex{INPCK} & Check parity. \\ +\exi{ttyin/ignore-bad-parity-chars} + & \ex{IGNPAR} & Ignore chars with parity errors. \\ +\exi{ttyin/mark-parity-errors} + & \ex{PARMRK} & Insert chars to mark parity errors.\\ +\exi{ttyin/ignore-break} + & \ex{IGNBRK} & Ignore breaks. \\ +\exi{ttyin/interrupt-on-break} + & \ex{BRKINT} & Signal on breaks. \\ +\exi{ttyin/7bits} + & \ex{ISTRIP} & Strip char to seven bits. \\ +\exi{ttyin/cr->nl} + & \ex{ICRNL} & Map carriage-return to newline. \\ +\exi{ttyin/ignore-cr} + & \ex{IGNCR} & Ignore carriage-returns. \\ +\exi{ttyin/nl->cr} + & \ex{INLCR} & Map newline to carriage-return. \\ +\exi{ttyin/input-flow-ctl} + & \ex{IXOFF} & Enable input flow control. \\ +\exi{ttyin/output-flow-ctl} + & \ex{IXON} & Enable output flow control. \\ + +\hline\hline +{SVR4 and 4.3+BSD} & & \\ +\exi{ttyin/xon-any} & \ex{IXANY} & Any char restarts after stop. \\ +\exi{ttyin/beep-on-overflow} & \ex{IMAXBEL} & Ring bell when queue full. \\ + +\hline\hline +{SVR4} & & \\ +\exi{ttyin/lowercase} & \ex{IUCLC} & Map upper case to lower case. \\ +\hline +\end{tabular} +\end{center} +\caption{Input-flags. These are the named flags for the \protect\ex{tty-info} + record's \protect\var{input-flags} field. + These flags generally control the processing of input chars. + Only the {\Posix} entries are guaranteed to be non-\sharpf. + } +\label{table:ttyin} +\end{table} + +% Output flags +%%%%%%%%%%%%%% +\begin{table}[p] +\begin{center}%\small +\begin{tabular}{|lll|} \hline +Scsh & C & Meaning \\ \hline\hline + +\multicolumn{3}{|l|}{\Posix} \\ +\exi{ttyout/enable} & \ex{OPOST} & Enable output processing. \\ + +\hline\hline +\multicolumn{3}{|l|}{SVR4 and 4.3+BSD} \\ +\exi{ttyout/nl->crnl} & \ex{ONLCR} & Map nl to cr-nl. \\ + +\hline\hline +\multicolumn{3}{|l|}{4.3+BSD} \\ +\exi{ttyout/discard-eot} & \ex{ONOEOT} & Discard EOT chars. \\ +\exi{ttyout/expand-tabs} & \ex{OXTABS}\footnote{ + Note this is distinct from the SVR4-equivalent + \ex{ttyout/tab-delayx} flag defined in + table~\ref{table:ttydelays}.} + & Expand tabs. \\ + +\hline\hline +\multicolumn{3}{|l|}{SVR4} \\ +\exi{ttyout/cr->nl} & \ex{OCRNL} & Map cr to nl. \\ +\exi{ttyout/nl-does-cr} & \ex{ONLRET}& Nl performs cr as well. \\ +\exi{ttyout/no-col0-cr} & \ex{ONOCR} & No cr output in column 0. \\ +\exi{ttyout/delay-w/fill-char} & \ex{OFILL} & Send fill char to delay. \\ +\exi{ttyout/fill-w/del} & \ex{OFDEL} & Fill char is {\Ascii} DEL. \\ +\exi{ttyout/uppercase} & \ex{OLCUC} & Map lower to upper case. \\ +\hline +\end{tabular} +\end{center} +\caption{Output-flags. These are the named flags for the \protect\ex{tty-info} + record's \protect\var{output-flags} field. + These flags generally control the processing of output chars. + Only the {\Posix} entries are guaranteed to be non-\sharpf.} +\label{table:ttyout} +\end{table} + +% Delay flags +%%%%%%%%%%%%% +\begin{table}[p] +\begin{tabular}{r|ll|} \cline{2-3} +& Value & Comment \\ \cline{2-3} +{Backspace delay} & \exi{ttyout/bs-delay} & Bit-field mask \\ + & \exi{ttyout/bs-delay0} & \\ + & \exi{ttyout/bs-delay1} & \\ + +\cline{2-3} +{Carriage-return delay} & \exi{ttyout/cr-delay} & Bit-field mask \\ + & \exi{ttyout/cr-delay0} & \\ + & \exi{ttyout/cr-delay1} & \\ + & \exi{ttyout/cr-delay2} & \\ + & \exi{ttyout/cr-delay3} & \\ + +\cline{2-3} +{Form-feed delay} & \exi{ttyout/ff-delay} & Bit-field mask \\ + & \exi{ttyout/ff-delay0} & \\ + & \exi{ttyout/ff-delay1} & \\ + +\cline{2-3} +{Horizontal-tab delay} & \exi{ttyout/tab-delay} & Bit-field mask \\ + & \exi{ttyout/tab-delay0} & \\ + & \exi{ttyout/tab-delay1} & \\ + & \exi{ttyout/tab-delay2} & \\ + & \exi{ttyout/tab-delayx} & Expand tabs \\ + +\cline{2-3} +{Newline delay} & \exi{ttyout/nl-delay} & Bit-field mask \\ + & \exi{ttyout/nl-delay0} & \\ + & \exi{ttyout/nl-delay1} & \\ + +\cline{2-3} +{Vertical tab delay} & \exi{ttyout/vtab-delay} & Bit-field mask \\ + & \exi{ttyout/vtab-delay0} & \\ + & \exi{ttyout/vtab-delay1} & \\ + +\cline{2-3} +{All} & \exi{ttyout/all-delay} & Total bit-field mask \\ +\cline{2-3} +\end{tabular} + +\caption{Delay constants. These are the named flags for the + \protect\ex{tty-info} record's \protect\var{output-flags} field. + These flags control the output delays associated with printing + special characters. + They are non-{\Posix}, and have non-{\sharpf} values + only on SVR4 systems.} +\label{table:ttydelays} +\end{table} + +% Control flags +%%%%%%%%%%%%%%% +\begin{table}[p] +\begin{center}%\small +\begin{tabular}{|lll|} \hline +Scsh & C & Meaning \\ + +\hline\hline +\multicolumn{3}{|l|}{\Posix} \\ +\exi{ttyc/char-size} & \ex{CSIZE} & Character size mask \\ +\exi{ttyc/char-size5} & \ex{CS5} & 5 bits \\ +\exi{ttyc/char-size6} & \ex{CS6} & 6 bits \\ +\exi{ttyc/char-size7} & \ex{CS7} & 7 bits \\ +\exi{ttyc/char-size8} & \ex{CS8} & 8 bits \\ +\exi{ttyc/enable-parity}& \ex{PARENB} & Generate and detect parity. \\ +\exi{ttyc/odd-parity} & \ex{PARODD} & Odd parity. \\ +\exi{ttyc/enable-read} & \ex{CREAD} & Enable reception of chars. \\ +\exi{ttyc/hup-on-close} & \ex{HUPCL} & Hang up on last close. \\ +\exi{ttyc/no-modem-sync}& \ex{LOCAL} & Ignore modem lines. \\ +\exi{ttyc/2-stop-bits} & \ex{CSTOPB} & Send two stop bits. \\ + +\hline\hline +\multicolumn{3}{|l|}{4.3+BSD} \\ +\exi{ttyc/ignore-flags} & \ex{CIGNORE} & Ignore control flags. \\ +\exi{ttyc/CTS-output-flow-ctl} & \verb|CCTS_OFLOW| & CTS flow control of output \\ +\exi{ttyc/RTS-input-flow-ctl} & \verb|CRTS_IFLOW| & RTS flow control of input \\ +\exi{ttyc/carrier-flow-ctl} & \ex{MDMBUF} & \\ +\hline +\end{tabular} +\end{center} + +\caption{Control-flags. These are the named flags for the \protect\ex{tty-info} + record's \protect\var{control-flags} field. + These flags generally control the details of the terminal's + serial line. + Only the {\Posix} entries are guaranteed to be non-\sharpf.} +\label{table:ttyctl} +\end{table} + +% Local flags +%%%%%%%%%%%%% +\begin{table}[p] +\begin{center}\small +\begin{tabular}{|lll|} \hline +Scsh & C & Meaning \\ + +\hline\hline +\multicolumn{3}{|l|}{\Posix} \\ +\exi{ttyl/canonical} & \ex{ICANON} & Canonical input processing. \\ +\exi{ttyl/echo} & \ex{ECHO} & Enable echoing. \\ +\exi{ttyl/echo-delete-line} & \ex{ECHOK} & Echo newline after line kill. \\ +\exi{ttyl/echo-nl} & \ex{ECHONL} & Echo newline even if echo is off. \\ +\exi{ttyl/visual-delete}& \ex{ECHOE} & Visually erase chars. \\ +\exi{ttyl/enable-signals} & \ex{ISIG} & Enable \verb|^|C, \verb|^|Z signalling. \\ +\exi{ttyl/extended} & \ex{IEXTEN} & Enable extensions. \\ +\exi{ttyl/no-flush-on-interrupt} + & \ex{NOFLSH} & Don't flush after interrupt. \\ +\exi{ttyl/ttou-signal} & \ex{ITOSTOP} & \ex{SIGTTOU} on background output. \\ + +\hline\hline +\multicolumn{3}{|l|}{SVR4 and 4.3+BSD} \\ +\exi{ttyl/echo-ctl} & \ex{ECHOCTL} + & Echo control chars as ``\verb|^X|''. \\ +\exi{ttyl/flush-output} & \ex{FLUSHO} & Output is being flushed. \\ +\exi{ttyl/hardcopy-delete} & \ex{ECHOPRT} & Visual erase for hardcopy. \\ +\exi{ttyl/reprint-unread-chars} & \ex{PENDIN} & Retype pending input. \\ +\exi{ttyl/visual-delete-line} & \ex{ECHOKE} & Visually erase a line-kill. \\ + +\hline\hline +\multicolumn{3}{|l|}{4.3+BSD} \\ +\exi{ttyl/alt-delete-word} & \ex{ALTWERASE} & Alternate word erase algorithm \\ +\exi{ttyl/no-kernel-status} & \ex{NOKERNINFO} & No kernel status on \verb|^T|. \\ + +\hline\hline +\multicolumn{3}{|l|}{SVR4} \\ +\exi{ttyl/case-map} & \ex{XCASE} & Canonical case presentation \\ +\hline +\end{tabular} +\end{center} + +\caption{Local-flags. These are the named flags for the \protect\ex{tty-info} + record's \protect\var{local-flags} field. + These flags generally control the details of the line-editting + user interface. + Only the {\Posix} entries are guaranteed to be non-\sharpf.} +\label{table:ttylocal} +\end{table} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% diff --git a/doc/scsh-manual/xman.tex b/doc/scsh-manual/xman.tex new file mode 100644 index 0000000..c936bf5 --- /dev/null +++ b/doc/scsh-manual/xman.tex @@ -0,0 +1,38 @@ +%&latex -*- latex -*- + +% This is the reference manual for the Scheme Shell. + +\documentclass[twoside]{report} +\usepackage{code,boxedminipage,draftfooters,makeidx,palatino,ct, + headings,mantitle,array,matter,a4,tex2page} + +% Style issues +\parskip = 3pt plus 3pt +\sloppy + +\input{decls} +\makeindex +%%% End preamble + +\begin{document} + +\frontmatter +\include{front} + +\mainmatter +\include{intro} +\include{procnotation} +\include{syscalls} +\include{network} +\include{strings} +\include{rdelim} +\include{awk} +\include{miscprocs} +\include{running} +\include{changes} +\include{todo} + +\backmatter +\printindex + +\end{document}