diff --git a/scsh/regexp/COPYRIGHT b/scsh/regexp/COPYRIGHT index 36b9804..30c1f7a 100644 --- a/scsh/regexp/COPYRIGHT +++ b/scsh/regexp/COPYRIGHT @@ -1,19 +1,20 @@ -Copyright (c) 1986, 1993, 1995 by University of Toronto. -Written by Henry Spencer. Not derived from licensed software. +Copyright 1992, 1993, 1994, 1997 Henry Spencer. All rights reserved. +This software is not subject to any license of the American Telephone +and Telegraph Company or of the Regents of the University of California. -Permission is granted to anyone to use this software for any -purpose on any computer system, and to redistribute it in any way, -subject to the following restrictions: +Permission is granted to anyone to use this software for any purpose on +any computer system, and to alter it and redistribute it, subject +to the following restrictions: -1. The author is not responsible for the consequences of use of - this software, no matter how awful, even if they arise - from defects in it. +1. The author is not responsible for the consequences of use of this + software, no matter how awful, even if they arise from flaws in it. -2. The origin of this software must not be misrepresented, either - by explicit claim or by omission. +2. The origin of this software must not be misrepresented, either by + explicit claim or by omission. Since few users ever read sources, + credits must appear in the documentation. -3. Altered versions must be plainly marked as such, and must not - be misrepresented (by explicit claim or omission) as being - the original software. +3. Altered versions must be plainly marked as such, and must not be + misrepresented as being the original software. Since few users + ever read sources, credits must appear in the documentation. -4. This notice must not be removed or altered. +4. This notice may not be removed or altered. diff --git a/scsh/regexp/Makefile b/scsh/regexp/Makefile deleted file mode 100644 index e69de29..0000000 diff --git a/scsh/regexp/Makefile.in b/scsh/regexp/Makefile.in index 6aed9ad..e8db2b6 100644 --- a/scsh/regexp/Makefile.in +++ b/scsh/regexp/Makefile.in @@ -1,118 +1,137 @@ -srcdir = @srcdir@ VPATH = @srcdir@ CC = @CC@ CFLAGS1 = @CFLAGS1@ RANLIB = @RANLIB@ -# Things you might want to put in ENV: -# -DERRAVAIL have utzoo-compatible error() function and friends -ENV= +# You probably want to take -DREDEBUG out of CFLAGS, and put something like +# -O in, *after* testing (-DREDEBUG strengthens testing by enabling a lot of +# internal assertion checking and some debugging facilities). +# Put -Dconst= in for a pre-ANSI compiler. +# Do not take -DPOSIX_MISTAKE out. +# REGCFLAGS isn't important to you (it's for my use in some special contexts). +#CFLAGS=-I. -DPOSIX_MISTAKE -DREDEBUG $(REGCFLAGS) +CFLAGS=-I. -DPOSIX_MISTAKE $(REGCFLAGS) $(CFLAGS1) -# Things you might want to put in TEST: -# -DDEBUG debugging hooks -# -I. regexp.h from current directory, not /usr/include -TEST=-I. -I$(srcdir) +# If you have a pre-ANSI compiler, put -o into MKHFLAGS. If you want +# the Berkeley __P macro, put -b in. +MKHFLAGS= -# Things you might want to put in PROF: -# -pg profiler -# PROF= +# Flags for linking but not compiling, if any. +LDFLAGS= -CFLAGS=$(CFLAGS1) $(ENV) $(TEST) $(PROF) -LDFLAGS=$(PROF) +# Extra libraries for linking, if any. +LIBS= -LIB=libregexp.a -OBJ=regexp.o regsub.o regerror.o -TMP=dtr.tmp +# Internal stuff, should not need changing. +OBJPRODN=regcomp.o regexec.o regerror.o regfree.o +OBJS=$(OBJPRODN) split.o debug.o main.o +H=cclass.h cname.h regex2.h utils.h +REGSRC=regcomp.c regerror.c regexec.c regfree.c +ALLSRC=$(REGSRC) engine.c debug.c main.c split.c + +# Stuff that matters only if you're trying to lint the package. +LINTFLAGS=-I. -Dstatic= -Dconst= -DREDEBUG +LINTC=regcomp.c regexec.c regerror.c regfree.c debug.c main.c +JUNKLINT=possible pointer alignment|null effect + +# arrangements to build forward-reference header files +.SUFFIXES: .ih .h +.c.ih: + sh ./mkh $(MKHFLAGS) -p $< >$@ default: r -try: try.o $(LIB) - $(CC) $(LDFLAGS) try.o $(LIB) -o try +lib: purge $(OBJPRODN) + rm -f libregex.a + ar crv libregex.a $(OBJPRODN) -# Making timer will probably require putting stuff in $(PROF) and then -# recompiling everything; the following is just the final stage. -timer: timer.o $(LIB) - $(CC) $(LDFLAGS) timer.o $(LIB) -o timer +purge: + rm -f *.o -timer.o: timer.c timer.t.h +# stuff to build regex.h +REGEXH=regex.h +REGEXHSRC=regex2.h $(REGSRC) +$(REGEXH): $(REGEXHSRC) mkh + sh ./mkh $(MKHFLAGS) -i _REGEX_H_ $(REGEXHSRC) >regex.tmp + cmp -s regex.tmp regex.h 2>/dev/null || cp regex.tmp regex.h + rm -f regex.tmp -timer.t.h: tests - sed 's/ /","/g;s/\\/&&/g;s/.*/{"&"},/' tests >timer.t.h +# dependencies +$(OBJPRODN) debug.o: utils.h regex.h regex2.h +regcomp.o: cclass.h cname.h regcomp.ih +regexec.o: engine.c engine.ih +regerror.o: regerror.ih +debug.o: debug.ih +main.o: main.ih -# Regression test. -r: try tests - ./try &1 | egrep -v '$(JUNKLINT)' | tee lint -ch.soe: ch $(BITS) - soelim ch >$@ +fullprint: + ti README WHATSNEW notes todo | list + ti *.h | list + list *.c + list regex.3 regex.7 -ch.sml: ch $(BITS) smlize splitfigs - splitfigs ch | soelim | smlize >$@ +print: + ti README WHATSNEW notes todo | list + ti *.h | list + list reg*.c engine.c -fig0 fig1 fig2: ch splitfigs - splitfigs ch >/dev/null -f: fig0 fig1 fig2 figs - groff -Tps -s $(OPT) figs | lpr +mf.tmp: Makefile + sed '/^REGEXH=/s/=.*/=regex.h/' Makefile | sed '/#DEL$$/d' >$@ -fig1.ps: fig0 fig1 - ( cat fig0 ; echo ".LP" ; cat fig1 ) | groff -Tps $(OPT) >$@ +DTRH=cclass.h cname.h regex2.h utils.h +PRE=COPYRIGHT README WHATSNEW +POST=mkh regex.3 regex.7 tests $(DTRH) $(ALLSRC) fake/*.[ch] +FILES=$(PRE) Makefile $(POST) +DTR=$(PRE) Makefile=mf.tmp $(POST) +dtr: $(FILES) mf.tmp + makedtr $(DTR) >$@ + rm mf.tmp -fig2.ps: fig0 fig2 - ( cat fig0 ; echo ".LP" ; cat fig2 ) | groff -Tps $(OPT) >$@ +cio: $(FILES) + cio $(FILES) -fp: fig1.ps fig2.ps +rdf: $(FILES) + rcsdiff -c $(FILES) 2>&1 | p -r.1: regexp.c splitter - splitter regexp.c +# various forms of cleanup +tidy: + rm -f junk* core core.* *.core dtr *.tmp lint -rs.1: regsub.c splitter - splitter regsub.c +clean: tidy + rm -f *.o *.s *.ih re libregex.a -re.1: regerror.c splitter - splitter regerror.c - -rm.h: regmagic.h splitter - splitter regmagic.h - -re.h: regexp.h splitter - splitter regexp.h - -PLAIN=COPYRIGHT README Makefile regexp.3 try.c timer.c tests -FIX=regexp.h regexp.c regsub.c regerror.c regmagic.h -DTR=$(PLAIN) $(FIX) - -dtr: r $(DTR) - rm -rf $(TMP) - mkdir $(TMP) - cp $(PLAIN) $(TMP) - for f in $(FIX) ; do normalize $$f >$(TMP)/$$f ; done - ( cd $(TMP) ; makedtr $(DTR) ) >$@ - rm -rf $(TMP) - -ch.ps: ch Makefile $(BITS) - groff -Tps $(OPT) ch >$@ - -copy: ch.soe ch.sml fp - makedtr REMARKS ch.sml fig*.ps ch.soe >$@ - -go: copy dtr +# don't do this one unless you know what you're doing +spotless: clean + rm -f mkh regex.h diff --git a/scsh/regexp/README b/scsh/regexp/README index bcb9cf5..5f5a7ab 100644 --- a/scsh/regexp/README +++ b/scsh/regexp/README @@ -1,57 +1,32 @@ -This is a revision of my well-known regular-expression package, regexp(3). -It gives C programs the ability to use egrep-style regular expressions, and -does it in a much cleaner fashion than the analogous routines in SysV. -It is not, alas, fully POSIX.2-compliant; that is hard. (I'm working on -a full reimplementation that will do that.) +alpha3.7 release. +Fri Nov 21 13:25:21 EST 1997 +henry@zoo.toronto.edu -This version is the one which is examined and explained in one chapter of -"Software Solutions in C" (Dale Schumacher, ed.; AP Professional 1994; -ISBN 0-12-632360-7), plus a couple of insignificant updates, plus one -significant bug fix (done 10 Nov 1995). +See WHATSNEW for change listing. -Although this package was inspired by the Bell V8 regexp(3), this -implementation is *NOT* AT&T/Bell code, and is not derived from licensed -software. Even though U of T is a V8 licensee. This software is based on -a V8 manual page sent to me by Dennis Ritchie (the manual page enclosed -here is a complete rewrite and hence is not covered by AT&T copyright). -I admit to some familiarity with regular-expression implementations of -the past, but the only one that this code traces any ancestry to is the -one published in Kernighan & Plauger's "Software Tools" (from which -this one draws ideas but not code). +installation notes: +-------- +Read the comments at the beginning of Makefile before running. -Simplistically: put this stuff into a source directory, inspect Makefile -for compilation options that need changing to suit your local environment, -and then do "make". This compiles the regexp(3) functions, builds a -library containing them, compiles a test program, and runs a large set of -regression tests. If there are no complaints, then put regexp.h into -/usr/include, add regexp.o, regsub.o, and regerror.o into your C library -(or put libre.a into /usr/lib), and install regexp.3 (perhaps with slight -modifications) in your manual-pages directory. +Utils.h contains some things that just might have to be modified on +some systems, as well as a nested include (ugh) of . -The files are: +The "fake" directory contains quick-and-dirty fakes for some header +files and routines that old systems may not have. Note also that +-DUSEBCOPY will make utils.h substitute bcopy() for memmove(). -COPYRIGHT copyright notice -README this text -Makefile instructions to make everything -regexp.3 manual page -regexp.h header file, for /usr/include -regexp.c source for regcomp() and regexec() -regsub.c source for regsub() -regerror.c source for default regerror() -regmagic.h internal header file -try.c source for test program -timer.c source for timing program -tests test list for try and timer +After that, "make r" will build regcomp.o, regexec.o, regfree.o, +and regerror.o (the actual routines), bundle them together into a test +program, and run regression tests on them. No output is good output. -This implementation uses nondeterministic automata rather than the -deterministic ones found in some other implementations, which makes it -simpler, smaller, and faster at compiling regular expressions, but slower -at executing them. Many users have found the speed perfectly adequate, -although replacing the insides of egrep with this code would be a mistake. +"make lib" builds just the .o files for the actual routines (when +you're happy with testing and have adjusted CFLAGS for production), +and puts them together into libregex.a. You can pick up either the +library or *.o ("make lib" makes sure there are no other .o files left +around to confuse things). -This stuff should be pretty portable, given an ANSI C compiler and -appropriate option settings. There are no "reserved" char values except for -NUL, and no special significance is attached to the top bit of chars. -The string(3) functions are used a fair bit, on the grounds that they are -probably faster than coding the operations in line. Some attempts at code -tuning have been made, but this is invariably a bit machine-specific. +Main.c, debug.c, split.c are used for regression testing but are not part +of the RE routines themselves. + +Regex.h goes in /usr/include. All other .h files are internal only. +-------- diff --git a/scsh/regexp/WHATSNEW b/scsh/regexp/WHATSNEW index 230ca86..d84f9e3 100644 --- a/scsh/regexp/WHATSNEW +++ b/scsh/regexp/WHATSNEW @@ -1,3 +1,7 @@ +New in alpha3.7: A bit of cleanup aimed at maximizing portability, +possibly at slight cost in efficiency. "ul" suffixes and "unsigned long" +no longer appear, in particular. + New in alpha3.6: A couple more portability glitches fixed. New in alpha3.5: Active development of this code has been stopped -- diff --git a/scsh/regexp/debug.ih b/scsh/regexp/debug.ih deleted file mode 100644 index 5f40ff7..0000000 --- a/scsh/regexp/debug.ih +++ /dev/null @@ -1,14 +0,0 @@ -/* ========= begin header generated by ./mkh ========= */ -#ifdef __cplusplus -extern "C" { -#endif - -/* === debug.c === */ -void regprint(regex_t *r, FILE *d); -static void s_print(register struct re_guts *g, FILE *d); -static char *regchar(int ch); - -#ifdef __cplusplus -} -#endif -/* ========= end header generated by ./mkh ========= */ diff --git a/scsh/regexp/engine.ih b/scsh/regexp/engine.ih deleted file mode 100644 index cc98334..0000000 --- a/scsh/regexp/engine.ih +++ /dev/null @@ -1,35 +0,0 @@ -/* ========= begin header generated by ./mkh ========= */ -#ifdef __cplusplus -extern "C" { -#endif - -/* === engine.c === */ -static int matcher(register struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags); -static char *dissect(register struct match *m, char *start, char *stop, sopno startst, sopno stopst); -static char *backref(register struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev); -static char *fast(register struct match *m, char *start, char *stop, sopno startst, sopno stopst); -static char *slow(register struct match *m, char *start, char *stop, sopno startst, sopno stopst); -static states step(register struct re_guts *g, sopno start, sopno stop, register states bef, int ch, register states aft); -#define BOL (OUT+1) -#define EOL (BOL+1) -#define BOLEOL (BOL+2) -#define NOTHING (BOL+3) -#define BOW (BOL+4) -#define EOW (BOL+5) -#define CODEMAX (BOL+5) /* highest code used */ -#define NONCHAR(c) ((c) > CHAR_MAX) -#define NNONCHAR (CODEMAX-CHAR_MAX) -#ifdef REDEBUG -static void print(struct match *m, char *caption, states st, int ch, FILE *d); -#endif -#ifdef REDEBUG -static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst); -#endif -#ifdef REDEBUG -static char *pchar(int ch); -#endif - -#ifdef __cplusplus -} -#endif -/* ========= end header generated by ./mkh ========= */ diff --git a/scsh/regexp/main.ih b/scsh/regexp/main.ih deleted file mode 100644 index 5a0118a..0000000 --- a/scsh/regexp/main.ih +++ /dev/null @@ -1,19 +0,0 @@ -/* ========= begin header generated by ./mkh ========= */ -#ifdef __cplusplus -extern "C" { -#endif - -/* === main.c === */ -void regress(FILE *in); -void try(char *f0, char *f1, char *f2, char *f3, char *f4, int opts); -int options(int type, char *s); -int opt(int c, char *s); -void fixstr(register char *p); -char *check(char *str, regmatch_t sub, char *should); -static char *eprint(int err); -static int efind(char *name); - -#ifdef __cplusplus -} -#endif -/* ========= end header generated by ./mkh ========= */ diff --git a/scsh/regexp/patch-msg b/scsh/regexp/patch-msg deleted file mode 100644 index 36a7ff9..0000000 --- a/scsh/regexp/patch-msg +++ /dev/null @@ -1,803 +0,0 @@ -Date: Mon, 1 Jul 1996 23:22:47 GMT -From: Bill Sommerfeld -To: shivers@lcs.mit.edu, bdc@ai.mit.edu -Subject: scsh patch for precompiled regexps.. - -I meant to send this out months ago but I was just too hosed with work. - -Here's what I have right now: - -There are three pieces here: - diffs to the "core" scsh - diffs to Henry Spencer's latest regexp library - a copy of Henry Spencer's latest regexp library.. - -It appears to work (it passes the same regression tests as the C library..). - -Let me know if I didn't include something needed for this to work.. - - - Bill - -diff -rc scsh-0.4.2/scsh/re.scm scsh-0.4.2-regexp/scsh/re.scm -*** scsh-0.4.2/scsh/re.scm Fri Oct 27 04:58:56 1995 ---- scsh-0.4.2-regexp/scsh/re.scm Sat Apr 6 21:07:41 1996 -*************** -*** 34,49 **** - - ;;; Bogus stub definitions for low-level match routines: - -! (define regexp? string?) -! (define (make-regexp str) str) - -! (define (regexp-exec regexp str . maybe-start) - (let ((start (optional-arg maybe-start 0)) - (start-vec (make-vector 10)) - (end-vec (make-vector 10))) -! (and (%regexp-match regexp str start start-vec end-vec) -! (make-regexp-match str start-vec end-vec)))) -! - - ;;; Convert a string into a regex pattern that matches that string exactly -- - ;;; in other words, quote the special chars with backslashes. ---- 34,53 ---- - - ;;; Bogus stub definitions for low-level match routines: - -! (define-record iregexp -! string) - -! (define regexp? iregexp?) -! -! (define (make-regexp str) -! (make-iregexp (compile-regexp str))) -! -! (define (regexp-exec r s . maybe-start) - (let ((start (optional-arg maybe-start 0)) - (start-vec (make-vector 10)) - (end-vec (make-vector 10))) -! (and (%regexp-exec-1 (iregexp:string r) s start start-vec end-vec) -! (make-regexp-match s start-vec end-vec)))) - - ;;; Convert a string into a regex pattern that matches that string exactly -- - ;;; in other words, quote the special chars with backslashes. -*************** -*** 58,75 **** - (cons #\\ result) - result)))))) - -! (define-foreign %regexp-match/errno (reg_match (string regexp) -! (string s) -! (integer start) -! (vector-desc start-vec) -! (vector-desc end-vec)) -! static-string ; Error string or #f if all is ok. -! bool) ; match? -! -! (define (%regexp-match regexp string start start-vec end-vec) -! (receive (err match?) (%regexp-match/errno regexp string start -! start-vec end-vec) -! (if err (error err %regexp-match regexp string start) match?))) - - - ;;; I do this one in C, I'm not sure why: ---- 62,79 ---- - (cons #\\ result) - result)))))) - -! ;;;(define-foreign %regexp-match/errno (reg_match (string regexp) -! ;;; (string s) -! ;;; (integer start) -! ;;; (vector-desc start-vec) -! ;;; (vector-desc end-vec)) -! ;;; static-string ; Error string or #f if all is ok. -! ;;; bool) ; match? -! -! ;;;(define (%regexp-match regexp string start start-vec end-vec) -! ;;; (receive (err match?) (%regexp-match/errno regexp string start -! ;;; start-vec end-vec) -! ;;; (if err (error err %regexp-match regexp string start) match?))) - - - ;;; I do this one in C, I'm not sure why: -*************** -*** 79,81 **** ---- 83,166 ---- - (filter_stringvec (string regexp) ((C "char const ** ~a") cvec)) - static-string ; error message -- #f if no error. - integer) ; number of files that pass the filter. -+ -+ ;;; precompiled regexps. -+ -+ (define-foreign %regexp-compiled-length (reg_comp_len (string regexp)) -+ static-string -+ integer) -+ -+ (define-foreign %regexp-compile (reg_comp_comp (string regexp) -+ (string-desc re-buf)) -+ static-string) -+ -+ (define (%regexp-exec-1 r s start sv ev) -+ (receive (err match?) (%regexp-exec r s start sv ev) -+ (if err (error err s start) -+ match?))) -+ -+ (define-foreign %regexp-exec (reg_exec (string-desc regexp) -+ (string s) -+ (integer start) -+ (vector-desc start-vec) -+ (vector-desc end-vec)) -+ static-string -+ bool) -+ -+ -+ (define (compile-regexp e) -+ (receive (err len) -+ (%regexp-compiled-length e) -+ (if err (error err e) -+ (let ((buf (make-string len))) -+ (%regexp-compile e buf) -+ buf)))) -+ -+ -+ -+ (define-foreign %regexp-subst (reg_subst (string-desc regexp) -+ (string m) -+ (string s) -+ (integer start) -+ (vector-desc start-vec) -+ (vector-desc end-vec) -+ (string-desc outbuf)) -+ static-string -+ integer) -+ -+ (define-foreign %regexp-subst-len (reg_subst_len (string-desc regexp) -+ (string m) -+ (string s) -+ (integer start) -+ (vector-desc start-vec) -+ (vector-desc end-vec)) -+ static-string -+ integer) -+ -+ -+ (define (regexp-subst re match replacement) -+ (let ((cr (iregexp:string re)) -+ (matchstr (regexp-match:string match)) -+ (startvec (regexp-match:start match)) -+ (endvec (regexp-match:end match))) -+ (receive (err outlen) -+ (%regexp-subst-len cr -+ matchstr -+ replacement -+ 0 -+ startvec -+ endvec) -+ (if err (error err matchstr replacement) -+ (let ((outbuf (make-string outlen))) -+ (receive (err outlen) -+ (%regexp-subst cr -+ matchstr -+ replacement -+ 0 -+ startvec -+ endvec -+ outbuf) -+ (if err (error err matchstr replacement) -+ (substring outbuf 0 outlen)))))))) -+ -+ -\ No newline at end of file -diff -rc scsh-0.4.2/scsh/re1.c scsh-0.4.2-regexp/scsh/re1.c -*** scsh-0.4.2/scsh/re1.c Fri Oct 27 04:58:58 1995 ---- scsh-0.4.2-regexp/scsh/re1.c Sat Apr 6 21:01:15 1996 -*************** -*** 19,24 **** ---- 19,150 ---- - /* Stash error msg in global. */ - void regerror(char *msg) {regexp_error = msg;} - -+ /* -+ ** Return NULL normally, error string on error. -+ ** Stash number of bytes needed for compiled regexp into `*len' -+ */ -+ -+ char *reg_comp_len(const char *re, int *len) -+ { -+ int l; -+ -+ regexp_error = NULL; -+ *len = regcomp_len(re); -+ return regexp_error; -+ } -+ -+ /* -+ ** Return NULL normally, error string on error. -+ ** Compile regexp into string described by `cr'. -+ */ -+ -+ char *reg_comp_comp(const char *re, scheme_value cr) -+ { -+ int len = STRING_LENGTH(cr); -+ regexp *r = (regexp *)&STRING_REF(cr, 0); -+ -+ regexp_error = NULL; -+ r = regcomp_comp(re, r, len); -+ return regexp_error; -+ } -+ -+ /* Return NULL normally, error string on error. -+ ** Stash match info in start_vec and end_vec. -+ ** Returns boolean match/no-match in hit. -+ */ -+ -+ char *reg_exec(scheme_value cr, const char *string, int start, -+ scheme_value start_vec, scheme_value end_vec, int *hit) -+ { -+ regexp *r = (regexp *)&STRING_REF(cr, 0); -+ -+ if( VECTOR_LENGTH(start_vec) != NSUBEXP ) { -+ return "Illegal start vector"; -+ } -+ -+ if( VECTOR_LENGTH(end_vec) != NSUBEXP ) { -+ return "Illegal end vector"; -+ } -+ -+ regexp_error = 0; -+ *hit = 0; -+ -+ if( regexec(r, string+start) ) { -+ int i; -+ for(i=0; istartp[i]; -+ const char *e = r->endp[i]; -+ VECTOR_REF(start_vec,i) = s?ENTER_FIXNUM(s - string):SCHFALSE; -+ VECTOR_REF(end_vec,i) = e?ENTER_FIXNUM(e - string):SCHFALSE; -+ r->startp[i] = NULL; -+ r->endp[i] = NULL; -+ } -+ *hit = 1; -+ } -+ return regexp_error; -+ } -+ -+ char *reg_subst(scheme_value cr, const char *match, -+ const char *src, int start, -+ scheme_value start_vec, scheme_value end_vec, -+ scheme_value outbuf, int *len) -+ { -+ int i; -+ regexp *r = (regexp *)&STRING_REF(cr, 0); -+ -+ if( VECTOR_LENGTH(start_vec) != NSUBEXP ) { -+ return "Illegal start vector"; -+ } -+ -+ if( VECTOR_LENGTH(end_vec) != NSUBEXP ) { -+ return "Illegal end vector"; -+ } -+ -+ for (i=0; istartp[i] = FIXNUMP(se)?(match + EXTRACT_FIXNUM(se)):NULL; -+ r->endp[i] = FIXNUMP(ee)? (match + EXTRACT_FIXNUM(ee)):NULL; -+ } -+ -+ regexp_error = NULL; -+ regnsub (r, src, &STRING_REF(outbuf, 0), STRING_LENGTH(outbuf)); -+ *len = strlen(&STRING_REF(outbuf, 0)); -+ return regexp_error; -+ } -+ -+ char *reg_subst_len(scheme_value cr, const char *match, -+ const char *src, int start, -+ scheme_value start_vec, scheme_value end_vec, -+ int *len) -+ { -+ int i; -+ regexp *r = (regexp *)&STRING_REF(cr, 0); -+ -+ if( VECTOR_LENGTH(start_vec) != NSUBEXP ) { -+ return "Illegal start vector"; -+ } -+ -+ if( VECTOR_LENGTH(end_vec) != NSUBEXP ) { -+ return "Illegal end vector"; -+ } -+ -+ for (i=0; istartp[i] = FIXNUMP(se)?(match + EXTRACT_FIXNUM(se)):NULL; -+ r->endp[i] = FIXNUMP(ee)? (match + EXTRACT_FIXNUM(ee)):NULL; -+ } -+ -+ regexp_error = NULL; -+ *len = regsublen (r, src); -+ return regexp_error; -+ } -+ -+ -+ #if 0 - /* Return NULL normally, error string on error. - ** Stash match info in start_vec and end_vec. - ** Returns boolean match/no-match in hit. -*************** -*** 56,61 **** ---- 182,188 ---- - Free(prog); - return regexp_error; - } -+ #endif - - - char *filter_stringvec(const char *re, char const **stringvec, int *nummatch) -diff -rc scsh-0.4.2/scsh/re1.h scsh-0.4.2-regexp/scsh/re1.h -*** scsh-0.4.2/scsh/re1.h Sun Oct 22 08:34:34 1995 ---- scsh-0.4.2-regexp/scsh/re1.h Sat Apr 6 17:54:09 1996 -*************** -*** 1,6 **** ---- 1,21 ---- -+ #if 0 - char *reg_match(const char *re, const char *string, int start, - scheme_value start_vec, scheme_value end_vec, - int *hit); -+ #endif - - char *filter_stringvec(const char *re, char const **stringvec, - int *nummatch); -+ -+ char *reg_comp_len(const char *re, int *len); -+ char *reg_comp_comp(const char *re, scheme_value cr); -+ -+ char *reg_exec(scheme_value cr, const char *string, int start, -+ scheme_value start_vec, scheme_value end_vec, int *hit); -+ -+ char *reg_subst(scheme_value cr, const char *match, -+ const char *src, int start, -+ scheme_value start_vec, scheme_value end_vec, -+ scheme_value outbuf, int *len); -+ -+ - -Only in scsh-0.4.2-regexp/scsh: re2.scm -diff -rc scsh-0.4.2/scsh/scsh-interfaces.scm scsh-0.4.2-regexp/scsh/scsh-interfaces.scm -*** scsh-0.4.2/scsh/scsh-interfaces.scm Tue Oct 31 19:19:30 1995 ---- scsh-0.4.2-regexp/scsh/scsh-interfaces.scm Sat Apr 6 18:48:12 1996 -*************** -*** 413,418 **** ---- 413,419 ---- - make-regexp - regexp? - regexp-exec -+ regexp-subst - regexp-quote)) - - - -regexp library changes: - -*** Makefile 1996/04/06 19:24:49 1.1 ---- Makefile 1996/04/06 20:46:26 -*************** -*** 5,11 **** - # Things you might want to put in TEST: - # -DDEBUG debugging hooks - # -I. regexp.h from current directory, not /usr/include -! TEST=-I. - - # Things you might want to put in PROF: - # -pg profiler ---- 5,11 ---- - # Things you might want to put in TEST: - # -DDEBUG debugging hooks - # -I. regexp.h from current directory, not /usr/include -! TEST=-I. -DDEBUG - - # Things you might want to put in PROF: - # -pg profiler -*** regexp.c 1996/04/06 19:24:49 1.1 ---- regexp.c 1996/04/06 22:34:55 -*************** -*** 105,110 **** ---- 105,111 ---- - * Utility definitions. - */ - #define FAIL(m) { regerror(m); return(NULL); } -+ #define FAILN(m) { regerror(m); return(-1); } - #define ISREPN(c) ((c) == '*' || (c) == '+' || (c) == '?') - #define META "^$.[()|?+*\\" - -*************** -*** 162,173 **** - const char *exp; - { - register regexp *r; -! register char *scan; - int flags; - struct comp co; - - if (exp == NULL) -! FAIL("NULL argument to regcomp"); - - /* First pass: determine size, legality. */ - co.regparse = (char *)exp; ---- 163,193 ---- - const char *exp; - { - register regexp *r; -! size_t len; -! -! len = regcomp_len(exp); -! if (len <= 0) -! return NULL; -! -! /* Allocate space. */ -! r = (regexp *)malloc(len); -! -! if (r == NULL) -! FAIL("out of space"); -! return regcomp_comp(exp, r, len); -! } -! -! -! size_t -! regcomp_len(exp) -! const char *exp; -! { - int flags; -+ register regexp *r; - struct comp co; - - if (exp == NULL) -! FAILN("NULL argument to regcomp"); - - /* First pass: determine size, legality. */ - co.regparse = (char *)exp; -*************** -*** 178,198 **** - co.regcode = co.regdummy; - regc(&co, MAGIC); - if (reg(&co, 0, &flags) == NULL) -! return(NULL); - - /* Small enough for pointer-storage convention? */ - if (co.regsize >= 0x7fffL) /* Probably could be 0xffffL. */ -! FAIL("regexp too big"); - -! /* Allocate space. */ -! r = (regexp *)malloc(sizeof(regexp) + (size_t)co.regsize); -! if (r == NULL) -! FAIL("out of space"); - - /* Second pass: emit code. */ - co.regparse = (char *)exp; - co.regnpar = 1; - co.regcode = r->program; - regc(&co, MAGIC); - if (reg(&co, 0, &flags) == NULL) - return(NULL); ---- 198,228 ---- - co.regcode = co.regdummy; - regc(&co, MAGIC); - if (reg(&co, 0, &flags) == NULL) -! return -1; - - /* Small enough for pointer-storage convention? */ - if (co.regsize >= 0x7fffL) /* Probably could be 0xffffL. */ -! FAILN("regexp too big"); - -! return (sizeof(regexp) + (size_t)co.regsize); -! } -! -! -! regexp * -! regcomp_comp(exp, r, len) -! const char *exp; -! register regexp *r; -! size_t len; -! { -! register char *scan; -! int flags; -! struct comp co; - - /* Second pass: emit code. */ - co.regparse = (char *)exp; - co.regnpar = 1; - co.regcode = r->program; -+ co.regsize = len - sizeof(regexp); - regc(&co, MAGIC); - if (reg(&co, 0, &flags) == NULL) - return(NULL); -*************** -*** 200,206 **** - /* Dig out information for optimizations. */ - r->regstart = '\0'; /* Worst-case defaults. */ - r->reganch = 0; -! r->regmust = NULL; - r->regmlen = 0; - scan = r->program+1; /* First BRANCH. */ - if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ ---- 230,236 ---- - /* Dig out information for optimizations. */ - r->regstart = '\0'; /* Worst-case defaults. */ - r->reganch = 0; -! r->regmust = 0; - r->regmlen = 0; - scan = r->program+1; /* First BRANCH. */ - if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ -*************** -*** 229,235 **** - longest = OPERAND(scan); - len = strlen(OPERAND(scan)); - } -! r->regmust = longest; - r->regmlen = (int)len; - } - } ---- 259,265 ---- - longest = OPERAND(scan); - len = strlen(OPERAND(scan)); - } -! r->regmust = longest - r->program; - r->regmlen = (int)len; - } - } -*************** -*** 648,655 **** - struct exec { - char *reginput; /* String-input pointer. */ - char *regbol; /* Beginning of input, for ^ check. */ -! char **regstartp; /* Pointer to startp array. */ -! char **regendp; /* Ditto for endp. */ - }; - - /* ---- 678,685 ---- - struct exec { - char *reginput; /* String-input pointer. */ - char *regbol; /* Beginning of input, for ^ check. */ -! const char **regstartp; /* Pointer to startp array. */ -! const char **regendp; /* Ditto for endp. */ - }; - - /* -*************** -*** 690,696 **** - } - - /* If there is a "must appear" string, look for it. */ -! if (prog->regmust != NULL && strstr(string, prog->regmust) == NULL) - return(0); - - /* Mark beginning of line for ^ . */ ---- 720,727 ---- - } - - /* If there is a "must appear" string, look for it. */ -! if ((prog->regmlen > 0) && -! strstr(string, &prog->program[prog->regmust]) == NULL) - return(0); - - /* Mark beginning of line for ^ . */ -*************** -*** 729,736 **** - char *string; - { - register int i; -! register char **stp; -! register char **enp; - - ep->reginput = string; - ---- 760,767 ---- - char *string; - { - register int i; -! register const char **stp; -! register const char **enp; - - ep->reginput = string; - -*************** -*** 1004,1011 **** - printf("start `%c' ", r->regstart); - if (r->reganch) - printf("anchored "); -! if (r->regmust != NULL) -! printf("must have \"%s\"", r->regmust); - printf("\n"); - } - ---- 1035,1042 ---- - printf("start `%c' ", r->regstart); - if (r->reganch) - printf("anchored "); -! if (r->regmlen > 0) -! printf("must have \"%s\"", &r->program[r->regmust]); - printf("\n"); - } - -*** regexp.h 1996/04/06 19:24:49 1.1 ---- regexp.h 1996/04/07 01:52:19 -*************** -*** 6,16 **** - */ - #define NSUBEXP 10 - typedef struct regexp { -! char *startp[NSUBEXP]; -! char *endp[NSUBEXP]; - char regstart; /* Internal use only. */ - char reganch; /* Internal use only. */ -! char *regmust; /* Internal use only. */ - int regmlen; /* Internal use only. */ - char program[1]; /* Unwarranted chumminess with compiler. */ - } regexp; ---- 6,16 ---- - */ - #define NSUBEXP 10 - typedef struct regexp { -! const char *startp[NSUBEXP]; -! const char *endp[NSUBEXP]; - char regstart; /* Internal use only. */ - char reganch; /* Internal use only. */ -! int regmust; /* Internal use only. */ - int regmlen; /* Internal use only. */ - char program[1]; /* Unwarranted chumminess with compiler. */ - } regexp; -*************** -*** 18,21 **** ---- 18,27 ---- - extern regexp *regcomp(const char *re); - extern int regexec(regexp *rp, const char *s); - extern void regsub(const regexp *rp, const char *src, char *dst); -+ extern void regnsub(const regexp *rp, const char *src, char *dst, size_t len); -+ extern size_t regsublen(const regexp *rp, const char *src); -+ - extern void regerror(char *message); -+ extern size_t regcomp_len(const char *exp); -+ extern regexp *regcomp_comp(const char *exp, struct regexp *r, size_t len); -+ -*** regsub.c 1996/04/06 19:24:49 1.1 ---- regsub.c 1996/04/07 02:10:29 -*************** -*** 11,25 **** - /* - - regsub - perform substitutions after a regexp match - */ - void -! regsub(rp, source, dest) - const regexp *rp; - const char *source; - char *dest; - { - register regexp * const prog = (regexp *)rp; -! register char *src = (char *)source; - register char *dst = dest; - register char c; - register int no; - register size_t len; ---- 11,42 ---- - /* - - regsub - perform substitutions after a regexp match - */ -+ -+ void regsub(rp, source, dest) -+ const regexp *rp; -+ const char *source; -+ char *dest; -+ { -+ regnsub(rp, source, dest, BUFSIZ); -+ } -+ -+ -+ -+ /* -+ - regnsub - perform bounds-checked substitutions after a regexp match -+ */ - void -! regnsub(rp, source, dest, destlen) - const regexp *rp; - const char *source; - char *dest; -+ size_t destlen; - { - register regexp * const prog = (regexp *)rp; -! register const char *src = (char *)source; - register char *dst = dest; -+ char *dstend = dest + destlen; -+ char *odst; - register char c; - register int no; - register size_t len; -*************** -*** 45,55 **** - if (c == '\\' && (*src == '\\' || *src == '&')) - c = *src++; - *dst++ = c; - } else if (prog->startp[no] != NULL && prog->endp[no] != NULL && -! prog->endp[no] > prog->startp[no]) { - len = prog->endp[no] - prog->startp[no]; -! (void) strncpy(dst, prog->startp[no], len); - dst += len; - if (*(dst-1) == '\0') { /* strncpy hit NUL. */ - regerror("damaged match string"); - return; ---- 62,83 ---- - if (c == '\\' && (*src == '\\' || *src == '&')) - c = *src++; - *dst++ = c; -+ if (dst >= dstend) -+ { -+ regerror("output buffer too small"); -+ return; -+ } - } else if (prog->startp[no] != NULL && prog->endp[no] != NULL && -! prog->endp[no] > prog->startp[no]) { - len = prog->endp[no] - prog->startp[no]; -! odst = dst; - dst += len; -+ if (dst >= dstend) -+ { -+ regerror("output buffer too small"); -+ return; -+ } -+ (void) strncpy(odst, prog->startp[no], len); - if (*(dst-1) == '\0') { /* strncpy hit NUL. */ - regerror("damaged match string"); - return; -*************** -*** 58,60 **** ---- 86,131 ---- - } - *dst++ = '\0'; - } -+ -+ size_t regsublen(rp, source) -+ const regexp *rp; -+ const char *source; -+ { -+ register regexp * const prog = (regexp *)rp; -+ register char *src = (char *)source; -+ register char c; -+ register int no; -+ register int len = 0; -+ -+ if (prog == NULL || source == NULL) { -+ regerror("NULL parameter to regsublen"); -+ return -1; -+ } -+ -+ if ((unsigned char)*(prog->program) != MAGIC) { -+ regerror("damaged regexp"); -+ return -1; -+ } -+ while ((c = *src++) != '\0') { -+ if (c == '&') -+ no = 0; -+ else if (c == '\\' && isdigit(*src)) -+ no = *src++ - '0'; -+ else -+ no = -1; -+ if (no < 0) { /* Ordinary character. */ -+ if (c == '\\' && (*src == '\\' || *src == '&')) -+ src++; -+ len++; -+ } else { -+ const char *s = prog->startp[no]; -+ const char *e = prog->endp[no]; -+ if ((s != NULL) && (e != NULL) && (e > s)) { -+ len += e-s; -+ } -+ } -+ } -+ return len+1; -+ } -+ -+ - -Original regexp code from henry: -[unpacked & deleted -Olin] diff --git a/scsh/regexp/regcomp.ih b/scsh/regexp/regcomp.ih deleted file mode 100644 index 0776e71..0000000 --- a/scsh/regexp/regcomp.ih +++ /dev/null @@ -1,51 +0,0 @@ -/* ========= begin header generated by ./mkh ========= */ -#ifdef __cplusplus -extern "C" { -#endif - -/* === regcomp.c === */ -static void p_ere(register struct parse *p, int stop); -static void p_ere_exp(register struct parse *p); -static void p_str(register struct parse *p); -static void p_bre(register struct parse *p, register int end1, register int end2); -static int p_simp_re(register struct parse *p, int starordinary); -static int p_count(register struct parse *p); -static void p_bracket(register struct parse *p); -static void p_b_term(register struct parse *p, register cset *cs); -static void p_b_cclass(register struct parse *p, register cset *cs); -static void p_b_eclass(register struct parse *p, register cset *cs); -static char p_b_symbol(register struct parse *p); -static char p_b_coll_elem(register struct parse *p, int endc); -static char othercase(int ch); -static void bothcases(register struct parse *p, int ch); -static void ordinary(register struct parse *p, register int ch); -static void nonnewline(register struct parse *p); -static void repeat(register struct parse *p, sopno start, int from, int to); -static int seterr(register struct parse *p, int e); -static cset *allocset(register struct parse *p); -static void freeset(register struct parse *p, register cset *cs); -static int freezeset(register struct parse *p, register cset *cs); -static int firstch(register struct parse *p, register cset *cs); -static int nch(register struct parse *p, register cset *cs); -static void mcadd(register struct parse *p, register cset *cs, register char *cp); -static void mcsub(register cset *cs, register char *cp); -static int mcin(register cset *cs, register char *cp); -static char *mcfind(register cset *cs, register char *cp); -static void mcinvert(register struct parse *p, register cset *cs); -static void mccase(register struct parse *p, register cset *cs); -static int isinsets(register struct re_guts *g, int c); -static int samesets(register struct re_guts *g, int c1, int c2); -static void categorize(struct parse *p, register struct re_guts *g); -static sopno dupl(register struct parse *p, sopno start, sopno finish); -static void doemit(register struct parse *p, sop op, size_t opnd); -static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); -static void dofwd(register struct parse *p, sopno pos, sop value); -static void enlarge(register struct parse *p, sopno size); -static void stripsnug(register struct parse *p, register struct re_guts *g); -static void findmust(register struct parse *p, register struct re_guts *g); -static sopno pluscount(register struct parse *p, register struct re_guts *g); - -#ifdef __cplusplus -} -#endif -/* ========= end header generated by ./mkh ========= */ diff --git a/scsh/regexp/regerror.c b/scsh/regexp/regerror.c index a280cee..e53dafc 100644 --- a/scsh/regexp/regerror.c +++ b/scsh/regexp/regerror.c @@ -1,18 +1,126 @@ -/* - * regerror - */ +#include #include +#include +#include +#include #include +#include -void -regerror(s) -char *s; +#include "utils.h" +#include "regerror.ih" + +/* + = #define REG_OKAY 0 + = #define REG_NOMATCH 1 + = #define REG_BADPAT 2 + = #define REG_ECOLLATE 3 + = #define REG_ECTYPE 4 + = #define REG_EESCAPE 5 + = #define REG_ESUBREG 6 + = #define REG_EBRACK 7 + = #define REG_EPAREN 8 + = #define REG_EBRACE 9 + = #define REG_BADBR 10 + = #define REG_ERANGE 11 + = #define REG_ESPACE 12 + = #define REG_BADRPT 13 + = #define REG_EMPTY 14 + = #define REG_ASSERT 15 + = #define REG_INVARG 16 + = #define REG_ATOI 255 // convert name to number (!) + = #define REG_ITOA 0400 // convert number to name (!) + */ +static struct rerr { + int code; + char *name; + char *explain; +} rerrs[] = { + REG_OKAY, "REG_OKAY", "no errors detected", + REG_NOMATCH, "REG_NOMATCH", "regexec() failed to match", + REG_BADPAT, "REG_BADPAT", "invalid regular expression", + REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element", + REG_ECTYPE, "REG_ECTYPE", "invalid character class", + REG_EESCAPE, "REG_EESCAPE", "trailing backslash (\\)", + REG_ESUBREG, "REG_ESUBREG", "invalid backreference number", + REG_EBRACK, "REG_EBRACK", "brackets ([ ]) not balanced", + REG_EPAREN, "REG_EPAREN", "parentheses not balanced", + REG_EBRACE, "REG_EBRACE", "braces not balanced", + REG_BADBR, "REG_BADBR", "invalid repetition count(s)", + REG_ERANGE, "REG_ERANGE", "invalid character range", + REG_ESPACE, "REG_ESPACE", "out of memory", + REG_BADRPT, "REG_BADRPT", "repetition-operator operand invalid", + REG_EMPTY, "REG_EMPTY", "empty (sub)expression", + REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug", + REG_INVARG, "REG_INVARG", "invalid argument to regex routine", + -1, "", "*** unknown regexp error code ***", +}; + +/* + - regerror - the interface to error numbers + = extern size_t regerror(int, const regex_t *, char *, size_t); + */ +/* ARGSUSED */ +size_t +regerror(errcode, preg, errbuf, errbuf_size) +int errcode; +const regex_t *preg; +char *errbuf; +size_t errbuf_size; { -#ifdef ERRAVAIL - error("regexp: %s", s); -#else - fprintf(stderr, "regexp(3): %s\n", s); - exit(EXIT_FAILURE); -#endif - /* NOTREACHED */ + register struct rerr *r; + register size_t len; + register int target = errcode &~ REG_ITOA; + register char *s; + char convbuf[50]; + + if (errcode == REG_ATOI) + s = regatoi(preg, convbuf); + else { + for (r = rerrs; r->code >= 0; r++) + if (r->code == target) + break; + + if (errcode®_ITOA) { + if (r->code >= 0) + (void) strcpy(convbuf, r->name); + else + sprintf(convbuf, "REG_0x%x", target); + assert(strlen(convbuf) < sizeof(convbuf)); + s = convbuf; + } else + s = r->explain; + } + + len = strlen(s) + 1; + if (errbuf_size > 0) { + if (errbuf_size > len) + (void) strcpy(errbuf, s); + else { + (void) strncpy(errbuf, s, errbuf_size-1); + errbuf[errbuf_size-1] = '\0'; + } + } + + return(len); +} + +/* + - regatoi - internal routine to implement REG_ATOI + == static char *regatoi(const regex_t *preg, char *localbuf); + */ +static char * +regatoi(preg, localbuf) +const regex_t *preg; +char *localbuf; +{ + register struct rerr *r; + + for (r = rerrs; r->code >= 0; r++) + if (strcmp(r->name, preg->re_endp) == 0) + break; + if (r->code < 0) + return("0"); + + sprintf(localbuf, "%d", r->code); + return(localbuf); } diff --git a/scsh/regexp/regerror.ih b/scsh/regexp/regerror.ih deleted file mode 100644 index 2cb668c..0000000 --- a/scsh/regexp/regerror.ih +++ /dev/null @@ -1,12 +0,0 @@ -/* ========= begin header generated by ./mkh ========= */ -#ifdef __cplusplus -extern "C" { -#endif - -/* === regerror.c === */ -static char *regatoi(const regex_t *preg, char *localbuf); - -#ifdef __cplusplus -} -#endif -/* ========= end header generated by ./mkh ========= */ diff --git a/scsh/regexp/regex.h b/scsh/regexp/regex.h deleted file mode 100644 index d094d07..0000000 --- a/scsh/regexp/regex.h +++ /dev/null @@ -1,74 +0,0 @@ -#ifndef _REGEX_H_ -#define _REGEX_H_ /* never again */ -/* ========= begin header generated by ./mkh ========= */ -#ifdef __cplusplus -extern "C" { -#endif - -/* === regex2.h === */ -typedef off_t regoff_t; -typedef struct { - int re_magic; - size_t re_nsub; /* number of parenthesized subexpressions */ - const char *re_endp; /* end pointer for REG_PEND */ - struct re_guts *re_g; /* none of your business :-) */ -} regex_t; -typedef struct { - regoff_t rm_so; /* start of match */ - regoff_t rm_eo; /* end of match */ -} regmatch_t; - - -/* === regcomp.c === */ -extern int regcomp(regex_t *, const char *, int); -#define REG_BASIC 0000 -#define REG_EXTENDED 0001 -#define REG_ICASE 0002 -#define REG_NOSUB 0004 -#define REG_NEWLINE 0010 -#define REG_NOSPEC 0020 -#define REG_PEND 0040 -#define REG_DUMP 0200 - - -/* === regerror.c === */ -#define REG_OKAY 0 -#define REG_NOMATCH 1 -#define REG_BADPAT 2 -#define REG_ECOLLATE 3 -#define REG_ECTYPE 4 -#define REG_EESCAPE 5 -#define REG_ESUBREG 6 -#define REG_EBRACK 7 -#define REG_EPAREN 8 -#define REG_EBRACE 9 -#define REG_BADBR 10 -#define REG_ERANGE 11 -#define REG_ESPACE 12 -#define REG_BADRPT 13 -#define REG_EMPTY 14 -#define REG_ASSERT 15 -#define REG_INVARG 16 -#define REG_ATOI 255 /* convert name to number (!) */ -#define REG_ITOA 0400 /* convert number to name (!) */ -extern size_t regerror(int, const regex_t *, char *, size_t); - - -/* === regexec.c === */ -extern int regexec(const regex_t *, const char *, size_t, regmatch_t [], int); -#define REG_NOTBOL 00001 -#define REG_NOTEOL 00002 -#define REG_STARTEND 00004 -#define REG_TRACE 00400 /* tracing of execution */ -#define REG_LARGE 01000 /* force large representation */ -#define REG_BACKR 02000 /* force use of backref code */ - - -/* === regfree.c === */ -extern void regfree(regex_t *); - -#ifdef __cplusplus -} -#endif -/* ========= end header generated by ./mkh ========= */ -#endif diff --git a/scsh/regexp/regex2.h b/scsh/regexp/regex2.h index e1c75a9..58fd8d8 100644 --- a/scsh/regexp/regex2.h +++ b/scsh/regexp/regex2.h @@ -36,36 +36,36 @@ * In state representations, an operator's bit is on to signify a state * immediately *preceding* "execution" of that operator. */ -typedef unsigned long sop; /* strip operator */ +typedef long sop; /* strip operator */ typedef long sopno; -#define OPRMASK 0xf8000000 -#define OPDMASK 0x07ffffff -#define OPSHIFT ((unsigned)27) +#define OPRMASK 0x7c000000 +#define OPDMASK 0x03ffffff +#define OPSHIFT (26) #define OP(n) ((n)&OPRMASK) #define OPND(n) ((n)&OPDMASK) #define SOP(op, opnd) ((op)|(opnd)) /* operators meaning operand */ /* (back, fwd are offsets) */ -#define OEND (1ul<> (n)) -#define ISSETBACK(v, n) ((v) & ((unsigned long)here >> (n))) +#define FWD(dst, src, n) ((dst) |= ((unsigned)(src)&(here)) << (n)) +#define BACK(dst, src, n) ((dst) |= ((unsigned)(src)&(here)) >> (n)) +#define ISSETBACK(v, n) ((v) & ((unsigned)here >> (n))) /* function names */ #define SNAMES /* engine.c looks after details */ diff --git a/scsh/regexp/regexp.3 b/scsh/regexp/regexp.3 deleted file mode 100644 index 6d2555b..0000000 --- a/scsh/regexp/regexp.3 +++ /dev/null @@ -1,186 +0,0 @@ -.TH REGEXP 3 "2 Sept 1995" -.SH NAME -regcomp, regexec, regsub, regerror \- regular expression handler -.SH SYNOPSIS -.ft B -.nf -#include - -regexp *regcomp(exp) -const char *exp; - -int regexec(prog, string) -regexp *prog; -const char *string; - -void regsub(prog, source, dest) -const regexp *prog; -const char *source; -char *dest; - -void regerror(msg) -char *msg; -.SH DESCRIPTION -These functions implement -.IR egrep (1)-style -regular expressions and supporting facilities. -.PP -.I Regcomp -compiles a regular expression into a structure of type -.IR regexp , -and returns a pointer to it. -The space has been allocated using -.IR malloc (3) -and may be released by -.IR free . -.PP -.I Regexec -matches a NUL-terminated \fIstring\fR against the compiled regular expression -in \fIprog\fR. -It returns 1 for success and 0 for failure, and adjusts the contents of -\fIprog\fR's \fIstartp\fR and \fIendp\fR (see below) accordingly. -.PP -The members of a -.I regexp -structure include at least the following (not necessarily in order): -.PP -.RS -char *startp[NSUBEXP]; -.br -char *endp[NSUBEXP]; -.RE -.PP -where -.I NSUBEXP -is defined (as 10) in the header file. -Once a successful \fIregexec\fR has been done using the \fIregexp\fR, -each \fIstartp\fR-\fIendp\fR pair describes one substring -within the \fIstring\fR, -with the \fIstartp\fR pointing to the first character of the substring and -the \fIendp\fR pointing to the first character following the substring. -The 0th substring is the substring of \fIstring\fR that matched the whole -regular expression. -The others are those substrings that matched parenthesized expressions -within the regular expression, with parenthesized expressions numbered -in left-to-right order of their opening parentheses. -.PP -.I Regsub -copies \fIsource\fR to \fIdest\fR, making substitutions according to the -most recent \fIregexec\fR performed using \fIprog\fR. -Each instance of `&' in \fIsource\fR is replaced by the substring -indicated by \fIstartp\fR[\fI0\fR] and -\fIendp\fR[\fI0\fR]. -Each instance of `\e\fIn\fR', where \fIn\fR is a digit, is replaced by -the substring indicated by -\fIstartp\fR[\fIn\fR] and -\fIendp\fR[\fIn\fR]. -To get a literal `&' or `\e\fIn\fR' into \fIdest\fR, prefix it with `\e'; -to get a literal `\e' preceding `&' or `\e\fIn\fR', prefix it with -another `\e'. -.PP -.I Regerror -is called whenever an error is detected in \fIregcomp\fR, \fIregexec\fR, -or \fIregsub\fR. -The default \fIregerror\fR writes the string \fImsg\fR, -with a suitable indicator of origin, -on the standard -error output -and invokes \fIexit\fR(2). -.I Regerror -can be replaced by the user if other actions are desirable. -.SH "REGULAR EXPRESSION SYNTAX" -A regular expression is zero or more \fIbranches\fR, separated by `|'. -It matches anything that matches one of the branches. -.PP -A branch is zero or more \fIpieces\fR, concatenated. -It matches a match for the first, followed by a match for the second, etc. -.PP -A piece is an \fIatom\fR possibly followed by `*', `+', or `?'. -An atom followed by `*' matches a sequence of 0 or more matches of the atom. -An atom followed by `+' matches a sequence of 1 or more matches of the atom. -An atom followed by `?' matches a match of the atom, or the null string. -.PP -An atom is a regular expression in parentheses (matching a match for the -regular expression), a \fIrange\fR (see below), `.' -(matching any single character), `^' (matching the null string at the -beginning of the input string), `$' (matching the null string at the -end of the input string), a `\e' followed by a single character (matching -that character), or a single character with no other significance -(matching that character). -.PP -A \fIrange\fR is a sequence of characters enclosed in `[]'. -It normally matches any single character from the sequence. -If the sequence begins with `^', -it matches any single character \fInot\fR from the rest of the sequence. -If two characters in the sequence are separated by `\-', this is shorthand -for the full list of ASCII characters between them -(e.g. `[0-9]' matches any decimal digit). -To include a literal `]' in the sequence, make it the first character -(following a possible `^'). -To include a literal `\-', make it the first or last character. -.SH AMBIGUITY -If a regular expression could match two different parts of the input string, -it will match the one which begins earliest. -If both begin in the same place but match different lengths, or match -the same length in different ways, life gets messier, as follows. -.PP -In general, the possibilities in a list of branches are considered in -left-to-right order, the possibilities for `*', `+', and `?' are -considered longest-first, nested constructs are considered from the -outermost in, and concatenated constructs are considered leftmost-first. -The match that will be chosen is the one that uses the earliest -possibility in the first choice that has to be made. -If there is more than one choice, the next will be made in the same manner -(earliest possibility) subject to the decision on the first choice. -And so forth. -.PP -For example, `(ab|a)b*c' could match `abc' in one of two ways. -The first choice is between `ab' and `a'; since `ab' is earlier, and does -lead to a successful overall match, it is chosen. -Since the `b' is already spoken for, -the `b*' must match its last possibility\(emthe empty string\(emsince -it must respect the earlier choice. -.PP -In the particular case where the regular expression does not use `|' -and does not apply `*', `+', or `?' to parenthesized subexpressions, -the net effect is that the longest possible -match will be chosen. -So `ab*', presented with `xabbbby', will match `abbbb'. -Note that if `ab*' is tried against `xabyabbbz', it -will match `ab' just after `x', due to the begins-earliest rule. -(In effect, the decision on where to start the match is the first choice -to be made, hence subsequent choices must respect it even if this leads them -to less-preferred alternatives.) -.SH SEE ALSO -egrep(1), expr(1) -.SH DIAGNOSTICS -\fIRegcomp\fR returns NULL for a failure -(\fIregerror\fR permitting), -where failures are syntax errors, exceeding implementation limits, -or applying `+' or `*' to a possibly-null operand. -.SH HISTORY -This is a revised version. -Both code and manual page were -originally written by Henry Spencer at University of Toronto. -They are intended to be compatible with the Bell V8 \fIregexp\fR(3), -but are not derived from Bell code. -.SH BUGS -Empty branches and empty regular expressions are not portable -to other, otherwise-similar, implementations. -.PP -The ban on -applying `*' or `+' to a possibly-null operand is an artifact of the -simplistic implementation. -.PP -The match-choice rules are complex. -A simple ``longest match'' rule would be preferable, -but is harder to implement. -.PP -Although there is a general similarity to POSIX.2 ``extended'' regular -expressions, neither the regular-expression syntax nor the programming -interface is an exact match. -.PP -Due to emphasis on -compactness and simplicity, -it's not strikingly fast. -It does give some attention to handling simple cases quickly. diff --git a/scsh/regexp/regexp.c b/scsh/regexp/regexp.c deleted file mode 100644 index 65d7e88..0000000 --- a/scsh/regexp/regexp.c +++ /dev/null @@ -1,1124 +0,0 @@ -/* - * regcomp and regexec -- regsub and regerror are elsewhere - */ -#include -#include -#include -#include -#include "regmagic.h" - -/* - * The "internal use only" fields in regexp.h are present to pass info from - * compile to execute that permits the execute phase to run lots faster on - * simple cases. They are: - * - * regstart char that must begin a match; '\0' if none obvious - * reganch is the match anchored (at beginning-of-line only)? - * regmust string (pointer into program) that match must include, or NULL - * regmlen length of regmust string - * - * Regstart and reganch permit very fast decisions on suitable starting points - * for a match, cutting down the work a lot. Regmust permits fast rejection - * of lines that cannot possibly match. The regmust tests are costly enough - * that regcomp() supplies a regmust only if the r.e. contains something - * potentially expensive (at present, the only such thing detected is * or + - * at the start of the r.e., which can involve a lot of backup). Regmlen is - * supplied because the test in regexec() needs it and regcomp() is computing - * it anyway. - */ - -/* - * Structure for regexp "program". This is essentially a linear encoding - * of a nondeterministic finite-state machine (aka syntax charts or - * "railroad normal form" in parsing technology). Each node is an opcode - * plus a "next" pointer, possibly plus an operand. "Next" pointers of - * all nodes except BRANCH implement concatenation; a "next" pointer with - * a BRANCH on both ends of it is connecting two alternatives. (Here we - * have one of the subtle syntax dependencies: an individual BRANCH (as - * opposed to a collection of them) is never concatenated with anything - * because of operator precedence.) The operand of some types of node is - * a literal string; for others, it is a node leading into a sub-FSM. In - * particular, the operand of a BRANCH node is the first node of the branch. - * (NB this is *not* a tree structure: the tail of the branch connects - * to the thing following the set of BRANCHes.) The opcodes are: - */ - -/* definition number opnd? meaning */ -#define END 0 /* no End of program. */ -#define BOL 1 /* no Match beginning of line. */ -#define EOL 2 /* no Match end of line. */ -#define ANY 3 /* no Match any character. */ -#define ANYOF 4 /* str Match any of these. */ -#define ANYBUT 5 /* str Match any but one of these. */ -#define BRANCH 6 /* node Match this, or the next..\&. */ -#define BACK 7 /* no "next" ptr points backward. */ -#define EXACTLY 8 /* str Match this string. */ -#define NOTHING 9 /* no Match empty string. */ -#define STAR 10 /* node Match this 0 or more times. */ -#define PLUS 11 /* node Match this 1 or more times. */ -#define OPEN 20 /* no Sub-RE starts here. */ - /* OPEN+1 is number 1, etc. */ -#define CLOSE 30 /* no Analogous to OPEN. */ - -/* - * Opcode notes: - * - * BRANCH The set of branches constituting a single choice are hooked - * together with their "next" pointers, since precedence prevents - * anything being concatenated to any individual branch. The - * "next" pointer of the last BRANCH in a choice points to the - * thing following the whole choice. This is also where the - * final "next" pointer of each individual branch points; each - * branch starts with the operand node of a BRANCH node. - * - * BACK Normal "next" pointers all implicitly point forward; BACK - * exists to make loop structures possible. - * - * STAR,PLUS '?', and complex '*' and '+', are implemented as circular - * BRANCH structures using BACK. Simple cases (one character - * per match) are implemented with STAR and PLUS for speed - * and to minimize recursive plunges. - * - * OPEN,CLOSE ...are numbered at compile time. - */ - -/* - * A node is one char of opcode followed by two chars of "next" pointer. - * "Next" pointers are stored as two 8-bit pieces, high order first. The - * value is a positive offset from the opcode of the node containing it. - * An operand, if any, simply follows the node. (Note that much of the - * code generation knows about this implicit relationship.) - * - * Using two bytes for the "next" pointer is vast overkill for most things, - * but allows patterns to get big without disasters. - */ -#define OP(p) (*(p)) -#define NEXT(p) (((*((p)+1)&0177)<<8) + (*((p)+2)&0377)) -#define OPERAND(p) ((p) + 3) - -/* - * See regmagic.h for one further detail of program structure. - */ - - -/* - * Utility definitions. - */ -#define FAIL(m) { regerror(m); return(NULL); } -#define FAILN(m) { regerror(m); return(-1); } -#define ISREPN(c) ((c) == '*' || (c) == '+' || (c) == '?') -#define META "^$.[()|?+*\\" - -/* - * Flags to be passed up and down. - */ -#define HASWIDTH 01 /* Known never to match null string. */ -#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ -#define SPSTART 04 /* Starts with * or +. */ -#define WORST 0 /* Worst case. */ - -/* - * Work-variable struct for regcomp(). - */ -struct comp { - char *regparse; /* Input-scan pointer. */ - int regnpar; /* () count. */ - char *regcode; /* Code-emit pointer; ®dummy = don't. */ - char regdummy[3]; /* NOTHING, 0 next ptr */ - long regsize; /* Code size. */ -}; -#define EMITTING(cp) ((cp)->regcode != (cp)->regdummy) - -/* - * Forward declarations for regcomp()'s friends. - */ -static char *reg(struct comp *cp, int paren, int *flagp); -static char *regbranch(struct comp *cp, int *flagp); -static char *regpiece(struct comp *cp, int *flagp); -static char *regatom(struct comp *cp, int *flagp); -static char *regnode(struct comp *cp, int op); -static char *regnext(char *node); -static void regc(struct comp *cp, int c); -static void reginsert(struct comp *cp, int op, char *opnd); -static void regtail(struct comp *cp, char *p, char *val); -static void regoptail(struct comp *cp, char *p, char *val); - -/* - - regcomp - compile a regular expression into internal code - * - * We can't allocate space until we know how big the compiled form will be, - * but we can't compile it (and thus know how big it is) until we've got a - * place to put the code. So we cheat: we compile it twice, once with code - * generation turned off and size counting turned on, and once "for real". - * This also means that we don't allocate space until we are sure that the - * thing really will compile successfully, and we never have to move the - * code and thus invalidate pointers into it. (Note that it has to be in - * one piece because free() must be able to free it all.) - * - * Beware that the optimization-preparation code in here knows about some - * of the structure of the compiled regexp. - */ -regexp * -regcomp(exp) -const char *exp; -{ - register regexp *r; - size_t len; - - len = regcomp_len(exp); - if (len <= 0) - return NULL; - - /* Allocate space. */ - r = (regexp *)malloc(len); - - if (r == NULL) - FAIL("out of space"); - return regcomp_comp(exp, r, len); -} - - -size_t -regcomp_len(exp) -const char *exp; -{ - int flags; - register regexp *r; - struct comp co; - - if (exp == NULL) - FAILN("NULL argument to regcomp"); - - /* First pass: determine size, legality. */ - co.regparse = (char *)exp; - co.regnpar = 1; - co.regsize = 0L; - co.regdummy[0] = NOTHING; - co.regdummy[1] = co.regdummy[2] = 0; - co.regcode = co.regdummy; - regc(&co, MAGIC); - if (reg(&co, 0, &flags) == NULL) - return -1; - - /* Small enough for pointer-storage convention? */ - if (co.regsize >= 0x7fffL) /* Probably could be 0xffffL. */ - FAILN("regexp too big"); - - return (sizeof(regexp) + (size_t)co.regsize); -} - - -regexp * -regcomp_comp(exp, r, len) -const char *exp; -register regexp *r; -size_t len; -{ - register char *scan; - int flags; - struct comp co; - - /* Second pass: emit code. */ - co.regparse = (char *)exp; - co.regnpar = 1; - co.regcode = r->program; - co.regsize = len - sizeof(regexp); - regc(&co, MAGIC); - if (reg(&co, 0, &flags) == NULL) - return(NULL); - - /* Dig out information for optimizations. */ - r->regstart = '\0'; /* Worst-case defaults. */ - r->reganch = 0; - r->regmust = 0; - r->regmlen = 0; - scan = r->program+1; /* First BRANCH. */ - if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ - scan = OPERAND(scan); - - /* Starting-point info. */ - if (OP(scan) == EXACTLY) - r->regstart = *OPERAND(scan); - else if (OP(scan) == BOL) - r->reganch = 1; - - /* - * If there's something expensive in the r.e., find the - * longest literal string that must appear and make it the - * regmust. Resolve ties in favor of later strings, since - * the regstart check works with the beginning of the r.e. - * and avoiding duplication strengthens checking. Not a - * strong reason, but sufficient in the absence of others. - */ - if (flags&SPSTART) { - register char *longest = NULL; - register size_t len = 0; - - for (; scan != NULL; scan = regnext(scan)) - if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { - longest = OPERAND(scan); - len = strlen(OPERAND(scan)); - } - r->regmust = longest - r->program; - r->regmlen = (int)len; - } - } - - return(r); -} - -/* - - reg - regular expression, i.e. main body or parenthesized thing - * - * Caller must absorb opening parenthesis. - * - * Combining parenthesis handling with the base level of regular expression - * is a trifle forced, but the need to tie the tails of the branches to what - * follows makes it hard to avoid. - */ -static char * -reg(cp, paren, flagp) -register struct comp *cp; -int paren; /* Parenthesized? */ -int *flagp; -{ - register char *ret; - register char *br; - register char *ender; - register int parno; - int flags; - - *flagp = HASWIDTH; /* Tentatively. */ - - if (paren) { - /* Make an OPEN node. */ - if (cp->regnpar >= NSUBEXP) - FAIL("too many ()"); - parno = cp->regnpar; - cp->regnpar++; - ret = regnode(cp, OPEN+parno); - } - - /* Pick up the branches, linking them together. */ - br = regbranch(cp, &flags); - if (br == NULL) - return(NULL); - if (paren) - regtail(cp, ret, br); /* OPEN -> first. */ - else - ret = br; - *flagp &= ~(~flags&HASWIDTH); /* Clear bit if bit 0. */ - *flagp |= flags&SPSTART; - while (*cp->regparse == '|') { - cp->regparse++; - br = regbranch(cp, &flags); - if (br == NULL) - return(NULL); - regtail(cp, ret, br); /* BRANCH -> BRANCH. */ - *flagp &= ~(~flags&HASWIDTH); - *flagp |= flags&SPSTART; - } - - /* Make a closing node, and hook it on the end. */ - ender = regnode(cp, (paren) ? CLOSE+parno : END); - regtail(cp, ret, ender); - - /* Hook the tails of the branches to the closing node. */ - for (br = ret; br != NULL; br = regnext(br)) - regoptail(cp, br, ender); - - /* Check for proper termination. */ - if (paren && *cp->regparse++ != ')') { - FAIL("unterminated ()"); - } else if (!paren && *cp->regparse != '\0') { - if (*cp->regparse == ')') { - FAIL("unmatched ()"); - } else - FAIL("internal error: junk on end"); - /* NOTREACHED */ - } - - return(ret); -} - -/* - - regbranch - one alternative of an | operator - * - * Implements the concatenation operator. - */ -static char * -regbranch(cp, flagp) -register struct comp *cp; -int *flagp; -{ - register char *ret; - register char *chain; - register char *latest; - int flags; - register int c; - - *flagp = WORST; /* Tentatively. */ - - ret = regnode(cp, BRANCH); - chain = NULL; - while ((c = *cp->regparse) != '\0' && c != '|' && c != ')') { - latest = regpiece(cp, &flags); - if (latest == NULL) - return(NULL); - *flagp |= flags&HASWIDTH; - if (chain == NULL) /* First piece. */ - *flagp |= flags&SPSTART; - else - regtail(cp, chain, latest); - chain = latest; - } - if (chain == NULL) /* Loop ran zero times. */ - (void) regnode(cp, NOTHING); - - return(ret); -} - -/* - - regpiece - something followed by possible [*+?] - * - * Note that the branching code sequences used for ? and the general cases - * of * and + are somewhat optimized: they use the same NOTHING node as - * both the endmarker for their branch list and the body of the last branch. - * It might seem that this node could be dispensed with entirely, but the - * endmarker role is not redundant. - */ -static char * -regpiece(cp, flagp) -register struct comp *cp; -int *flagp; -{ - register char *ret; - register char op; - register char *next; - int flags; - - ret = regatom(cp, &flags); - if (ret == NULL) - return(NULL); - - op = *cp->regparse; - if (!ISREPN(op)) { - *flagp = flags; - return(ret); - } - - if (!(flags&HASWIDTH) && op != '?') - FAIL("*+ operand could be empty"); - switch (op) { - case '*': *flagp = WORST|SPSTART; break; - case '+': *flagp = WORST|SPSTART|HASWIDTH; break; - case '?': *flagp = WORST; break; - } - - if (op == '*' && (flags&SIMPLE)) - reginsert(cp, STAR, ret); - else if (op == '*') { - /* Emit x* as (x&|), where & means "self". */ - reginsert(cp, BRANCH, ret); /* Either x */ - regoptail(cp, ret, regnode(cp, BACK)); /* and loop */ - regoptail(cp, ret, ret); /* back */ - regtail(cp, ret, regnode(cp, BRANCH)); /* or */ - regtail(cp, ret, regnode(cp, NOTHING)); /* null. */ - } else if (op == '+' && (flags&SIMPLE)) - reginsert(cp, PLUS, ret); - else if (op == '+') { - /* Emit x+ as x(&|), where & means "self". */ - next = regnode(cp, BRANCH); /* Either */ - regtail(cp, ret, next); - regtail(cp, regnode(cp, BACK), ret); /* loop back */ - regtail(cp, next, regnode(cp, BRANCH)); /* or */ - regtail(cp, ret, regnode(cp, NOTHING)); /* null. */ - } else if (op == '?') { - /* Emit x? as (x|) */ - reginsert(cp, BRANCH, ret); /* Either x */ - regtail(cp, ret, regnode(cp, BRANCH)); /* or */ - next = regnode(cp, NOTHING); /* null. */ - regtail(cp, ret, next); - regoptail(cp, ret, next); - } - cp->regparse++; - if (ISREPN(*cp->regparse)) - FAIL("nested *?+"); - - return(ret); -} - -/* - - regatom - the lowest level - * - * Optimization: gobbles an entire sequence of ordinary characters so that - * it can turn them into a single node, which is smaller to store and - * faster to run. Backslashed characters are exceptions, each becoming a - * separate node; the code is simpler that way and it's not worth fixing. - */ -static char * -regatom(cp, flagp) -register struct comp *cp; -int *flagp; -{ - register char *ret; - int flags; - - *flagp = WORST; /* Tentatively. */ - - switch (*cp->regparse++) { - case '^': - ret = regnode(cp, BOL); - break; - case '$': - ret = regnode(cp, EOL); - break; - case '.': - ret = regnode(cp, ANY); - *flagp |= HASWIDTH|SIMPLE; - break; - case '[': { - register int range; - register int rangeend; - register int c; - - if (*cp->regparse == '^') { /* Complement of range. */ - ret = regnode(cp, ANYBUT); - cp->regparse++; - } else - ret = regnode(cp, ANYOF); - if ((c = *cp->regparse) == ']' || c == '-') { - regc(cp, c); - cp->regparse++; - } - while ((c = *cp->regparse++) != '\0' && c != ']') { - if (c != '-') - regc(cp, c); - else if ((c = *cp->regparse) == ']' || c == '\0') - regc(cp, '-'); - else { - range = (unsigned char)*(cp->regparse-2); - rangeend = (unsigned char)c; - if (range > rangeend) - FAIL("invalid [] range"); - for (range++; range <= rangeend; range++) - regc(cp, range); - cp->regparse++; - } - } - regc(cp, '\0'); - if (c != ']') - FAIL("unmatched []"); - *flagp |= HASWIDTH|SIMPLE; - break; - } - case '(': - ret = reg(cp, 1, &flags); - if (ret == NULL) - return(NULL); - *flagp |= flags&(HASWIDTH|SPSTART); - break; - case '\0': - case '|': - case ')': - /* supposed to be caught earlier */ - FAIL("internal error: \\0|) unexpected"); - break; - case '?': - case '+': - case '*': - FAIL("?+* follows nothing"); - break; - case '\\': - if (*cp->regparse == '\0') - FAIL("trailing \\"); - ret = regnode(cp, EXACTLY); - regc(cp, *cp->regparse++); - regc(cp, '\0'); - *flagp |= HASWIDTH|SIMPLE; - break; - default: { - register size_t len; - register char ender; - - cp->regparse--; - len = strcspn(cp->regparse, META); - if (len == 0) - FAIL("internal error: strcspn 0"); - ender = *(cp->regparse+len); - if (len > 1 && ISREPN(ender)) - len--; /* Back off clear of ?+* operand. */ - *flagp |= HASWIDTH; - if (len == 1) - *flagp |= SIMPLE; - ret = regnode(cp, EXACTLY); - for (; len > 0; len--) - regc(cp, *cp->regparse++); - regc(cp, '\0'); - break; - } - } - - return(ret); -} - -/* - - regnode - emit a node - */ -static char * /* Location. */ -regnode(cp, op) -register struct comp *cp; -char op; -{ - register char *const ret = cp->regcode; - register char *ptr; - - if (!EMITTING(cp)) { - cp->regsize += 3; - return(ret); - } - - ptr = ret; - *ptr++ = op; - *ptr++ = '\0'; /* Null next pointer. */ - *ptr++ = '\0'; - cp->regcode = ptr; - - return(ret); -} - -/* - - regc - emit (if appropriate) a byte of code - */ -static void -regc(cp, b) -register struct comp *cp; -char b; -{ - if (EMITTING(cp)) - *cp->regcode++ = b; - else - cp->regsize++; -} - -/* - - reginsert - insert an operator in front of already-emitted operand - * - * Means relocating the operand. - */ -static void -reginsert(cp, op, opnd) -register struct comp *cp; -char op; -char *opnd; -{ - register char *place; - - if (!EMITTING(cp)) { - cp->regsize += 3; - return; - } - - (void) memmove(opnd+3, opnd, (size_t)(cp->regcode - opnd)); - cp->regcode += 3; - - place = opnd; /* Op node, where operand used to be. */ - *place++ = op; - *place++ = '\0'; - *place++ = '\0'; -} - -/* - - regtail - set the next-pointer at the end of a node chain - */ -static void -regtail(cp, p, val) -register struct comp *cp; -char *p; -char *val; -{ - register char *scan; - register char *temp; - register int offset; - - if (!EMITTING(cp)) - return; - - /* Find last node. */ - for (scan = p; (temp = regnext(scan)) != NULL; scan = temp) - continue; - - offset = (OP(scan) == BACK) ? scan - val : val - scan; - *(scan+1) = (offset>>8)&0177; - *(scan+2) = offset&0377; -} - -/* - - regoptail - regtail on operand of first argument; nop if operandless - */ -static void -regoptail(cp, p, val) -register struct comp *cp; -char *p; -char *val; -{ - /* "Operandless" and "op != BRANCH" are synonymous in practice. */ - if (!EMITTING(cp) || OP(p) != BRANCH) - return; - regtail(cp, OPERAND(p), val); -} - -/* - * regexec and friends - */ - -/* - * Work-variable struct for regexec(). - */ -struct exec { - char *reginput; /* String-input pointer. */ - char *regbol; /* Beginning of input, for ^ check. */ - const char **regstartp; /* Pointer to startp array. */ - const char **regendp; /* Ditto for endp. */ -}; - -/* - * Forwards. - */ -static int regtry(struct exec *ep, regexp *rp, char *string); -static int regmatch(struct exec *ep, char *prog); -static size_t regrepeat(struct exec *ep, char *node); - -#ifdef DEBUG -int regnarrate = 0; -void regdump(); -static char *regprop(); -#endif - -/* - - regexec - match a regexp against a string - */ -int -regexec(prog, str) -register regexp *prog; -const char *str; -{ - register char *string = (char *)str; /* avert const poisoning */ - register char *s; - struct exec ex; - - /* Be paranoid. */ - if (prog == NULL || string == NULL) { - regerror("NULL argument to regexec"); - return(0); - } - - /* Check validity of program. */ - if ((unsigned char)*prog->program != MAGIC) { - regerror("corrupted regexp"); - return(0); - } - - /* If there is a "must appear" string, look for it. */ - if ((prog->regmlen > 0) && - strstr(string, &prog->program[prog->regmust]) == NULL) - return(0); - - /* Mark beginning of line for ^ . */ - ex.regbol = string; - ex.regstartp = prog->startp; - ex.regendp = prog->endp; - - /* Simplest case: anchored match need be tried only once. */ - if (prog->reganch) - return(regtry(&ex, prog, string)); - - /* Messy cases: unanchored match. */ - if (prog->regstart != '\0') { - /* We know what char it must start with. */ - for (s = string; s != NULL; s = strchr(s+1, prog->regstart)) - if (regtry(&ex, prog, s)) - return(1); - return(0); - } else { - /* We don't -- general case. */ - for (s = string; !regtry(&ex, prog, s); s++) - if (*s == '\0') - return(0); - return(1); - } - /* NOTREACHED */ -} - -/* - - regtry - try match at specific point - */ -static int /* 0 failure, 1 success */ -regtry(ep, prog, string) -register struct exec *ep; -regexp *prog; -char *string; -{ - register int i; - register const char **stp; - register const char **enp; - - ep->reginput = string; - - stp = prog->startp; - enp = prog->endp; - for (i = NSUBEXP; i > 0; i--) { - *stp++ = NULL; - *enp++ = NULL; - } - if (regmatch(ep, prog->program + 1)) { - prog->startp[0] = string; - prog->endp[0] = ep->reginput; - return(1); - } else - return(0); -} - -/* - - regmatch - main matching routine - * - * Conceptually the strategy is simple: check to see whether the current - * node matches, call self recursively to see whether the rest matches, - * and then act accordingly. In practice we make some effort to avoid - * recursion, in particular by going through "ordinary" nodes (that don't - * need to know whether the rest of the match failed) by a loop instead of - * by recursion. - */ -static int /* 0 failure, 1 success */ -regmatch(ep, prog) -register struct exec *ep; -char *prog; -{ - register char *scan; /* Current node. */ - char *next; /* Next node. */ - -#ifdef DEBUG - if (prog != NULL && regnarrate) - fprintf(stderr, "%s(\n", regprop(prog)); -#endif - for (scan = prog; scan != NULL; scan = next) { -#ifdef DEBUG - if (regnarrate) - fprintf(stderr, "%s...\n", regprop(scan)); -#endif - next = regnext(scan); - - switch (OP(scan)) { - case BOL: - if (ep->reginput != ep->regbol) - return(0); - break; - case EOL: - if (*ep->reginput != '\0') - return(0); - break; - case ANY: - if (*ep->reginput == '\0') - return(0); - ep->reginput++; - break; - case EXACTLY: { - register size_t len; - register char *const opnd = OPERAND(scan); - - /* Inline the first character, for speed. */ - if (*opnd != *ep->reginput) - return(0); - len = strlen(opnd); - if (len > 1 && strncmp(opnd, ep->reginput, len) != 0) - return(0); - ep->reginput += len; - break; - } - case ANYOF: - if (*ep->reginput == '\0' || - strchr(OPERAND(scan), *ep->reginput) == NULL) - return(0); - ep->reginput++; - break; - case ANYBUT: - if (*ep->reginput == '\0' || - strchr(OPERAND(scan), *ep->reginput) != NULL) - return(0); - ep->reginput++; - break; - case NOTHING: - break; - case BACK: - break; - case OPEN+1: case OPEN+2: case OPEN+3: - case OPEN+4: case OPEN+5: case OPEN+6: - case OPEN+7: case OPEN+8: case OPEN+9: { - register const int no = OP(scan) - OPEN; - register char *const input = ep->reginput; - - if (regmatch(ep, next)) { - /* - * Don't set startp if some later - * invocation of the same parentheses - * already has. - */ - if (ep->regstartp[no] == NULL) - ep->regstartp[no] = input; - return(1); - } else - return(0); - break; - } - case CLOSE+1: case CLOSE+2: case CLOSE+3: - case CLOSE+4: case CLOSE+5: case CLOSE+6: - case CLOSE+7: case CLOSE+8: case CLOSE+9: { - register const int no = OP(scan) - CLOSE; - register char *const input = ep->reginput; - - if (regmatch(ep, next)) { - /* - * Don't set endp if some later - * invocation of the same parentheses - * already has. - */ - if (ep->regendp[no] == NULL) - ep->regendp[no] = input; - return(1); - } else - return(0); - break; - } - case BRANCH: { - register char *const save = ep->reginput; - - if (OP(next) != BRANCH) /* No choice. */ - next = OPERAND(scan); /* Avoid recursion. */ - else { - while (OP(scan) == BRANCH) { - if (regmatch(ep, OPERAND(scan))) - return(1); - ep->reginput = save; - scan = regnext(scan); - } - return(0); - /* NOTREACHED */ - } - break; - } - case STAR: case PLUS: { - register const char nextch = - (OP(next) == EXACTLY) ? *OPERAND(next) : '\0'; - register size_t no; - register char *const save = ep->reginput; - register const size_t min = (OP(scan) == STAR) ? 0 : 1; - - for (no = regrepeat(ep, OPERAND(scan)) + 1; no > min; no--) { - ep->reginput = save + no - 1; - /* If it could work, try it. */ - if (nextch == '\0' || *ep->reginput == nextch) - if (regmatch(ep, next)) - return(1); - } - return(0); - break; - } - case END: - return(1); /* Success! */ - break; - default: - regerror("regexp corruption"); - return(0); - break; - } - } - - /* - * We get here only if there's trouble -- normally "case END" is - * the terminating point. - */ - regerror("corrupted pointers"); - return(0); -} - -/* - - regrepeat - report how many times something simple would match - */ -static size_t -regrepeat(ep, node) -register struct exec *ep; -char *node; -{ - register size_t count; - register char *scan; - register char ch; - - switch (OP(node)) { - case ANY: - return(strlen(ep->reginput)); - break; - case EXACTLY: - ch = *OPERAND(node); - count = 0; - for (scan = ep->reginput; *scan == ch; scan++) - count++; - return(count); - break; - case ANYOF: - return(strspn(ep->reginput, OPERAND(node))); - break; - case ANYBUT: - return(strcspn(ep->reginput, OPERAND(node))); - break; - default: /* Oh dear. Called inappropriately. */ - regerror("internal error: bad call of regrepeat"); - return(0); /* Best compromise. */ - break; - } - /* NOTREACHED */ -} - -/* - - regnext - dig the "next" pointer out of a node - */ -static char * -regnext(p) -register char *p; -{ - register const int offset = NEXT(p); - - if (offset == 0) - return(NULL); - - return((OP(p) == BACK) ? p-offset : p+offset); -} - -#ifdef DEBUG - -static char *regprop(); - -/* - - regdump - dump a regexp onto stdout in vaguely comprehensible form - */ -void -regdump(r) -regexp *r; -{ - register char *s; - register char op = EXACTLY; /* Arbitrary non-END op. */ - register char *next; - - - s = r->program + 1; - while (op != END) { /* While that wasn't END last time... */ - op = OP(s); - printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ - next = regnext(s); - if (next == NULL) /* Next ptr. */ - printf("(0)"); - else - printf("(%d)", (s-r->program)+(next-s)); - s += 3; - if (op == ANYOF || op == ANYBUT || op == EXACTLY) { - /* Literal string, where present. */ - while (*s != '\0') { - putchar(*s); - s++; - } - s++; - } - putchar('\n'); - } - - /* Header fields of interest. */ - if (r->regstart != '\0') - printf("start `%c' ", r->regstart); - if (r->reganch) - printf("anchored "); - if (r->regmlen > 0) - printf("must have \"%s\"", &r->program[r->regmust]); - printf("\n"); -} - -/* - - regprop - printable representation of opcode - */ -static char * -regprop(op) -char *op; -{ - register char *p; - static char buf[50]; - - (void) strcpy(buf, ":"); - - switch (OP(op)) { - case BOL: - p = "BOL"; - break; - case EOL: - p = "EOL"; - break; - case ANY: - p = "ANY"; - break; - case ANYOF: - p = "ANYOF"; - break; - case ANYBUT: - p = "ANYBUT"; - break; - case BRANCH: - p = "BRANCH"; - break; - case EXACTLY: - p = "EXACTLY"; - break; - case NOTHING: - p = "NOTHING"; - break; - case BACK: - p = "BACK"; - break; - case END: - p = "END"; - break; - case OPEN+1: - case OPEN+2: - case OPEN+3: - case OPEN+4: - case OPEN+5: - case OPEN+6: - case OPEN+7: - case OPEN+8: - case OPEN+9: - sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); - p = NULL; - break; - case CLOSE+1: - case CLOSE+2: - case CLOSE+3: - case CLOSE+4: - case CLOSE+5: - case CLOSE+6: - case CLOSE+7: - case CLOSE+8: - case CLOSE+9: - sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); - p = NULL; - break; - case STAR: - p = "STAR"; - break; - case PLUS: - p = "PLUS"; - break; - default: - regerror("corrupted opcode"); - break; - } - if (p != NULL) - (void) strcat(buf, p); - return(buf); -} -#endif diff --git a/scsh/regexp/regexp.h b/scsh/regexp/regexp.h deleted file mode 100644 index 48af08c..0000000 --- a/scsh/regexp/regexp.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Definitions etc. for regexp(3) routines. - * - * Caveat: this is V8 regexp(3) [actually, a reimplementation thereof], - * not the System V one. - */ -#define NSUBEXP 10 -typedef struct regexp { - const char *startp[NSUBEXP]; - const char *endp[NSUBEXP]; - char regstart; /* Internal use only. */ - char reganch; /* Internal use only. */ - int regmust; /* Internal use only. */ - int regmlen; /* Internal use only. */ - char program[1]; /* Unwarranted chumminess with compiler. */ -} regexp; - -extern regexp *regcomp(const char *re); -extern int regexec(regexp *rp, const char *s); -extern void regsub(const regexp *rp, const char *src, char *dst); -extern void regnsub(const regexp *rp, const char *src, char *dst, size_t len); -extern size_t regsublen(const regexp *rp, const char *src); - -extern void regerror(char *message); -extern size_t regcomp_len(const char *exp); -extern regexp *regcomp_comp(const char *exp, struct regexp *r, size_t len); - diff --git a/scsh/regexp/regmagic.h b/scsh/regexp/regmagic.h deleted file mode 100644 index 5acf447..0000000 --- a/scsh/regexp/regmagic.h +++ /dev/null @@ -1,5 +0,0 @@ -/* - * The first byte of the regexp internal "program" is actually this magic - * number; the start node begins in the second byte. - */ -#define MAGIC 0234 diff --git a/scsh/regexp/regsub.c b/scsh/regexp/regsub.c deleted file mode 100644 index bc98845..0000000 --- a/scsh/regexp/regsub.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * regsub - */ -#include -#include -#include -#include -#include -#include "regmagic.h" - -/* - - regsub - perform substitutions after a regexp match - */ - -void regsub(rp, source, dest) -const regexp *rp; -const char *source; -char *dest; -{ - regnsub(rp, source, dest, BUFSIZ); -} - - - -/* - - regnsub - perform bounds-checked substitutions after a regexp match - */ -void -regnsub(rp, source, dest, destlen) -const regexp *rp; -const char *source; -char *dest; -size_t destlen; -{ - register regexp * const prog = (regexp *)rp; - register const char *src = (char *)source; - register char *dst = dest; - char *dstend = dest + destlen; - char *odst; - register char c; - register int no; - register size_t len; - - if (prog == NULL || source == NULL || dest == NULL) { - regerror("NULL parameter to regsub"); - return; - } - if ((unsigned char)*(prog->program) != MAGIC) { - regerror("damaged regexp"); - return; - } - - while ((c = *src++) != '\0') { - if (c == '&') - no = 0; - else if (c == '\\' && isdigit(*src)) - no = *src++ - '0'; - else - no = -1; - - if (no < 0) { /* Ordinary character. */ - if (c == '\\' && (*src == '\\' || *src == '&')) - c = *src++; - *dst++ = c; - if (dst >= dstend) - { - regerror("output buffer too small"); - return; - } - } else if (prog->startp[no] != NULL && prog->endp[no] != NULL && - prog->endp[no] > prog->startp[no]) { - len = prog->endp[no] - prog->startp[no]; - odst = dst; - dst += len; - if (dst >= dstend) - { - regerror("output buffer too small"); - return; - } - (void) strncpy(odst, prog->startp[no], len); - if (*(dst-1) == '\0') { /* strncpy hit NUL. */ - regerror("damaged match string"); - return; - } - } - } - *dst++ = '\0'; -} - -size_t regsublen(rp, source) -const regexp *rp; -const char *source; -{ - register regexp * const prog = (regexp *)rp; - register char *src = (char *)source; - register char c; - register int no; - register int len = 0; - - if (prog == NULL || source == NULL) { - regerror("NULL parameter to regsublen"); - return -1; - } - - if ((unsigned char)*(prog->program) != MAGIC) { - regerror("damaged regexp"); - return -1; - } - while ((c = *src++) != '\0') { - if (c == '&') - no = 0; - else if (c == '\\' && isdigit(*src)) - no = *src++ - '0'; - else - no = -1; - if (no < 0) { /* Ordinary character. */ - if (c == '\\' && (*src == '\\' || *src == '&')) - src++; - len++; - } else { - const char *s = prog->startp[no]; - const char *e = prog->endp[no]; - if ((s != NULL) && (e != NULL) && (e > s)) { - len += e-s; - } - } - } - return len+1; -} - - diff --git a/scsh/regexp/tests b/scsh/regexp/tests index 10aa6f9..e4d928d 100644 --- a/scsh/regexp/tests +++ b/scsh/regexp/tests @@ -1,127 +1,477 @@ -abc abc y & abc -abc xbc n - - -abc axc n - - -abc abx n - - -abc xabcy y & abc -abc ababc y & abc -ab*c abc y & abc -ab*bc abc y & abc -ab*bc abbc y & abbc -ab*bc abbbbc y & abbbbc -ab+bc abbc y & abbc -ab+bc abc n - - -ab+bc abq n - - -ab+bc abbbbc y & abbbbc -ab?bc abbc y & abbc -ab?bc abc y & abc -ab?bc abbbbc n - - -ab?c abc y & abc -^abc$ abc y & abc -^abc$ abcc n - - -^abc abcc y & abc -^abc$ aabc n - - -abc$ aabc y & abc -^ abc y & -$ abc y & -a.c abc y & abc -a.c axc y & axc -a.*c axyzc y & axyzc -a.*c axyzd n - - -a[bc]d abc n - - -a[bc]d abd y & abd -a[b-d]e abd n - - -a[b-d]e ace y & ace -a[b-d] aac y & ac -a[-b] a- y & a- -a[b-] a- y & a- -[k] ab n - - -a[b-a] - c - - -a[]b - c - - -a[ - c - - -a] a] y & a] -a[]]b a]b y & a]b -a[^bc]d aed y & aed -a[^bc]d abd n - - -a[^-b]c adc y & adc -a[^-b]c a-c n - - -a[^]b]c a]c n - - -a[^]b]c adc y & adc -ab|cd abc y & ab -ab|cd abcd y & ab -()ef def y &-\1 ef- -()* - c - - -*a - c - - -^* - c - - -$* - c - - -(*)b - c - - -$b b n - - -a\ - c - - -a\(b a(b y &-\1 a(b- -a\(*b ab y & ab -a\(*b a((b y & a((b -a\\b a\b y & a\b -abc) - c - - -(abc - c - - -((a)) abc y &-\1-\2 a-a-a -(a)b(c) abc y &-\1-\2 abc-a-c -a+b+c aabbabc y & abc -a** - c - - -a*? - c - - -(a*)* - c - - -(a*)+ - c - - -(a|)* - c - - -(a*|b)* - c - - -(a+|b)* ab y &-\1 ab-b -(a+|b)+ ab y &-\1 ab-b -(a+|b)? ab y &-\1 a-a -[^ab]* cde y & cde -(^)* - c - - -(ab|)* - c - - -)( - c - - - abc y & -abc n - - -a* y & -abcd abcd y &-\&-\\& abcd-&-\abcd -a(bc)d abcd y \1-\\1-\\\1 bc-\1-\bc -([abc])*d abbbcd y &-\1 abbbcd-c -([abc])*bcd abcd y &-\1 abcd-a -a|b|c|d|e e y & e -(a|b|c|d|e)f ef y &-\1 ef-e -((a*|b))* - c - - -abcd*efg abcdefg y & abcdefg -ab* xabyabbbz y & ab -ab* xayabbbz y & a -(ab|cd)e abcde y &-\1 cde-cd -[abhgefdc]ij hij y & hij -^(ab|cd)e abcde n x\1y xy -(abc|)ef abcdef y &-\1 ef- -(a|b)c*d abcd y &-\1 bcd-b -(ab|ab*)bc abc y &-\1 abc-a -a([bc]*)c* abc y &-\1 abc-bc -a([bc]*)(c*d) abcd y &-\1-\2 abcd-bc-d -a([bc]+)(c*d) abcd y &-\1-\2 abcd-bc-d -a([bc]*)(c+d) abcd y &-\1-\2 abcd-b-cd -a[bcd]*dcdcde adcdcde y & adcdcde -a[bcd]+dcdcde adcdcde n - - -(ab|a)b*c abc y &-\1 abc-ab -((a)(b)c)(d) abcd y \1-\2-\3-\4 abc-a-b-d -[ -~]* abc y & abc -[ -~ -~]* abc y & abc -[ -~ -~ -~]* abc y & abc -[ -~ -~ -~ -~]* abc y & abc -[ -~ -~ -~ -~ -~]* abc y & abc -[ -~ -~ -~ -~ -~ -~]* abc y & abc -[ -~ -~ -~ -~ -~ -~ -~]* abc y & abc -[a-zA-Z_][a-zA-Z0-9_]* alpha y & alpha -^a(bc+|b[eh])g|.h$ abh y &-\1 bh- -(bc+d$|ef*g.|h?i(j|k)) effgz y &-\1-\2 effgz-effgz- -(bc+d$|ef*g.|h?i(j|k)) ij y &-\1-\2 ij-ij-j -(bc+d$|ef*g.|h?i(j|k)) effg n - - -(bc+d$|ef*g.|h?i(j|k)) bcdd n - - -(bc+d$|ef*g.|h?i(j|k)) reffgz y &-\1-\2 effgz-effgz- -((((((((((a)))))))))) - c - - -(((((((((a))))))))) a y & a -multiple words of text uh-uh n - - -multiple words multiple words, yeah y & multiple words -(.*)c(.*) abcde y &-\1-\2 abcde-ab-de -\((.*), (.*)\) (a, b) y (\2, \1) (b, a) +# regular expression test set +# Lines are at least three fields, separated by one or more tabs. "" stands +# for an empty field. First field is an RE. Second field is flags. If +# C flag given, regcomp() is expected to fail, and the third field is the +# error name (minus the leading REG_). +# +# Otherwise it is expected to succeed, and the third field is the string to +# try matching it against. If there is no fourth field, the match is +# expected to fail. If there is a fourth field, it is the substring that +# the RE is expected to match. If there is a fifth field, it is a comma- +# separated list of what the subexpressions should match, with - indicating +# no match for that one. In both the fourth and fifth fields, a (sub)field +# starting with @ indicates that the (sub)expression is expected to match +# a null string followed by the stuff after the @; this provides a way to +# test where null strings match. The character `N' in REs and strings +# is newline, `S' is space, `T' is tab, `Z' is NUL. +# +# The full list of flags: +# - placeholder, does nothing +# b RE is a BRE, not an ERE +# & try it as both an ERE and a BRE +# C regcomp() error expected, third field is error name +# i REG_ICASE +# m ("mundane") REG_NOSPEC +# s REG_NOSUB (not really testable) +# n REG_NEWLINE +# ^ REG_NOTBOL +# $ REG_NOTEOL +# # REG_STARTEND (see below) +# p REG_PEND +# +# For REG_STARTEND, the start/end offsets are those of the substring +# enclosed in (). + +# basics +a & a a +abc & abc abc +abc|de - abc abc +a|b|c - abc a + +# parentheses and perversions thereof +a(b)c - abc abc +a\(b\)c b abc abc +a( C EPAREN +a( b a( a( +a\( - a( a( +a\( bC EPAREN +a\(b bC EPAREN +a(b C EPAREN +a(b b a(b a(b +# gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly) +a) - a) a) +) - ) ) +# end gagging (in a just world, those *should* give EPAREN) +a) b a) a) +a\) bC EPAREN +\) bC EPAREN +a()b - ab ab +a\(\)b b ab ab + +# anchoring and REG_NEWLINE +^abc$ & abc abc +a^b - a^b +a^b b a^b a^b +a$b - a$b +a$b b a$b a$b +^ & abc @abc +$ & abc @ +^$ & "" @ +$^ - "" @ +\($\)\(^\) b "" @ +# stop retching, those are legitimate (although disgusting) +^^ - "" @ +$$ - "" @ +b$ & abNc +b$ &n abNc b +^b$ & aNbNc +^b$ &n aNbNc b +^$ &n aNNb @Nb +^$ n abc +^$ n abcN @ +$^ n aNNb @Nb +\($\)\(^\) bn aNNb @Nb +^^ n^ aNNb @Nb +$$ n aNNb @NN +^a ^ a +a$ $ a +^a ^n aNb +^b ^n aNb b +a$ $n bNa +b$ $n bNa b +a*(^b$)c* - b b +a*\(^b$\)c* b b b + +# certain syntax errors and non-errors +| C EMPTY +| b | | +* C BADRPT +* b * * ++ C BADRPT +? C BADRPT +"" &C EMPTY +() - abc @abc +\(\) b abc @abc +a||b C EMPTY +|ab C EMPTY +ab| C EMPTY +(|a)b C EMPTY +(a|)b C EMPTY +(*a) C BADRPT +(+a) C BADRPT +(?a) C BADRPT +({1}a) C BADRPT +\(\{1\}a\) bC BADRPT +(a|*b) C BADRPT +(a|+b) C BADRPT +(a|?b) C BADRPT +(a|{1}b) C BADRPT +^* C BADRPT +^* b * * +^+ C BADRPT +^? C BADRPT +^{1} C BADRPT +^\{1\} bC BADRPT + +# metacharacters, backslashes +a.c & abc abc +a[bc]d & abd abd +a\*c & a*c a*c +a\\b & a\b a\b +a\\\*b & a\*b a\*b +a\bc & abc abc +a\ &C EESCAPE +a\\bc & a\bc a\bc +\{ bC BADRPT +a\[b & a[b a[b +a[b &C EBRACK +# trailing $ is a peculiar special case for the BRE code +a$ & a a +a$ & a$ +a\$ & a +a\$ & a$ a$ +a\\$ & a +a\\$ & a$ +a\\$ & a\$ +a\\$ & a\ a\ + +# back references, ugh +a\(b\)\2c bC ESUBREG +a\(b\1\)c bC ESUBREG +a\(b*\)c\1d b abbcbbd abbcbbd bb +a\(b*\)c\1d b abbcbd +a\(b*\)c\1d b abbcbbbd +^\(.\)\1 b abc +a\([bc]\)\1d b abcdabbd abbd b +a\(\([bc]\)\2\)*d b abbccd abbccd +a\(\([bc]\)\2\)*d b abbcbd +# actually, this next one probably ought to fail, but the spec is unclear +a\(\(b\)*\2\)*d b abbbd abbbd +# here is a case that no NFA implementation does right +\(ab*\)[ab]*\1 b ababaaa ababaaa a +# check out normal matching in the presence of back refs +\(a\)\1bcd b aabcd aabcd +\(a\)\1bc*d b aabcd aabcd +\(a\)\1bc*d b aabd aabd +\(a\)\1bc*d b aabcccd aabcccd +\(a\)\1bc*[ce]d b aabcccd aabcccd +^\(a\)\1b\(c\)*cd$ b aabcccd aabcccd + +# ordinary repetitions +ab*c & abc abc +ab+c - abc abc +ab?c - abc abc +a\(*\)b b a*b a*b +a\(**\)b b ab ab +a\(***\)b bC BADRPT +*a b *a *a +**a b a a +***a bC BADRPT + +# the dreaded bounded repetitions +{ & { { +{abc & {abc {abc +{1 C BADRPT +{1} C BADRPT +a{b & a{b a{b +a{1}b - ab ab +a\{1\}b b ab ab +a{1,}b - ab ab +a\{1,\}b b ab ab +a{1,2}b - aab aab +a\{1,2\}b b aab aab +a{1 C EBRACE +a\{1 bC EBRACE +a{1a C EBRACE +a\{1a bC EBRACE +a{1a} C BADBR +a\{1a\} bC BADBR +a{,2} - a{,2} a{,2} +a\{,2\} bC BADBR +a{,} - a{,} a{,} +a\{,\} bC BADBR +a{1,x} C BADBR +a\{1,x\} bC BADBR +a{1,x C EBRACE +a\{1,x bC EBRACE +a{300} C BADBR +a\{300\} bC BADBR +a{1,0} C BADBR +a\{1,0\} bC BADBR +ab{0,0}c - abcac ac +ab\{0,0\}c b abcac ac +ab{0,1}c - abcac abc +ab\{0,1\}c b abcac abc +ab{0,3}c - abbcac abbc +ab\{0,3\}c b abbcac abbc +ab{1,1}c - acabc abc +ab\{1,1\}c b acabc abc +ab{1,3}c - acabc abc +ab\{1,3\}c b acabc abc +ab{2,2}c - abcabbc abbc +ab\{2,2\}c b abcabbc abbc +ab{2,4}c - abcabbc abbc +ab\{2,4\}c b abcabbc abbc +((a{1,10}){1,10}){1,10} - a a a,a + +# multiple repetitions +a** &C BADRPT +a++ C BADRPT +a?? C BADRPT +a*+ C BADRPT +a*? C BADRPT +a+* C BADRPT +a+? C BADRPT +a?* C BADRPT +a?+ C BADRPT +a{1}{1} C BADRPT +a*{1} C BADRPT +a+{1} C BADRPT +a?{1} C BADRPT +a{1}* C BADRPT +a{1}+ C BADRPT +a{1}? C BADRPT +a*{b} - a{b} a{b} +a\{1\}\{1\} bC BADRPT +a*\{1\} bC BADRPT +a\{1\}* bC BADRPT + +# brackets, and numerous perversions thereof +a[b]c & abc abc +a[ab]c & abc abc +a[^ab]c & adc adc +a[]b]c & a]c a]c +a[[b]c & a[c a[c +a[-b]c & a-c a-c +a[^]b]c & adc adc +a[^-b]c & adc adc +a[b-]c & a-c a-c +a[b &C EBRACK +a[] &C EBRACK +a[1-3]c & a2c a2c +a[3-1]c &C ERANGE +a[1-3-5]c &C ERANGE +a[[.-.]--]c & a-c a-c +a[1- &C ERANGE +a[[. &C EBRACK +a[[.x &C EBRACK +a[[.x. &C EBRACK +a[[.x.] &C EBRACK +a[[.x.]] & ax ax +a[[.x,.]] &C ECOLLATE +a[[.one.]]b & a1b a1b +a[[.notdef.]]b &C ECOLLATE +a[[.].]]b & a]b a]b +a[[:alpha:]]c & abc abc +a[[:notdef:]]c &C ECTYPE +a[[: &C EBRACK +a[[:alpha &C EBRACK +a[[:alpha:] &C EBRACK +a[[:alpha,:] &C ECTYPE +a[[:]:]]b &C ECTYPE +a[[:-:]]b &C ECTYPE +a[[:alph:]] &C ECTYPE +a[[:alphabet:]] &C ECTYPE +[[:alnum:]]+ - -%@a0X- a0X +[[:alpha:]]+ - -%@aX0- aX +[[:blank:]]+ - aSSTb SST +[[:cntrl:]]+ - aNTb NT +[[:digit:]]+ - a019b 019 +[[:graph:]]+ - Sa%bS a%b +[[:lower:]]+ - AabC ab +[[:print:]]+ - NaSbN aSb +[[:punct:]]+ - S%-&T %-& +[[:space:]]+ - aSNTb SNT +[[:upper:]]+ - aBCd BC +[[:xdigit:]]+ - p0f3Cq 0f3C +a[[=b=]]c & abc abc +a[[= &C EBRACK +a[[=b &C EBRACK +a[[=b= &C EBRACK +a[[=b=] &C EBRACK +a[[=b,=]] &C ECOLLATE +a[[=one=]]b & a1b a1b + +# complexities +a(((b)))c - abc abc +a(b|(c))d - abd abd +a(b*|c)d - abbd abbd +# just gotta have one DFA-buster, of course +a[ab]{20} - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab +# and an inline expansion in case somebody gets tricky +a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab] - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab +# and in case somebody just slips in an NFA... +a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee|week)(knights|night) - aaaaabaaaabaaaabaaaabweeknights aaaaabaaaabaaaabaaaabweeknights +# fish for anomalies as the number of states passes 32 +12345678901234567890123456789 - a12345678901234567890123456789b 12345678901234567890123456789 +123456789012345678901234567890 - a123456789012345678901234567890b 123456789012345678901234567890 +1234567890123456789012345678901 - a1234567890123456789012345678901b 1234567890123456789012345678901 +12345678901234567890123456789012 - a12345678901234567890123456789012b 12345678901234567890123456789012 +123456789012345678901234567890123 - a123456789012345678901234567890123b 123456789012345678901234567890123 +# and one really big one, beyond any plausible word width +1234567890123456789012345678901234567890123456789012345678901234567890 - a1234567890123456789012345678901234567890123456789012345678901234567890b 1234567890123456789012345678901234567890123456789012345678901234567890 +# fish for problems as brackets go past 8 +[ab][cd][ef][gh][ij][kl][mn] - xacegikmoq acegikm +[ab][cd][ef][gh][ij][kl][mn][op] - xacegikmoq acegikmo +[ab][cd][ef][gh][ij][kl][mn][op][qr] - xacegikmoqy acegikmoq +[ab][cd][ef][gh][ij][kl][mn][op][q] - xacegikmoqy acegikmoq + +# subtleties of matching +abc & xabcy abc +a\(b\)?c\1d b acd +aBc i Abc Abc +a[Bc]*d i abBCcd abBCcd +0[[:upper:]]1 &i 0a1 0a1 +0[[:lower:]]1 &i 0A1 0A1 +a[^b]c &i abc +a[^b]c &i aBc +a[^b]c &i adc adc +[a]b[c] - abc abc +[a]b[a] - aba aba +[abc]b[abc] - abc abc +[abc]b[abd] - abd abd +a(b?c)+d - accd accd +(wee|week)(knights|night) - weeknights weeknights +(we|wee|week|frob)(knights|night|day) - weeknights weeknights +a[bc]d - xyzaaabcaababdacd abd +a[ab]c - aaabc abc +abc s abc abc +a* & b @b + +# Let's have some fun -- try to match a C comment. +# first the obvious, which looks okay at first glance... +/\*.*\*/ - /*x*/ /*x*/ +# but... +/\*.*\*/ - /*x*/y/*z*/ /*x*/y/*z*/ +# okay, we must not match */ inside; try to do that... +/\*([^*]|\*[^/])*\*/ - /*x*/ /*x*/ +/\*([^*]|\*[^/])*\*/ - /*x*/y/*z*/ /*x*/ +# but... +/\*([^*]|\*[^/])*\*/ - /*x**/y/*z*/ /*x**/y/*z*/ +# and a still fancier version, which does it right (I think)... +/\*([^*]|\*+[^*/])*\*+/ - /*x*/ /*x*/ +/\*([^*]|\*+[^*/])*\*+/ - /*x*/y/*z*/ /*x*/ +/\*([^*]|\*+[^*/])*\*+/ - /*x**/y/*z*/ /*x**/ +/\*([^*]|\*+[^*/])*\*+/ - /*x****/y/*z*/ /*x****/ +/\*([^*]|\*+[^*/])*\*+/ - /*x**x*/y/*z*/ /*x**x*/ +/\*([^*]|\*+[^*/])*\*+/ - /*x***x/y/*z*/ /*x***x/y/*z*/ + +# subexpressions +.* - abc abc - +a(b)(c)d - abcd abcd b,c +a(((b)))c - abc abc b,b,b +a(b|(c))d - abd abd b,- +a(b*|c|e)d - abbd abbd bb +a(b*|c|e)d - acd acd c +a(b*|c|e)d - ad ad @d +a(b?)c - abc abc b +a(b?)c - ac ac @c +a(b+)c - abc abc b +a(b+)c - abbbc abbbc bbb +a(b*)c - ac ac @c +(a|ab)(bc([de]+)f|cde) - abcdef abcdef a,bcdef,de +# the regression tester only asks for 9 subexpressions +a(b)(c)(d)(e)(f)(g)(h)(i)(j)k - abcdefghijk abcdefghijk b,c,d,e,f,g,h,i,j +a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l - abcdefghijkl abcdefghijkl b,c,d,e,f,g,h,i,j,k +a([bc]?)c - abc abc b +a([bc]?)c - ac ac @c +a([bc]+)c - abc abc b +a([bc]+)c - abcc abcc bc +a([bc]+)bc - abcbc abcbc bc +a(bb+|b)b - abb abb b +a(bbb+|bb+|b)b - abb abb b +a(bbb+|bb+|b)b - abbb abbb bb +a(bbb+|bb+|b)bb - abbb abbb b +(.*).* - abcdef abcdef abcdef +(a*)* - bc @b @b + +# do we get the right subexpression when it is used more than once? +a(b|c)*d - ad ad - +a(b|c)*d - abcd abcd c +a(b|c)+d - abd abd b +a(b|c)+d - abcd abcd c +a(b|c?)+d - ad ad @d +a(b|c?)+d - abcd abcd @d +a(b|c){0,0}d - ad ad - +a(b|c){0,1}d - ad ad - +a(b|c){0,1}d - abd abd b +a(b|c){0,2}d - ad ad - +a(b|c){0,2}d - abcd abcd c +a(b|c){0,}d - ad ad - +a(b|c){0,}d - abcd abcd c +a(b|c){1,1}d - abd abd b +a(b|c){1,1}d - acd acd c +a(b|c){1,2}d - abd abd b +a(b|c){1,2}d - abcd abcd c +a(b|c){1,}d - abd abd b +a(b|c){1,}d - abcd abcd c +a(b|c){2,2}d - acbd acbd b +a(b|c){2,2}d - abcd abcd c +a(b|c){2,4}d - abcd abcd c +a(b|c){2,4}d - abcbd abcbd b +a(b|c){2,4}d - abcbcd abcbcd c +a(b|c){2,}d - abcd abcd c +a(b|c){2,}d - abcbd abcbd b +a(b+|((c)*))+d - abd abd @d,@d,- +a(b+|((c)*))+d - abcd abcd @d,@d,- + +# check out the STARTEND option +[abc] &# a(b)c b +[abc] &# a(d)c +[abc] &# a(bc)d b +[abc] &# a(dc)d c +. &# a()c +b.*c &# b(bc)c bc +b.* &# b(bc)c bc +.*c &# b(bc)c bc + +# plain strings, with the NOSPEC flag +abc m abc abc +abc m xabcy abc +abc m xyz +a*b m aba*b a*b +a*b m ab +"" mC EMPTY + +# cases involving NULs +aZb & a a +aZb &p a +aZb &p# (aZb) aZb +aZ*b &p# (ab) ab +a.b &# (aZb) aZb +a.* &# (aZb)c aZb + +# word boundaries (ick) +[[:<:]]a & a a +[[:<:]]a & ba +[[:<:]]a & -a a +a[[:>:]] & a a +a[[:>:]] & ab +a[[:>:]] & a- a +[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc abc +[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc-q abc +[[:<:]]a.c[[:>:]] & axc-dayc-dazce-abc axc +[[:<:]]b.c[[:>:]] & a_bxc-byc_d-bzc-q bzc +[[:<:]].x..[[:>:]] & y_xa_-_xb_y-_xc_-axdc _xc_ +[[:<:]]a_b[[:>:]] & x_a_b + +# past problems, and suspected problems +(A[1])|(A[2])|(A[3])|(A[4])|(A[5])|(A[6])|(A[7])|(A[8])|(A[9])|(A[A]) - A1 A1 +abcdefghijklmnop i abcdefghijklmnop abcdefghijklmnop +abcdefghijklmnopqrstuv i abcdefghijklmnopqrstuv abcdefghijklmnopqrstuv +(ALAK)|(ALT[AB])|(CC[123]1)|(CM[123]1)|(GAMC)|(LC[23][EO ])|(SEM[1234])|(SL[ES][12])|(SLWW)|(SLF )|(SLDT)|(VWH[12])|(WH[34][EW])|(WP1[ESN]) - CC11 CC11 +CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a - CC11 CC11 +Char \([a-z0-9_]*\)\[.* b Char xyz[k Char xyz[k xyz +a?b - ab ab +-\{0,1\}[0-9]*$ b -5 -5 +a*a*a*a*a*a*a* & aaaaaa aaaaaa diff --git a/scsh/regexp/timer.c b/scsh/regexp/timer.c deleted file mode 100644 index c104a4f..0000000 --- a/scsh/regexp/timer.c +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Simple timing program for regcomp(). - * Usage: timer ncomp nexec nsub - * or - * timer ncomp nexec nsub regexp string [ answer [ sub ] ] - * - * The second form is for timing repetitions of a single test case. - * The first form's test data is a compiled-in copy of the "tests" file. - * Ncomp, nexec, nsub are how many times to do each regcomp, regexec, - * and regsub. The way to time an operation individually is to do something - * like "timer 1 50 1". - */ -#include - -struct try { - char *re, *str, *ans, *src, *dst; -} tests[] = { -#include "timer.t.h" -{ NULL, NULL, NULL, NULL, NULL } -}; - -#include - -int errreport = 0; /* Report errors via errseen? */ -char *errseen = NULL; /* Error message. */ - -char *progname; - -/* ARGSUSED */ -main(argc, argv) -int argc; -char *argv[]; -{ - int ncomp, nexec, nsub; - struct try one; - char dummy[512]; - - if (argc < 4) { - ncomp = 1; - nexec = 1; - nsub = 1; - } else { - ncomp = atoi(argv[1]); - nexec = atoi(argv[2]); - nsub = atoi(argv[3]); - } - - progname = argv[0]; - if (argc > 5) { - one.re = argv[4]; - one.str = argv[5]; - if (argc > 6) - one.ans = argv[6]; - else - one.ans = "y"; - if (argc > 7) { - one.src = argv[7]; - one.dst = "xxx"; - } else { - one.src = "x"; - one.dst = "x"; - } - errreport = 1; - try(one, ncomp, nexec, nsub); - } else - multiple(ncomp, nexec, nsub); - exit(0); -} - -void -regerror(s) -char *s; -{ - if (errreport) - errseen = s; - else - error(s, ""); -} - -#ifndef ERRAVAIL -error(s1, s2) -char *s1; -char *s2; -{ - fprintf(stderr, "regexp: "); - fprintf(stderr, s1, s2); - fprintf(stderr, "\n"); - exit(1); -} -#endif - -int lineno = 0; - -multiple(ncomp, nexec, nsub) -int ncomp, nexec, nsub; -{ - register int i; - extern char *strchr(); - - errreport = 1; - for (i = 0; tests[i].re != NULL; i++) { - lineno++; - try(tests[i], ncomp, nexec, nsub); - } -} - -try(fields, ncomp, nexec, nsub) -struct try fields; -int ncomp, nexec, nsub; -{ - regexp *r; - char dbuf[BUFSIZ]; - register int i; - - errseen = NULL; - r = regcomp(fields.re); - if (r == NULL) { - if (*fields.ans != 'c') - complain("regcomp failure in `%s'", fields.re); - return; - } - if (*fields.ans == 'c') { - complain("unexpected regcomp success in `%s'", fields.re); - free((char *)r); - return; - } - for (i = ncomp-1; i > 0; i--) { - free((char *)r); - r = regcomp(fields.re); - } - if (!regexec(r, fields.str)) { - if (*fields.ans != 'n') - complain("regexec failure in `%s'", ""); - free((char *)r); - return; - } - if (*fields.ans == 'n') { - complain("unexpected regexec success", ""); - free((char *)r); - return; - } - for (i = nexec-1; i > 0; i--) - (void) regexec(r, fields.str); - errseen = NULL; - for (i = nsub; i > 0; i--) - regsub(r, fields.src, dbuf); - if (errseen != NULL) { - complain("regsub complaint", ""); - free((char *)r); - return; - } - if (strcmp(dbuf, fields.dst) != 0) - complain("regsub result `%s' wrong", dbuf); - free((char *)r); -} - -complain(s1, s2) -char *s1; -char *s2; -{ - fprintf(stderr, "try: %d: ", lineno); - fprintf(stderr, s1, s2); - fprintf(stderr, " (%s)\n", (errseen != NULL) ? errseen : ""); -} diff --git a/scsh/regexp/try.c b/scsh/regexp/try.c deleted file mode 100644 index 9b6424b..0000000 --- a/scsh/regexp/try.c +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Simple test program for regexp(3) stuff. Knows about debugging hooks. - * Usage: try re [string [output [-]]] - * The re is compiled and dumped, regexeced against the string, the result - * is applied to output using regsub(). The - triggers a running narrative - * from regexec(). Dumping and narrative don't happen unless DEBUG. - * - * If there are no arguments, stdin is assumed to be a stream of lines with - * five fields: a r.e., a string to match it against, a result code, a - * source string for regsub, and the proper result. Result codes are 'c' - * for compile failure, 'y' for match success, 'n' for match failure. - * Field separator is tab. - */ -#include -#include - -#ifdef ERRAVAIL -char *progname; -extern char *mkprogname(); -#endif - -#ifdef DEBUG -extern int regnarrate; -#endif - -char buf[BUFSIZ]; - -int errreport = 0; /* Report errors via errseen? */ -char *errseen = NULL; /* Error message. */ -int status = 0; /* Exit status. */ - -/* ARGSUSED */ -main(argc, argv) -int argc; -char *argv[]; -{ - regexp *r; - int i; - -#ifdef ERRAVAIL - progname = mkprogname(argv[0]); -#endif - - if (argc == 1) { - multiple(); - exit(status); - } - - r = regcomp(argv[1]); - if (r == NULL) - error("regcomp failure", ""); -#ifdef DEBUG - regdump(r); - if (argc > 4) - regnarrate++; -#endif - if (argc > 2) { - i = regexec(r, argv[2]); - printf("%d", i); - for (i = 1; i < NSUBEXP; i++) - if (r->startp[i] != NULL && r->endp[i] != NULL) - printf(" \\%d", i); - printf("\n"); - } - if (argc > 3) { - regsub(r, argv[3], buf); - printf("%s\n", buf); - } - exit(status); -} - -void -regerror(s) -char *s; -{ - if (errreport) - errseen = s; - else - error(s, ""); -} - -#ifndef ERRAVAIL -error(s1, s2) -char *s1; -char *s2; -{ - fprintf(stderr, "regexp: "); - fprintf(stderr, s1, s2); - fprintf(stderr, "\n"); - exit(1); -} -#endif - -int lineno; - -regexp badregexp; /* Implicit init to 0. */ - -multiple() -{ - char rbuf[BUFSIZ]; - char *field[5]; - char *scan; - int i; - regexp *r; - extern char *strchr(); - - errreport = 1; - lineno = 0; - while (fgets(rbuf, sizeof(rbuf), stdin) != NULL) { - rbuf[strlen(rbuf)-1] = '\0'; /* Dispense with \n. */ - lineno++; - scan = rbuf; - for (i = 0; i < 5; i++) { - field[i] = scan; - if (field[i] == NULL) { - complain("bad testfile format", ""); - exit(1); - } - scan = strchr(scan, '\t'); - if (scan != NULL) - *scan++ = '\0'; - } - try(field); - } - - /* And finish up with some internal testing... */ - lineno = 9990; - errseen = NULL; - if (regcomp((char *)NULL) != NULL || errseen == NULL) - complain("regcomp(NULL) doesn't complain", ""); - lineno = 9991; - errseen = NULL; - if (regexec((regexp *)NULL, "foo") || errseen == NULL) - complain("regexec(NULL, ...) doesn't complain", ""); - lineno = 9992; - r = regcomp("foo"); - if (r == NULL) { - complain("regcomp(\"foo\") fails", ""); - return; - } - lineno = 9993; - errseen = NULL; - if (regexec(r, (char *)NULL) || errseen == NULL) - complain("regexec(..., NULL) doesn't complain", ""); - lineno = 9994; - errseen = NULL; - regsub((regexp *)NULL, "foo", rbuf); - if (errseen == NULL) - complain("regsub(NULL, ..., ...) doesn't complain", ""); - lineno = 9995; - errseen = NULL; - regsub(r, (char *)NULL, rbuf); - if (errseen == NULL) - complain("regsub(..., NULL, ...) doesn't complain", ""); - lineno = 9996; - errseen = NULL; - regsub(r, "foo", (char *)NULL); - if (errseen == NULL) - complain("regsub(..., ..., NULL) doesn't complain", ""); - lineno = 9997; - errseen = NULL; - if (regexec(&badregexp, "foo") || errseen == NULL) - complain("regexec(nonsense, ...) doesn't complain", ""); - lineno = 9998; - errseen = NULL; - regsub(&badregexp, "foo", rbuf); - if (errseen == NULL) - complain("regsub(nonsense, ..., ...) doesn't complain", ""); -} - -try(fields) -char **fields; -{ - regexp *r; - char dbuf[BUFSIZ]; - - errseen = NULL; - r = regcomp(fields[0]); - if (r == NULL) { - if (*fields[2] != 'c') - complain("regcomp failure in `%s'", fields[0]); - return; - } - if (*fields[2] == 'c') { - complain("unexpected regcomp success in `%s'", fields[0]); - free((char *)r); - return; - } - if (!regexec(r, fields[1])) { - if (*fields[2] != 'n') - complain("regexec failure in `%s'", fields[0]); - free((char *)r); - return; - } - if (*fields[2] == 'n') { - complain("unexpected regexec success", ""); - free((char *)r); - return; - } - errseen = NULL; - regsub(r, fields[3], dbuf); - if (errseen != NULL) { - complain("regsub complaint", ""); - free((char *)r); - return; - } - if (strcmp(dbuf, fields[4]) != 0) - complain("regsub result `%s' wrong", dbuf); - free((char *)r); -} - -complain(s1, s2) -char *s1; -char *s2; -{ - fprintf(stderr, "try: %d: ", lineno); - fprintf(stderr, s1, s2); - fprintf(stderr, " (%s)\n", (errseen != NULL) ? errseen : ""); - status = 1; -}