From a61145fa6ad4451a8c662aa5cc8bee32c364a219 Mon Sep 17 00:00:00 2001 From: bdc Date: Tue, 24 Sep 1996 01:29:51 +0000 Subject: [PATCH] updated regexp --- scsh/regexp/COPYRIGHT | 19 + scsh/regexp/Makefile.in | 160 +-- scsh/regexp/README | 95 +- scsh/regexp/patch-msg | 803 +++++++++++++++ scsh/regexp/regerror.c | 18 +- scsh/regexp/regexp.3 | 186 ++++ scsh/regexp/regexp.c | 2069 +++++++++++++++++++-------------------- scsh/regexp/regexp.h | 22 +- scsh/regexp/regsub.c | 168 ++-- scsh/regexp/tests | 127 +++ scsh/regexp/timer.c | 164 ++++ scsh/regexp/try.c | 384 ++++---- 12 files changed, 2732 insertions(+), 1483 deletions(-) create mode 100644 scsh/regexp/COPYRIGHT create mode 100644 scsh/regexp/patch-msg create mode 100644 scsh/regexp/regexp.3 create mode 100644 scsh/regexp/tests create mode 100644 scsh/regexp/timer.c diff --git a/scsh/regexp/COPYRIGHT b/scsh/regexp/COPYRIGHT new file mode 100644 index 0000000..36b9804 --- /dev/null +++ b/scsh/regexp/COPYRIGHT @@ -0,0 +1,19 @@ +Copyright (c) 1986, 1993, 1995 by University of Toronto. +Written by Henry Spencer. Not derived from licensed software. + +Permission is granted to anyone to use this software for any +purpose on any computer system, and to redistribute it in any way, +subject to the following restrictions: + +1. The author is not responsible for the consequences of use of + this software, no matter how awful, even if they arise + from defects in it. + +2. The origin of this software must not be misrepresented, either + by explicit claim or by omission. + +3. Altered versions must be plainly marked as such, and must not + be misrepresented (by explicit claim or omission) as being + the original software. + +4. This notice must not be removed or altered. diff --git a/scsh/regexp/Makefile.in b/scsh/regexp/Makefile.in index c464295..6aed9ad 100644 --- a/scsh/regexp/Makefile.in +++ b/scsh/regexp/Makefile.in @@ -5,92 +5,114 @@ CFLAGS1 = @CFLAGS1@ RANLIB = @RANLIB@ -# Things you might want to put in ENV and LENV: -# -Dvoid=int compilers that don't do void -# -DCHARBITS=0377 compilers that don't do unsigned char -# -DSTATIC=extern compilers that don't like "static foo();" as forward decl -# -DSTRCSPN library does not have strcspn() -# -Dstrchr=index library does not have strchr() +# Things you might want to put in ENV: # -DERRAVAIL have utzoo-compatible error() function and friends -# ENV=-DSTRCSPN -# LENV=-DSTRCSPN - +ENV= + # Things you might want to put in TEST: # -DDEBUG debugging hooks # -I. regexp.h from current directory, not /usr/include TEST=-I. -I$(srcdir) - + # Things you might want to put in PROF: -# -Dstatic='/* */' make everything global so profiler can see it. -# -p profiler -PROF= +# -pg profiler +# PROF= -INCDEST=/contrib/share/include -LIBDEST=/contrib/system/lib -MANDEST=/contrib/share/man/man3 +CFLAGS=$(CFLAGS1) $(ENV) $(TEST) $(PROF) +LDFLAGS=$(PROF) -# CC = cc -# CFLAGS1 = -O -Q +LIB=libregexp.a +OBJ=regexp.o regsub.o regerror.o +TMP=dtr.tmp -LINTFLAGS=$(LENV) $(TEST) -ha -# LDFLAGS=-i +default: r -CFLAGS = $(CFLAGS1) $(ENV) $(TEST) $(PROF) +try: try.o $(LIB) + $(CC) $(LDFLAGS) try.o $(LIB) -o try -OBJ=regexp.o regsub.o -LIBOBJ= $(OBJ) regerror.o -LSRC=regexp.c regsub.c regerror.c -DTR=README dMakefile regexp.3 regexp.h regexp.c regsub.c regerror.c \ - regmagic.h try.c timer.c tests -DEST = .. - -# we don't use the library anymore -bri -all: $(OBJ) # libregexp.a try - -libregexp.a: $(LIBOBJ) - ar r libregexp.a $(LIBOBJ) - $(RANLIB) libregexp.a - -install: - install -c libregexp.a $(LIBDEST)/libregexp.a - $(RANLIB) $(LIBDEST)/libregexp.a - install -c regexp.h $(INCDEST)/regexp.h - install -c regexp.3 $(MANDEST)/regexp.3 - -try: try.o $(OBJ) - $(CC) $(LDFLAGS) try.o $(OBJ) -o try - # Making timer will probably require putting stuff in $(PROF) and then # recompiling everything; the following is just the final stage. -timer: timer.o $(OBJ) - $(CC) $(LDFLAGS) $(PROF) timer.o $(OBJ) -o timer - +timer: timer.o $(LIB) + $(CC) $(LDFLAGS) timer.o $(LIB) -o timer + timer.o: timer.c timer.t.h - + timer.t.h: tests sed 's/ /","/g;s/\\/&&/g;s/.*/{"&"},/' tests >timer.t.h - + # Regression test. -r: ./try tests - @echo 'No news is good news...' - ./try dtr - -dMakefile: Makefile - sed '/^L*ENV=/s/ *-DERRAVAIL//' Makefile >dMakefile - -mv: $(OBJ) regerror.o - mv $(OBJ) regerror.o $(DEST) + rm -f *.o core mon.out gmon.out timer.t.h dtr copy try timer r.* + rm -f residue rs.* re.1 rm.h re.h ch.soe ch.ps j badcom fig[012] + rm -f ch.sml fig[12].ps $(LIB) + rm -rf $(TMP) + +# the rest of this is unlikely to be of use to you + +BITS = r.1 rs.1 re.1 rm.h re.h +OPT=-p -ms + +ch.soe: ch $(BITS) + soelim ch >$@ + +ch.sml: ch $(BITS) smlize splitfigs + splitfigs ch | soelim | smlize >$@ + +fig0 fig1 fig2: ch splitfigs + splitfigs ch >/dev/null + +f: fig0 fig1 fig2 figs + groff -Tps -s $(OPT) figs | lpr + +fig1.ps: fig0 fig1 + ( cat fig0 ; echo ".LP" ; cat fig1 ) | groff -Tps $(OPT) >$@ + +fig2.ps: fig0 fig2 + ( cat fig0 ; echo ".LP" ; cat fig2 ) | groff -Tps $(OPT) >$@ + +fp: fig1.ps fig2.ps + +r.1: regexp.c splitter + splitter regexp.c + +rs.1: regsub.c splitter + splitter regsub.c + +re.1: regerror.c splitter + splitter regerror.c + +rm.h: regmagic.h splitter + splitter regmagic.h + +re.h: regexp.h splitter + splitter regexp.h + +PLAIN=COPYRIGHT README Makefile regexp.3 try.c timer.c tests +FIX=regexp.h regexp.c regsub.c regerror.c regmagic.h +DTR=$(PLAIN) $(FIX) + +dtr: r $(DTR) + rm -rf $(TMP) + mkdir $(TMP) + cp $(PLAIN) $(TMP) + for f in $(FIX) ; do normalize $$f >$(TMP)/$$f ; done + ( cd $(TMP) ; makedtr $(DTR) ) >$@ + rm -rf $(TMP) + +ch.ps: ch Makefile $(BITS) + groff -Tps $(OPT) ch >$@ + +copy: ch.soe ch.sml fp + makedtr REMARKS ch.sml fig*.ps ch.soe >$@ + +go: copy dtr diff --git a/scsh/regexp/README b/scsh/regexp/README index a18c796..bcb9cf5 100644 --- a/scsh/regexp/README +++ b/scsh/regexp/README @@ -1,55 +1,37 @@ -This is a nearly-public-domain reimplementation of the V8 regexp(3) package. +This is a revision of my well-known regular-expression package, regexp(3). It gives C programs the ability to use egrep-style regular expressions, and does it in a much cleaner fashion than the analogous routines in SysV. - - Copyright (c) 1986 by University of Toronto. - Written by Henry Spencer. Not derived from licensed software. - - Permission is granted to anyone to use this software for any - purpose on any computer system, and to redistribute it freely, - subject to the following restrictions: - - 1. The author is not responsible for the consequences of use of - this software, no matter how awful, even if they arise - from defects in it. - - 2. The origin of this software must not be misrepresented, either - by explicit claim or by omission. - - 3. Altered versions must be plainly marked as such, and must not - be misrepresented as being the original software. - -Barring a couple of small items in the BUGS list, this implementation is -believed 100% compatible with V8. It should even be binary-compatible, -sort of, since the only fields in a "struct regexp" that other people have -any business touching are declared in exactly the same way at the same -location in the struct (the beginning). - -This implementation is *NOT* AT&T/Bell code, and is not derived from licensed +It is not, alas, fully POSIX.2-compliant; that is hard. (I'm working on +a full reimplementation that will do that.) + +This version is the one which is examined and explained in one chapter of +"Software Solutions in C" (Dale Schumacher, ed.; AP Professional 1994; +ISBN 0-12-632360-7), plus a couple of insignificant updates, plus one +significant bug fix (done 10 Nov 1995). + +Although this package was inspired by the Bell V8 regexp(3), this +implementation is *NOT* AT&T/Bell code, and is not derived from licensed software. Even though U of T is a V8 licensee. This software is based on a V8 manual page sent to me by Dennis Ritchie (the manual page enclosed here is a complete rewrite and hence is not covered by AT&T copyright). -The software was nearly complete at the time of arrival of our V8 tape. -I haven't even looked at V8 yet, although a friend elsewhere at U of T has -been kind enough to run a few test programs using the V8 regexp(3) to resolve -a few fine points. I admit to some familiarity with regular-expression -implementations of the past, but the only one that this code traces any -ancestry to is the one published in Kernighan & Plauger (from which this -one draws ideas but not code). - -Simplistically: put this stuff into a source directory, copy regexp.h into -/usr/include, inspect Makefile for compilation options that need changing -to suit your local environment, and then do "make r". This compiles the -regexp(3) functions, compiles a test program, and runs a large set of -regression tests. If there are no complaints, then put regexp.o, regsub.o, -and regerror.o into your C library, and regexp.3 into your manual-pages -directory. - -Note that if you don't put regexp.h into /usr/include *before* compiling, -you'll have to add "-I." to CFLAGS before compiling. - +I admit to some familiarity with regular-expression implementations of +the past, but the only one that this code traces any ancestry to is the +one published in Kernighan & Plauger's "Software Tools" (from which +this one draws ideas but not code). + +Simplistically: put this stuff into a source directory, inspect Makefile +for compilation options that need changing to suit your local environment, +and then do "make". This compiles the regexp(3) functions, builds a +library containing them, compiles a test program, and runs a large set of +regression tests. If there are no complaints, then put regexp.h into +/usr/include, add regexp.o, regsub.o, and regerror.o into your C library +(or put libre.a into /usr/lib), and install regexp.3 (perhaps with slight +modifications) in your manual-pages directory. + The files are: - + +COPYRIGHT copyright notice +README this text Makefile instructions to make everything regexp.3 manual page regexp.h header file, for /usr/include @@ -60,24 +42,15 @@ regmagic.h internal header file try.c source for test program timer.c source for timing program tests test list for try and timer - + This implementation uses nondeterministic automata rather than the deterministic ones found in some other implementations, which makes it simpler, smaller, and faster at compiling regular expressions, but slower -at executing them. In theory, anyway. This implementation does employ -some special-case optimizations to make the simpler cases (which do make -up the bulk of regular expressions actually used) run quickly. In general, -if you want blazing speed you're in the wrong place. Replacing the insides -of egrep with this stuff is probably a mistake; if you want your own egrep -you're going to have to do a lot more work. But if you want to use regular -expressions a little bit in something else, you're in luck. Note that many -existing text editors use nondeterministic regular-expression implementations, -so you're in good company. - -This stuff should be pretty portable, given appropriate option settings. -If your chars have less than 8 bits, you're going to have to change the -internal representation of the automaton, although knowledge of the details -of this is fairly localized. There are no "reserved" char values except for +at executing them. Many users have found the speed perfectly adequate, +although replacing the insides of egrep with this code would be a mistake. + +This stuff should be pretty portable, given an ANSI C compiler and +appropriate option settings. There are no "reserved" char values except for NUL, and no special significance is attached to the top bit of chars. The string(3) functions are used a fair bit, on the grounds that they are probably faster than coding the operations in line. Some attempts at code diff --git a/scsh/regexp/patch-msg b/scsh/regexp/patch-msg new file mode 100644 index 0000000..36a7ff9 --- /dev/null +++ b/scsh/regexp/patch-msg @@ -0,0 +1,803 @@ +Date: Mon, 1 Jul 1996 23:22:47 GMT +From: Bill Sommerfeld +To: shivers@lcs.mit.edu, bdc@ai.mit.edu +Subject: scsh patch for precompiled regexps.. + +I meant to send this out months ago but I was just too hosed with work. + +Here's what I have right now: + +There are three pieces here: + diffs to the "core" scsh + diffs to Henry Spencer's latest regexp library + a copy of Henry Spencer's latest regexp library.. + +It appears to work (it passes the same regression tests as the C library..). + +Let me know if I didn't include something needed for this to work.. + + - Bill + +diff -rc scsh-0.4.2/scsh/re.scm scsh-0.4.2-regexp/scsh/re.scm +*** scsh-0.4.2/scsh/re.scm Fri Oct 27 04:58:56 1995 +--- scsh-0.4.2-regexp/scsh/re.scm Sat Apr 6 21:07:41 1996 +*************** +*** 34,49 **** + + ;;; Bogus stub definitions for low-level match routines: + +! (define regexp? string?) +! (define (make-regexp str) str) + +! (define (regexp-exec regexp str . maybe-start) + (let ((start (optional-arg maybe-start 0)) + (start-vec (make-vector 10)) + (end-vec (make-vector 10))) +! (and (%regexp-match regexp str start start-vec end-vec) +! (make-regexp-match str start-vec end-vec)))) +! + + ;;; Convert a string into a regex pattern that matches that string exactly -- + ;;; in other words, quote the special chars with backslashes. +--- 34,53 ---- + + ;;; Bogus stub definitions for low-level match routines: + +! (define-record iregexp +! string) + +! (define regexp? iregexp?) +! +! (define (make-regexp str) +! (make-iregexp (compile-regexp str))) +! +! (define (regexp-exec r s . maybe-start) + (let ((start (optional-arg maybe-start 0)) + (start-vec (make-vector 10)) + (end-vec (make-vector 10))) +! (and (%regexp-exec-1 (iregexp:string r) s start start-vec end-vec) +! (make-regexp-match s start-vec end-vec)))) + + ;;; Convert a string into a regex pattern that matches that string exactly -- + ;;; in other words, quote the special chars with backslashes. +*************** +*** 58,75 **** + (cons #\\ result) + result)))))) + +! (define-foreign %regexp-match/errno (reg_match (string regexp) +! (string s) +! (integer start) +! (vector-desc start-vec) +! (vector-desc end-vec)) +! static-string ; Error string or #f if all is ok. +! bool) ; match? +! +! (define (%regexp-match regexp string start start-vec end-vec) +! (receive (err match?) (%regexp-match/errno regexp string start +! start-vec end-vec) +! (if err (error err %regexp-match regexp string start) match?))) + + + ;;; I do this one in C, I'm not sure why: +--- 62,79 ---- + (cons #\\ result) + result)))))) + +! ;;;(define-foreign %regexp-match/errno (reg_match (string regexp) +! ;;; (string s) +! ;;; (integer start) +! ;;; (vector-desc start-vec) +! ;;; (vector-desc end-vec)) +! ;;; static-string ; Error string or #f if all is ok. +! ;;; bool) ; match? +! +! ;;;(define (%regexp-match regexp string start start-vec end-vec) +! ;;; (receive (err match?) (%regexp-match/errno regexp string start +! ;;; start-vec end-vec) +! ;;; (if err (error err %regexp-match regexp string start) match?))) + + + ;;; I do this one in C, I'm not sure why: +*************** +*** 79,81 **** +--- 83,166 ---- + (filter_stringvec (string regexp) ((C "char const ** ~a") cvec)) + static-string ; error message -- #f if no error. + integer) ; number of files that pass the filter. ++ ++ ;;; precompiled regexps. ++ ++ (define-foreign %regexp-compiled-length (reg_comp_len (string regexp)) ++ static-string ++ integer) ++ ++ (define-foreign %regexp-compile (reg_comp_comp (string regexp) ++ (string-desc re-buf)) ++ static-string) ++ ++ (define (%regexp-exec-1 r s start sv ev) ++ (receive (err match?) (%regexp-exec r s start sv ev) ++ (if err (error err s start) ++ match?))) ++ ++ (define-foreign %regexp-exec (reg_exec (string-desc regexp) ++ (string s) ++ (integer start) ++ (vector-desc start-vec) ++ (vector-desc end-vec)) ++ static-string ++ bool) ++ ++ ++ (define (compile-regexp e) ++ (receive (err len) ++ (%regexp-compiled-length e) ++ (if err (error err e) ++ (let ((buf (make-string len))) ++ (%regexp-compile e buf) ++ buf)))) ++ ++ ++ ++ (define-foreign %regexp-subst (reg_subst (string-desc regexp) ++ (string m) ++ (string s) ++ (integer start) ++ (vector-desc start-vec) ++ (vector-desc end-vec) ++ (string-desc outbuf)) ++ static-string ++ integer) ++ ++ (define-foreign %regexp-subst-len (reg_subst_len (string-desc regexp) ++ (string m) ++ (string s) ++ (integer start) ++ (vector-desc start-vec) ++ (vector-desc end-vec)) ++ static-string ++ integer) ++ ++ ++ (define (regexp-subst re match replacement) ++ (let ((cr (iregexp:string re)) ++ (matchstr (regexp-match:string match)) ++ (startvec (regexp-match:start match)) ++ (endvec (regexp-match:end match))) ++ (receive (err outlen) ++ (%regexp-subst-len cr ++ matchstr ++ replacement ++ 0 ++ startvec ++ endvec) ++ (if err (error err matchstr replacement) ++ (let ((outbuf (make-string outlen))) ++ (receive (err outlen) ++ (%regexp-subst cr ++ matchstr ++ replacement ++ 0 ++ startvec ++ endvec ++ outbuf) ++ (if err (error err matchstr replacement) ++ (substring outbuf 0 outlen)))))))) ++ ++ +\ No newline at end of file +diff -rc scsh-0.4.2/scsh/re1.c scsh-0.4.2-regexp/scsh/re1.c +*** scsh-0.4.2/scsh/re1.c Fri Oct 27 04:58:58 1995 +--- scsh-0.4.2-regexp/scsh/re1.c Sat Apr 6 21:01:15 1996 +*************** +*** 19,24 **** +--- 19,150 ---- + /* Stash error msg in global. */ + void regerror(char *msg) {regexp_error = msg;} + ++ /* ++ ** Return NULL normally, error string on error. ++ ** Stash number of bytes needed for compiled regexp into `*len' ++ */ ++ ++ char *reg_comp_len(const char *re, int *len) ++ { ++ int l; ++ ++ regexp_error = NULL; ++ *len = regcomp_len(re); ++ return regexp_error; ++ } ++ ++ /* ++ ** Return NULL normally, error string on error. ++ ** Compile regexp into string described by `cr'. ++ */ ++ ++ char *reg_comp_comp(const char *re, scheme_value cr) ++ { ++ int len = STRING_LENGTH(cr); ++ regexp *r = (regexp *)&STRING_REF(cr, 0); ++ ++ regexp_error = NULL; ++ r = regcomp_comp(re, r, len); ++ return regexp_error; ++ } ++ ++ /* Return NULL normally, error string on error. ++ ** Stash match info in start_vec and end_vec. ++ ** Returns boolean match/no-match in hit. ++ */ ++ ++ char *reg_exec(scheme_value cr, const char *string, int start, ++ scheme_value start_vec, scheme_value end_vec, int *hit) ++ { ++ regexp *r = (regexp *)&STRING_REF(cr, 0); ++ ++ if( VECTOR_LENGTH(start_vec) != NSUBEXP ) { ++ return "Illegal start vector"; ++ } ++ ++ if( VECTOR_LENGTH(end_vec) != NSUBEXP ) { ++ return "Illegal end vector"; ++ } ++ ++ regexp_error = 0; ++ *hit = 0; ++ ++ if( regexec(r, string+start) ) { ++ int i; ++ for(i=0; istartp[i]; ++ const char *e = r->endp[i]; ++ VECTOR_REF(start_vec,i) = s?ENTER_FIXNUM(s - string):SCHFALSE; ++ VECTOR_REF(end_vec,i) = e?ENTER_FIXNUM(e - string):SCHFALSE; ++ r->startp[i] = NULL; ++ r->endp[i] = NULL; ++ } ++ *hit = 1; ++ } ++ return regexp_error; ++ } ++ ++ char *reg_subst(scheme_value cr, const char *match, ++ const char *src, int start, ++ scheme_value start_vec, scheme_value end_vec, ++ scheme_value outbuf, int *len) ++ { ++ int i; ++ regexp *r = (regexp *)&STRING_REF(cr, 0); ++ ++ if( VECTOR_LENGTH(start_vec) != NSUBEXP ) { ++ return "Illegal start vector"; ++ } ++ ++ if( VECTOR_LENGTH(end_vec) != NSUBEXP ) { ++ return "Illegal end vector"; ++ } ++ ++ for (i=0; istartp[i] = FIXNUMP(se)?(match + EXTRACT_FIXNUM(se)):NULL; ++ r->endp[i] = FIXNUMP(ee)? (match + EXTRACT_FIXNUM(ee)):NULL; ++ } ++ ++ regexp_error = NULL; ++ regnsub (r, src, &STRING_REF(outbuf, 0), STRING_LENGTH(outbuf)); ++ *len = strlen(&STRING_REF(outbuf, 0)); ++ return regexp_error; ++ } ++ ++ char *reg_subst_len(scheme_value cr, const char *match, ++ const char *src, int start, ++ scheme_value start_vec, scheme_value end_vec, ++ int *len) ++ { ++ int i; ++ regexp *r = (regexp *)&STRING_REF(cr, 0); ++ ++ if( VECTOR_LENGTH(start_vec) != NSUBEXP ) { ++ return "Illegal start vector"; ++ } ++ ++ if( VECTOR_LENGTH(end_vec) != NSUBEXP ) { ++ return "Illegal end vector"; ++ } ++ ++ for (i=0; istartp[i] = FIXNUMP(se)?(match + EXTRACT_FIXNUM(se)):NULL; ++ r->endp[i] = FIXNUMP(ee)? (match + EXTRACT_FIXNUM(ee)):NULL; ++ } ++ ++ regexp_error = NULL; ++ *len = regsublen (r, src); ++ return regexp_error; ++ } ++ ++ ++ #if 0 + /* Return NULL normally, error string on error. + ** Stash match info in start_vec and end_vec. + ** Returns boolean match/no-match in hit. +*************** +*** 56,61 **** +--- 182,188 ---- + Free(prog); + return regexp_error; + } ++ #endif + + + char *filter_stringvec(const char *re, char const **stringvec, int *nummatch) +diff -rc scsh-0.4.2/scsh/re1.h scsh-0.4.2-regexp/scsh/re1.h +*** scsh-0.4.2/scsh/re1.h Sun Oct 22 08:34:34 1995 +--- scsh-0.4.2-regexp/scsh/re1.h Sat Apr 6 17:54:09 1996 +*************** +*** 1,6 **** +--- 1,21 ---- ++ #if 0 + char *reg_match(const char *re, const char *string, int start, + scheme_value start_vec, scheme_value end_vec, + int *hit); ++ #endif + + char *filter_stringvec(const char *re, char const **stringvec, + int *nummatch); ++ ++ char *reg_comp_len(const char *re, int *len); ++ char *reg_comp_comp(const char *re, scheme_value cr); ++ ++ char *reg_exec(scheme_value cr, const char *string, int start, ++ scheme_value start_vec, scheme_value end_vec, int *hit); ++ ++ char *reg_subst(scheme_value cr, const char *match, ++ const char *src, int start, ++ scheme_value start_vec, scheme_value end_vec, ++ scheme_value outbuf, int *len); ++ ++ + +Only in scsh-0.4.2-regexp/scsh: re2.scm +diff -rc scsh-0.4.2/scsh/scsh-interfaces.scm scsh-0.4.2-regexp/scsh/scsh-interfaces.scm +*** scsh-0.4.2/scsh/scsh-interfaces.scm Tue Oct 31 19:19:30 1995 +--- scsh-0.4.2-regexp/scsh/scsh-interfaces.scm Sat Apr 6 18:48:12 1996 +*************** +*** 413,418 **** +--- 413,419 ---- + make-regexp + regexp? + regexp-exec ++ regexp-subst + regexp-quote)) + + + +regexp library changes: + +*** Makefile 1996/04/06 19:24:49 1.1 +--- Makefile 1996/04/06 20:46:26 +*************** +*** 5,11 **** + # Things you might want to put in TEST: + # -DDEBUG debugging hooks + # -I. regexp.h from current directory, not /usr/include +! TEST=-I. + + # Things you might want to put in PROF: + # -pg profiler +--- 5,11 ---- + # Things you might want to put in TEST: + # -DDEBUG debugging hooks + # -I. regexp.h from current directory, not /usr/include +! TEST=-I. -DDEBUG + + # Things you might want to put in PROF: + # -pg profiler +*** regexp.c 1996/04/06 19:24:49 1.1 +--- regexp.c 1996/04/06 22:34:55 +*************** +*** 105,110 **** +--- 105,111 ---- + * Utility definitions. + */ + #define FAIL(m) { regerror(m); return(NULL); } ++ #define FAILN(m) { regerror(m); return(-1); } + #define ISREPN(c) ((c) == '*' || (c) == '+' || (c) == '?') + #define META "^$.[()|?+*\\" + +*************** +*** 162,173 **** + const char *exp; + { + register regexp *r; +! register char *scan; + int flags; + struct comp co; + + if (exp == NULL) +! FAIL("NULL argument to regcomp"); + + /* First pass: determine size, legality. */ + co.regparse = (char *)exp; +--- 163,193 ---- + const char *exp; + { + register regexp *r; +! size_t len; +! +! len = regcomp_len(exp); +! if (len <= 0) +! return NULL; +! +! /* Allocate space. */ +! r = (regexp *)malloc(len); +! +! if (r == NULL) +! FAIL("out of space"); +! return regcomp_comp(exp, r, len); +! } +! +! +! size_t +! regcomp_len(exp) +! const char *exp; +! { + int flags; ++ register regexp *r; + struct comp co; + + if (exp == NULL) +! FAILN("NULL argument to regcomp"); + + /* First pass: determine size, legality. */ + co.regparse = (char *)exp; +*************** +*** 178,198 **** + co.regcode = co.regdummy; + regc(&co, MAGIC); + if (reg(&co, 0, &flags) == NULL) +! return(NULL); + + /* Small enough for pointer-storage convention? */ + if (co.regsize >= 0x7fffL) /* Probably could be 0xffffL. */ +! FAIL("regexp too big"); + +! /* Allocate space. */ +! r = (regexp *)malloc(sizeof(regexp) + (size_t)co.regsize); +! if (r == NULL) +! FAIL("out of space"); + + /* Second pass: emit code. */ + co.regparse = (char *)exp; + co.regnpar = 1; + co.regcode = r->program; + regc(&co, MAGIC); + if (reg(&co, 0, &flags) == NULL) + return(NULL); +--- 198,228 ---- + co.regcode = co.regdummy; + regc(&co, MAGIC); + if (reg(&co, 0, &flags) == NULL) +! return -1; + + /* Small enough for pointer-storage convention? */ + if (co.regsize >= 0x7fffL) /* Probably could be 0xffffL. */ +! FAILN("regexp too big"); + +! return (sizeof(regexp) + (size_t)co.regsize); +! } +! +! +! regexp * +! regcomp_comp(exp, r, len) +! const char *exp; +! register regexp *r; +! size_t len; +! { +! register char *scan; +! int flags; +! struct comp co; + + /* Second pass: emit code. */ + co.regparse = (char *)exp; + co.regnpar = 1; + co.regcode = r->program; ++ co.regsize = len - sizeof(regexp); + regc(&co, MAGIC); + if (reg(&co, 0, &flags) == NULL) + return(NULL); +*************** +*** 200,206 **** + /* Dig out information for optimizations. */ + r->regstart = '\0'; /* Worst-case defaults. */ + r->reganch = 0; +! r->regmust = NULL; + r->regmlen = 0; + scan = r->program+1; /* First BRANCH. */ + if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ +--- 230,236 ---- + /* Dig out information for optimizations. */ + r->regstart = '\0'; /* Worst-case defaults. */ + r->reganch = 0; +! r->regmust = 0; + r->regmlen = 0; + scan = r->program+1; /* First BRANCH. */ + if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ +*************** +*** 229,235 **** + longest = OPERAND(scan); + len = strlen(OPERAND(scan)); + } +! r->regmust = longest; + r->regmlen = (int)len; + } + } +--- 259,265 ---- + longest = OPERAND(scan); + len = strlen(OPERAND(scan)); + } +! r->regmust = longest - r->program; + r->regmlen = (int)len; + } + } +*************** +*** 648,655 **** + struct exec { + char *reginput; /* String-input pointer. */ + char *regbol; /* Beginning of input, for ^ check. */ +! char **regstartp; /* Pointer to startp array. */ +! char **regendp; /* Ditto for endp. */ + }; + + /* +--- 678,685 ---- + struct exec { + char *reginput; /* String-input pointer. */ + char *regbol; /* Beginning of input, for ^ check. */ +! const char **regstartp; /* Pointer to startp array. */ +! const char **regendp; /* Ditto for endp. */ + }; + + /* +*************** +*** 690,696 **** + } + + /* If there is a "must appear" string, look for it. */ +! if (prog->regmust != NULL && strstr(string, prog->regmust) == NULL) + return(0); + + /* Mark beginning of line for ^ . */ +--- 720,727 ---- + } + + /* If there is a "must appear" string, look for it. */ +! if ((prog->regmlen > 0) && +! strstr(string, &prog->program[prog->regmust]) == NULL) + return(0); + + /* Mark beginning of line for ^ . */ +*************** +*** 729,736 **** + char *string; + { + register int i; +! register char **stp; +! register char **enp; + + ep->reginput = string; + +--- 760,767 ---- + char *string; + { + register int i; +! register const char **stp; +! register const char **enp; + + ep->reginput = string; + +*************** +*** 1004,1011 **** + printf("start `%c' ", r->regstart); + if (r->reganch) + printf("anchored "); +! if (r->regmust != NULL) +! printf("must have \"%s\"", r->regmust); + printf("\n"); + } + +--- 1035,1042 ---- + printf("start `%c' ", r->regstart); + if (r->reganch) + printf("anchored "); +! if (r->regmlen > 0) +! printf("must have \"%s\"", &r->program[r->regmust]); + printf("\n"); + } + +*** regexp.h 1996/04/06 19:24:49 1.1 +--- regexp.h 1996/04/07 01:52:19 +*************** +*** 6,16 **** + */ + #define NSUBEXP 10 + typedef struct regexp { +! char *startp[NSUBEXP]; +! char *endp[NSUBEXP]; + char regstart; /* Internal use only. */ + char reganch; /* Internal use only. */ +! char *regmust; /* Internal use only. */ + int regmlen; /* Internal use only. */ + char program[1]; /* Unwarranted chumminess with compiler. */ + } regexp; +--- 6,16 ---- + */ + #define NSUBEXP 10 + typedef struct regexp { +! const char *startp[NSUBEXP]; +! const char *endp[NSUBEXP]; + char regstart; /* Internal use only. */ + char reganch; /* Internal use only. */ +! int regmust; /* Internal use only. */ + int regmlen; /* Internal use only. */ + char program[1]; /* Unwarranted chumminess with compiler. */ + } regexp; +*************** +*** 18,21 **** +--- 18,27 ---- + extern regexp *regcomp(const char *re); + extern int regexec(regexp *rp, const char *s); + extern void regsub(const regexp *rp, const char *src, char *dst); ++ extern void regnsub(const regexp *rp, const char *src, char *dst, size_t len); ++ extern size_t regsublen(const regexp *rp, const char *src); ++ + extern void regerror(char *message); ++ extern size_t regcomp_len(const char *exp); ++ extern regexp *regcomp_comp(const char *exp, struct regexp *r, size_t len); ++ +*** regsub.c 1996/04/06 19:24:49 1.1 +--- regsub.c 1996/04/07 02:10:29 +*************** +*** 11,25 **** + /* + - regsub - perform substitutions after a regexp match + */ + void +! regsub(rp, source, dest) + const regexp *rp; + const char *source; + char *dest; + { + register regexp * const prog = (regexp *)rp; +! register char *src = (char *)source; + register char *dst = dest; + register char c; + register int no; + register size_t len; +--- 11,42 ---- + /* + - regsub - perform substitutions after a regexp match + */ ++ ++ void regsub(rp, source, dest) ++ const regexp *rp; ++ const char *source; ++ char *dest; ++ { ++ regnsub(rp, source, dest, BUFSIZ); ++ } ++ ++ ++ ++ /* ++ - regnsub - perform bounds-checked substitutions after a regexp match ++ */ + void +! regnsub(rp, source, dest, destlen) + const regexp *rp; + const char *source; + char *dest; ++ size_t destlen; + { + register regexp * const prog = (regexp *)rp; +! register const char *src = (char *)source; + register char *dst = dest; ++ char *dstend = dest + destlen; ++ char *odst; + register char c; + register int no; + register size_t len; +*************** +*** 45,55 **** + if (c == '\\' && (*src == '\\' || *src == '&')) + c = *src++; + *dst++ = c; + } else if (prog->startp[no] != NULL && prog->endp[no] != NULL && +! prog->endp[no] > prog->startp[no]) { + len = prog->endp[no] - prog->startp[no]; +! (void) strncpy(dst, prog->startp[no], len); + dst += len; + if (*(dst-1) == '\0') { /* strncpy hit NUL. */ + regerror("damaged match string"); + return; +--- 62,83 ---- + if (c == '\\' && (*src == '\\' || *src == '&')) + c = *src++; + *dst++ = c; ++ if (dst >= dstend) ++ { ++ regerror("output buffer too small"); ++ return; ++ } + } else if (prog->startp[no] != NULL && prog->endp[no] != NULL && +! prog->endp[no] > prog->startp[no]) { + len = prog->endp[no] - prog->startp[no]; +! odst = dst; + dst += len; ++ if (dst >= dstend) ++ { ++ regerror("output buffer too small"); ++ return; ++ } ++ (void) strncpy(odst, prog->startp[no], len); + if (*(dst-1) == '\0') { /* strncpy hit NUL. */ + regerror("damaged match string"); + return; +*************** +*** 58,60 **** +--- 86,131 ---- + } + *dst++ = '\0'; + } ++ ++ size_t regsublen(rp, source) ++ const regexp *rp; ++ const char *source; ++ { ++ register regexp * const prog = (regexp *)rp; ++ register char *src = (char *)source; ++ register char c; ++ register int no; ++ register int len = 0; ++ ++ if (prog == NULL || source == NULL) { ++ regerror("NULL parameter to regsublen"); ++ return -1; ++ } ++ ++ if ((unsigned char)*(prog->program) != MAGIC) { ++ regerror("damaged regexp"); ++ return -1; ++ } ++ while ((c = *src++) != '\0') { ++ if (c == '&') ++ no = 0; ++ else if (c == '\\' && isdigit(*src)) ++ no = *src++ - '0'; ++ else ++ no = -1; ++ if (no < 0) { /* Ordinary character. */ ++ if (c == '\\' && (*src == '\\' || *src == '&')) ++ src++; ++ len++; ++ } else { ++ const char *s = prog->startp[no]; ++ const char *e = prog->endp[no]; ++ if ((s != NULL) && (e != NULL) && (e > s)) { ++ len += e-s; ++ } ++ } ++ } ++ return len+1; ++ } ++ ++ + +Original regexp code from henry: +[unpacked & deleted -Olin] diff --git a/scsh/regexp/regerror.c b/scsh/regexp/regerror.c index 2d682c2..a280cee 100644 --- a/scsh/regexp/regerror.c +++ b/scsh/regexp/regerror.c @@ -1,14 +1,18 @@ +/* + * regerror + */ #include - -void +#include + +void regerror(s) - char *s; +char *s; { #ifdef ERRAVAIL - error("regexp: %s", s); + error("regexp: %s", s); #else - fprintf(stderr, "regexp(3): %s", s); - exit(1); + fprintf(stderr, "regexp(3): %s\n", s); + exit(EXIT_FAILURE); #endif - /* NOTREACHED */ + /* NOTREACHED */ } diff --git a/scsh/regexp/regexp.3 b/scsh/regexp/regexp.3 new file mode 100644 index 0000000..6d2555b --- /dev/null +++ b/scsh/regexp/regexp.3 @@ -0,0 +1,186 @@ +.TH REGEXP 3 "2 Sept 1995" +.SH NAME +regcomp, regexec, regsub, regerror \- regular expression handler +.SH SYNOPSIS +.ft B +.nf +#include + +regexp *regcomp(exp) +const char *exp; + +int regexec(prog, string) +regexp *prog; +const char *string; + +void regsub(prog, source, dest) +const regexp *prog; +const char *source; +char *dest; + +void regerror(msg) +char *msg; +.SH DESCRIPTION +These functions implement +.IR egrep (1)-style +regular expressions and supporting facilities. +.PP +.I Regcomp +compiles a regular expression into a structure of type +.IR regexp , +and returns a pointer to it. +The space has been allocated using +.IR malloc (3) +and may be released by +.IR free . +.PP +.I Regexec +matches a NUL-terminated \fIstring\fR against the compiled regular expression +in \fIprog\fR. +It returns 1 for success and 0 for failure, and adjusts the contents of +\fIprog\fR's \fIstartp\fR and \fIendp\fR (see below) accordingly. +.PP +The members of a +.I regexp +structure include at least the following (not necessarily in order): +.PP +.RS +char *startp[NSUBEXP]; +.br +char *endp[NSUBEXP]; +.RE +.PP +where +.I NSUBEXP +is defined (as 10) in the header file. +Once a successful \fIregexec\fR has been done using the \fIregexp\fR, +each \fIstartp\fR-\fIendp\fR pair describes one substring +within the \fIstring\fR, +with the \fIstartp\fR pointing to the first character of the substring and +the \fIendp\fR pointing to the first character following the substring. +The 0th substring is the substring of \fIstring\fR that matched the whole +regular expression. +The others are those substrings that matched parenthesized expressions +within the regular expression, with parenthesized expressions numbered +in left-to-right order of their opening parentheses. +.PP +.I Regsub +copies \fIsource\fR to \fIdest\fR, making substitutions according to the +most recent \fIregexec\fR performed using \fIprog\fR. +Each instance of `&' in \fIsource\fR is replaced by the substring +indicated by \fIstartp\fR[\fI0\fR] and +\fIendp\fR[\fI0\fR]. +Each instance of `\e\fIn\fR', where \fIn\fR is a digit, is replaced by +the substring indicated by +\fIstartp\fR[\fIn\fR] and +\fIendp\fR[\fIn\fR]. +To get a literal `&' or `\e\fIn\fR' into \fIdest\fR, prefix it with `\e'; +to get a literal `\e' preceding `&' or `\e\fIn\fR', prefix it with +another `\e'. +.PP +.I Regerror +is called whenever an error is detected in \fIregcomp\fR, \fIregexec\fR, +or \fIregsub\fR. +The default \fIregerror\fR writes the string \fImsg\fR, +with a suitable indicator of origin, +on the standard +error output +and invokes \fIexit\fR(2). +.I Regerror +can be replaced by the user if other actions are desirable. +.SH "REGULAR EXPRESSION SYNTAX" +A regular expression is zero or more \fIbranches\fR, separated by `|'. +It matches anything that matches one of the branches. +.PP +A branch is zero or more \fIpieces\fR, concatenated. +It matches a match for the first, followed by a match for the second, etc. +.PP +A piece is an \fIatom\fR possibly followed by `*', `+', or `?'. +An atom followed by `*' matches a sequence of 0 or more matches of the atom. +An atom followed by `+' matches a sequence of 1 or more matches of the atom. +An atom followed by `?' matches a match of the atom, or the null string. +.PP +An atom is a regular expression in parentheses (matching a match for the +regular expression), a \fIrange\fR (see below), `.' +(matching any single character), `^' (matching the null string at the +beginning of the input string), `$' (matching the null string at the +end of the input string), a `\e' followed by a single character (matching +that character), or a single character with no other significance +(matching that character). +.PP +A \fIrange\fR is a sequence of characters enclosed in `[]'. +It normally matches any single character from the sequence. +If the sequence begins with `^', +it matches any single character \fInot\fR from the rest of the sequence. +If two characters in the sequence are separated by `\-', this is shorthand +for the full list of ASCII characters between them +(e.g. `[0-9]' matches any decimal digit). +To include a literal `]' in the sequence, make it the first character +(following a possible `^'). +To include a literal `\-', make it the first or last character. +.SH AMBIGUITY +If a regular expression could match two different parts of the input string, +it will match the one which begins earliest. +If both begin in the same place but match different lengths, or match +the same length in different ways, life gets messier, as follows. +.PP +In general, the possibilities in a list of branches are considered in +left-to-right order, the possibilities for `*', `+', and `?' are +considered longest-first, nested constructs are considered from the +outermost in, and concatenated constructs are considered leftmost-first. +The match that will be chosen is the one that uses the earliest +possibility in the first choice that has to be made. +If there is more than one choice, the next will be made in the same manner +(earliest possibility) subject to the decision on the first choice. +And so forth. +.PP +For example, `(ab|a)b*c' could match `abc' in one of two ways. +The first choice is between `ab' and `a'; since `ab' is earlier, and does +lead to a successful overall match, it is chosen. +Since the `b' is already spoken for, +the `b*' must match its last possibility\(emthe empty string\(emsince +it must respect the earlier choice. +.PP +In the particular case where the regular expression does not use `|' +and does not apply `*', `+', or `?' to parenthesized subexpressions, +the net effect is that the longest possible +match will be chosen. +So `ab*', presented with `xabbbby', will match `abbbb'. +Note that if `ab*' is tried against `xabyabbbz', it +will match `ab' just after `x', due to the begins-earliest rule. +(In effect, the decision on where to start the match is the first choice +to be made, hence subsequent choices must respect it even if this leads them +to less-preferred alternatives.) +.SH SEE ALSO +egrep(1), expr(1) +.SH DIAGNOSTICS +\fIRegcomp\fR returns NULL for a failure +(\fIregerror\fR permitting), +where failures are syntax errors, exceeding implementation limits, +or applying `+' or `*' to a possibly-null operand. +.SH HISTORY +This is a revised version. +Both code and manual page were +originally written by Henry Spencer at University of Toronto. +They are intended to be compatible with the Bell V8 \fIregexp\fR(3), +but are not derived from Bell code. +.SH BUGS +Empty branches and empty regular expressions are not portable +to other, otherwise-similar, implementations. +.PP +The ban on +applying `*' or `+' to a possibly-null operand is an artifact of the +simplistic implementation. +.PP +The match-choice rules are complex. +A simple ``longest match'' rule would be preferable, +but is harder to implement. +.PP +Although there is a general similarity to POSIX.2 ``extended'' regular +expressions, neither the regular-expression syntax nor the programming +interface is an exact match. +.PP +Due to emphasis on +compactness and simplicity, +it's not strikingly fast. +It does give some attention to handling simple cases quickly. diff --git a/scsh/regexp/regexp.c b/scsh/regexp/regexp.c index 5f7fa6d..65d7e88 100644 --- a/scsh/regexp/regexp.c +++ b/scsh/regexp/regexp.c @@ -1,45 +1,21 @@ /* - * regcomp and regexec -- regsub and regerror are elsewhere @(#)regexp.c 1.3 - * of 18 April 87 - * - * Copyright (c) 1986 by University of Toronto. Written by Henry Spencer. Not - * derived from licensed software. - * - * Permission is granted to anyone to use this software for any purpose on any - * computer system, and to redistribute it freely, subject to the following - * restrictions: - * - * 1. The author is not responsible for the consequences of use of this - * software, no matter how awful, even if they arise from defects in it. - * - * 2. The origin of this software must not be misrepresented, either by explicit - * claim or by omission. - * - * 3. Altered versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * - * Beware that some of this code is subtly aware of the way operator precedence - * is structured in regular expressions. Serious changes in - * regular-expression syntax might require a total rethink. + * regcomp and regexec -- regsub and regerror are elsewhere */ #include -#ifdef AMIGA -#undef min -#include "regexp.h" -#else +#include +#include #include -#endif #include "regmagic.h" - + /* * The "internal use only" fields in regexp.h are present to pass info from * compile to execute that permits the execute phase to run lots faster on - * simple cases. They are: + * simple cases. They are: * - * regstart char that must begin a match; '\0' if none obvious reganch - * is the match anchored (at beginning-of-line only)? regmust string - * (pointer into program) that match must include, or NULL regmlen - * length of regmust string + * regstart char that must begin a match; '\0' if none obvious + * reganch is the match anchored (at beginning-of-line only)? + * regmust string (pointer into program) that match must include, or NULL + * regmlen length of regmust string * * Regstart and reganch permit very fast decisions on suitable starting points * for a match, cutting down the work a lot. Regmust permits fast rejection @@ -48,1164 +24,1101 @@ * potentially expensive (at present, the only such thing detected is * or + * at the start of the r.e., which can involve a lot of backup). Regmlen is * supplied because the test in regexec() needs it and regcomp() is computing - * it anyway. + * it anyway. */ - + /* - * Structure for regexp "program". This is essentially a linear encoding of - * a nondeterministic finite-state machine (aka syntax charts or "railroad - * normal form" in parsing technology). Each node is an opcode plus a "next" - * pointer, possibly plus an operand. "Next" pointers of all nodes except - * BRANCH implement concatenation; a "next" pointer with a BRANCH on both - * ends of it is connecting two alternatives. (Here we have one of the - * subtle syntax dependencies: an individual BRANCH (as opposed to a - * collection of them) is never concatenated with anything because of - * operator precedence.) The operand of some types of node is a literal - * string; for others, it is a node leading into a sub-FSM. In particular, - * the operand of a BRANCH node is the first node of the branch. (NB this is - * *not* a tree structure: the tail of the branch connects to the thing - * following the set of BRANCHes.) The opcodes are: + * Structure for regexp "program". This is essentially a linear encoding + * of a nondeterministic finite-state machine (aka syntax charts or + * "railroad normal form" in parsing technology). Each node is an opcode + * plus a "next" pointer, possibly plus an operand. "Next" pointers of + * all nodes except BRANCH implement concatenation; a "next" pointer with + * a BRANCH on both ends of it is connecting two alternatives. (Here we + * have one of the subtle syntax dependencies: an individual BRANCH (as + * opposed to a collection of them) is never concatenated with anything + * because of operator precedence.) The operand of some types of node is + * a literal string; for others, it is a node leading into a sub-FSM. In + * particular, the operand of a BRANCH node is the first node of the branch. + * (NB this is *not* a tree structure: the tail of the branch connects + * to the thing following the set of BRANCHes.) The opcodes are: */ - + /* definition number opnd? meaning */ -#define END 0 /* no End of program. */ -#define BOL 1 /* no Match "" at beginning of line. */ -#define EOL 2 /* no Match "" at end of line. */ -#define ANY 3 /* no Match any one character. */ -#define ANYOF 4 /* str Match any character in this string. */ -#define ANYBUT 5 /* str Match any character not in this - * string. */ -#define BRANCH 6 /* node Match this alternative, or the - * next... */ -#define BACK 7 /* no Match "", "next" ptr points backward. */ -#define EXACTLY 8 /* str Match this string. */ -#define NOTHING 9 /* no Match empty string. */ -#define STAR 10 /* node Match this (simple) thing 0 or more - * times. */ -#define PLUS 11 /* node Match this (simple) thing 1 or more - * times. */ -#define OPEN 20 /* no Mark this point in input as start of - * #n. */ -/* OPEN+1 is number 1, etc. */ -#define CLOSE 30 /* no Analogous to OPEN. */ - +#define END 0 /* no End of program. */ +#define BOL 1 /* no Match beginning of line. */ +#define EOL 2 /* no Match end of line. */ +#define ANY 3 /* no Match any character. */ +#define ANYOF 4 /* str Match any of these. */ +#define ANYBUT 5 /* str Match any but one of these. */ +#define BRANCH 6 /* node Match this, or the next..\&. */ +#define BACK 7 /* no "next" ptr points backward. */ +#define EXACTLY 8 /* str Match this string. */ +#define NOTHING 9 /* no Match empty string. */ +#define STAR 10 /* node Match this 0 or more times. */ +#define PLUS 11 /* node Match this 1 or more times. */ +#define OPEN 20 /* no Sub-RE starts here. */ + /* OPEN+1 is number 1, etc. */ +#define CLOSE 30 /* no Analogous to OPEN. */ + /* - * Opcode notes: + * Opcode notes: * - * BRANCH The set of branches constituting a single choice are hooked together - * with their "next" pointers, since precedence prevents anything being - * concatenated to any individual branch. The "next" pointer of the last - * BRANCH in a choice points to the thing following the whole choice. This - * is also where the final "next" pointer of each individual branch points; - * each branch starts with the operand node of a BRANCH node. + * BRANCH The set of branches constituting a single choice are hooked + * together with their "next" pointers, since precedence prevents + * anything being concatenated to any individual branch. The + * "next" pointer of the last BRANCH in a choice points to the + * thing following the whole choice. This is also where the + * final "next" pointer of each individual branch points; each + * branch starts with the operand node of a BRANCH node. * - * BACK Normal "next" pointers all implicitly point forward; BACK exists to - * make loop structures possible. + * BACK Normal "next" pointers all implicitly point forward; BACK + * exists to make loop structures possible. * * STAR,PLUS '?', and complex '*' and '+', are implemented as circular - * BRANCH structures using BACK. Simple cases (one character per match) are - * implemented with STAR and PLUS for speed and to minimize recursive - * plunges. + * BRANCH structures using BACK. Simple cases (one character + * per match) are implemented with STAR and PLUS for speed + * and to minimize recursive plunges. * - * OPEN,CLOSE ...are numbered at compile time. + * OPEN,CLOSE ...are numbered at compile time. */ - + /* * A node is one char of opcode followed by two chars of "next" pointer. * "Next" pointers are stored as two 8-bit pieces, high order first. The - * value is a positive offset from the opcode of the node containing it. An - * operand, if any, simply follows the node. (Note that much of the code - * generation knows about this implicit relationship.) + * value is a positive offset from the opcode of the node containing it. + * An operand, if any, simply follows the node. (Note that much of the + * code generation knows about this implicit relationship.) * - * Using two bytes for the "next" pointer is vast overkill for most things, but - * allows patterns to get big without disasters. + * Using two bytes for the "next" pointer is vast overkill for most things, + * but allows patterns to get big without disasters. */ -#define OP(p) (*(p)) -#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) -#define OPERAND(p) ((p) + 3) - +#define OP(p) (*(p)) +#define NEXT(p) (((*((p)+1)&0177)<<8) + (*((p)+2)&0377)) +#define OPERAND(p) ((p) + 3) + /* - * See regmagic.h for one further detail of program structure. + * See regmagic.h for one further detail of program structure. */ - - + + /* - * Utility definitions. + * Utility definitions. */ -#ifndef CHARBITS -#define UCHARAT(p) ((int)*(unsigned char *)(p)) -#else -#define UCHARAT(p) ((int)*(p)&CHARBITS) -#endif - -#define FAIL(m) { regerror(m); return(NULL); } -#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') -#define META "^$.[()|?+*\\" - +#define FAIL(m) { regerror(m); return(NULL); } +#define FAILN(m) { regerror(m); return(-1); } +#define ISREPN(c) ((c) == '*' || (c) == '+' || (c) == '?') +#define META "^$.[()|?+*\\" + /* - * Flags to be passed up and down. + * Flags to be passed up and down. */ -#define HASWIDTH 01 /* Known never to match null string. */ -#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ -#define SPSTART 04 /* Starts with * or +. */ -#define WORST 0 /* Worst case. */ - +#define HASWIDTH 01 /* Known never to match null string. */ +#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ +#define SPSTART 04 /* Starts with * or +. */ +#define WORST 0 /* Worst case. */ + /* - * Global work variables for regcomp(). + * Work-variable struct for regcomp(). */ -static char *regparse; /* Input-scan pointer. */ -static int regnpar; /* () count. */ -static char regdummy; -static char *regcode; /* Code-emit pointer; ®dummy = don't. */ -static long regsize; /* Code size. */ - +struct comp { + char *regparse; /* Input-scan pointer. */ + int regnpar; /* () count. */ + char *regcode; /* Code-emit pointer; ®dummy = don't. */ + char regdummy[3]; /* NOTHING, 0 next ptr */ + long regsize; /* Code size. */ +}; +#define EMITTING(cp) ((cp)->regcode != (cp)->regdummy) + /* - * Forward declarations for regcomp()'s friends. + * Forward declarations for regcomp()'s friends. */ -#ifndef STATIC -#define STATIC static -#endif -STATIC char *reg(); -STATIC char *regbranch(); -STATIC char *regpiece(); -STATIC char *regatom(); -STATIC char *regnode(); -STATIC char *regnext(); -STATIC void regc(); -STATIC void reginsert(); -STATIC void regtail(); -STATIC void regoptail(); -#ifdef STRCSPN -int strcspn(); -#endif - +static char *reg(struct comp *cp, int paren, int *flagp); +static char *regbranch(struct comp *cp, int *flagp); +static char *regpiece(struct comp *cp, int *flagp); +static char *regatom(struct comp *cp, int *flagp); +static char *regnode(struct comp *cp, int op); +static char *regnext(char *node); +static void regc(struct comp *cp, int c); +static void reginsert(struct comp *cp, int op, char *opnd); +static void regtail(struct comp *cp, char *p, char *val); +static void regoptail(struct comp *cp, char *p, char *val); + /* - * - regcomp - compile a regular expression into internal code + - regcomp - compile a regular expression into internal code * - * We can't allocate space until we know how big the compiled form will be, but - * we can't compile it (and thus know how big it is) until we've got a place - * to put the code. So we cheat: we compile it twice, once with code + * We can't allocate space until we know how big the compiled form will be, + * but we can't compile it (and thus know how big it is) until we've got a + * place to put the code. So we cheat: we compile it twice, once with code * generation turned off and size counting turned on, and once "for real". * This also means that we don't allocate space until we are sure that the - * thing really will compile successfully, and we never have to move the code - * and thus invalidate pointers into it. (Note that it has to be in one - * piece because free() must be able to free it all.) + * thing really will compile successfully, and we never have to move the + * code and thus invalidate pointers into it. (Note that it has to be in + * one piece because free() must be able to free it all.) * - * Beware that the optimization-preparation code in here knows about some of the - * structure of the compiled regexp. + * Beware that the optimization-preparation code in here knows about some + * of the structure of the compiled regexp. */ -regexp * +regexp * regcomp(exp) - char *exp; +const char *exp; { - register regexp *r; - register char *scan; - register char *longest; - register int len; - int flags; - extern char *malloc(); - - if (exp == NULL) - FAIL("NULL argument"); - - /* First pass: determine size, legality. */ - regparse = exp; - regnpar = 1; - regsize = 0L; - regcode = ®dummy; - regc(MAGIC); - if (reg(0, &flags) == NULL) - return (NULL); - - /* Small enough for pointer-storage convention? */ - if (regsize >= 32767L) /* Probably could be 65535L. */ - FAIL("regexp too big"); - - /* Allocate space. */ - r = (regexp *) malloc(sizeof(regexp) + (unsigned) regsize); - if (r == NULL) - FAIL("out of space"); - - /* Second pass: emit code. */ - regparse = exp; - regnpar = 1; - regcode = r->program; - regc(MAGIC); - if (reg(0, &flags) == NULL) - return (NULL); - - /* Dig out information for optimizations. */ - r->regstart = '\0'; /* Worst-case defaults. */ - r->reganch = 0; - r->regmust = NULL; - r->regmlen = 0; - scan = r->program + 1; /* First BRANCH. */ - if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ - scan = OPERAND(scan); - - /* Starting-point info. */ - if (OP(scan) == EXACTLY) - r->regstart = *OPERAND(scan); - else if (OP(scan) == BOL) - r->reganch++; - - /* - * If there's something expensive in the r.e., find the longest - * literal string that must appear and make it the regmust. Resolve - * ties in favor of later strings, since the regstart check works - * with the beginning of the r.e. and avoiding duplication - * strengthens checking. Not a strong reason, but sufficient in the - * absence of others. - */ - if (flags & SPSTART) { - longest = NULL; - len = 0; - for (; scan != NULL; scan = regnext(scan)) - if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { - longest = OPERAND(scan); - len = strlen(OPERAND(scan)); + register regexp *r; + size_t len; + + len = regcomp_len(exp); + if (len <= 0) + return NULL; + + /* Allocate space. */ + r = (regexp *)malloc(len); + + if (r == NULL) + FAIL("out of space"); + return regcomp_comp(exp, r, len); +} + + +size_t +regcomp_len(exp) +const char *exp; +{ + int flags; + register regexp *r; + struct comp co; + + if (exp == NULL) + FAILN("NULL argument to regcomp"); + + /* First pass: determine size, legality. */ + co.regparse = (char *)exp; + co.regnpar = 1; + co.regsize = 0L; + co.regdummy[0] = NOTHING; + co.regdummy[1] = co.regdummy[2] = 0; + co.regcode = co.regdummy; + regc(&co, MAGIC); + if (reg(&co, 0, &flags) == NULL) + return -1; + + /* Small enough for pointer-storage convention? */ + if (co.regsize >= 0x7fffL) /* Probably could be 0xffffL. */ + FAILN("regexp too big"); + + return (sizeof(regexp) + (size_t)co.regsize); +} + + +regexp * +regcomp_comp(exp, r, len) +const char *exp; +register regexp *r; +size_t len; +{ + register char *scan; + int flags; + struct comp co; + + /* Second pass: emit code. */ + co.regparse = (char *)exp; + co.regnpar = 1; + co.regcode = r->program; + co.regsize = len - sizeof(regexp); + regc(&co, MAGIC); + if (reg(&co, 0, &flags) == NULL) + return(NULL); + + /* Dig out information for optimizations. */ + r->regstart = '\0'; /* Worst-case defaults. */ + r->reganch = 0; + r->regmust = 0; + r->regmlen = 0; + scan = r->program+1; /* First BRANCH. */ + if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ + scan = OPERAND(scan); + + /* Starting-point info. */ + if (OP(scan) == EXACTLY) + r->regstart = *OPERAND(scan); + else if (OP(scan) == BOL) + r->reganch = 1; + + /* + * If there's something expensive in the r.e., find the + * longest literal string that must appear and make it the + * regmust. Resolve ties in favor of later strings, since + * the regstart check works with the beginning of the r.e. + * and avoiding duplication strengthens checking. Not a + * strong reason, but sufficient in the absence of others. + */ + if (flags&SPSTART) { + register char *longest = NULL; + register size_t len = 0; + + for (; scan != NULL; scan = regnext(scan)) + if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { + longest = OPERAND(scan); + len = strlen(OPERAND(scan)); + } + r->regmust = longest - r->program; + r->regmlen = (int)len; } - r->regmust = longest; - r->regmlen = len; } - } - return (r); + + return(r); } - + /* - * - reg - regular expression, i.e. main body or parenthesized thing + - reg - regular expression, i.e. main body or parenthesized thing * - * Caller must absorb opening parenthesis. + * Caller must absorb opening parenthesis. * - * Combining parenthesis handling with the base level of regular expression is a - * trifle forced, but the need to tie the tails of the branches to what - * follows makes it hard to avoid. + * Combining parenthesis handling with the base level of regular expression + * is a trifle forced, but the need to tie the tails of the branches to what + * follows makes it hard to avoid. */ -static char * -reg(paren, flagp) - int paren; /* Parenthesized? */ - int *flagp; +static char * +reg(cp, paren, flagp) +register struct comp *cp; +int paren; /* Parenthesized? */ +int *flagp; { - register char *ret; - register char *br; - register char *ender; - register int parno; - int flags; - - *flagp = HASWIDTH; /* Tentatively. */ - - /* Make an OPEN node, if parenthesized. */ - if (paren) { - if (regnpar >= NSUBEXP) - FAIL("too many ()"); - parno = regnpar; - regnpar++; - ret = regnode(OPEN + parno); - } else - ret = NULL; - - /* Pick up the branches, linking them together. */ - br = regbranch(&flags); - if (br == NULL) - return (NULL); - if (ret != NULL) - regtail(ret, br); /* OPEN -> first. */ - else - ret = br; - if (!(flags & HASWIDTH)) - *flagp &= ~HASWIDTH; - *flagp |= flags & SPSTART; - while (*regparse == '|') { - regparse++; - br = regbranch(&flags); + register char *ret; + register char *br; + register char *ender; + register int parno; + int flags; + + *flagp = HASWIDTH; /* Tentatively. */ + + if (paren) { + /* Make an OPEN node. */ + if (cp->regnpar >= NSUBEXP) + FAIL("too many ()"); + parno = cp->regnpar; + cp->regnpar++; + ret = regnode(cp, OPEN+parno); + } + + /* Pick up the branches, linking them together. */ + br = regbranch(cp, &flags); if (br == NULL) - return (NULL); - regtail(ret, br); /* BRANCH -> BRANCH. */ - if (!(flags & HASWIDTH)) - *flagp &= ~HASWIDTH; - *flagp |= flags & SPSTART; - } - - /* Make a closing node, and hook it on the end. */ - ender = regnode((paren) ? CLOSE + parno : END); - regtail(ret, ender); - - /* Hook the tails of the branches to the closing node. */ - for (br = ret; br != NULL; br = regnext(br)) - regoptail(br, ender); - - /* Check for proper termination. */ - if (paren && *regparse++ != ')') { - FAIL("unmatched ()"); - } else if (!paren && *regparse != '\0') { - if (*regparse == ')') { - FAIL("unmatched ()"); - } else - FAIL("junk on end");/* "Can't happen". */ - /* NOTREACHED */ - } - return (ret); -} - -/* - * - regbranch - one alternative of an | operator - * - * Implements the concatenation operator. - */ -static char * -regbranch(flagp) - int *flagp; -{ - register char *ret; - register char *chain; - register char *latest; - int flags; - - *flagp = WORST; /* Tentatively. */ - - ret = regnode(BRANCH); - chain = NULL; - while (*regparse != '\0' && *regparse != '|' && *regparse != ')') { - latest = regpiece(&flags); - if (latest == NULL) - return (NULL); - *flagp |= flags & HASWIDTH; - if (chain == NULL) /* First piece. */ - *flagp |= flags & SPSTART; + return(NULL); + if (paren) + regtail(cp, ret, br); /* OPEN -> first. */ else - regtail(chain, latest); - chain = latest; - } - if (chain == NULL) /* Loop ran zero times. */ - (void) regnode(NOTHING); - - return (ret); -} - -/* - * - regpiece - something followed by possible [*+?] - * - * Note that the branching code sequences used for ? and the general cases of * - * and + are somewhat optimized: they use the same NOTHING node as both the - * endmarker for their branch list and the body of the last branch. It might - * seem that this node could be dispensed with entirely, but the endmarker - * role is not redundant. - */ -static char * -regpiece(flagp) - int *flagp; -{ - register char *ret; - register char op; - register char *next; - int flags; - - ret = regatom(&flags); - if (ret == NULL) - return (NULL); - - op = *regparse; - if (!ISMULT(op)) { - *flagp = flags; - return (ret); - } - if (!(flags & HASWIDTH) && op != '?') - FAIL("*+ operand could be empty"); - *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH); - - if (op == '*' && (flags & SIMPLE)) - reginsert(STAR, ret); - else if (op == '*') { - /* Emit x* as (x&|), where & means "self". */ - reginsert(BRANCH, ret); /* Either x */ - regoptail(ret, regnode(BACK)); /* and loop */ - regoptail(ret, ret); /* back */ - regtail(ret, regnode(BRANCH)); /* or */ - regtail(ret, regnode(NOTHING)); /* null. */ - } else if (op == '+' && (flags & SIMPLE)) - reginsert(PLUS, ret); - else if (op == '+') { - /* Emit x+ as x(&|), where & means "self". */ - next = regnode(BRANCH); /* Either */ - regtail(ret, next); - regtail(regnode(BACK), ret); /* loop back */ - regtail(next, regnode(BRANCH)); /* or */ - regtail(ret, regnode(NOTHING)); /* null. */ - } else if (op == '?') { - /* Emit x? as (x|) */ - reginsert(BRANCH, ret); /* Either x */ - regtail(ret, regnode(BRANCH)); /* or */ - next = regnode(NOTHING);/* null. */ - regtail(ret, next); - regoptail(ret, next); - } - regparse++; - if (ISMULT(*regparse)) - FAIL("nested *?+"); - - return (ret); -} - -/* - * - regatom - the lowest level - * - * Optimization: gobbles an entire sequence of ordinary characters so that it - * can turn them into a single node, which is smaller to store and faster to - * run. Backslashed characters are exceptions, each becoming a separate - * node; the code is simpler that way and it's not worth fixing. - */ -static char * -regatom(flagp) - int *flagp; -{ - register char *ret; - int flags; - - *flagp = WORST; /* Tentatively. */ - - switch (*regparse++) { - case '^': - ret = regnode(BOL); - break; - case '$': - ret = regnode(EOL); - break; - case '.': - ret = regnode(ANY); - *flagp |= HASWIDTH | SIMPLE; - break; - case '[':{ - register int class; - register int classend; - - if (*regparse == '^') { /* Complement of range. */ - ret = regnode(ANYBUT); - regparse++; - } else - ret = regnode(ANYOF); - if (*regparse == ']' || *regparse == '-') - regc(*regparse++); - while (*regparse != '\0' && *regparse != ']') { - if (*regparse == '-') { - regparse++; - if (*regparse == ']' || *regparse == '\0') - regc('-'); - else { - class = UCHARAT(regparse - 2) + 1; - classend = UCHARAT(regparse); - if (class > classend + 1) - FAIL("invalid [] range"); - for (; class <= classend; class++) - regc(class); - regparse++; - } + ret = br; + *flagp &= ~(~flags&HASWIDTH); /* Clear bit if bit 0. */ + *flagp |= flags&SPSTART; + while (*cp->regparse == '|') { + cp->regparse++; + br = regbranch(cp, &flags); + if (br == NULL) + return(NULL); + regtail(cp, ret, br); /* BRANCH -> BRANCH. */ + *flagp &= ~(~flags&HASWIDTH); + *flagp |= flags&SPSTART; + } + + /* Make a closing node, and hook it on the end. */ + ender = regnode(cp, (paren) ? CLOSE+parno : END); + regtail(cp, ret, ender); + + /* Hook the tails of the branches to the closing node. */ + for (br = ret; br != NULL; br = regnext(br)) + regoptail(cp, br, ender); + + /* Check for proper termination. */ + if (paren && *cp->regparse++ != ')') { + FAIL("unterminated ()"); + } else if (!paren && *cp->regparse != '\0') { + if (*cp->regparse == ')') { + FAIL("unmatched ()"); } else - regc(*regparse++); - } - regc('\0'); - if (*regparse != ']') - FAIL("unmatched []"); - regparse++; - *flagp |= HASWIDTH | SIMPLE; + FAIL("internal error: junk on end"); + /* NOTREACHED */ } - break; - case '(': - ret = reg(1, &flags); - if (ret == NULL) - return (NULL); - *flagp |= flags & (HASWIDTH | SPSTART); - break; - case '\0': - case '|': - case ')': - FAIL("internal urp"); /* Supposed to be caught earlier. */ - break; - case '?': - case '+': - case '*': - FAIL("?+* follows nothing"); - break; - case '\\': - if (*regparse == '\0') - FAIL("trailing \\"); - ret = regnode(EXACTLY); - regc(*regparse++); - regc('\0'); - *flagp |= HASWIDTH | SIMPLE; - break; - default:{ - register int len; - register char ender; - - regparse--; - len = strcspn(regparse, META); - if (len <= 0) - FAIL("internal disaster"); - ender = *(regparse + len); - if (len > 1 && ISMULT(ender)) - len--; /* Back off clear of ?+* operand. */ - *flagp |= HASWIDTH; - if (len == 1) - *flagp |= SIMPLE; - ret = regnode(EXACTLY); - while (len > 0) { - regc(*regparse++); - len--; - } - regc('\0'); - } - break; - } - - return (ret); + + return(ret); } - + /* - * - regnode - emit a node - */ -static char * /* Location. */ -regnode(op) - char op; -{ - register char *ret; - register char *ptr; - - ret = regcode; - if (ret == ®dummy) { - regsize += 3; - return (ret); - } - ptr = ret; - *ptr++ = op; - *ptr++ = '\0'; /* Null "next" pointer. */ - *ptr++ = '\0'; - regcode = ptr; - - return (ret); -} - -/* - * - regc - emit (if appropriate) a byte of code - */ -static void -regc(b) - char b; -{ - if (regcode != ®dummy) - *regcode++ = b; - else - regsize++; -} - -/* - * - reginsert - insert an operator in front of already-emitted operand + - regbranch - one alternative of an | operator * - * Means relocating the operand. + * Implements the concatenation operator. + */ +static char * +regbranch(cp, flagp) +register struct comp *cp; +int *flagp; +{ + register char *ret; + register char *chain; + register char *latest; + int flags; + register int c; + + *flagp = WORST; /* Tentatively. */ + + ret = regnode(cp, BRANCH); + chain = NULL; + while ((c = *cp->regparse) != '\0' && c != '|' && c != ')') { + latest = regpiece(cp, &flags); + if (latest == NULL) + return(NULL); + *flagp |= flags&HASWIDTH; + if (chain == NULL) /* First piece. */ + *flagp |= flags&SPSTART; + else + regtail(cp, chain, latest); + chain = latest; + } + if (chain == NULL) /* Loop ran zero times. */ + (void) regnode(cp, NOTHING); + + return(ret); +} + +/* + - regpiece - something followed by possible [*+?] + * + * Note that the branching code sequences used for ? and the general cases + * of * and + are somewhat optimized: they use the same NOTHING node as + * both the endmarker for their branch list and the body of the last branch. + * It might seem that this node could be dispensed with entirely, but the + * endmarker role is not redundant. + */ +static char * +regpiece(cp, flagp) +register struct comp *cp; +int *flagp; +{ + register char *ret; + register char op; + register char *next; + int flags; + + ret = regatom(cp, &flags); + if (ret == NULL) + return(NULL); + + op = *cp->regparse; + if (!ISREPN(op)) { + *flagp = flags; + return(ret); + } + + if (!(flags&HASWIDTH) && op != '?') + FAIL("*+ operand could be empty"); + switch (op) { + case '*': *flagp = WORST|SPSTART; break; + case '+': *flagp = WORST|SPSTART|HASWIDTH; break; + case '?': *flagp = WORST; break; + } + + if (op == '*' && (flags&SIMPLE)) + reginsert(cp, STAR, ret); + else if (op == '*') { + /* Emit x* as (x&|), where & means "self". */ + reginsert(cp, BRANCH, ret); /* Either x */ + regoptail(cp, ret, regnode(cp, BACK)); /* and loop */ + regoptail(cp, ret, ret); /* back */ + regtail(cp, ret, regnode(cp, BRANCH)); /* or */ + regtail(cp, ret, regnode(cp, NOTHING)); /* null. */ + } else if (op == '+' && (flags&SIMPLE)) + reginsert(cp, PLUS, ret); + else if (op == '+') { + /* Emit x+ as x(&|), where & means "self". */ + next = regnode(cp, BRANCH); /* Either */ + regtail(cp, ret, next); + regtail(cp, regnode(cp, BACK), ret); /* loop back */ + regtail(cp, next, regnode(cp, BRANCH)); /* or */ + regtail(cp, ret, regnode(cp, NOTHING)); /* null. */ + } else if (op == '?') { + /* Emit x? as (x|) */ + reginsert(cp, BRANCH, ret); /* Either x */ + regtail(cp, ret, regnode(cp, BRANCH)); /* or */ + next = regnode(cp, NOTHING); /* null. */ + regtail(cp, ret, next); + regoptail(cp, ret, next); + } + cp->regparse++; + if (ISREPN(*cp->regparse)) + FAIL("nested *?+"); + + return(ret); +} + +/* + - regatom - the lowest level + * + * Optimization: gobbles an entire sequence of ordinary characters so that + * it can turn them into a single node, which is smaller to store and + * faster to run. Backslashed characters are exceptions, each becoming a + * separate node; the code is simpler that way and it's not worth fixing. + */ +static char * +regatom(cp, flagp) +register struct comp *cp; +int *flagp; +{ + register char *ret; + int flags; + + *flagp = WORST; /* Tentatively. */ + + switch (*cp->regparse++) { + case '^': + ret = regnode(cp, BOL); + break; + case '$': + ret = regnode(cp, EOL); + break; + case '.': + ret = regnode(cp, ANY); + *flagp |= HASWIDTH|SIMPLE; + break; + case '[': { + register int range; + register int rangeend; + register int c; + + if (*cp->regparse == '^') { /* Complement of range. */ + ret = regnode(cp, ANYBUT); + cp->regparse++; + } else + ret = regnode(cp, ANYOF); + if ((c = *cp->regparse) == ']' || c == '-') { + regc(cp, c); + cp->regparse++; + } + while ((c = *cp->regparse++) != '\0' && c != ']') { + if (c != '-') + regc(cp, c); + else if ((c = *cp->regparse) == ']' || c == '\0') + regc(cp, '-'); + else { + range = (unsigned char)*(cp->regparse-2); + rangeend = (unsigned char)c; + if (range > rangeend) + FAIL("invalid [] range"); + for (range++; range <= rangeend; range++) + regc(cp, range); + cp->regparse++; + } + } + regc(cp, '\0'); + if (c != ']') + FAIL("unmatched []"); + *flagp |= HASWIDTH|SIMPLE; + break; + } + case '(': + ret = reg(cp, 1, &flags); + if (ret == NULL) + return(NULL); + *flagp |= flags&(HASWIDTH|SPSTART); + break; + case '\0': + case '|': + case ')': + /* supposed to be caught earlier */ + FAIL("internal error: \\0|) unexpected"); + break; + case '?': + case '+': + case '*': + FAIL("?+* follows nothing"); + break; + case '\\': + if (*cp->regparse == '\0') + FAIL("trailing \\"); + ret = regnode(cp, EXACTLY); + regc(cp, *cp->regparse++); + regc(cp, '\0'); + *flagp |= HASWIDTH|SIMPLE; + break; + default: { + register size_t len; + register char ender; + + cp->regparse--; + len = strcspn(cp->regparse, META); + if (len == 0) + FAIL("internal error: strcspn 0"); + ender = *(cp->regparse+len); + if (len > 1 && ISREPN(ender)) + len--; /* Back off clear of ?+* operand. */ + *flagp |= HASWIDTH; + if (len == 1) + *flagp |= SIMPLE; + ret = regnode(cp, EXACTLY); + for (; len > 0; len--) + regc(cp, *cp->regparse++); + regc(cp, '\0'); + break; + } + } + + return(ret); +} + +/* + - regnode - emit a node + */ +static char * /* Location. */ +regnode(cp, op) +register struct comp *cp; +char op; +{ + register char *const ret = cp->regcode; + register char *ptr; + + if (!EMITTING(cp)) { + cp->regsize += 3; + return(ret); + } + + ptr = ret; + *ptr++ = op; + *ptr++ = '\0'; /* Null next pointer. */ + *ptr++ = '\0'; + cp->regcode = ptr; + + return(ret); +} + +/* + - regc - emit (if appropriate) a byte of code */ static void -reginsert(op, opnd) - char op; - char *opnd; +regc(cp, b) +register struct comp *cp; +char b; { - register char *src; - register char *dst; - register char *place; - - if (regcode == ®dummy) { - regsize += 3; - return; - } - src = regcode; - regcode += 3; - dst = regcode; - while (src > opnd) - *--dst = *--src; - - place = opnd; /* Op node, where operand used to be. */ - *place++ = op; - *place++ = '\0'; - *place++ = '\0'; + if (EMITTING(cp)) + *cp->regcode++ = b; + else + cp->regsize++; } - + /* - * - regtail - set the next-pointer at the end of a node chain + - reginsert - insert an operator in front of already-emitted operand + * + * Means relocating the operand. */ static void -regtail(p, val) - char *p; - char *val; +reginsert(cp, op, opnd) +register struct comp *cp; +char op; +char *opnd; { - register char *scan; - register char *temp; - register int offset; - - if (p == ®dummy) - return; - - /* Find last node. */ - scan = p; - for (;;) { - temp = regnext(scan); - if (temp == NULL) - break; - scan = temp; - } - - if (OP(scan) == BACK) - offset = scan - val; - else - offset = val - scan; - *(scan + 1) = (offset >> 8) & 0377; - *(scan + 2) = offset & 0377; + register char *place; + + if (!EMITTING(cp)) { + cp->regsize += 3; + return; + } + + (void) memmove(opnd+3, opnd, (size_t)(cp->regcode - opnd)); + cp->regcode += 3; + + place = opnd; /* Op node, where operand used to be. */ + *place++ = op; + *place++ = '\0'; + *place++ = '\0'; } - + /* - * - regoptail - regtail on operand of first argument; nop if operandless + - regtail - set the next-pointer at the end of a node chain */ static void -regoptail(p, val) - char *p; - char *val; +regtail(cp, p, val) +register struct comp *cp; +char *p; +char *val; { - /* "Operandless" and "op != BRANCH" are synonymous in practice. */ - if (p == NULL || p == ®dummy || OP(p) != BRANCH) - return; - regtail(OPERAND(p), val); + register char *scan; + register char *temp; + register int offset; + + if (!EMITTING(cp)) + return; + + /* Find last node. */ + for (scan = p; (temp = regnext(scan)) != NULL; scan = temp) + continue; + + offset = (OP(scan) == BACK) ? scan - val : val - scan; + *(scan+1) = (offset>>8)&0177; + *(scan+2) = offset&0377; } - + /* - * regexec and friends + - regoptail - regtail on operand of first argument; nop if operandless */ - +static void +regoptail(cp, p, val) +register struct comp *cp; +char *p; +char *val; +{ + /* "Operandless" and "op != BRANCH" are synonymous in practice. */ + if (!EMITTING(cp) || OP(p) != BRANCH) + return; + regtail(cp, OPERAND(p), val); +} + /* - * Global work variables for regexec(). + * regexec and friends */ -static char *reginput; /* String-input pointer. */ -static char *regbol; /* Beginning of input, for ^ check. */ -static char **regstartp; /* Pointer to startp array. */ -static char **regendp; /* Ditto for endp. */ - + /* - * Forwards. + * Work-variable struct for regexec(). */ -STATIC int regtry(); -STATIC int regmatch(); -STATIC int regrepeat(); - +struct exec { + char *reginput; /* String-input pointer. */ + char *regbol; /* Beginning of input, for ^ check. */ + const char **regstartp; /* Pointer to startp array. */ + const char **regendp; /* Ditto for endp. */ +}; + +/* + * Forwards. + */ +static int regtry(struct exec *ep, regexp *rp, char *string); +static int regmatch(struct exec *ep, char *prog); +static size_t regrepeat(struct exec *ep, char *node); + #ifdef DEBUG -int regnarrate = 0; -void regdump(); -STATIC char *regprop(); +int regnarrate = 0; +void regdump(); +static char *regprop(); #endif - + /* - * - regexec - match a regexp against a string + - regexec - match a regexp against a string */ int -regexec(prog, string) - register regexp *prog; - register char *string; +regexec(prog, str) +register regexp *prog; +const char *str; { - register char *s; - extern char *strchr(); - - /* Be paranoid... */ - if (prog == NULL || string == NULL) { - regerror("NULL parameter"); - return (0); - } - /* Check validity of program. */ - if (UCHARAT(prog->program) != MAGIC) { - regerror("corrupted program"); - return (0); - } - /* If there is a "must appear" string, look for it. */ - if (prog->regmust != NULL) { - s = string; - while ((s = strchr(s, prog->regmust[0])) != NULL) { - if (strncmp(s, prog->regmust, prog->regmlen) == 0) - break; /* Found it. */ - s++; + register char *string = (char *)str; /* avert const poisoning */ + register char *s; + struct exec ex; + + /* Be paranoid. */ + if (prog == NULL || string == NULL) { + regerror("NULL argument to regexec"); + return(0); } - if (s == NULL) /* Not present. */ - return (0); - } - /* Mark beginning of line for ^ . */ - regbol = string; - - /* Simplest case: anchored match need be tried only once. */ - if (prog->reganch) - return (regtry(prog, string)); - - /* Messy cases: unanchored match. */ - s = string; - if (prog->regstart != '\0') - /* We know what char it must start with. */ - while ((s = strchr(s, prog->regstart)) != NULL) { - if (regtry(prog, s)) - return (1); - s++; + + /* Check validity of program. */ + if ((unsigned char)*prog->program != MAGIC) { + regerror("corrupted regexp"); + return(0); } - else - /* We don't -- general case. */ - do { - if (regtry(prog, s)) - return (1); - } while (*s++ != '\0'); - - /* Failure. */ - return (0); + + /* If there is a "must appear" string, look for it. */ + if ((prog->regmlen > 0) && + strstr(string, &prog->program[prog->regmust]) == NULL) + return(0); + + /* Mark beginning of line for ^ . */ + ex.regbol = string; + ex.regstartp = prog->startp; + ex.regendp = prog->endp; + + /* Simplest case: anchored match need be tried only once. */ + if (prog->reganch) + return(regtry(&ex, prog, string)); + + /* Messy cases: unanchored match. */ + if (prog->regstart != '\0') { + /* We know what char it must start with. */ + for (s = string; s != NULL; s = strchr(s+1, prog->regstart)) + if (regtry(&ex, prog, s)) + return(1); + return(0); + } else { + /* We don't -- general case. */ + for (s = string; !regtry(&ex, prog, s); s++) + if (*s == '\0') + return(0); + return(1); + } + /* NOTREACHED */ } - + /* - * - regtry - try match at specific point + - regtry - try match at specific point */ static int /* 0 failure, 1 success */ -regtry(prog, string) - regexp *prog; - char *string; +regtry(ep, prog, string) +register struct exec *ep; +regexp *prog; +char *string; { - register int i; - register char **sp; - register char **ep; - - reginput = string; - regstartp = prog->startp; - regendp = prog->endp; - - sp = prog->startp; - ep = prog->endp; - for (i = NSUBEXP; i > 0; i--) { - *sp++ = NULL; - *ep++ = NULL; - } - if (regmatch(prog->program + 1)) { - prog->startp[0] = string; - prog->endp[0] = reginput; - return (1); - } else - return (0); + register int i; + register const char **stp; + register const char **enp; + + ep->reginput = string; + + stp = prog->startp; + enp = prog->endp; + for (i = NSUBEXP; i > 0; i--) { + *stp++ = NULL; + *enp++ = NULL; + } + if (regmatch(ep, prog->program + 1)) { + prog->startp[0] = string; + prog->endp[0] = ep->reginput; + return(1); + } else + return(0); } - + /* - * - regmatch - main matching routine + - regmatch - main matching routine * - * Conceptually the strategy is simple: check to see whether the current node - * matches, call self recursively to see whether the rest matches, and then - * act accordingly. In practice we make some effort to avoid recursion, in - * particular by going through "ordinary" nodes (that don't need to know - * whether the rest of the match failed) by a loop instead of by recursion. + * Conceptually the strategy is simple: check to see whether the current + * node matches, call self recursively to see whether the rest matches, + * and then act accordingly. In practice we make some effort to avoid + * recursion, in particular by going through "ordinary" nodes (that don't + * need to know whether the rest of the match failed) by a loop instead of + * by recursion. */ static int /* 0 failure, 1 success */ -regmatch(prog) - char *prog; +regmatch(ep, prog) +register struct exec *ep; +char *prog; { - register char *scan; /* Current node. */ - char *next; /* Next node. */ - extern char *strchr(); - - scan = prog; + register char *scan; /* Current node. */ + char *next; /* Next node. */ + #ifdef DEBUG - if (scan != NULL && regnarrate) - fprintf(stderr, "%s(\n", regprop(scan)); + if (prog != NULL && regnarrate) + fprintf(stderr, "%s(\n", regprop(prog)); #endif - while (scan != NULL) { + for (scan = prog; scan != NULL; scan = next) { #ifdef DEBUG - if (regnarrate) - fprintf(stderr, "%s...\n", regprop(scan)); + if (regnarrate) + fprintf(stderr, "%s...\n", regprop(scan)); #endif - next = regnext(scan); - - switch (OP(scan)) { - case BOL: - if (reginput != regbol) - return (0); - break; - case EOL: - if (*reginput != '\0') - return (0); - break; + next = regnext(scan); + + switch (OP(scan)) { + case BOL: + if (ep->reginput != ep->regbol) + return(0); + break; + case EOL: + if (*ep->reginput != '\0') + return(0); + break; + case ANY: + if (*ep->reginput == '\0') + return(0); + ep->reginput++; + break; + case EXACTLY: { + register size_t len; + register char *const opnd = OPERAND(scan); + + /* Inline the first character, for speed. */ + if (*opnd != *ep->reginput) + return(0); + len = strlen(opnd); + if (len > 1 && strncmp(opnd, ep->reginput, len) != 0) + return(0); + ep->reginput += len; + break; + } + case ANYOF: + if (*ep->reginput == '\0' || + strchr(OPERAND(scan), *ep->reginput) == NULL) + return(0); + ep->reginput++; + break; + case ANYBUT: + if (*ep->reginput == '\0' || + strchr(OPERAND(scan), *ep->reginput) != NULL) + return(0); + ep->reginput++; + break; + case NOTHING: + break; + case BACK: + break; + case OPEN+1: case OPEN+2: case OPEN+3: + case OPEN+4: case OPEN+5: case OPEN+6: + case OPEN+7: case OPEN+8: case OPEN+9: { + register const int no = OP(scan) - OPEN; + register char *const input = ep->reginput; + + if (regmatch(ep, next)) { + /* + * Don't set startp if some later + * invocation of the same parentheses + * already has. + */ + if (ep->regstartp[no] == NULL) + ep->regstartp[no] = input; + return(1); + } else + return(0); + break; + } + case CLOSE+1: case CLOSE+2: case CLOSE+3: + case CLOSE+4: case CLOSE+5: case CLOSE+6: + case CLOSE+7: case CLOSE+8: case CLOSE+9: { + register const int no = OP(scan) - CLOSE; + register char *const input = ep->reginput; + + if (regmatch(ep, next)) { + /* + * Don't set endp if some later + * invocation of the same parentheses + * already has. + */ + if (ep->regendp[no] == NULL) + ep->regendp[no] = input; + return(1); + } else + return(0); + break; + } + case BRANCH: { + register char *const save = ep->reginput; + + if (OP(next) != BRANCH) /* No choice. */ + next = OPERAND(scan); /* Avoid recursion. */ + else { + while (OP(scan) == BRANCH) { + if (regmatch(ep, OPERAND(scan))) + return(1); + ep->reginput = save; + scan = regnext(scan); + } + return(0); + /* NOTREACHED */ + } + break; + } + case STAR: case PLUS: { + register const char nextch = + (OP(next) == EXACTLY) ? *OPERAND(next) : '\0'; + register size_t no; + register char *const save = ep->reginput; + register const size_t min = (OP(scan) == STAR) ? 0 : 1; + + for (no = regrepeat(ep, OPERAND(scan)) + 1; no > min; no--) { + ep->reginput = save + no - 1; + /* If it could work, try it. */ + if (nextch == '\0' || *ep->reginput == nextch) + if (regmatch(ep, next)) + return(1); + } + return(0); + break; + } + case END: + return(1); /* Success! */ + break; + default: + regerror("regexp corruption"); + return(0); + break; + } + } + + /* + * We get here only if there's trouble -- normally "case END" is + * the terminating point. + */ + regerror("corrupted pointers"); + return(0); +} + +/* + - regrepeat - report how many times something simple would match + */ +static size_t +regrepeat(ep, node) +register struct exec *ep; +char *node; +{ + register size_t count; + register char *scan; + register char ch; + + switch (OP(node)) { case ANY: - if (*reginput == '\0') - return (0); - reginput++; - break; - case EXACTLY:{ - register int len; - register char *opnd; - - opnd = OPERAND(scan); - /* Inline the first character, for speed. */ - if (*opnd != *reginput) - return (0); - len = strlen(opnd); - if (len > 1 && strncmp(opnd, reginput, len) != 0) - return (0); - reginput += len; - } - break; + return(strlen(ep->reginput)); + break; + case EXACTLY: + ch = *OPERAND(node); + count = 0; + for (scan = ep->reginput; *scan == ch; scan++) + count++; + return(count); + break; case ANYOF: - if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) - return (0); - reginput++; - break; + return(strspn(ep->reginput, OPERAND(node))); + break; case ANYBUT: - if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) - return (0); - reginput++; - break; - case NOTHING: - break; - case BACK: - break; - case OPEN + 1: - case OPEN + 2: - case OPEN + 3: - case OPEN + 4: - case OPEN + 5: - case OPEN + 6: - case OPEN + 7: - case OPEN + 8: - case OPEN + 9:{ - register int no; - register char *save; - - no = OP(scan) - OPEN; - save = reginput; - - if (regmatch(next)) { - /* - * Don't set startp if some later invocation of the same - * parentheses already has. - */ - if (regstartp[no] == NULL) - regstartp[no] = save; - return (1); - } else - return (0); - } - break; - case CLOSE + 1: - case CLOSE + 2: - case CLOSE + 3: - case CLOSE + 4: - case CLOSE + 5: - case CLOSE + 6: - case CLOSE + 7: - case CLOSE + 8: - case CLOSE + 9:{ - register int no; - register char *save; - - no = OP(scan) - CLOSE; - save = reginput; - - if (regmatch(next)) { - /* - * Don't set endp if some later invocation of the same - * parentheses already has. - */ - if (regendp[no] == NULL) - regendp[no] = save; - return (1); - } else - return (0); - } - break; - case BRANCH:{ - register char *save; - - if (OP(next) != BRANCH) /* No choice. */ - next = OPERAND(scan); /* Avoid recursion. */ - else { - do { - save = reginput; - if (regmatch(OPERAND(scan))) - return (1); - reginput = save; - scan = regnext(scan); - } while (scan != NULL && OP(scan) == BRANCH); - return (0); - /* NOTREACHED */ - } - } - break; - case STAR: - case PLUS:{ - register char nextch; - register int no; - register char *save; - register int min; - - /* - * Lookahead to avoid useless match attempts when we know - * what character comes next. - */ - nextch = '\0'; - if (OP(next) == EXACTLY) - nextch = *OPERAND(next); - min = (OP(scan) == STAR) ? 0 : 1; - save = reginput; - no = regrepeat(OPERAND(scan)); - while (no >= min) { - /* If it could work, try it. */ - if (nextch == '\0' || *reginput == nextch) - if (regmatch(next)) - return (1); - /* Couldn't or didn't -- back up. */ - no--; - reginput = save + no; - } - return (0); - } - break; - case END: - return (1); /* Success! */ - break; - default: - regerror("memory corruption"); - return (0); - break; + return(strcspn(ep->reginput, OPERAND(node))); + break; + default: /* Oh dear. Called inappropriately. */ + regerror("internal error: bad call of regrepeat"); + return(0); /* Best compromise. */ + break; } - - scan = next; - } - - /* - * We get here only if there's trouble -- normally "case END" is the - * terminating point. - */ - regerror("corrupted pointers"); - return (0); + /* NOTREACHED */ } - + /* - * - regrepeat - repeatedly match something simple, report how many + - regnext - dig the "next" pointer out of a node */ -static int -regrepeat(p) - char *p; -{ - register int count = 0; - register char *scan; - register char *opnd; - - scan = reginput; - opnd = OPERAND(p); - switch (OP(p)) { - case ANY: - count = strlen(scan); - scan += count; - break; - case EXACTLY: - while (*opnd == *scan) { - count++; - scan++; - } - break; - case ANYOF: - while (*scan != '\0' && strchr(opnd, *scan) != NULL) { - count++; - scan++; - } - break; - case ANYBUT: - while (*scan != '\0' && strchr(opnd, *scan) == NULL) { - count++; - scan++; - } - break; - default: /* Oh dear. Called inappropriately. */ - regerror("internal foulup"); - count = 0; /* Best compromise. */ - break; - } - reginput = scan; - - return (count); -} - -/* - * - regnext - dig the "next" pointer out of a node - */ -static char * +static char * regnext(p) - register char *p; +register char *p; { - register int offset; - - if (p == ®dummy) - return (NULL); - - offset = NEXT(p); - if (offset == 0) - return (NULL); - - if (OP(p) == BACK) - return (p - offset); - else - return (p + offset); + register const int offset = NEXT(p); + + if (offset == 0) + return(NULL); + + return((OP(p) == BACK) ? p-offset : p+offset); } - + #ifdef DEBUG - -STATIC char *regprop(); - + +static char *regprop(); + /* - * - regdump - dump a regexp onto stdout in vaguely comprehensible form + - regdump - dump a regexp onto stdout in vaguely comprehensible form */ void regdump(r) - regexp *r; +regexp *r; { - register char *s; - register char op = EXACTLY; /* Arbitrary non-END op. */ - register char *next; - extern char *strchr(); - - - s = r->program + 1; - while (op != END) { /* While that wasn't END last time... */ - op = OP(s); - printf("%2d%s", s - r->program, regprop(s)); /* Where, what. */ - next = regnext(s); - if (next == NULL) /* Next ptr. */ - printf("(0)"); - else - printf("(%d)", (s - r->program) + (next - s)); - s += 3; - if (op == ANYOF || op == ANYBUT || op == EXACTLY) { - /* Literal string, where present. */ - while (*s != '\0') { - putchar(*s); - s++; - } - s++; + register char *s; + register char op = EXACTLY; /* Arbitrary non-END op. */ + register char *next; + + + s = r->program + 1; + while (op != END) { /* While that wasn't END last time... */ + op = OP(s); + printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ + next = regnext(s); + if (next == NULL) /* Next ptr. */ + printf("(0)"); + else + printf("(%d)", (s-r->program)+(next-s)); + s += 3; + if (op == ANYOF || op == ANYBUT || op == EXACTLY) { + /* Literal string, where present. */ + while (*s != '\0') { + putchar(*s); + s++; + } + s++; + } + putchar('\n'); } - putchar('\n'); - } - - /* Header fields of interest. */ - if (r->regstart != '\0') - printf("start `%c' ", r->regstart); - if (r->reganch) - printf("anchored "); - if (r->regmust != NULL) - printf("must have \"%s\"", r->regmust); - printf("\n"); + + /* Header fields of interest. */ + if (r->regstart != '\0') + printf("start `%c' ", r->regstart); + if (r->reganch) + printf("anchored "); + if (r->regmlen > 0) + printf("must have \"%s\"", &r->program[r->regmust]); + printf("\n"); } - + /* - * - regprop - printable representation of opcode + - regprop - printable representation of opcode */ -static char * +static char * regprop(op) - char *op; +char *op; { - register char *p; - static char buf[50]; - - (void) strcpy(buf, ":"); - - switch (OP(op)) { - case BOL: - p = "BOL"; - break; - case EOL: - p = "EOL"; - break; - case ANY: - p = "ANY"; - break; - case ANYOF: - p = "ANYOF"; - break; - case ANYBUT: - p = "ANYBUT"; - break; - case BRANCH: - p = "BRANCH"; - break; - case EXACTLY: - p = "EXACTLY"; - break; - case NOTHING: - p = "NOTHING"; - break; - case BACK: - p = "BACK"; - break; - case END: - p = "END"; - break; - case OPEN + 1: - case OPEN + 2: - case OPEN + 3: - case OPEN + 4: - case OPEN + 5: - case OPEN + 6: - case OPEN + 7: - case OPEN + 8: - case OPEN + 9: - sprintf(buf + strlen(buf), "OPEN%d", OP(op) - OPEN); - p = NULL; - break; - case CLOSE + 1: - case CLOSE + 2: - case CLOSE + 3: - case CLOSE + 4: - case CLOSE + 5: - case CLOSE + 6: - case CLOSE + 7: - case CLOSE + 8: - case CLOSE + 9: - sprintf(buf + strlen(buf), "CLOSE%d", OP(op) - CLOSE); - p = NULL; - break; - case STAR: - p = "STAR"; - break; - case PLUS: - p = "PLUS"; - break; - default: - regerror("corrupted opcode"); - break; - } - if (p != NULL) - (void) strcat(buf, p); - return (buf); -} -#endif - -/* - * The following is provided for those people who do not have strcspn() in - * their C libraries. They should get off their butts and do something about - * it; at least one public-domain implementation of those (highly useful) - * string routines has been published on Usenet. - */ -#ifdef STRCSPN -/* - * strcspn - find length of initial segment of s1 consisting entirely of - * characters not from s2 - */ - -static int -strcspn(s1, s2) - char *s1; - char *s2; -{ - register char *scan1; - register char *scan2; - register int count; - - count = 0; - for (scan1 = s1; *scan1 != '\0'; scan1++) { - for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ - if (*scan1 == *scan2++) - return (count); - count++; - } - return (count); + register char *p; + static char buf[50]; + + (void) strcpy(buf, ":"); + + switch (OP(op)) { + case BOL: + p = "BOL"; + break; + case EOL: + p = "EOL"; + break; + case ANY: + p = "ANY"; + break; + case ANYOF: + p = "ANYOF"; + break; + case ANYBUT: + p = "ANYBUT"; + break; + case BRANCH: + p = "BRANCH"; + break; + case EXACTLY: + p = "EXACTLY"; + break; + case NOTHING: + p = "NOTHING"; + break; + case BACK: + p = "BACK"; + break; + case END: + p = "END"; + break; + case OPEN+1: + case OPEN+2: + case OPEN+3: + case OPEN+4: + case OPEN+5: + case OPEN+6: + case OPEN+7: + case OPEN+8: + case OPEN+9: + sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); + p = NULL; + break; + case CLOSE+1: + case CLOSE+2: + case CLOSE+3: + case CLOSE+4: + case CLOSE+5: + case CLOSE+6: + case CLOSE+7: + case CLOSE+8: + case CLOSE+9: + sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); + p = NULL; + break; + case STAR: + p = "STAR"; + break; + case PLUS: + p = "PLUS"; + break; + default: + regerror("corrupted opcode"); + break; + } + if (p != NULL) + (void) strcat(buf, p); + return(buf); } #endif diff --git a/scsh/regexp/regexp.h b/scsh/regexp/regexp.h index ef07a9f..48af08c 100644 --- a/scsh/regexp/regexp.h +++ b/scsh/regexp/regexp.h @@ -6,16 +6,22 @@ */ #define NSUBEXP 10 typedef struct regexp { - char *startp[NSUBEXP]; - char *endp[NSUBEXP]; + const char *startp[NSUBEXP]; + const char *endp[NSUBEXP]; char regstart; /* Internal use only. */ char reganch; /* Internal use only. */ - char *regmust; /* Internal use only. */ + int regmust; /* Internal use only. */ int regmlen; /* Internal use only. */ char program[1]; /* Unwarranted chumminess with compiler. */ } regexp; - -extern regexp *regcomp(); -extern int regexec(); -extern void regsub(); -extern void regerror(); + +extern regexp *regcomp(const char *re); +extern int regexec(regexp *rp, const char *s); +extern void regsub(const regexp *rp, const char *src, char *dst); +extern void regnsub(const regexp *rp, const char *src, char *dst, size_t len); +extern size_t regsublen(const regexp *rp, const char *src); + +extern void regerror(char *message); +extern size_t regcomp_len(const char *exp); +extern regexp *regcomp_comp(const char *exp, struct regexp *r, size_t len); + diff --git a/scsh/regexp/regsub.c b/scsh/regexp/regsub.c index 7fc003c..bc98845 100644 --- a/scsh/regexp/regsub.c +++ b/scsh/regexp/regsub.c @@ -1,83 +1,131 @@ /* - * regsub @(#)regsub.c 1.3 of 2 April 86 - * - * Copyright (c) 1986 by University of Toronto. Written by Henry Spencer. Not - * derived from licensed software. - * - * Permission is granted to anyone to use this software for any purpose on any - * computer system, and to redistribute it freely, subject to the following - * restrictions: - * - * 1. The author is not responsible for the consequences of use of this - * software, no matter how awful, even if they arise from defects in it. - * - * 2. The origin of this software must not be misrepresented, either by explicit - * claim or by omission. - * - * 3. Altered versions must be plainly marked as such, and must not be - * misrepresented as being the original software. + * regsub */ #include -#ifdef AMIGA -#include "regexp.h" -#else +#include +#include +#include #include -#endif #include "regmagic.h" - -#ifndef CHARBITS -#define UCHARAT(p) ((int)*(unsigned char *)(p)) -#else -#define UCHARAT(p) ((int)*(p)&CHARBITS) -#endif - + /* - * - regsub - perform substitutions after a regexp match + - regsub - perform substitutions after a regexp match + */ + +void regsub(rp, source, dest) +const regexp *rp; +const char *source; +char *dest; +{ + regnsub(rp, source, dest, BUFSIZ); +} + + + +/* + - regnsub - perform bounds-checked substitutions after a regexp match */ void -regsub(prog, source, dest) - regexp *prog; - char *source; - char *dest; +regnsub(rp, source, dest, destlen) +const regexp *rp; +const char *source; +char *dest; +size_t destlen; { - register char *src; - register char *dst; - register char c; - register int no; - register int len; - extern char *strncpy(); - - if (prog == NULL || source == NULL || dest == NULL) { - regerror("NULL parm to regsub"); - return; + register regexp * const prog = (regexp *)rp; + register const char *src = (char *)source; + register char *dst = dest; + char *dstend = dest + destlen; + char *odst; + register char c; + register int no; + register size_t len; + + if (prog == NULL || source == NULL || dest == NULL) { + regerror("NULL parameter to regsub"); + return; + } + if ((unsigned char)*(prog->program) != MAGIC) { + regerror("damaged regexp"); + return; + } + + while ((c = *src++) != '\0') { + if (c == '&') + no = 0; + else if (c == '\\' && isdigit(*src)) + no = *src++ - '0'; + else + no = -1; + + if (no < 0) { /* Ordinary character. */ + if (c == '\\' && (*src == '\\' || *src == '&')) + c = *src++; + *dst++ = c; + if (dst >= dstend) + { + regerror("output buffer too small"); + return; + } + } else if (prog->startp[no] != NULL && prog->endp[no] != NULL && + prog->endp[no] > prog->startp[no]) { + len = prog->endp[no] - prog->startp[no]; + odst = dst; + dst += len; + if (dst >= dstend) + { + regerror("output buffer too small"); + return; + } + (void) strncpy(odst, prog->startp[no], len); + if (*(dst-1) == '\0') { /* strncpy hit NUL. */ + regerror("damaged match string"); + return; + } + } + } + *dst++ = '\0'; +} + +size_t regsublen(rp, source) +const regexp *rp; +const char *source; +{ + register regexp * const prog = (regexp *)rp; + register char *src = (char *)source; + register char c; + register int no; + register int len = 0; + + if (prog == NULL || source == NULL) { + regerror("NULL parameter to regsublen"); + return -1; } - if (UCHARAT(prog->program) != MAGIC) { - regerror("damaged regexp fed to regsub"); - return; + + if ((unsigned char)*(prog->program) != MAGIC) { + regerror("damaged regexp"); + return -1; } - src = source; - dst = dest; while ((c = *src++) != '\0') { if (c == '&') no = 0; - else if (c == '\\' && '0' <= *src && *src <= '9') + else if (c == '\\' && isdigit(*src)) no = *src++ - '0'; else no = -1; - if (no < 0) { /* Ordinary character. */ if (c == '\\' && (*src == '\\' || *src == '&')) - c = *src++; - *dst++ = c; - } else if (prog->startp[no] != NULL && prog->endp[no] != NULL) { - len = prog->endp[no] - prog->startp[no]; - (void) strncpy(dst, prog->startp[no], len); - dst += len; - if (len != 0 && *(dst - 1) == '\0') { /* strncpy hit NUL. */ - regerror("damaged match string"); - return; + src++; + len++; + } else { + const char *s = prog->startp[no]; + const char *e = prog->endp[no]; + if ((s != NULL) && (e != NULL) && (e > s)) { + len += e-s; } } } - *dst++ = '\0'; + return len+1; } + + diff --git a/scsh/regexp/tests b/scsh/regexp/tests new file mode 100644 index 0000000..10aa6f9 --- /dev/null +++ b/scsh/regexp/tests @@ -0,0 +1,127 @@ +abc abc y & abc +abc xbc n - - +abc axc n - - +abc abx n - - +abc xabcy y & abc +abc ababc y & abc +ab*c abc y & abc +ab*bc abc y & abc +ab*bc abbc y & abbc +ab*bc abbbbc y & abbbbc +ab+bc abbc y & abbc +ab+bc abc n - - +ab+bc abq n - - +ab+bc abbbbc y & abbbbc +ab?bc abbc y & abbc +ab?bc abc y & abc +ab?bc abbbbc n - - +ab?c abc y & abc +^abc$ abc y & abc +^abc$ abcc n - - +^abc abcc y & abc +^abc$ aabc n - - +abc$ aabc y & abc +^ abc y & +$ abc y & +a.c abc y & abc +a.c axc y & axc +a.*c axyzc y & axyzc +a.*c axyzd n - - +a[bc]d abc n - - +a[bc]d abd y & abd +a[b-d]e abd n - - +a[b-d]e ace y & ace +a[b-d] aac y & ac +a[-b] a- y & a- +a[b-] a- y & a- +[k] ab n - - +a[b-a] - c - - +a[]b - c - - +a[ - c - - +a] a] y & a] +a[]]b a]b y & a]b +a[^bc]d aed y & aed +a[^bc]d abd n - - +a[^-b]c adc y & adc +a[^-b]c a-c n - - +a[^]b]c a]c n - - +a[^]b]c adc y & adc +ab|cd abc y & ab +ab|cd abcd y & ab +()ef def y &-\1 ef- +()* - c - - +*a - c - - +^* - c - - +$* - c - - +(*)b - c - - +$b b n - - +a\ - c - - +a\(b a(b y &-\1 a(b- +a\(*b ab y & ab +a\(*b a((b y & a((b +a\\b a\b y & a\b +abc) - c - - +(abc - c - - +((a)) abc y &-\1-\2 a-a-a +(a)b(c) abc y &-\1-\2 abc-a-c +a+b+c aabbabc y & abc +a** - c - - +a*? - c - - +(a*)* - c - - +(a*)+ - c - - +(a|)* - c - - +(a*|b)* - c - - +(a+|b)* ab y &-\1 ab-b +(a+|b)+ ab y &-\1 ab-b +(a+|b)? ab y &-\1 a-a +[^ab]* cde y & cde +(^)* - c - - +(ab|)* - c - - +)( - c - - + abc y & +abc n - - +a* y & +abcd abcd y &-\&-\\& abcd-&-\abcd +a(bc)d abcd y \1-\\1-\\\1 bc-\1-\bc +([abc])*d abbbcd y &-\1 abbbcd-c +([abc])*bcd abcd y &-\1 abcd-a +a|b|c|d|e e y & e +(a|b|c|d|e)f ef y &-\1 ef-e +((a*|b))* - c - - +abcd*efg abcdefg y & abcdefg +ab* xabyabbbz y & ab +ab* xayabbbz y & a +(ab|cd)e abcde y &-\1 cde-cd +[abhgefdc]ij hij y & hij +^(ab|cd)e abcde n x\1y xy +(abc|)ef abcdef y &-\1 ef- +(a|b)c*d abcd y &-\1 bcd-b +(ab|ab*)bc abc y &-\1 abc-a +a([bc]*)c* abc y &-\1 abc-bc +a([bc]*)(c*d) abcd y &-\1-\2 abcd-bc-d +a([bc]+)(c*d) abcd y &-\1-\2 abcd-bc-d +a([bc]*)(c+d) abcd y &-\1-\2 abcd-b-cd +a[bcd]*dcdcde adcdcde y & adcdcde +a[bcd]+dcdcde adcdcde n - - +(ab|a)b*c abc y &-\1 abc-ab +((a)(b)c)(d) abcd y \1-\2-\3-\4 abc-a-b-d +[ -~]* abc y & abc +[ -~ -~]* abc y & abc +[ -~ -~ -~]* abc y & abc +[ -~ -~ -~ -~]* abc y & abc +[ -~ -~ -~ -~ -~]* abc y & abc +[ -~ -~ -~ -~ -~ -~]* abc y & abc +[ -~ -~ -~ -~ -~ -~ -~]* abc y & abc +[a-zA-Z_][a-zA-Z0-9_]* alpha y & alpha +^a(bc+|b[eh])g|.h$ abh y &-\1 bh- +(bc+d$|ef*g.|h?i(j|k)) effgz y &-\1-\2 effgz-effgz- +(bc+d$|ef*g.|h?i(j|k)) ij y &-\1-\2 ij-ij-j +(bc+d$|ef*g.|h?i(j|k)) effg n - - +(bc+d$|ef*g.|h?i(j|k)) bcdd n - - +(bc+d$|ef*g.|h?i(j|k)) reffgz y &-\1-\2 effgz-effgz- +((((((((((a)))))))))) - c - - +(((((((((a))))))))) a y & a +multiple words of text uh-uh n - - +multiple words multiple words, yeah y & multiple words +(.*)c(.*) abcde y &-\1-\2 abcde-ab-de +\((.*), (.*)\) (a, b) y (\2, \1) (b, a) diff --git a/scsh/regexp/timer.c b/scsh/regexp/timer.c new file mode 100644 index 0000000..c104a4f --- /dev/null +++ b/scsh/regexp/timer.c @@ -0,0 +1,164 @@ +/* + * Simple timing program for regcomp(). + * Usage: timer ncomp nexec nsub + * or + * timer ncomp nexec nsub regexp string [ answer [ sub ] ] + * + * The second form is for timing repetitions of a single test case. + * The first form's test data is a compiled-in copy of the "tests" file. + * Ncomp, nexec, nsub are how many times to do each regcomp, regexec, + * and regsub. The way to time an operation individually is to do something + * like "timer 1 50 1". + */ +#include + +struct try { + char *re, *str, *ans, *src, *dst; +} tests[] = { +#include "timer.t.h" +{ NULL, NULL, NULL, NULL, NULL } +}; + +#include + +int errreport = 0; /* Report errors via errseen? */ +char *errseen = NULL; /* Error message. */ + +char *progname; + +/* ARGSUSED */ +main(argc, argv) +int argc; +char *argv[]; +{ + int ncomp, nexec, nsub; + struct try one; + char dummy[512]; + + if (argc < 4) { + ncomp = 1; + nexec = 1; + nsub = 1; + } else { + ncomp = atoi(argv[1]); + nexec = atoi(argv[2]); + nsub = atoi(argv[3]); + } + + progname = argv[0]; + if (argc > 5) { + one.re = argv[4]; + one.str = argv[5]; + if (argc > 6) + one.ans = argv[6]; + else + one.ans = "y"; + if (argc > 7) { + one.src = argv[7]; + one.dst = "xxx"; + } else { + one.src = "x"; + one.dst = "x"; + } + errreport = 1; + try(one, ncomp, nexec, nsub); + } else + multiple(ncomp, nexec, nsub); + exit(0); +} + +void +regerror(s) +char *s; +{ + if (errreport) + errseen = s; + else + error(s, ""); +} + +#ifndef ERRAVAIL +error(s1, s2) +char *s1; +char *s2; +{ + fprintf(stderr, "regexp: "); + fprintf(stderr, s1, s2); + fprintf(stderr, "\n"); + exit(1); +} +#endif + +int lineno = 0; + +multiple(ncomp, nexec, nsub) +int ncomp, nexec, nsub; +{ + register int i; + extern char *strchr(); + + errreport = 1; + for (i = 0; tests[i].re != NULL; i++) { + lineno++; + try(tests[i], ncomp, nexec, nsub); + } +} + +try(fields, ncomp, nexec, nsub) +struct try fields; +int ncomp, nexec, nsub; +{ + regexp *r; + char dbuf[BUFSIZ]; + register int i; + + errseen = NULL; + r = regcomp(fields.re); + if (r == NULL) { + if (*fields.ans != 'c') + complain("regcomp failure in `%s'", fields.re); + return; + } + if (*fields.ans == 'c') { + complain("unexpected regcomp success in `%s'", fields.re); + free((char *)r); + return; + } + for (i = ncomp-1; i > 0; i--) { + free((char *)r); + r = regcomp(fields.re); + } + if (!regexec(r, fields.str)) { + if (*fields.ans != 'n') + complain("regexec failure in `%s'", ""); + free((char *)r); + return; + } + if (*fields.ans == 'n') { + complain("unexpected regexec success", ""); + free((char *)r); + return; + } + for (i = nexec-1; i > 0; i--) + (void) regexec(r, fields.str); + errseen = NULL; + for (i = nsub; i > 0; i--) + regsub(r, fields.src, dbuf); + if (errseen != NULL) { + complain("regsub complaint", ""); + free((char *)r); + return; + } + if (strcmp(dbuf, fields.dst) != 0) + complain("regsub result `%s' wrong", dbuf); + free((char *)r); +} + +complain(s1, s2) +char *s1; +char *s2; +{ + fprintf(stderr, "try: %d: ", lineno); + fprintf(stderr, s1, s2); + fprintf(stderr, " (%s)\n", (errseen != NULL) ? errseen : ""); +} diff --git a/scsh/regexp/try.c b/scsh/regexp/try.c index 072c94e..9b6424b 100644 --- a/scsh/regexp/try.c +++ b/scsh/regexp/try.c @@ -1,236 +1,220 @@ /* - * Simple test program for regexp(3) stuff. Knows about debugging hooks. + * Simple test program for regexp(3) stuff. Knows about debugging hooks. + * Usage: try re [string [output [-]]] + * The re is compiled and dumped, regexeced against the string, the result + * is applied to output using regsub(). The - triggers a running narrative + * from regexec(). Dumping and narrative don't happen unless DEBUG. * - * Copyright (c) 1986 by University of Toronto. Written by Henry Spencer. Not - * derived from licensed software. - * - * Permission is granted to anyone to use this software for any purpose on any - * computer system, and to redistribute it freely, subject to the following - * restrictions: - * - * 1. The author is not responsible for the consequences of use of this - * software, no matter how awful, even if they arise from defects in it. - * - * 2. The origin of this software must not be misrepresented, either by explicit - * claim or by omission. - * - * 3. Altered versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * - * Usage: try re [string [output [-]]] The re is compiled and dumped, regexeced - * against the string, the result is applied to output using regsub(). The - - * triggers a running narrative from regexec(). Dumping and narrative don't - * happen unless DEBUG. - * - * If there are no arguments, stdin is assumed to be a stream of lines with five - * fields: a r.e., a string to match it against, a result code, a source - * string for regsub, and the proper result. Result codes are 'c' for - * compile failure, 'y' for match success, 'n' for match failure. Field - * separator is tab. + * If there are no arguments, stdin is assumed to be a stream of lines with + * five fields: a r.e., a string to match it against, a result code, a + * source string for regsub, and the proper result. Result codes are 'c' + * for compile failure, 'y' for match success, 'n' for match failure. + * Field separator is tab. */ #include #include - + #ifdef ERRAVAIL -char *progname; -extern char *mkprogname(); +char *progname; +extern char *mkprogname(); #endif - + #ifdef DEBUG -extern int regnarrate; +extern int regnarrate; #endif - -char buf[BUFSIZ]; - -int errreport = 0; /* Report errors via errseen? */ -char *errseen = NULL; /* Error message. */ -int status = 0; /* Exit status. */ - + +char buf[BUFSIZ]; + +int errreport = 0; /* Report errors via errseen? */ +char *errseen = NULL; /* Error message. */ +int status = 0; /* Exit status. */ + /* ARGSUSED */ main(argc, argv) - int argc; - char *argv[]; +int argc; +char *argv[]; { - regexp *r; - int i; - + regexp *r; + int i; + #ifdef ERRAVAIL - progname = mkprogname(argv[0]); + progname = mkprogname(argv[0]); #endif - - if (argc == 1) { - multiple(); - exit(status); - } - r = regcomp(argv[1]); - if (r == NULL) - error("regcomp failure", ""); + + if (argc == 1) { + multiple(); + exit(status); + } + + r = regcomp(argv[1]); + if (r == NULL) + error("regcomp failure", ""); #ifdef DEBUG - regdump(r); - if (argc > 4) - regnarrate++; + regdump(r); + if (argc > 4) + regnarrate++; #endif - if (argc > 2) { - i = regexec(r, argv[2]); - printf("%d", i); - for (i = 1; i < NSUBEXP; i++) - if (r->startp[i] != NULL && r->endp[i] != NULL) - printf(" \\%d", i); - printf("\n"); - } - if (argc > 3) { - regsub(r, argv[3], buf); - printf("%s\n", buf); - } - exit(status); + if (argc > 2) { + i = regexec(r, argv[2]); + printf("%d", i); + for (i = 1; i < NSUBEXP; i++) + if (r->startp[i] != NULL && r->endp[i] != NULL) + printf(" \\%d", i); + printf("\n"); + } + if (argc > 3) { + regsub(r, argv[3], buf); + printf("%s\n", buf); + } + exit(status); } - + void regerror(s) - char *s; +char *s; { - if (errreport) - errseen = s; - else - error(s, ""); + if (errreport) + errseen = s; + else + error(s, ""); } - + #ifndef ERRAVAIL error(s1, s2) - char *s1; - char *s2; +char *s1; +char *s2; { - fprintf(stderr, "regexp: "); - fprintf(stderr, s1, s2); - fprintf(stderr, "\n"); - exit(1); + fprintf(stderr, "regexp: "); + fprintf(stderr, s1, s2); + fprintf(stderr, "\n"); + exit(1); } #endif - -int lineno; - -regexp badregexp; /* Implicit init to 0. */ - + +int lineno; + +regexp badregexp; /* Implicit init to 0. */ + multiple() { - char rbuf[BUFSIZ]; - char *field[5]; - char *scan; - int i; - regexp *r; - extern char *strchr(); - - errreport = 1; - lineno = 0; - while (fgets(rbuf, sizeof(rbuf), stdin) != NULL) { - rbuf[strlen(rbuf) - 1] = '\0'; /* Dispense with \n. */ - lineno++; - scan = rbuf; - for (i = 0; i < 5; i++) { - field[i] = scan; - if (field[i] == NULL) { - complain("bad testfile format", ""); - exit(1); - } - scan = strchr(scan, '\t'); - if (scan != NULL) - *scan++ = '\0'; + char rbuf[BUFSIZ]; + char *field[5]; + char *scan; + int i; + regexp *r; + extern char *strchr(); + + errreport = 1; + lineno = 0; + while (fgets(rbuf, sizeof(rbuf), stdin) != NULL) { + rbuf[strlen(rbuf)-1] = '\0'; /* Dispense with \n. */ + lineno++; + scan = rbuf; + for (i = 0; i < 5; i++) { + field[i] = scan; + if (field[i] == NULL) { + complain("bad testfile format", ""); + exit(1); + } + scan = strchr(scan, '\t'); + if (scan != NULL) + *scan++ = '\0'; + } + try(field); } - try(field); - } - - /* And finish up with some internal testing... */ - lineno = 9990; - errseen = NULL; - if (regcomp((char *) NULL) != NULL || errseen == NULL) - complain("regcomp(NULL) doesn't complain", ""); - lineno = 9991; - errseen = NULL; - if (regexec((regexp *) NULL, "foo") || errseen == NULL) - complain("regexec(NULL, ...) doesn't complain", ""); - lineno = 9992; - r = regcomp("foo"); - if (r == NULL) { - complain("regcomp(\"foo\") fails", ""); - return; - } - lineno = 9993; - errseen = NULL; - if (regexec(r, (char *) NULL) || errseen == NULL) - complain("regexec(..., NULL) doesn't complain", ""); - lineno = 9994; - errseen = NULL; - regsub((regexp *) NULL, "foo", rbuf); - if (errseen == NULL) - complain("regsub(NULL, ..., ...) doesn't complain", ""); - lineno = 9995; - errseen = NULL; - regsub(r, (char *) NULL, rbuf); - if (errseen == NULL) - complain("regsub(..., NULL, ...) doesn't complain", ""); - lineno = 9996; - errseen = NULL; - regsub(r, "foo", (char *) NULL); - if (errseen == NULL) - complain("regsub(..., ..., NULL) doesn't complain", ""); - lineno = 9997; - errseen = NULL; - if (regexec(&badregexp, "foo") || errseen == NULL) - complain("regexec(nonsense, ...) doesn't complain", ""); - lineno = 9998; - errseen = NULL; - regsub(&badregexp, "foo", rbuf); - if (errseen == NULL) - complain("regsub(nonsense, ..., ...) doesn't complain", ""); + + /* And finish up with some internal testing... */ + lineno = 9990; + errseen = NULL; + if (regcomp((char *)NULL) != NULL || errseen == NULL) + complain("regcomp(NULL) doesn't complain", ""); + lineno = 9991; + errseen = NULL; + if (regexec((regexp *)NULL, "foo") || errseen == NULL) + complain("regexec(NULL, ...) doesn't complain", ""); + lineno = 9992; + r = regcomp("foo"); + if (r == NULL) { + complain("regcomp(\"foo\") fails", ""); + return; + } + lineno = 9993; + errseen = NULL; + if (regexec(r, (char *)NULL) || errseen == NULL) + complain("regexec(..., NULL) doesn't complain", ""); + lineno = 9994; + errseen = NULL; + regsub((regexp *)NULL, "foo", rbuf); + if (errseen == NULL) + complain("regsub(NULL, ..., ...) doesn't complain", ""); + lineno = 9995; + errseen = NULL; + regsub(r, (char *)NULL, rbuf); + if (errseen == NULL) + complain("regsub(..., NULL, ...) doesn't complain", ""); + lineno = 9996; + errseen = NULL; + regsub(r, "foo", (char *)NULL); + if (errseen == NULL) + complain("regsub(..., ..., NULL) doesn't complain", ""); + lineno = 9997; + errseen = NULL; + if (regexec(&badregexp, "foo") || errseen == NULL) + complain("regexec(nonsense, ...) doesn't complain", ""); + lineno = 9998; + errseen = NULL; + regsub(&badregexp, "foo", rbuf); + if (errseen == NULL) + complain("regsub(nonsense, ..., ...) doesn't complain", ""); } - + try(fields) - char **fields; +char **fields; { - regexp *r; - char dbuf[BUFSIZ]; - - errseen = NULL; - r = regcomp(fields[0]); - if (r == NULL) { - if (*fields[2] != 'c') - complain("regcomp failure in `%s'", fields[0]); - return; - } - if (*fields[2] == 'c') { - complain("unexpected regcomp success in `%s'", fields[0]); - free((char *) r); - return; - } - if (!regexec(r, fields[1])) { - if (*fields[2] != 'n') - complain("regexec failure in `%s'", ""); - free((char *) r); - return; - } - if (*fields[2] == 'n') { - complain("unexpected regexec success", ""); - free((char *) r); - return; - } - errseen = NULL; - regsub(r, fields[3], dbuf); - if (errseen != NULL) { - complain("regsub complaint", ""); - free((char *) r); - return; - } - if (strcmp(dbuf, fields[4]) != 0) - complain("regsub result `%s' wrong", dbuf); - free((char *) r); + regexp *r; + char dbuf[BUFSIZ]; + + errseen = NULL; + r = regcomp(fields[0]); + if (r == NULL) { + if (*fields[2] != 'c') + complain("regcomp failure in `%s'", fields[0]); + return; + } + if (*fields[2] == 'c') { + complain("unexpected regcomp success in `%s'", fields[0]); + free((char *)r); + return; + } + if (!regexec(r, fields[1])) { + if (*fields[2] != 'n') + complain("regexec failure in `%s'", fields[0]); + free((char *)r); + return; + } + if (*fields[2] == 'n') { + complain("unexpected regexec success", ""); + free((char *)r); + return; + } + errseen = NULL; + regsub(r, fields[3], dbuf); + if (errseen != NULL) { + complain("regsub complaint", ""); + free((char *)r); + return; + } + if (strcmp(dbuf, fields[4]) != 0) + complain("regsub result `%s' wrong", dbuf); + free((char *)r); } - + complain(s1, s2) - char *s1; - char *s2; +char *s1; +char *s2; { - fprintf(stderr, "try: %d: ", lineno); - fprintf(stderr, s1, s2); - fprintf(stderr, " (%s)\n", (errseen != NULL) ? errseen : ""); - status = 1; + fprintf(stderr, "try: %d: ", lineno); + fprintf(stderr, s1, s2); + fprintf(stderr, " (%s)\n", (errseen != NULL) ? errseen : ""); + status = 1; }