updated regexp

This commit is contained in:
bdc 1996-09-24 01:29:51 +00:00
parent 31e3c2e522
commit a61145fa6a
12 changed files with 2732 additions and 1483 deletions

19
scsh/regexp/COPYRIGHT Normal file
View File

@ -0,0 +1,19 @@
Copyright (c) 1986, 1993, 1995 by University of Toronto.
Written by Henry Spencer. Not derived from licensed software.
Permission is granted to anyone to use this software for any
purpose on any computer system, and to redistribute it in any way,
subject to the following restrictions:
1. The author is not responsible for the consequences of use of
this software, no matter how awful, even if they arise
from defects in it.
2. The origin of this software must not be misrepresented, either
by explicit claim or by omission.
3. Altered versions must be plainly marked as such, and must not
be misrepresented (by explicit claim or omission) as being
the original software.
4. This notice must not be removed or altered.

View File

@ -5,92 +5,114 @@ CFLAGS1 = @CFLAGS1@
RANLIB = @RANLIB@
# Things you might want to put in ENV and LENV:
# -Dvoid=int compilers that don't do void
# -DCHARBITS=0377 compilers that don't do unsigned char
# -DSTATIC=extern compilers that don't like "static foo();" as forward decl
# -DSTRCSPN library does not have strcspn()
# -Dstrchr=index library does not have strchr()
# Things you might want to put in ENV:
# -DERRAVAIL have utzoo-compatible error() function and friends
# ENV=-DSTRCSPN
# LENV=-DSTRCSPN
ENV=
# Things you might want to put in TEST:
# -DDEBUG debugging hooks
# -I. regexp.h from current directory, not /usr/include
TEST=-I. -I$(srcdir)
# Things you might want to put in PROF:
# -Dstatic='/* */' make everything global so profiler can see it.
# -p profiler
PROF=
# -pg profiler
# PROF=
INCDEST=/contrib/share/include
LIBDEST=/contrib/system/lib
MANDEST=/contrib/share/man/man3
CFLAGS=$(CFLAGS1) $(ENV) $(TEST) $(PROF)
LDFLAGS=$(PROF)
# CC = cc
# CFLAGS1 = -O -Q
LIB=libregexp.a
OBJ=regexp.o regsub.o regerror.o
TMP=dtr.tmp
LINTFLAGS=$(LENV) $(TEST) -ha
# LDFLAGS=-i
default: r
CFLAGS = $(CFLAGS1) $(ENV) $(TEST) $(PROF)
try: try.o $(LIB)
$(CC) $(LDFLAGS) try.o $(LIB) -o try
OBJ=regexp.o regsub.o
LIBOBJ= $(OBJ) regerror.o
LSRC=regexp.c regsub.c regerror.c
DTR=README dMakefile regexp.3 regexp.h regexp.c regsub.c regerror.c \
regmagic.h try.c timer.c tests
DEST = ..
# we don't use the library anymore -bri
all: $(OBJ) # libregexp.a try
libregexp.a: $(LIBOBJ)
ar r libregexp.a $(LIBOBJ)
$(RANLIB) libregexp.a
install:
install -c libregexp.a $(LIBDEST)/libregexp.a
$(RANLIB) $(LIBDEST)/libregexp.a
install -c regexp.h $(INCDEST)/regexp.h
install -c regexp.3 $(MANDEST)/regexp.3
try: try.o $(OBJ)
$(CC) $(LDFLAGS) try.o $(OBJ) -o try
# Making timer will probably require putting stuff in $(PROF) and then
# recompiling everything; the following is just the final stage.
timer: timer.o $(OBJ)
$(CC) $(LDFLAGS) $(PROF) timer.o $(OBJ) -o timer
timer: timer.o $(LIB)
$(CC) $(LDFLAGS) timer.o $(LIB) -o timer
timer.o: timer.c timer.t.h
timer.t.h: tests
sed 's/ /","/g;s/\\/&&/g;s/.*/{"&"},/' tests >timer.t.h
# Regression test.
r: ./try tests
@echo 'No news is good news...'
./try <tests
lint: timer.t.h
@echo 'Complaints about multiply-declared regerror() are legit.'
lint $(LINTFLAGS) $(LSRC) try.c
lint $(LINTFLAGS) $(LSRC) timer.c
r: try tests
./try <tests # no news is good news...
$(LIB): $(OBJ)
ar cr $(LIB) $(OBJ)
$(RANLIB) libregexp.a
regexp.o: regexp.c regexp.h regmagic.h
regsub.o: regsub.c regexp.h regmagic.h
clean:
rm -f *.o *.out *~ *.a core mon.out timer.t.h dMakefile dtr try timer
dtr: r makedtr $(DTR)
makedtr $(DTR) >dtr
dMakefile: Makefile
sed '/^L*ENV=/s/ *-DERRAVAIL//' Makefile >dMakefile
mv: $(OBJ) regerror.o
mv $(OBJ) regerror.o $(DEST)
rm -f *.o core mon.out gmon.out timer.t.h dtr copy try timer r.*
rm -f residue rs.* re.1 rm.h re.h ch.soe ch.ps j badcom fig[012]
rm -f ch.sml fig[12].ps $(LIB)
rm -rf $(TMP)
# the rest of this is unlikely to be of use to you
BITS = r.1 rs.1 re.1 rm.h re.h
OPT=-p -ms
ch.soe: ch $(BITS)
soelim ch >$@
ch.sml: ch $(BITS) smlize splitfigs
splitfigs ch | soelim | smlize >$@
fig0 fig1 fig2: ch splitfigs
splitfigs ch >/dev/null
f: fig0 fig1 fig2 figs
groff -Tps -s $(OPT) figs | lpr
fig1.ps: fig0 fig1
( cat fig0 ; echo ".LP" ; cat fig1 ) | groff -Tps $(OPT) >$@
fig2.ps: fig0 fig2
( cat fig0 ; echo ".LP" ; cat fig2 ) | groff -Tps $(OPT) >$@
fp: fig1.ps fig2.ps
r.1: regexp.c splitter
splitter regexp.c
rs.1: regsub.c splitter
splitter regsub.c
re.1: regerror.c splitter
splitter regerror.c
rm.h: regmagic.h splitter
splitter regmagic.h
re.h: regexp.h splitter
splitter regexp.h
PLAIN=COPYRIGHT README Makefile regexp.3 try.c timer.c tests
FIX=regexp.h regexp.c regsub.c regerror.c regmagic.h
DTR=$(PLAIN) $(FIX)
dtr: r $(DTR)
rm -rf $(TMP)
mkdir $(TMP)
cp $(PLAIN) $(TMP)
for f in $(FIX) ; do normalize $$f >$(TMP)/$$f ; done
( cd $(TMP) ; makedtr $(DTR) ) >$@
rm -rf $(TMP)
ch.ps: ch Makefile $(BITS)
groff -Tps $(OPT) ch >$@
copy: ch.soe ch.sml fp
makedtr REMARKS ch.sml fig*.ps ch.soe >$@
go: copy dtr

View File

@ -1,55 +1,37 @@
This is a nearly-public-domain reimplementation of the V8 regexp(3) package.
This is a revision of my well-known regular-expression package, regexp(3).
It gives C programs the ability to use egrep-style regular expressions, and
does it in a much cleaner fashion than the analogous routines in SysV.
Copyright (c) 1986 by University of Toronto.
Written by Henry Spencer. Not derived from licensed software.
Permission is granted to anyone to use this software for any
purpose on any computer system, and to redistribute it freely,
subject to the following restrictions:
1. The author is not responsible for the consequences of use of
this software, no matter how awful, even if they arise
from defects in it.
2. The origin of this software must not be misrepresented, either
by explicit claim or by omission.
3. Altered versions must be plainly marked as such, and must not
be misrepresented as being the original software.
Barring a couple of small items in the BUGS list, this implementation is
believed 100% compatible with V8. It should even be binary-compatible,
sort of, since the only fields in a "struct regexp" that other people have
any business touching are declared in exactly the same way at the same
location in the struct (the beginning).
This implementation is *NOT* AT&T/Bell code, and is not derived from licensed
It is not, alas, fully POSIX.2-compliant; that is hard. (I'm working on
a full reimplementation that will do that.)
This version is the one which is examined and explained in one chapter of
"Software Solutions in C" (Dale Schumacher, ed.; AP Professional 1994;
ISBN 0-12-632360-7), plus a couple of insignificant updates, plus one
significant bug fix (done 10 Nov 1995).
Although this package was inspired by the Bell V8 regexp(3), this
implementation is *NOT* AT&T/Bell code, and is not derived from licensed
software. Even though U of T is a V8 licensee. This software is based on
a V8 manual page sent to me by Dennis Ritchie (the manual page enclosed
here is a complete rewrite and hence is not covered by AT&T copyright).
The software was nearly complete at the time of arrival of our V8 tape.
I haven't even looked at V8 yet, although a friend elsewhere at U of T has
been kind enough to run a few test programs using the V8 regexp(3) to resolve
a few fine points. I admit to some familiarity with regular-expression
implementations of the past, but the only one that this code traces any
ancestry to is the one published in Kernighan & Plauger (from which this
one draws ideas but not code).
Simplistically: put this stuff into a source directory, copy regexp.h into
/usr/include, inspect Makefile for compilation options that need changing
to suit your local environment, and then do "make r". This compiles the
regexp(3) functions, compiles a test program, and runs a large set of
regression tests. If there are no complaints, then put regexp.o, regsub.o,
and regerror.o into your C library, and regexp.3 into your manual-pages
directory.
Note that if you don't put regexp.h into /usr/include *before* compiling,
you'll have to add "-I." to CFLAGS before compiling.
I admit to some familiarity with regular-expression implementations of
the past, but the only one that this code traces any ancestry to is the
one published in Kernighan & Plauger's "Software Tools" (from which
this one draws ideas but not code).
Simplistically: put this stuff into a source directory, inspect Makefile
for compilation options that need changing to suit your local environment,
and then do "make". This compiles the regexp(3) functions, builds a
library containing them, compiles a test program, and runs a large set of
regression tests. If there are no complaints, then put regexp.h into
/usr/include, add regexp.o, regsub.o, and regerror.o into your C library
(or put libre.a into /usr/lib), and install regexp.3 (perhaps with slight
modifications) in your manual-pages directory.
The files are:
COPYRIGHT copyright notice
README this text
Makefile instructions to make everything
regexp.3 manual page
regexp.h header file, for /usr/include
@ -60,24 +42,15 @@ regmagic.h internal header file
try.c source for test program
timer.c source for timing program
tests test list for try and timer
This implementation uses nondeterministic automata rather than the
deterministic ones found in some other implementations, which makes it
simpler, smaller, and faster at compiling regular expressions, but slower
at executing them. In theory, anyway. This implementation does employ
some special-case optimizations to make the simpler cases (which do make
up the bulk of regular expressions actually used) run quickly. In general,
if you want blazing speed you're in the wrong place. Replacing the insides
of egrep with this stuff is probably a mistake; if you want your own egrep
you're going to have to do a lot more work. But if you want to use regular
expressions a little bit in something else, you're in luck. Note that many
existing text editors use nondeterministic regular-expression implementations,
so you're in good company.
This stuff should be pretty portable, given appropriate option settings.
If your chars have less than 8 bits, you're going to have to change the
internal representation of the automaton, although knowledge of the details
of this is fairly localized. There are no "reserved" char values except for
at executing them. Many users have found the speed perfectly adequate,
although replacing the insides of egrep with this code would be a mistake.
This stuff should be pretty portable, given an ANSI C compiler and
appropriate option settings. There are no "reserved" char values except for
NUL, and no special significance is attached to the top bit of chars.
The string(3) functions are used a fair bit, on the grounds that they are
probably faster than coding the operations in line. Some attempts at code

803
scsh/regexp/patch-msg Normal file
View File

@ -0,0 +1,803 @@
Date: Mon, 1 Jul 1996 23:22:47 GMT
From: Bill Sommerfeld <sommerfeld@orchard.medford.ma.us>
To: shivers@lcs.mit.edu, bdc@ai.mit.edu
Subject: scsh patch for precompiled regexps..
I meant to send this out months ago but I was just too hosed with work.
Here's what I have right now:
There are three pieces here:
diffs to the "core" scsh
diffs to Henry Spencer's latest regexp library
a copy of Henry Spencer's latest regexp library..
It appears to work (it passes the same regression tests as the C library..).
Let me know if I didn't include something needed for this to work..
- Bill
diff -rc scsh-0.4.2/scsh/re.scm scsh-0.4.2-regexp/scsh/re.scm
*** scsh-0.4.2/scsh/re.scm Fri Oct 27 04:58:56 1995
--- scsh-0.4.2-regexp/scsh/re.scm Sat Apr 6 21:07:41 1996
***************
*** 34,49 ****
;;; Bogus stub definitions for low-level match routines:
! (define regexp? string?)
! (define (make-regexp str) str)
! (define (regexp-exec regexp str . maybe-start)
(let ((start (optional-arg maybe-start 0))
(start-vec (make-vector 10))
(end-vec (make-vector 10)))
! (and (%regexp-match regexp str start start-vec end-vec)
! (make-regexp-match str start-vec end-vec))))
!
;;; Convert a string into a regex pattern that matches that string exactly --
;;; in other words, quote the special chars with backslashes.
--- 34,53 ----
;;; Bogus stub definitions for low-level match routines:
! (define-record iregexp
! string)
! (define regexp? iregexp?)
!
! (define (make-regexp str)
! (make-iregexp (compile-regexp str)))
!
! (define (regexp-exec r s . maybe-start)
(let ((start (optional-arg maybe-start 0))
(start-vec (make-vector 10))
(end-vec (make-vector 10)))
! (and (%regexp-exec-1 (iregexp:string r) s start start-vec end-vec)
! (make-regexp-match s start-vec end-vec))))
;;; Convert a string into a regex pattern that matches that string exactly --
;;; in other words, quote the special chars with backslashes.
***************
*** 58,75 ****
(cons #\\ result)
result))))))
! (define-foreign %regexp-match/errno (reg_match (string regexp)
! (string s)
! (integer start)
! (vector-desc start-vec)
! (vector-desc end-vec))
! static-string ; Error string or #f if all is ok.
! bool) ; match?
!
! (define (%regexp-match regexp string start start-vec end-vec)
! (receive (err match?) (%regexp-match/errno regexp string start
! start-vec end-vec)
! (if err (error err %regexp-match regexp string start) match?)))
;;; I do this one in C, I'm not sure why:
--- 62,79 ----
(cons #\\ result)
result))))))
! ;;;(define-foreign %regexp-match/errno (reg_match (string regexp)
! ;;; (string s)
! ;;; (integer start)
! ;;; (vector-desc start-vec)
! ;;; (vector-desc end-vec))
! ;;; static-string ; Error string or #f if all is ok.
! ;;; bool) ; match?
!
! ;;;(define (%regexp-match regexp string start start-vec end-vec)
! ;;; (receive (err match?) (%regexp-match/errno regexp string start
! ;;; start-vec end-vec)
! ;;; (if err (error err %regexp-match regexp string start) match?)))
;;; I do this one in C, I'm not sure why:
***************
*** 79,81 ****
--- 83,166 ----
(filter_stringvec (string regexp) ((C "char const ** ~a") cvec))
static-string ; error message -- #f if no error.
integer) ; number of files that pass the filter.
+
+ ;;; precompiled regexps.
+
+ (define-foreign %regexp-compiled-length (reg_comp_len (string regexp))
+ static-string
+ integer)
+
+ (define-foreign %regexp-compile (reg_comp_comp (string regexp)
+ (string-desc re-buf))
+ static-string)
+
+ (define (%regexp-exec-1 r s start sv ev)
+ (receive (err match?) (%regexp-exec r s start sv ev)
+ (if err (error err s start)
+ match?)))
+
+ (define-foreign %regexp-exec (reg_exec (string-desc regexp)
+ (string s)
+ (integer start)
+ (vector-desc start-vec)
+ (vector-desc end-vec))
+ static-string
+ bool)
+
+
+ (define (compile-regexp e)
+ (receive (err len)
+ (%regexp-compiled-length e)
+ (if err (error err e)
+ (let ((buf (make-string len)))
+ (%regexp-compile e buf)
+ buf))))
+
+
+
+ (define-foreign %regexp-subst (reg_subst (string-desc regexp)
+ (string m)
+ (string s)
+ (integer start)
+ (vector-desc start-vec)
+ (vector-desc end-vec)
+ (string-desc outbuf))
+ static-string
+ integer)
+
+ (define-foreign %regexp-subst-len (reg_subst_len (string-desc regexp)
+ (string m)
+ (string s)
+ (integer start)
+ (vector-desc start-vec)
+ (vector-desc end-vec))
+ static-string
+ integer)
+
+
+ (define (regexp-subst re match replacement)
+ (let ((cr (iregexp:string re))
+ (matchstr (regexp-match:string match))
+ (startvec (regexp-match:start match))
+ (endvec (regexp-match:end match)))
+ (receive (err outlen)
+ (%regexp-subst-len cr
+ matchstr
+ replacement
+ 0
+ startvec
+ endvec)
+ (if err (error err matchstr replacement)
+ (let ((outbuf (make-string outlen)))
+ (receive (err outlen)
+ (%regexp-subst cr
+ matchstr
+ replacement
+ 0
+ startvec
+ endvec
+ outbuf)
+ (if err (error err matchstr replacement)
+ (substring outbuf 0 outlen))))))))
+
+
\ No newline at end of file
diff -rc scsh-0.4.2/scsh/re1.c scsh-0.4.2-regexp/scsh/re1.c
*** scsh-0.4.2/scsh/re1.c Fri Oct 27 04:58:58 1995
--- scsh-0.4.2-regexp/scsh/re1.c Sat Apr 6 21:01:15 1996
***************
*** 19,24 ****
--- 19,150 ----
/* Stash error msg in global. */
void regerror(char *msg) {regexp_error = msg;}
+ /*
+ ** Return NULL normally, error string on error.
+ ** Stash number of bytes needed for compiled regexp into `*len'
+ */
+
+ char *reg_comp_len(const char *re, int *len)
+ {
+ int l;
+
+ regexp_error = NULL;
+ *len = regcomp_len(re);
+ return regexp_error;
+ }
+
+ /*
+ ** Return NULL normally, error string on error.
+ ** Compile regexp into string described by `cr'.
+ */
+
+ char *reg_comp_comp(const char *re, scheme_value cr)
+ {
+ int len = STRING_LENGTH(cr);
+ regexp *r = (regexp *)&STRING_REF(cr, 0);
+
+ regexp_error = NULL;
+ r = regcomp_comp(re, r, len);
+ return regexp_error;
+ }
+
+ /* Return NULL normally, error string on error.
+ ** Stash match info in start_vec and end_vec.
+ ** Returns boolean match/no-match in hit.
+ */
+
+ char *reg_exec(scheme_value cr, const char *string, int start,
+ scheme_value start_vec, scheme_value end_vec, int *hit)
+ {
+ regexp *r = (regexp *)&STRING_REF(cr, 0);
+
+ if( VECTOR_LENGTH(start_vec) != NSUBEXP ) {
+ return "Illegal start vector";
+ }
+
+ if( VECTOR_LENGTH(end_vec) != NSUBEXP ) {
+ return "Illegal end vector";
+ }
+
+ regexp_error = 0;
+ *hit = 0;
+
+ if( regexec(r, string+start) ) {
+ int i;
+ for(i=0; i<NSUBEXP; i++) {
+ const char *s = r->startp[i];
+ const char *e = r->endp[i];
+ VECTOR_REF(start_vec,i) = s?ENTER_FIXNUM(s - string):SCHFALSE;
+ VECTOR_REF(end_vec,i) = e?ENTER_FIXNUM(e - string):SCHFALSE;
+ r->startp[i] = NULL;
+ r->endp[i] = NULL;
+ }
+ *hit = 1;
+ }
+ return regexp_error;
+ }
+
+ char *reg_subst(scheme_value cr, const char *match,
+ const char *src, int start,
+ scheme_value start_vec, scheme_value end_vec,
+ scheme_value outbuf, int *len)
+ {
+ int i;
+ regexp *r = (regexp *)&STRING_REF(cr, 0);
+
+ if( VECTOR_LENGTH(start_vec) != NSUBEXP ) {
+ return "Illegal start vector";
+ }
+
+ if( VECTOR_LENGTH(end_vec) != NSUBEXP ) {
+ return "Illegal end vector";
+ }
+
+ for (i=0; i<NSUBEXP; i++)
+ {
+ scheme_value se = VECTOR_REF(start_vec, i);
+ scheme_value ee = VECTOR_REF(end_vec, i);
+ r->startp[i] = FIXNUMP(se)?(match + EXTRACT_FIXNUM(se)):NULL;
+ r->endp[i] = FIXNUMP(ee)? (match + EXTRACT_FIXNUM(ee)):NULL;
+ }
+
+ regexp_error = NULL;
+ regnsub (r, src, &STRING_REF(outbuf, 0), STRING_LENGTH(outbuf));
+ *len = strlen(&STRING_REF(outbuf, 0));
+ return regexp_error;
+ }
+
+ char *reg_subst_len(scheme_value cr, const char *match,
+ const char *src, int start,
+ scheme_value start_vec, scheme_value end_vec,
+ int *len)
+ {
+ int i;
+ regexp *r = (regexp *)&STRING_REF(cr, 0);
+
+ if( VECTOR_LENGTH(start_vec) != NSUBEXP ) {
+ return "Illegal start vector";
+ }
+
+ if( VECTOR_LENGTH(end_vec) != NSUBEXP ) {
+ return "Illegal end vector";
+ }
+
+ for (i=0; i<NSUBEXP; i++)
+ {
+ scheme_value se = VECTOR_REF(start_vec, i);
+ scheme_value ee = VECTOR_REF(end_vec, i);
+ r->startp[i] = FIXNUMP(se)?(match + EXTRACT_FIXNUM(se)):NULL;
+ r->endp[i] = FIXNUMP(ee)? (match + EXTRACT_FIXNUM(ee)):NULL;
+ }
+
+ regexp_error = NULL;
+ *len = regsublen (r, src);
+ return regexp_error;
+ }
+
+
+ #if 0
/* Return NULL normally, error string on error.
** Stash match info in start_vec and end_vec.
** Returns boolean match/no-match in hit.
***************
*** 56,61 ****
--- 182,188 ----
Free(prog);
return regexp_error;
}
+ #endif
char *filter_stringvec(const char *re, char const **stringvec, int *nummatch)
diff -rc scsh-0.4.2/scsh/re1.h scsh-0.4.2-regexp/scsh/re1.h
*** scsh-0.4.2/scsh/re1.h Sun Oct 22 08:34:34 1995
--- scsh-0.4.2-regexp/scsh/re1.h Sat Apr 6 17:54:09 1996
***************
*** 1,6 ****
--- 1,21 ----
+ #if 0
char *reg_match(const char *re, const char *string, int start,
scheme_value start_vec, scheme_value end_vec,
int *hit);
+ #endif
char *filter_stringvec(const char *re, char const **stringvec,
int *nummatch);
+
+ char *reg_comp_len(const char *re, int *len);
+ char *reg_comp_comp(const char *re, scheme_value cr);
+
+ char *reg_exec(scheme_value cr, const char *string, int start,
+ scheme_value start_vec, scheme_value end_vec, int *hit);
+
+ char *reg_subst(scheme_value cr, const char *match,
+ const char *src, int start,
+ scheme_value start_vec, scheme_value end_vec,
+ scheme_value outbuf, int *len);
+
+
Only in scsh-0.4.2-regexp/scsh: re2.scm
diff -rc scsh-0.4.2/scsh/scsh-interfaces.scm scsh-0.4.2-regexp/scsh/scsh-interfaces.scm
*** scsh-0.4.2/scsh/scsh-interfaces.scm Tue Oct 31 19:19:30 1995
--- scsh-0.4.2-regexp/scsh/scsh-interfaces.scm Sat Apr 6 18:48:12 1996
***************
*** 413,418 ****
--- 413,419 ----
make-regexp
regexp?
regexp-exec
+ regexp-subst
regexp-quote))
regexp library changes:
*** Makefile 1996/04/06 19:24:49 1.1
--- Makefile 1996/04/06 20:46:26
***************
*** 5,11 ****
# Things you might want to put in TEST:
# -DDEBUG debugging hooks
# -I. regexp.h from current directory, not /usr/include
! TEST=-I.
# Things you might want to put in PROF:
# -pg profiler
--- 5,11 ----
# Things you might want to put in TEST:
# -DDEBUG debugging hooks
# -I. regexp.h from current directory, not /usr/include
! TEST=-I. -DDEBUG
# Things you might want to put in PROF:
# -pg profiler
*** regexp.c 1996/04/06 19:24:49 1.1
--- regexp.c 1996/04/06 22:34:55
***************
*** 105,110 ****
--- 105,111 ----
* Utility definitions.
*/
#define FAIL(m) { regerror(m); return(NULL); }
+ #define FAILN(m) { regerror(m); return(-1); }
#define ISREPN(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define META "^$.[()|?+*\\"
***************
*** 162,173 ****
const char *exp;
{
register regexp *r;
! register char *scan;
int flags;
struct comp co;
if (exp == NULL)
! FAIL("NULL argument to regcomp");
/* First pass: determine size, legality. */
co.regparse = (char *)exp;
--- 163,193 ----
const char *exp;
{
register regexp *r;
! size_t len;
!
! len = regcomp_len(exp);
! if (len <= 0)
! return NULL;
!
! /* Allocate space. */
! r = (regexp *)malloc(len);
!
! if (r == NULL)
! FAIL("out of space");
! return regcomp_comp(exp, r, len);
! }
!
!
! size_t
! regcomp_len(exp)
! const char *exp;
! {
int flags;
+ register regexp *r;
struct comp co;
if (exp == NULL)
! FAILN("NULL argument to regcomp");
/* First pass: determine size, legality. */
co.regparse = (char *)exp;
***************
*** 178,198 ****
co.regcode = co.regdummy;
regc(&co, MAGIC);
if (reg(&co, 0, &flags) == NULL)
! return(NULL);
/* Small enough for pointer-storage convention? */
if (co.regsize >= 0x7fffL) /* Probably could be 0xffffL. */
! FAIL("regexp too big");
! /* Allocate space. */
! r = (regexp *)malloc(sizeof(regexp) + (size_t)co.regsize);
! if (r == NULL)
! FAIL("out of space");
/* Second pass: emit code. */
co.regparse = (char *)exp;
co.regnpar = 1;
co.regcode = r->program;
regc(&co, MAGIC);
if (reg(&co, 0, &flags) == NULL)
return(NULL);
--- 198,228 ----
co.regcode = co.regdummy;
regc(&co, MAGIC);
if (reg(&co, 0, &flags) == NULL)
! return -1;
/* Small enough for pointer-storage convention? */
if (co.regsize >= 0x7fffL) /* Probably could be 0xffffL. */
! FAILN("regexp too big");
! return (sizeof(regexp) + (size_t)co.regsize);
! }
!
!
! regexp *
! regcomp_comp(exp, r, len)
! const char *exp;
! register regexp *r;
! size_t len;
! {
! register char *scan;
! int flags;
! struct comp co;
/* Second pass: emit code. */
co.regparse = (char *)exp;
co.regnpar = 1;
co.regcode = r->program;
+ co.regsize = len - sizeof(regexp);
regc(&co, MAGIC);
if (reg(&co, 0, &flags) == NULL)
return(NULL);
***************
*** 200,206 ****
/* Dig out information for optimizations. */
r->regstart = '\0'; /* Worst-case defaults. */
r->reganch = 0;
! r->regmust = NULL;
r->regmlen = 0;
scan = r->program+1; /* First BRANCH. */
if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
--- 230,236 ----
/* Dig out information for optimizations. */
r->regstart = '\0'; /* Worst-case defaults. */
r->reganch = 0;
! r->regmust = 0;
r->regmlen = 0;
scan = r->program+1; /* First BRANCH. */
if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
***************
*** 229,235 ****
longest = OPERAND(scan);
len = strlen(OPERAND(scan));
}
! r->regmust = longest;
r->regmlen = (int)len;
}
}
--- 259,265 ----
longest = OPERAND(scan);
len = strlen(OPERAND(scan));
}
! r->regmust = longest - r->program;
r->regmlen = (int)len;
}
}
***************
*** 648,655 ****
struct exec {
char *reginput; /* String-input pointer. */
char *regbol; /* Beginning of input, for ^ check. */
! char **regstartp; /* Pointer to startp array. */
! char **regendp; /* Ditto for endp. */
};
/*
--- 678,685 ----
struct exec {
char *reginput; /* String-input pointer. */
char *regbol; /* Beginning of input, for ^ check. */
! const char **regstartp; /* Pointer to startp array. */
! const char **regendp; /* Ditto for endp. */
};
/*
***************
*** 690,696 ****
}
/* If there is a "must appear" string, look for it. */
! if (prog->regmust != NULL && strstr(string, prog->regmust) == NULL)
return(0);
/* Mark beginning of line for ^ . */
--- 720,727 ----
}
/* If there is a "must appear" string, look for it. */
! if ((prog->regmlen > 0) &&
! strstr(string, &prog->program[prog->regmust]) == NULL)
return(0);
/* Mark beginning of line for ^ . */
***************
*** 729,736 ****
char *string;
{
register int i;
! register char **stp;
! register char **enp;
ep->reginput = string;
--- 760,767 ----
char *string;
{
register int i;
! register const char **stp;
! register const char **enp;
ep->reginput = string;
***************
*** 1004,1011 ****
printf("start `%c' ", r->regstart);
if (r->reganch)
printf("anchored ");
! if (r->regmust != NULL)
! printf("must have \"%s\"", r->regmust);
printf("\n");
}
--- 1035,1042 ----
printf("start `%c' ", r->regstart);
if (r->reganch)
printf("anchored ");
! if (r->regmlen > 0)
! printf("must have \"%s\"", &r->program[r->regmust]);
printf("\n");
}
*** regexp.h 1996/04/06 19:24:49 1.1
--- regexp.h 1996/04/07 01:52:19
***************
*** 6,16 ****
*/
#define NSUBEXP 10
typedef struct regexp {
! char *startp[NSUBEXP];
! char *endp[NSUBEXP];
char regstart; /* Internal use only. */
char reganch; /* Internal use only. */
! char *regmust; /* Internal use only. */
int regmlen; /* Internal use only. */
char program[1]; /* Unwarranted chumminess with compiler. */
} regexp;
--- 6,16 ----
*/
#define NSUBEXP 10
typedef struct regexp {
! const char *startp[NSUBEXP];
! const char *endp[NSUBEXP];
char regstart; /* Internal use only. */
char reganch; /* Internal use only. */
! int regmust; /* Internal use only. */
int regmlen; /* Internal use only. */
char program[1]; /* Unwarranted chumminess with compiler. */
} regexp;
***************
*** 18,21 ****
--- 18,27 ----
extern regexp *regcomp(const char *re);
extern int regexec(regexp *rp, const char *s);
extern void regsub(const regexp *rp, const char *src, char *dst);
+ extern void regnsub(const regexp *rp, const char *src, char *dst, size_t len);
+ extern size_t regsublen(const regexp *rp, const char *src);
+
extern void regerror(char *message);
+ extern size_t regcomp_len(const char *exp);
+ extern regexp *regcomp_comp(const char *exp, struct regexp *r, size_t len);
+
*** regsub.c 1996/04/06 19:24:49 1.1
--- regsub.c 1996/04/07 02:10:29
***************
*** 11,25 ****
/*
- regsub - perform substitutions after a regexp match
*/
void
! regsub(rp, source, dest)
const regexp *rp;
const char *source;
char *dest;
{
register regexp * const prog = (regexp *)rp;
! register char *src = (char *)source;
register char *dst = dest;
register char c;
register int no;
register size_t len;
--- 11,42 ----
/*
- regsub - perform substitutions after a regexp match
*/
+
+ void regsub(rp, source, dest)
+ const regexp *rp;
+ const char *source;
+ char *dest;
+ {
+ regnsub(rp, source, dest, BUFSIZ);
+ }
+
+
+
+ /*
+ - regnsub - perform bounds-checked substitutions after a regexp match
+ */
void
! regnsub(rp, source, dest, destlen)
const regexp *rp;
const char *source;
char *dest;
+ size_t destlen;
{
register regexp * const prog = (regexp *)rp;
! register const char *src = (char *)source;
register char *dst = dest;
+ char *dstend = dest + destlen;
+ char *odst;
register char c;
register int no;
register size_t len;
***************
*** 45,55 ****
if (c == '\\' && (*src == '\\' || *src == '&'))
c = *src++;
*dst++ = c;
} else if (prog->startp[no] != NULL && prog->endp[no] != NULL &&
! prog->endp[no] > prog->startp[no]) {
len = prog->endp[no] - prog->startp[no];
! (void) strncpy(dst, prog->startp[no], len);
dst += len;
if (*(dst-1) == '\0') { /* strncpy hit NUL. */
regerror("damaged match string");
return;
--- 62,83 ----
if (c == '\\' && (*src == '\\' || *src == '&'))
c = *src++;
*dst++ = c;
+ if (dst >= dstend)
+ {
+ regerror("output buffer too small");
+ return;
+ }
} else if (prog->startp[no] != NULL && prog->endp[no] != NULL &&
! prog->endp[no] > prog->startp[no]) {
len = prog->endp[no] - prog->startp[no];
! odst = dst;
dst += len;
+ if (dst >= dstend)
+ {
+ regerror("output buffer too small");
+ return;
+ }
+ (void) strncpy(odst, prog->startp[no], len);
if (*(dst-1) == '\0') { /* strncpy hit NUL. */
regerror("damaged match string");
return;
***************
*** 58,60 ****
--- 86,131 ----
}
*dst++ = '\0';
}
+
+ size_t regsublen(rp, source)
+ const regexp *rp;
+ const char *source;
+ {
+ register regexp * const prog = (regexp *)rp;
+ register char *src = (char *)source;
+ register char c;
+ register int no;
+ register int len = 0;
+
+ if (prog == NULL || source == NULL) {
+ regerror("NULL parameter to regsublen");
+ return -1;
+ }
+
+ if ((unsigned char)*(prog->program) != MAGIC) {
+ regerror("damaged regexp");
+ return -1;
+ }
+ while ((c = *src++) != '\0') {
+ if (c == '&')
+ no = 0;
+ else if (c == '\\' && isdigit(*src))
+ no = *src++ - '0';
+ else
+ no = -1;
+ if (no < 0) { /* Ordinary character. */
+ if (c == '\\' && (*src == '\\' || *src == '&'))
+ src++;
+ len++;
+ } else {
+ const char *s = prog->startp[no];
+ const char *e = prog->endp[no];
+ if ((s != NULL) && (e != NULL) && (e > s)) {
+ len += e-s;
+ }
+ }
+ }
+ return len+1;
+ }
+
+
Original regexp code from henry:
[unpacked & deleted -Olin]

View File

@ -1,14 +1,18 @@
/*
* regerror
*/
#include <stdio.h>
void
#include <stdlib.h>
void
regerror(s)
char *s;
char *s;
{
#ifdef ERRAVAIL
error("regexp: %s", s);
error("regexp: %s", s);
#else
fprintf(stderr, "regexp(3): %s", s);
exit(1);
fprintf(stderr, "regexp(3): %s\n", s);
exit(EXIT_FAILURE);
#endif
/* NOTREACHED */
/* NOTREACHED */
}

186
scsh/regexp/regexp.3 Normal file
View File

@ -0,0 +1,186 @@
.TH REGEXP 3 "2 Sept 1995"
.SH NAME
regcomp, regexec, regsub, regerror \- regular expression handler
.SH SYNOPSIS
.ft B
.nf
#include <regexp.h>
regexp *regcomp(exp)
const char *exp;
int regexec(prog, string)
regexp *prog;
const char *string;
void regsub(prog, source, dest)
const regexp *prog;
const char *source;
char *dest;
void regerror(msg)
char *msg;
.SH DESCRIPTION
These functions implement
.IR egrep (1)-style
regular expressions and supporting facilities.
.PP
.I Regcomp
compiles a regular expression into a structure of type
.IR regexp ,
and returns a pointer to it.
The space has been allocated using
.IR malloc (3)
and may be released by
.IR free .
.PP
.I Regexec
matches a NUL-terminated \fIstring\fR against the compiled regular expression
in \fIprog\fR.
It returns 1 for success and 0 for failure, and adjusts the contents of
\fIprog\fR's \fIstartp\fR and \fIendp\fR (see below) accordingly.
.PP
The members of a
.I regexp
structure include at least the following (not necessarily in order):
.PP
.RS
char *startp[NSUBEXP];
.br
char *endp[NSUBEXP];
.RE
.PP
where
.I NSUBEXP
is defined (as 10) in the header file.
Once a successful \fIregexec\fR has been done using the \fIregexp\fR,
each \fIstartp\fR-\fIendp\fR pair describes one substring
within the \fIstring\fR,
with the \fIstartp\fR pointing to the first character of the substring and
the \fIendp\fR pointing to the first character following the substring.
The 0th substring is the substring of \fIstring\fR that matched the whole
regular expression.
The others are those substrings that matched parenthesized expressions
within the regular expression, with parenthesized expressions numbered
in left-to-right order of their opening parentheses.
.PP
.I Regsub
copies \fIsource\fR to \fIdest\fR, making substitutions according to the
most recent \fIregexec\fR performed using \fIprog\fR.
Each instance of `&' in \fIsource\fR is replaced by the substring
indicated by \fIstartp\fR[\fI0\fR] and
\fIendp\fR[\fI0\fR].
Each instance of `\e\fIn\fR', where \fIn\fR is a digit, is replaced by
the substring indicated by
\fIstartp\fR[\fIn\fR] and
\fIendp\fR[\fIn\fR].
To get a literal `&' or `\e\fIn\fR' into \fIdest\fR, prefix it with `\e';
to get a literal `\e' preceding `&' or `\e\fIn\fR', prefix it with
another `\e'.
.PP
.I Regerror
is called whenever an error is detected in \fIregcomp\fR, \fIregexec\fR,
or \fIregsub\fR.
The default \fIregerror\fR writes the string \fImsg\fR,
with a suitable indicator of origin,
on the standard
error output
and invokes \fIexit\fR(2).
.I Regerror
can be replaced by the user if other actions are desirable.
.SH "REGULAR EXPRESSION SYNTAX"
A regular expression is zero or more \fIbranches\fR, separated by `|'.
It matches anything that matches one of the branches.
.PP
A branch is zero or more \fIpieces\fR, concatenated.
It matches a match for the first, followed by a match for the second, etc.
.PP
A piece is an \fIatom\fR possibly followed by `*', `+', or `?'.
An atom followed by `*' matches a sequence of 0 or more matches of the atom.
An atom followed by `+' matches a sequence of 1 or more matches of the atom.
An atom followed by `?' matches a match of the atom, or the null string.
.PP
An atom is a regular expression in parentheses (matching a match for the
regular expression), a \fIrange\fR (see below), `.'
(matching any single character), `^' (matching the null string at the
beginning of the input string), `$' (matching the null string at the
end of the input string), a `\e' followed by a single character (matching
that character), or a single character with no other significance
(matching that character).
.PP
A \fIrange\fR is a sequence of characters enclosed in `[]'.
It normally matches any single character from the sequence.
If the sequence begins with `^',
it matches any single character \fInot\fR from the rest of the sequence.
If two characters in the sequence are separated by `\-', this is shorthand
for the full list of ASCII characters between them
(e.g. `[0-9]' matches any decimal digit).
To include a literal `]' in the sequence, make it the first character
(following a possible `^').
To include a literal `\-', make it the first or last character.
.SH AMBIGUITY
If a regular expression could match two different parts of the input string,
it will match the one which begins earliest.
If both begin in the same place but match different lengths, or match
the same length in different ways, life gets messier, as follows.
.PP
In general, the possibilities in a list of branches are considered in
left-to-right order, the possibilities for `*', `+', and `?' are
considered longest-first, nested constructs are considered from the
outermost in, and concatenated constructs are considered leftmost-first.
The match that will be chosen is the one that uses the earliest
possibility in the first choice that has to be made.
If there is more than one choice, the next will be made in the same manner
(earliest possibility) subject to the decision on the first choice.
And so forth.
.PP
For example, `(ab|a)b*c' could match `abc' in one of two ways.
The first choice is between `ab' and `a'; since `ab' is earlier, and does
lead to a successful overall match, it is chosen.
Since the `b' is already spoken for,
the `b*' must match its last possibility\(emthe empty string\(emsince
it must respect the earlier choice.
.PP
In the particular case where the regular expression does not use `|'
and does not apply `*', `+', or `?' to parenthesized subexpressions,
the net effect is that the longest possible
match will be chosen.
So `ab*', presented with `xabbbby', will match `abbbb'.
Note that if `ab*' is tried against `xabyabbbz', it
will match `ab' just after `x', due to the begins-earliest rule.
(In effect, the decision on where to start the match is the first choice
to be made, hence subsequent choices must respect it even if this leads them
to less-preferred alternatives.)
.SH SEE ALSO
egrep(1), expr(1)
.SH DIAGNOSTICS
\fIRegcomp\fR returns NULL for a failure
(\fIregerror\fR permitting),
where failures are syntax errors, exceeding implementation limits,
or applying `+' or `*' to a possibly-null operand.
.SH HISTORY
This is a revised version.
Both code and manual page were
originally written by Henry Spencer at University of Toronto.
They are intended to be compatible with the Bell V8 \fIregexp\fR(3),
but are not derived from Bell code.
.SH BUGS
Empty branches and empty regular expressions are not portable
to other, otherwise-similar, implementations.
.PP
The ban on
applying `*' or `+' to a possibly-null operand is an artifact of the
simplistic implementation.
.PP
The match-choice rules are complex.
A simple ``longest match'' rule would be preferable,
but is harder to implement.
.PP
Although there is a general similarity to POSIX.2 ``extended'' regular
expressions, neither the regular-expression syntax nor the programming
interface is an exact match.
.PP
Due to emphasis on
compactness and simplicity,
it's not strikingly fast.
It does give some attention to handling simple cases quickly.

File diff suppressed because it is too large Load Diff

View File

@ -6,16 +6,22 @@
*/
#define NSUBEXP 10
typedef struct regexp {
char *startp[NSUBEXP];
char *endp[NSUBEXP];
const char *startp[NSUBEXP];
const char *endp[NSUBEXP];
char regstart; /* Internal use only. */
char reganch; /* Internal use only. */
char *regmust; /* Internal use only. */
int regmust; /* Internal use only. */
int regmlen; /* Internal use only. */
char program[1]; /* Unwarranted chumminess with compiler. */
} regexp;
extern regexp *regcomp();
extern int regexec();
extern void regsub();
extern void regerror();
extern regexp *regcomp(const char *re);
extern int regexec(regexp *rp, const char *s);
extern void regsub(const regexp *rp, const char *src, char *dst);
extern void regnsub(const regexp *rp, const char *src, char *dst, size_t len);
extern size_t regsublen(const regexp *rp, const char *src);
extern void regerror(char *message);
extern size_t regcomp_len(const char *exp);
extern regexp *regcomp_comp(const char *exp, struct regexp *r, size_t len);

View File

@ -1,83 +1,131 @@
/*
* regsub @(#)regsub.c 1.3 of 2 April 86
*
* Copyright (c) 1986 by University of Toronto. Written by Henry Spencer. Not
* derived from licensed software.
*
* Permission is granted to anyone to use this software for any purpose on any
* computer system, and to redistribute it freely, subject to the following
* restrictions:
*
* 1. The author is not responsible for the consequences of use of this
* software, no matter how awful, even if they arise from defects in it.
*
* 2. The origin of this software must not be misrepresented, either by explicit
* claim or by omission.
*
* 3. Altered versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* regsub
*/
#include <stdio.h>
#ifdef AMIGA
#include "regexp.h"
#else
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <regexp.h>
#endif
#include "regmagic.h"
#ifndef CHARBITS
#define UCHARAT(p) ((int)*(unsigned char *)(p))
#else
#define UCHARAT(p) ((int)*(p)&CHARBITS)
#endif
/*
* - regsub - perform substitutions after a regexp match
- regsub - perform substitutions after a regexp match
*/
void regsub(rp, source, dest)
const regexp *rp;
const char *source;
char *dest;
{
regnsub(rp, source, dest, BUFSIZ);
}
/*
- regnsub - perform bounds-checked substitutions after a regexp match
*/
void
regsub(prog, source, dest)
regexp *prog;
char *source;
char *dest;
regnsub(rp, source, dest, destlen)
const regexp *rp;
const char *source;
char *dest;
size_t destlen;
{
register char *src;
register char *dst;
register char c;
register int no;
register int len;
extern char *strncpy();
if (prog == NULL || source == NULL || dest == NULL) {
regerror("NULL parm to regsub");
return;
register regexp * const prog = (regexp *)rp;
register const char *src = (char *)source;
register char *dst = dest;
char *dstend = dest + destlen;
char *odst;
register char c;
register int no;
register size_t len;
if (prog == NULL || source == NULL || dest == NULL) {
regerror("NULL parameter to regsub");
return;
}
if ((unsigned char)*(prog->program) != MAGIC) {
regerror("damaged regexp");
return;
}
while ((c = *src++) != '\0') {
if (c == '&')
no = 0;
else if (c == '\\' && isdigit(*src))
no = *src++ - '0';
else
no = -1;
if (no < 0) { /* Ordinary character. */
if (c == '\\' && (*src == '\\' || *src == '&'))
c = *src++;
*dst++ = c;
if (dst >= dstend)
{
regerror("output buffer too small");
return;
}
} else if (prog->startp[no] != NULL && prog->endp[no] != NULL &&
prog->endp[no] > prog->startp[no]) {
len = prog->endp[no] - prog->startp[no];
odst = dst;
dst += len;
if (dst >= dstend)
{
regerror("output buffer too small");
return;
}
(void) strncpy(odst, prog->startp[no], len);
if (*(dst-1) == '\0') { /* strncpy hit NUL. */
regerror("damaged match string");
return;
}
}
}
*dst++ = '\0';
}
size_t regsublen(rp, source)
const regexp *rp;
const char *source;
{
register regexp * const prog = (regexp *)rp;
register char *src = (char *)source;
register char c;
register int no;
register int len = 0;
if (prog == NULL || source == NULL) {
regerror("NULL parameter to regsublen");
return -1;
}
if (UCHARAT(prog->program) != MAGIC) {
regerror("damaged regexp fed to regsub");
return;
if ((unsigned char)*(prog->program) != MAGIC) {
regerror("damaged regexp");
return -1;
}
src = source;
dst = dest;
while ((c = *src++) != '\0') {
if (c == '&')
no = 0;
else if (c == '\\' && '0' <= *src && *src <= '9')
else if (c == '\\' && isdigit(*src))
no = *src++ - '0';
else
no = -1;
if (no < 0) { /* Ordinary character. */
if (c == '\\' && (*src == '\\' || *src == '&'))
c = *src++;
*dst++ = c;
} else if (prog->startp[no] != NULL && prog->endp[no] != NULL) {
len = prog->endp[no] - prog->startp[no];
(void) strncpy(dst, prog->startp[no], len);
dst += len;
if (len != 0 && *(dst - 1) == '\0') { /* strncpy hit NUL. */
regerror("damaged match string");
return;
src++;
len++;
} else {
const char *s = prog->startp[no];
const char *e = prog->endp[no];
if ((s != NULL) && (e != NULL) && (e > s)) {
len += e-s;
}
}
}
*dst++ = '\0';
return len+1;
}

127
scsh/regexp/tests Normal file
View File

@ -0,0 +1,127 @@
abc abc y & abc
abc xbc n - -
abc axc n - -
abc abx n - -
abc xabcy y & abc
abc ababc y & abc
ab*c abc y & abc
ab*bc abc y & abc
ab*bc abbc y & abbc
ab*bc abbbbc y & abbbbc
ab+bc abbc y & abbc
ab+bc abc n - -
ab+bc abq n - -
ab+bc abbbbc y & abbbbc
ab?bc abbc y & abbc
ab?bc abc y & abc
ab?bc abbbbc n - -
ab?c abc y & abc
^abc$ abc y & abc
^abc$ abcc n - -
^abc abcc y & abc
^abc$ aabc n - -
abc$ aabc y & abc
^ abc y &
$ abc y &
a.c abc y & abc
a.c axc y & axc
a.*c axyzc y & axyzc
a.*c axyzd n - -
a[bc]d abc n - -
a[bc]d abd y & abd
a[b-d]e abd n - -
a[b-d]e ace y & ace
a[b-d] aac y & ac
a[-b] a- y & a-
a[b-] a- y & a-
[k] ab n - -
a[b-a] - c - -
a[]b - c - -
a[ - c - -
a] a] y & a]
a[]]b a]b y & a]b
a[^bc]d aed y & aed
a[^bc]d abd n - -
a[^-b]c adc y & adc
a[^-b]c a-c n - -
a[^]b]c a]c n - -
a[^]b]c adc y & adc
ab|cd abc y & ab
ab|cd abcd y & ab
()ef def y &-\1 ef-
()* - c - -
*a - c - -
^* - c - -
$* - c - -
(*)b - c - -
$b b n - -
a\ - c - -
a\(b a(b y &-\1 a(b-
a\(*b ab y & ab
a\(*b a((b y & a((b
a\\b a\b y & a\b
abc) - c - -
(abc - c - -
((a)) abc y &-\1-\2 a-a-a
(a)b(c) abc y &-\1-\2 abc-a-c
a+b+c aabbabc y & abc
a** - c - -
a*? - c - -
(a*)* - c - -
(a*)+ - c - -
(a|)* - c - -
(a*|b)* - c - -
(a+|b)* ab y &-\1 ab-b
(a+|b)+ ab y &-\1 ab-b
(a+|b)? ab y &-\1 a-a
[^ab]* cde y & cde
(^)* - c - -
(ab|)* - c - -
)( - c - -
abc y &
abc n - -
a* y &
abcd abcd y &-\&-\\& abcd-&-\abcd
a(bc)d abcd y \1-\\1-\\\1 bc-\1-\bc
([abc])*d abbbcd y &-\1 abbbcd-c
([abc])*bcd abcd y &-\1 abcd-a
a|b|c|d|e e y & e
(a|b|c|d|e)f ef y &-\1 ef-e
((a*|b))* - c - -
abcd*efg abcdefg y & abcdefg
ab* xabyabbbz y & ab
ab* xayabbbz y & a
(ab|cd)e abcde y &-\1 cde-cd
[abhgefdc]ij hij y & hij
^(ab|cd)e abcde n x\1y xy
(abc|)ef abcdef y &-\1 ef-
(a|b)c*d abcd y &-\1 bcd-b
(ab|ab*)bc abc y &-\1 abc-a
a([bc]*)c* abc y &-\1 abc-bc
a([bc]*)(c*d) abcd y &-\1-\2 abcd-bc-d
a([bc]+)(c*d) abcd y &-\1-\2 abcd-bc-d
a([bc]*)(c+d) abcd y &-\1-\2 abcd-b-cd
a[bcd]*dcdcde adcdcde y & adcdcde
a[bcd]+dcdcde adcdcde n - -
(ab|a)b*c abc y &-\1 abc-ab
((a)(b)c)(d) abcd y \1-\2-\3-\4 abc-a-b-d
[ -~]* abc y & abc
[ -~ -~]* abc y & abc
[ -~ -~ -~]* abc y & abc
[ -~ -~ -~ -~]* abc y & abc
[ -~ -~ -~ -~ -~]* abc y & abc
[ -~ -~ -~ -~ -~ -~]* abc y & abc
[ -~ -~ -~ -~ -~ -~ -~]* abc y & abc
[a-zA-Z_][a-zA-Z0-9_]* alpha y & alpha
^a(bc+|b[eh])g|.h$ abh y &-\1 bh-
(bc+d$|ef*g.|h?i(j|k)) effgz y &-\1-\2 effgz-effgz-
(bc+d$|ef*g.|h?i(j|k)) ij y &-\1-\2 ij-ij-j
(bc+d$|ef*g.|h?i(j|k)) effg n - -
(bc+d$|ef*g.|h?i(j|k)) bcdd n - -
(bc+d$|ef*g.|h?i(j|k)) reffgz y &-\1-\2 effgz-effgz-
((((((((((a)))))))))) - c - -
(((((((((a))))))))) a y & a
multiple words of text uh-uh n - -
multiple words multiple words, yeah y & multiple words
(.*)c(.*) abcde y &-\1-\2 abcde-ab-de
\((.*), (.*)\) (a, b) y (\2, \1) (b, a)

164
scsh/regexp/timer.c Normal file
View File

@ -0,0 +1,164 @@
/*
* Simple timing program for regcomp().
* Usage: timer ncomp nexec nsub
* or
* timer ncomp nexec nsub regexp string [ answer [ sub ] ]
*
* The second form is for timing repetitions of a single test case.
* The first form's test data is a compiled-in copy of the "tests" file.
* Ncomp, nexec, nsub are how many times to do each regcomp, regexec,
* and regsub. The way to time an operation individually is to do something
* like "timer 1 50 1".
*/
#include <stdio.h>
struct try {
char *re, *str, *ans, *src, *dst;
} tests[] = {
#include "timer.t.h"
{ NULL, NULL, NULL, NULL, NULL }
};
#include <regexp.h>
int errreport = 0; /* Report errors via errseen? */
char *errseen = NULL; /* Error message. */
char *progname;
/* ARGSUSED */
main(argc, argv)
int argc;
char *argv[];
{
int ncomp, nexec, nsub;
struct try one;
char dummy[512];
if (argc < 4) {
ncomp = 1;
nexec = 1;
nsub = 1;
} else {
ncomp = atoi(argv[1]);
nexec = atoi(argv[2]);
nsub = atoi(argv[3]);
}
progname = argv[0];
if (argc > 5) {
one.re = argv[4];
one.str = argv[5];
if (argc > 6)
one.ans = argv[6];
else
one.ans = "y";
if (argc > 7) {
one.src = argv[7];
one.dst = "xxx";
} else {
one.src = "x";
one.dst = "x";
}
errreport = 1;
try(one, ncomp, nexec, nsub);
} else
multiple(ncomp, nexec, nsub);
exit(0);
}
void
regerror(s)
char *s;
{
if (errreport)
errseen = s;
else
error(s, "");
}
#ifndef ERRAVAIL
error(s1, s2)
char *s1;
char *s2;
{
fprintf(stderr, "regexp: ");
fprintf(stderr, s1, s2);
fprintf(stderr, "\n");
exit(1);
}
#endif
int lineno = 0;
multiple(ncomp, nexec, nsub)
int ncomp, nexec, nsub;
{
register int i;
extern char *strchr();
errreport = 1;
for (i = 0; tests[i].re != NULL; i++) {
lineno++;
try(tests[i], ncomp, nexec, nsub);
}
}
try(fields, ncomp, nexec, nsub)
struct try fields;
int ncomp, nexec, nsub;
{
regexp *r;
char dbuf[BUFSIZ];
register int i;
errseen = NULL;
r = regcomp(fields.re);
if (r == NULL) {
if (*fields.ans != 'c')
complain("regcomp failure in `%s'", fields.re);
return;
}
if (*fields.ans == 'c') {
complain("unexpected regcomp success in `%s'", fields.re);
free((char *)r);
return;
}
for (i = ncomp-1; i > 0; i--) {
free((char *)r);
r = regcomp(fields.re);
}
if (!regexec(r, fields.str)) {
if (*fields.ans != 'n')
complain("regexec failure in `%s'", "");
free((char *)r);
return;
}
if (*fields.ans == 'n') {
complain("unexpected regexec success", "");
free((char *)r);
return;
}
for (i = nexec-1; i > 0; i--)
(void) regexec(r, fields.str);
errseen = NULL;
for (i = nsub; i > 0; i--)
regsub(r, fields.src, dbuf);
if (errseen != NULL) {
complain("regsub complaint", "");
free((char *)r);
return;
}
if (strcmp(dbuf, fields.dst) != 0)
complain("regsub result `%s' wrong", dbuf);
free((char *)r);
}
complain(s1, s2)
char *s1;
char *s2;
{
fprintf(stderr, "try: %d: ", lineno);
fprintf(stderr, s1, s2);
fprintf(stderr, " (%s)\n", (errseen != NULL) ? errseen : "");
}

View File

@ -1,236 +1,220 @@
/*
* Simple test program for regexp(3) stuff. Knows about debugging hooks.
* Simple test program for regexp(3) stuff. Knows about debugging hooks.
* Usage: try re [string [output [-]]]
* The re is compiled and dumped, regexeced against the string, the result
* is applied to output using regsub(). The - triggers a running narrative
* from regexec(). Dumping and narrative don't happen unless DEBUG.
*
* Copyright (c) 1986 by University of Toronto. Written by Henry Spencer. Not
* derived from licensed software.
*
* Permission is granted to anyone to use this software for any purpose on any
* computer system, and to redistribute it freely, subject to the following
* restrictions:
*
* 1. The author is not responsible for the consequences of use of this
* software, no matter how awful, even if they arise from defects in it.
*
* 2. The origin of this software must not be misrepresented, either by explicit
* claim or by omission.
*
* 3. Altered versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
*
* Usage: try re [string [output [-]]] The re is compiled and dumped, regexeced
* against the string, the result is applied to output using regsub(). The -
* triggers a running narrative from regexec(). Dumping and narrative don't
* happen unless DEBUG.
*
* If there are no arguments, stdin is assumed to be a stream of lines with five
* fields: a r.e., a string to match it against, a result code, a source
* string for regsub, and the proper result. Result codes are 'c' for
* compile failure, 'y' for match success, 'n' for match failure. Field
* separator is tab.
* If there are no arguments, stdin is assumed to be a stream of lines with
* five fields: a r.e., a string to match it against, a result code, a
* source string for regsub, and the proper result. Result codes are 'c'
* for compile failure, 'y' for match success, 'n' for match failure.
* Field separator is tab.
*/
#include <stdio.h>
#include <regexp.h>
#ifdef ERRAVAIL
char *progname;
extern char *mkprogname();
char *progname;
extern char *mkprogname();
#endif
#ifdef DEBUG
extern int regnarrate;
extern int regnarrate;
#endif
char buf[BUFSIZ];
int errreport = 0; /* Report errors via errseen? */
char *errseen = NULL; /* Error message. */
int status = 0; /* Exit status. */
char buf[BUFSIZ];
int errreport = 0; /* Report errors via errseen? */
char *errseen = NULL; /* Error message. */
int status = 0; /* Exit status. */
/* ARGSUSED */
main(argc, argv)
int argc;
char *argv[];
int argc;
char *argv[];
{
regexp *r;
int i;
regexp *r;
int i;
#ifdef ERRAVAIL
progname = mkprogname(argv[0]);
progname = mkprogname(argv[0]);
#endif
if (argc == 1) {
multiple();
exit(status);
}
r = regcomp(argv[1]);
if (r == NULL)
error("regcomp failure", "");
if (argc == 1) {
multiple();
exit(status);
}
r = regcomp(argv[1]);
if (r == NULL)
error("regcomp failure", "");
#ifdef DEBUG
regdump(r);
if (argc > 4)
regnarrate++;
regdump(r);
if (argc > 4)
regnarrate++;
#endif
if (argc > 2) {
i = regexec(r, argv[2]);
printf("%d", i);
for (i = 1; i < NSUBEXP; i++)
if (r->startp[i] != NULL && r->endp[i] != NULL)
printf(" \\%d", i);
printf("\n");
}
if (argc > 3) {
regsub(r, argv[3], buf);
printf("%s\n", buf);
}
exit(status);
if (argc > 2) {
i = regexec(r, argv[2]);
printf("%d", i);
for (i = 1; i < NSUBEXP; i++)
if (r->startp[i] != NULL && r->endp[i] != NULL)
printf(" \\%d", i);
printf("\n");
}
if (argc > 3) {
regsub(r, argv[3], buf);
printf("%s\n", buf);
}
exit(status);
}
void
regerror(s)
char *s;
char *s;
{
if (errreport)
errseen = s;
else
error(s, "");
if (errreport)
errseen = s;
else
error(s, "");
}
#ifndef ERRAVAIL
error(s1, s2)
char *s1;
char *s2;
char *s1;
char *s2;
{
fprintf(stderr, "regexp: ");
fprintf(stderr, s1, s2);
fprintf(stderr, "\n");
exit(1);
fprintf(stderr, "regexp: ");
fprintf(stderr, s1, s2);
fprintf(stderr, "\n");
exit(1);
}
#endif
int lineno;
regexp badregexp; /* Implicit init to 0. */
int lineno;
regexp badregexp; /* Implicit init to 0. */
multiple()
{
char rbuf[BUFSIZ];
char *field[5];
char *scan;
int i;
regexp *r;
extern char *strchr();
errreport = 1;
lineno = 0;
while (fgets(rbuf, sizeof(rbuf), stdin) != NULL) {
rbuf[strlen(rbuf) - 1] = '\0'; /* Dispense with \n. */
lineno++;
scan = rbuf;
for (i = 0; i < 5; i++) {
field[i] = scan;
if (field[i] == NULL) {
complain("bad testfile format", "");
exit(1);
}
scan = strchr(scan, '\t');
if (scan != NULL)
*scan++ = '\0';
char rbuf[BUFSIZ];
char *field[5];
char *scan;
int i;
regexp *r;
extern char *strchr();
errreport = 1;
lineno = 0;
while (fgets(rbuf, sizeof(rbuf), stdin) != NULL) {
rbuf[strlen(rbuf)-1] = '\0'; /* Dispense with \n. */
lineno++;
scan = rbuf;
for (i = 0; i < 5; i++) {
field[i] = scan;
if (field[i] == NULL) {
complain("bad testfile format", "");
exit(1);
}
scan = strchr(scan, '\t');
if (scan != NULL)
*scan++ = '\0';
}
try(field);
}
try(field);
}
/* And finish up with some internal testing... */
lineno = 9990;
errseen = NULL;
if (regcomp((char *) NULL) != NULL || errseen == NULL)
complain("regcomp(NULL) doesn't complain", "");
lineno = 9991;
errseen = NULL;
if (regexec((regexp *) NULL, "foo") || errseen == NULL)
complain("regexec(NULL, ...) doesn't complain", "");
lineno = 9992;
r = regcomp("foo");
if (r == NULL) {
complain("regcomp(\"foo\") fails", "");
return;
}
lineno = 9993;
errseen = NULL;
if (regexec(r, (char *) NULL) || errseen == NULL)
complain("regexec(..., NULL) doesn't complain", "");
lineno = 9994;
errseen = NULL;
regsub((regexp *) NULL, "foo", rbuf);
if (errseen == NULL)
complain("regsub(NULL, ..., ...) doesn't complain", "");
lineno = 9995;
errseen = NULL;
regsub(r, (char *) NULL, rbuf);
if (errseen == NULL)
complain("regsub(..., NULL, ...) doesn't complain", "");
lineno = 9996;
errseen = NULL;
regsub(r, "foo", (char *) NULL);
if (errseen == NULL)
complain("regsub(..., ..., NULL) doesn't complain", "");
lineno = 9997;
errseen = NULL;
if (regexec(&badregexp, "foo") || errseen == NULL)
complain("regexec(nonsense, ...) doesn't complain", "");
lineno = 9998;
errseen = NULL;
regsub(&badregexp, "foo", rbuf);
if (errseen == NULL)
complain("regsub(nonsense, ..., ...) doesn't complain", "");
/* And finish up with some internal testing... */
lineno = 9990;
errseen = NULL;
if (regcomp((char *)NULL) != NULL || errseen == NULL)
complain("regcomp(NULL) doesn't complain", "");
lineno = 9991;
errseen = NULL;
if (regexec((regexp *)NULL, "foo") || errseen == NULL)
complain("regexec(NULL, ...) doesn't complain", "");
lineno = 9992;
r = regcomp("foo");
if (r == NULL) {
complain("regcomp(\"foo\") fails", "");
return;
}
lineno = 9993;
errseen = NULL;
if (regexec(r, (char *)NULL) || errseen == NULL)
complain("regexec(..., NULL) doesn't complain", "");
lineno = 9994;
errseen = NULL;
regsub((regexp *)NULL, "foo", rbuf);
if (errseen == NULL)
complain("regsub(NULL, ..., ...) doesn't complain", "");
lineno = 9995;
errseen = NULL;
regsub(r, (char *)NULL, rbuf);
if (errseen == NULL)
complain("regsub(..., NULL, ...) doesn't complain", "");
lineno = 9996;
errseen = NULL;
regsub(r, "foo", (char *)NULL);
if (errseen == NULL)
complain("regsub(..., ..., NULL) doesn't complain", "");
lineno = 9997;
errseen = NULL;
if (regexec(&badregexp, "foo") || errseen == NULL)
complain("regexec(nonsense, ...) doesn't complain", "");
lineno = 9998;
errseen = NULL;
regsub(&badregexp, "foo", rbuf);
if (errseen == NULL)
complain("regsub(nonsense, ..., ...) doesn't complain", "");
}
try(fields)
char **fields;
char **fields;
{
regexp *r;
char dbuf[BUFSIZ];
errseen = NULL;
r = regcomp(fields[0]);
if (r == NULL) {
if (*fields[2] != 'c')
complain("regcomp failure in `%s'", fields[0]);
return;
}
if (*fields[2] == 'c') {
complain("unexpected regcomp success in `%s'", fields[0]);
free((char *) r);
return;
}
if (!regexec(r, fields[1])) {
if (*fields[2] != 'n')
complain("regexec failure in `%s'", "");
free((char *) r);
return;
}
if (*fields[2] == 'n') {
complain("unexpected regexec success", "");
free((char *) r);
return;
}
errseen = NULL;
regsub(r, fields[3], dbuf);
if (errseen != NULL) {
complain("regsub complaint", "");
free((char *) r);
return;
}
if (strcmp(dbuf, fields[4]) != 0)
complain("regsub result `%s' wrong", dbuf);
free((char *) r);
regexp *r;
char dbuf[BUFSIZ];
errseen = NULL;
r = regcomp(fields[0]);
if (r == NULL) {
if (*fields[2] != 'c')
complain("regcomp failure in `%s'", fields[0]);
return;
}
if (*fields[2] == 'c') {
complain("unexpected regcomp success in `%s'", fields[0]);
free((char *)r);
return;
}
if (!regexec(r, fields[1])) {
if (*fields[2] != 'n')
complain("regexec failure in `%s'", fields[0]);
free((char *)r);
return;
}
if (*fields[2] == 'n') {
complain("unexpected regexec success", "");
free((char *)r);
return;
}
errseen = NULL;
regsub(r, fields[3], dbuf);
if (errseen != NULL) {
complain("regsub complaint", "");
free((char *)r);
return;
}
if (strcmp(dbuf, fields[4]) != 0)
complain("regsub result `%s' wrong", dbuf);
free((char *)r);
}
complain(s1, s2)
char *s1;
char *s2;
char *s1;
char *s2;
{
fprintf(stderr, "try: %d: ", lineno);
fprintf(stderr, s1, s2);
fprintf(stderr, " (%s)\n", (errseen != NULL) ? errseen : "");
status = 1;
fprintf(stderr, "try: %d: ", lineno);
fprintf(stderr, s1, s2);
fprintf(stderr, " (%s)\n", (errseen != NULL) ? errseen : "");
status = 1;
}