1. Removed cruft (timer.c, try.c, regexp.{3,c,h}, regmagic.h regsub.c)

that shouldn't even be in this dir -- it was from a different Spencer
   package, and wasn't used; got copied in by accident at some point.
2. Removed *.ih Makefile regex.h
   These are derived files produced during the build.
3. Removed patch-msg, which is old & dead.
4. Updated rest of the source to a newer version of Spencer's Posix package
   (alpha3.7). Not all of these files changed, actually, but I guess
   the last-mod dates did, so CVS thinks they're being updated or something?
This commit is contained in:
shivers 1999-07-10 20:01:52 +00:00
parent c23ba5b0cb
commit 14fe107a7e
23 changed files with 781 additions and 3189 deletions

View File

@ -1,19 +1,20 @@
Copyright (c) 1986, 1993, 1995 by University of Toronto.
Written by Henry Spencer. Not derived from licensed software.
Copyright 1992, 1993, 1994, 1997 Henry Spencer. All rights reserved.
This software is not subject to any license of the American Telephone
and Telegraph Company or of the Regents of the University of California.
Permission is granted to anyone to use this software for any
purpose on any computer system, and to redistribute it in any way,
subject to the following restrictions:
Permission is granted to anyone to use this software for any purpose on
any computer system, and to alter it and redistribute it, subject
to the following restrictions:
1. The author is not responsible for the consequences of use of
this software, no matter how awful, even if they arise
from defects in it.
1. The author is not responsible for the consequences of use of this
software, no matter how awful, even if they arise from flaws in it.
2. The origin of this software must not be misrepresented, either
by explicit claim or by omission.
2. The origin of this software must not be misrepresented, either by
explicit claim or by omission. Since few users ever read sources,
credits must appear in the documentation.
3. Altered versions must be plainly marked as such, and must not
be misrepresented (by explicit claim or omission) as being
the original software.
3. Altered versions must be plainly marked as such, and must not be
misrepresented as being the original software. Since few users
ever read sources, credits must appear in the documentation.
4. This notice must not be removed or altered.
4. This notice may not be removed or altered.

View File

View File

@ -1,118 +1,137 @@
srcdir = @srcdir@
VPATH = @srcdir@
CC = @CC@
CFLAGS1 = @CFLAGS1@
RANLIB = @RANLIB@
# Things you might want to put in ENV:
# -DERRAVAIL have utzoo-compatible error() function and friends
ENV=
# You probably want to take -DREDEBUG out of CFLAGS, and put something like
# -O in, *after* testing (-DREDEBUG strengthens testing by enabling a lot of
# internal assertion checking and some debugging facilities).
# Put -Dconst= in for a pre-ANSI compiler.
# Do not take -DPOSIX_MISTAKE out.
# REGCFLAGS isn't important to you (it's for my use in some special contexts).
#CFLAGS=-I. -DPOSIX_MISTAKE -DREDEBUG $(REGCFLAGS)
CFLAGS=-I. -DPOSIX_MISTAKE $(REGCFLAGS) $(CFLAGS1)
# Things you might want to put in TEST:
# -DDEBUG debugging hooks
# -I. regexp.h from current directory, not /usr/include
TEST=-I. -I$(srcdir)
# If you have a pre-ANSI compiler, put -o into MKHFLAGS. If you want
# the Berkeley __P macro, put -b in.
MKHFLAGS=
# Things you might want to put in PROF:
# -pg profiler
# PROF=
# Flags for linking but not compiling, if any.
LDFLAGS=
CFLAGS=$(CFLAGS1) $(ENV) $(TEST) $(PROF)
LDFLAGS=$(PROF)
# Extra libraries for linking, if any.
LIBS=
LIB=libregexp.a
OBJ=regexp.o regsub.o regerror.o
TMP=dtr.tmp
# Internal stuff, should not need changing.
OBJPRODN=regcomp.o regexec.o regerror.o regfree.o
OBJS=$(OBJPRODN) split.o debug.o main.o
H=cclass.h cname.h regex2.h utils.h
REGSRC=regcomp.c regerror.c regexec.c regfree.c
ALLSRC=$(REGSRC) engine.c debug.c main.c split.c
# Stuff that matters only if you're trying to lint the package.
LINTFLAGS=-I. -Dstatic= -Dconst= -DREDEBUG
LINTC=regcomp.c regexec.c regerror.c regfree.c debug.c main.c
JUNKLINT=possible pointer alignment|null effect
# arrangements to build forward-reference header files
.SUFFIXES: .ih .h
.c.ih:
sh ./mkh $(MKHFLAGS) -p $< >$@
default: r
try: try.o $(LIB)
$(CC) $(LDFLAGS) try.o $(LIB) -o try
lib: purge $(OBJPRODN)
rm -f libregex.a
ar crv libregex.a $(OBJPRODN)
# Making timer will probably require putting stuff in $(PROF) and then
# recompiling everything; the following is just the final stage.
timer: timer.o $(LIB)
$(CC) $(LDFLAGS) timer.o $(LIB) -o timer
purge:
rm -f *.o
timer.o: timer.c timer.t.h
# stuff to build regex.h
REGEXH=regex.h
REGEXHSRC=regex2.h $(REGSRC)
$(REGEXH): $(REGEXHSRC) mkh
sh ./mkh $(MKHFLAGS) -i _REGEX_H_ $(REGEXHSRC) >regex.tmp
cmp -s regex.tmp regex.h 2>/dev/null || cp regex.tmp regex.h
rm -f regex.tmp
timer.t.h: tests
sed 's/ /","/g;s/\\/&&/g;s/.*/{"&"},/' tests >timer.t.h
# dependencies
$(OBJPRODN) debug.o: utils.h regex.h regex2.h
regcomp.o: cclass.h cname.h regcomp.ih
regexec.o: engine.c engine.ih
regerror.o: regerror.ih
debug.o: debug.ih
main.o: main.ih
# Regression test.
r: try tests
./try <tests # no news is good news...
# tester
re: $(OBJS)
$(CC) $(CFLAGS) $(LDFLAGS) $(OBJS) $(LIBS) -o $@
$(LIB): $(OBJ)
ar cr $(LIB) $(OBJ)
$(RANLIB) libregexp.a
# regression test
r: re tests
./re <tests
./re -el <tests
./re -er <tests
regexp.o: regexp.c regexp.h regmagic.h
regsub.o: regsub.c regexp.h regmagic.h
# 57 variants, and other stuff, for development use -- not useful to you
ra: ./re tests
-./re <tests
-./re -el <tests
-./re -er <tests
clean:
rm -f *.o core mon.out gmon.out timer.t.h dtr copy try timer r.*
rm -f residue rs.* re.1 rm.h re.h ch.soe ch.ps j badcom fig[012]
rm -f ch.sml fig[12].ps $(LIB)
rm -rf $(TMP)
rx: ./re tests
./re -x <tests
./re -x -el <tests
./re -x -er <tests
# the rest of this is unlikely to be of use to you
t: ./re tests
-time ./re <tests
-time ./re -cs <tests
-time ./re -el <tests
-time ./re -cs -el <tests
BITS = r.1 rs.1 re.1 rm.h re.h
OPT=-p -ms
l: $(LINTC)
lint $(LINTFLAGS) -h $(LINTC) 2>&1 | egrep -v '$(JUNKLINT)' | tee lint
ch.soe: ch $(BITS)
soelim ch >$@
fullprint:
ti README WHATSNEW notes todo | list
ti *.h | list
list *.c
list regex.3 regex.7
ch.sml: ch $(BITS) smlize splitfigs
splitfigs ch | soelim | smlize >$@
print:
ti README WHATSNEW notes todo | list
ti *.h | list
list reg*.c engine.c
fig0 fig1 fig2: ch splitfigs
splitfigs ch >/dev/null
f: fig0 fig1 fig2 figs
groff -Tps -s $(OPT) figs | lpr
mf.tmp: Makefile
sed '/^REGEXH=/s/=.*/=regex.h/' Makefile | sed '/#DEL$$/d' >$@
fig1.ps: fig0 fig1
( cat fig0 ; echo ".LP" ; cat fig1 ) | groff -Tps $(OPT) >$@
DTRH=cclass.h cname.h regex2.h utils.h
PRE=COPYRIGHT README WHATSNEW
POST=mkh regex.3 regex.7 tests $(DTRH) $(ALLSRC) fake/*.[ch]
FILES=$(PRE) Makefile $(POST)
DTR=$(PRE) Makefile=mf.tmp $(POST)
dtr: $(FILES) mf.tmp
makedtr $(DTR) >$@
rm mf.tmp
fig2.ps: fig0 fig2
( cat fig0 ; echo ".LP" ; cat fig2 ) | groff -Tps $(OPT) >$@
cio: $(FILES)
cio $(FILES)
fp: fig1.ps fig2.ps
rdf: $(FILES)
rcsdiff -c $(FILES) 2>&1 | p
r.1: regexp.c splitter
splitter regexp.c
# various forms of cleanup
tidy:
rm -f junk* core core.* *.core dtr *.tmp lint
rs.1: regsub.c splitter
splitter regsub.c
clean: tidy
rm -f *.o *.s *.ih re libregex.a
re.1: regerror.c splitter
splitter regerror.c
rm.h: regmagic.h splitter
splitter regmagic.h
re.h: regexp.h splitter
splitter regexp.h
PLAIN=COPYRIGHT README Makefile regexp.3 try.c timer.c tests
FIX=regexp.h regexp.c regsub.c regerror.c regmagic.h
DTR=$(PLAIN) $(FIX)
dtr: r $(DTR)
rm -rf $(TMP)
mkdir $(TMP)
cp $(PLAIN) $(TMP)
for f in $(FIX) ; do normalize $$f >$(TMP)/$$f ; done
( cd $(TMP) ; makedtr $(DTR) ) >$@
rm -rf $(TMP)
ch.ps: ch Makefile $(BITS)
groff -Tps $(OPT) ch >$@
copy: ch.soe ch.sml fp
makedtr REMARKS ch.sml fig*.ps ch.soe >$@
go: copy dtr
# don't do this one unless you know what you're doing
spotless: clean
rm -f mkh regex.h

View File

@ -1,57 +1,32 @@
This is a revision of my well-known regular-expression package, regexp(3).
It gives C programs the ability to use egrep-style regular expressions, and
does it in a much cleaner fashion than the analogous routines in SysV.
It is not, alas, fully POSIX.2-compliant; that is hard. (I'm working on
a full reimplementation that will do that.)
alpha3.7 release.
Fri Nov 21 13:25:21 EST 1997
henry@zoo.toronto.edu
This version is the one which is examined and explained in one chapter of
"Software Solutions in C" (Dale Schumacher, ed.; AP Professional 1994;
ISBN 0-12-632360-7), plus a couple of insignificant updates, plus one
significant bug fix (done 10 Nov 1995).
See WHATSNEW for change listing.
Although this package was inspired by the Bell V8 regexp(3), this
implementation is *NOT* AT&T/Bell code, and is not derived from licensed
software. Even though U of T is a V8 licensee. This software is based on
a V8 manual page sent to me by Dennis Ritchie (the manual page enclosed
here is a complete rewrite and hence is not covered by AT&T copyright).
I admit to some familiarity with regular-expression implementations of
the past, but the only one that this code traces any ancestry to is the
one published in Kernighan & Plauger's "Software Tools" (from which
this one draws ideas but not code).
installation notes:
--------
Read the comments at the beginning of Makefile before running.
Simplistically: put this stuff into a source directory, inspect Makefile
for compilation options that need changing to suit your local environment,
and then do "make". This compiles the regexp(3) functions, builds a
library containing them, compiles a test program, and runs a large set of
regression tests. If there are no complaints, then put regexp.h into
/usr/include, add regexp.o, regsub.o, and regerror.o into your C library
(or put libre.a into /usr/lib), and install regexp.3 (perhaps with slight
modifications) in your manual-pages directory.
Utils.h contains some things that just might have to be modified on
some systems, as well as a nested include (ugh) of <assert.h>.
The files are:
The "fake" directory contains quick-and-dirty fakes for some header
files and routines that old systems may not have. Note also that
-DUSEBCOPY will make utils.h substitute bcopy() for memmove().
COPYRIGHT copyright notice
README this text
Makefile instructions to make everything
regexp.3 manual page
regexp.h header file, for /usr/include
regexp.c source for regcomp() and regexec()
regsub.c source for regsub()
regerror.c source for default regerror()
regmagic.h internal header file
try.c source for test program
timer.c source for timing program
tests test list for try and timer
After that, "make r" will build regcomp.o, regexec.o, regfree.o,
and regerror.o (the actual routines), bundle them together into a test
program, and run regression tests on them. No output is good output.
This implementation uses nondeterministic automata rather than the
deterministic ones found in some other implementations, which makes it
simpler, smaller, and faster at compiling regular expressions, but slower
at executing them. Many users have found the speed perfectly adequate,
although replacing the insides of egrep with this code would be a mistake.
"make lib" builds just the .o files for the actual routines (when
you're happy with testing and have adjusted CFLAGS for production),
and puts them together into libregex.a. You can pick up either the
library or *.o ("make lib" makes sure there are no other .o files left
around to confuse things).
This stuff should be pretty portable, given an ANSI C compiler and
appropriate option settings. There are no "reserved" char values except for
NUL, and no special significance is attached to the top bit of chars.
The string(3) functions are used a fair bit, on the grounds that they are
probably faster than coding the operations in line. Some attempts at code
tuning have been made, but this is invariably a bit machine-specific.
Main.c, debug.c, split.c are used for regression testing but are not part
of the RE routines themselves.
Regex.h goes in /usr/include. All other .h files are internal only.
--------

View File

@ -1,3 +1,7 @@
New in alpha3.7: A bit of cleanup aimed at maximizing portability,
possibly at slight cost in efficiency. "ul" suffixes and "unsigned long"
no longer appear, in particular.
New in alpha3.6: A couple more portability glitches fixed.
New in alpha3.5: Active development of this code has been stopped --

View File

@ -1,14 +0,0 @@
/* ========= begin header generated by ./mkh ========= */
#ifdef __cplusplus
extern "C" {
#endif
/* === debug.c === */
void regprint(regex_t *r, FILE *d);
static void s_print(register struct re_guts *g, FILE *d);
static char *regchar(int ch);
#ifdef __cplusplus
}
#endif
/* ========= end header generated by ./mkh ========= */

View File

@ -1,35 +0,0 @@
/* ========= begin header generated by ./mkh ========= */
#ifdef __cplusplus
extern "C" {
#endif
/* === engine.c === */
static int matcher(register struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
static char *dissect(register struct match *m, char *start, char *stop, sopno startst, sopno stopst);
static char *backref(register struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev);
static char *fast(register struct match *m, char *start, char *stop, sopno startst, sopno stopst);
static char *slow(register struct match *m, char *start, char *stop, sopno startst, sopno stopst);
static states step(register struct re_guts *g, sopno start, sopno stop, register states bef, int ch, register states aft);
#define BOL (OUT+1)
#define EOL (BOL+1)
#define BOLEOL (BOL+2)
#define NOTHING (BOL+3)
#define BOW (BOL+4)
#define EOW (BOL+5)
#define CODEMAX (BOL+5) /* highest code used */
#define NONCHAR(c) ((c) > CHAR_MAX)
#define NNONCHAR (CODEMAX-CHAR_MAX)
#ifdef REDEBUG
static void print(struct match *m, char *caption, states st, int ch, FILE *d);
#endif
#ifdef REDEBUG
static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst);
#endif
#ifdef REDEBUG
static char *pchar(int ch);
#endif
#ifdef __cplusplus
}
#endif
/* ========= end header generated by ./mkh ========= */

View File

@ -1,19 +0,0 @@
/* ========= begin header generated by ./mkh ========= */
#ifdef __cplusplus
extern "C" {
#endif
/* === main.c === */
void regress(FILE *in);
void try(char *f0, char *f1, char *f2, char *f3, char *f4, int opts);
int options(int type, char *s);
int opt(int c, char *s);
void fixstr(register char *p);
char *check(char *str, regmatch_t sub, char *should);
static char *eprint(int err);
static int efind(char *name);
#ifdef __cplusplus
}
#endif
/* ========= end header generated by ./mkh ========= */

View File

@ -1,803 +0,0 @@
Date: Mon, 1 Jul 1996 23:22:47 GMT
From: Bill Sommerfeld <sommerfeld@orchard.medford.ma.us>
To: shivers@lcs.mit.edu, bdc@ai.mit.edu
Subject: scsh patch for precompiled regexps..
I meant to send this out months ago but I was just too hosed with work.
Here's what I have right now:
There are three pieces here:
diffs to the "core" scsh
diffs to Henry Spencer's latest regexp library
a copy of Henry Spencer's latest regexp library..
It appears to work (it passes the same regression tests as the C library..).
Let me know if I didn't include something needed for this to work..
- Bill
diff -rc scsh-0.4.2/scsh/re.scm scsh-0.4.2-regexp/scsh/re.scm
*** scsh-0.4.2/scsh/re.scm Fri Oct 27 04:58:56 1995
--- scsh-0.4.2-regexp/scsh/re.scm Sat Apr 6 21:07:41 1996
***************
*** 34,49 ****
;;; Bogus stub definitions for low-level match routines:
! (define regexp? string?)
! (define (make-regexp str) str)
! (define (regexp-exec regexp str . maybe-start)
(let ((start (optional-arg maybe-start 0))
(start-vec (make-vector 10))
(end-vec (make-vector 10)))
! (and (%regexp-match regexp str start start-vec end-vec)
! (make-regexp-match str start-vec end-vec))))
!
;;; Convert a string into a regex pattern that matches that string exactly --
;;; in other words, quote the special chars with backslashes.
--- 34,53 ----
;;; Bogus stub definitions for low-level match routines:
! (define-record iregexp
! string)
! (define regexp? iregexp?)
!
! (define (make-regexp str)
! (make-iregexp (compile-regexp str)))
!
! (define (regexp-exec r s . maybe-start)
(let ((start (optional-arg maybe-start 0))
(start-vec (make-vector 10))
(end-vec (make-vector 10)))
! (and (%regexp-exec-1 (iregexp:string r) s start start-vec end-vec)
! (make-regexp-match s start-vec end-vec))))
;;; Convert a string into a regex pattern that matches that string exactly --
;;; in other words, quote the special chars with backslashes.
***************
*** 58,75 ****
(cons #\\ result)
result))))))
! (define-foreign %regexp-match/errno (reg_match (string regexp)
! (string s)
! (integer start)
! (vector-desc start-vec)
! (vector-desc end-vec))
! static-string ; Error string or #f if all is ok.
! bool) ; match?
!
! (define (%regexp-match regexp string start start-vec end-vec)
! (receive (err match?) (%regexp-match/errno regexp string start
! start-vec end-vec)
! (if err (error err %regexp-match regexp string start) match?)))
;;; I do this one in C, I'm not sure why:
--- 62,79 ----
(cons #\\ result)
result))))))
! ;;;(define-foreign %regexp-match/errno (reg_match (string regexp)
! ;;; (string s)
! ;;; (integer start)
! ;;; (vector-desc start-vec)
! ;;; (vector-desc end-vec))
! ;;; static-string ; Error string or #f if all is ok.
! ;;; bool) ; match?
!
! ;;;(define (%regexp-match regexp string start start-vec end-vec)
! ;;; (receive (err match?) (%regexp-match/errno regexp string start
! ;;; start-vec end-vec)
! ;;; (if err (error err %regexp-match regexp string start) match?)))
;;; I do this one in C, I'm not sure why:
***************
*** 79,81 ****
--- 83,166 ----
(filter_stringvec (string regexp) ((C "char const ** ~a") cvec))
static-string ; error message -- #f if no error.
integer) ; number of files that pass the filter.
+
+ ;;; precompiled regexps.
+
+ (define-foreign %regexp-compiled-length (reg_comp_len (string regexp))
+ static-string
+ integer)
+
+ (define-foreign %regexp-compile (reg_comp_comp (string regexp)
+ (string-desc re-buf))
+ static-string)
+
+ (define (%regexp-exec-1 r s start sv ev)
+ (receive (err match?) (%regexp-exec r s start sv ev)
+ (if err (error err s start)
+ match?)))
+
+ (define-foreign %regexp-exec (reg_exec (string-desc regexp)
+ (string s)
+ (integer start)
+ (vector-desc start-vec)
+ (vector-desc end-vec))
+ static-string
+ bool)
+
+
+ (define (compile-regexp e)
+ (receive (err len)
+ (%regexp-compiled-length e)
+ (if err (error err e)
+ (let ((buf (make-string len)))
+ (%regexp-compile e buf)
+ buf))))
+
+
+
+ (define-foreign %regexp-subst (reg_subst (string-desc regexp)
+ (string m)
+ (string s)
+ (integer start)
+ (vector-desc start-vec)
+ (vector-desc end-vec)
+ (string-desc outbuf))
+ static-string
+ integer)
+
+ (define-foreign %regexp-subst-len (reg_subst_len (string-desc regexp)
+ (string m)
+ (string s)
+ (integer start)
+ (vector-desc start-vec)
+ (vector-desc end-vec))
+ static-string
+ integer)
+
+
+ (define (regexp-subst re match replacement)
+ (let ((cr (iregexp:string re))
+ (matchstr (regexp-match:string match))
+ (startvec (regexp-match:start match))
+ (endvec (regexp-match:end match)))
+ (receive (err outlen)
+ (%regexp-subst-len cr
+ matchstr
+ replacement
+ 0
+ startvec
+ endvec)
+ (if err (error err matchstr replacement)
+ (let ((outbuf (make-string outlen)))
+ (receive (err outlen)
+ (%regexp-subst cr
+ matchstr
+ replacement
+ 0
+ startvec
+ endvec
+ outbuf)
+ (if err (error err matchstr replacement)
+ (substring outbuf 0 outlen))))))))
+
+
\ No newline at end of file
diff -rc scsh-0.4.2/scsh/re1.c scsh-0.4.2-regexp/scsh/re1.c
*** scsh-0.4.2/scsh/re1.c Fri Oct 27 04:58:58 1995
--- scsh-0.4.2-regexp/scsh/re1.c Sat Apr 6 21:01:15 1996
***************
*** 19,24 ****
--- 19,150 ----
/* Stash error msg in global. */
void regerror(char *msg) {regexp_error = msg;}
+ /*
+ ** Return NULL normally, error string on error.
+ ** Stash number of bytes needed for compiled regexp into `*len'
+ */
+
+ char *reg_comp_len(const char *re, int *len)
+ {
+ int l;
+
+ regexp_error = NULL;
+ *len = regcomp_len(re);
+ return regexp_error;
+ }
+
+ /*
+ ** Return NULL normally, error string on error.
+ ** Compile regexp into string described by `cr'.
+ */
+
+ char *reg_comp_comp(const char *re, scheme_value cr)
+ {
+ int len = STRING_LENGTH(cr);
+ regexp *r = (regexp *)&STRING_REF(cr, 0);
+
+ regexp_error = NULL;
+ r = regcomp_comp(re, r, len);
+ return regexp_error;
+ }
+
+ /* Return NULL normally, error string on error.
+ ** Stash match info in start_vec and end_vec.
+ ** Returns boolean match/no-match in hit.
+ */
+
+ char *reg_exec(scheme_value cr, const char *string, int start,
+ scheme_value start_vec, scheme_value end_vec, int *hit)
+ {
+ regexp *r = (regexp *)&STRING_REF(cr, 0);
+
+ if( VECTOR_LENGTH(start_vec) != NSUBEXP ) {
+ return "Illegal start vector";
+ }
+
+ if( VECTOR_LENGTH(end_vec) != NSUBEXP ) {
+ return "Illegal end vector";
+ }
+
+ regexp_error = 0;
+ *hit = 0;
+
+ if( regexec(r, string+start) ) {
+ int i;
+ for(i=0; i<NSUBEXP; i++) {
+ const char *s = r->startp[i];
+ const char *e = r->endp[i];
+ VECTOR_REF(start_vec,i) = s?ENTER_FIXNUM(s - string):SCHFALSE;
+ VECTOR_REF(end_vec,i) = e?ENTER_FIXNUM(e - string):SCHFALSE;
+ r->startp[i] = NULL;
+ r->endp[i] = NULL;
+ }
+ *hit = 1;
+ }
+ return regexp_error;
+ }
+
+ char *reg_subst(scheme_value cr, const char *match,
+ const char *src, int start,
+ scheme_value start_vec, scheme_value end_vec,
+ scheme_value outbuf, int *len)
+ {
+ int i;
+ regexp *r = (regexp *)&STRING_REF(cr, 0);
+
+ if( VECTOR_LENGTH(start_vec) != NSUBEXP ) {
+ return "Illegal start vector";
+ }
+
+ if( VECTOR_LENGTH(end_vec) != NSUBEXP ) {
+ return "Illegal end vector";
+ }
+
+ for (i=0; i<NSUBEXP; i++)
+ {
+ scheme_value se = VECTOR_REF(start_vec, i);
+ scheme_value ee = VECTOR_REF(end_vec, i);
+ r->startp[i] = FIXNUMP(se)?(match + EXTRACT_FIXNUM(se)):NULL;
+ r->endp[i] = FIXNUMP(ee)? (match + EXTRACT_FIXNUM(ee)):NULL;
+ }
+
+ regexp_error = NULL;
+ regnsub (r, src, &STRING_REF(outbuf, 0), STRING_LENGTH(outbuf));
+ *len = strlen(&STRING_REF(outbuf, 0));
+ return regexp_error;
+ }
+
+ char *reg_subst_len(scheme_value cr, const char *match,
+ const char *src, int start,
+ scheme_value start_vec, scheme_value end_vec,
+ int *len)
+ {
+ int i;
+ regexp *r = (regexp *)&STRING_REF(cr, 0);
+
+ if( VECTOR_LENGTH(start_vec) != NSUBEXP ) {
+ return "Illegal start vector";
+ }
+
+ if( VECTOR_LENGTH(end_vec) != NSUBEXP ) {
+ return "Illegal end vector";
+ }
+
+ for (i=0; i<NSUBEXP; i++)
+ {
+ scheme_value se = VECTOR_REF(start_vec, i);
+ scheme_value ee = VECTOR_REF(end_vec, i);
+ r->startp[i] = FIXNUMP(se)?(match + EXTRACT_FIXNUM(se)):NULL;
+ r->endp[i] = FIXNUMP(ee)? (match + EXTRACT_FIXNUM(ee)):NULL;
+ }
+
+ regexp_error = NULL;
+ *len = regsublen (r, src);
+ return regexp_error;
+ }
+
+
+ #if 0
/* Return NULL normally, error string on error.
** Stash match info in start_vec and end_vec.
** Returns boolean match/no-match in hit.
***************
*** 56,61 ****
--- 182,188 ----
Free(prog);
return regexp_error;
}
+ #endif
char *filter_stringvec(const char *re, char const **stringvec, int *nummatch)
diff -rc scsh-0.4.2/scsh/re1.h scsh-0.4.2-regexp/scsh/re1.h
*** scsh-0.4.2/scsh/re1.h Sun Oct 22 08:34:34 1995
--- scsh-0.4.2-regexp/scsh/re1.h Sat Apr 6 17:54:09 1996
***************
*** 1,6 ****
--- 1,21 ----
+ #if 0
char *reg_match(const char *re, const char *string, int start,
scheme_value start_vec, scheme_value end_vec,
int *hit);
+ #endif
char *filter_stringvec(const char *re, char const **stringvec,
int *nummatch);
+
+ char *reg_comp_len(const char *re, int *len);
+ char *reg_comp_comp(const char *re, scheme_value cr);
+
+ char *reg_exec(scheme_value cr, const char *string, int start,
+ scheme_value start_vec, scheme_value end_vec, int *hit);
+
+ char *reg_subst(scheme_value cr, const char *match,
+ const char *src, int start,
+ scheme_value start_vec, scheme_value end_vec,
+ scheme_value outbuf, int *len);
+
+
Only in scsh-0.4.2-regexp/scsh: re2.scm
diff -rc scsh-0.4.2/scsh/scsh-interfaces.scm scsh-0.4.2-regexp/scsh/scsh-interfaces.scm
*** scsh-0.4.2/scsh/scsh-interfaces.scm Tue Oct 31 19:19:30 1995
--- scsh-0.4.2-regexp/scsh/scsh-interfaces.scm Sat Apr 6 18:48:12 1996
***************
*** 413,418 ****
--- 413,419 ----
make-regexp
regexp?
regexp-exec
+ regexp-subst
regexp-quote))
regexp library changes:
*** Makefile 1996/04/06 19:24:49 1.1
--- Makefile 1996/04/06 20:46:26
***************
*** 5,11 ****
# Things you might want to put in TEST:
# -DDEBUG debugging hooks
# -I. regexp.h from current directory, not /usr/include
! TEST=-I.
# Things you might want to put in PROF:
# -pg profiler
--- 5,11 ----
# Things you might want to put in TEST:
# -DDEBUG debugging hooks
# -I. regexp.h from current directory, not /usr/include
! TEST=-I. -DDEBUG
# Things you might want to put in PROF:
# -pg profiler
*** regexp.c 1996/04/06 19:24:49 1.1
--- regexp.c 1996/04/06 22:34:55
***************
*** 105,110 ****
--- 105,111 ----
* Utility definitions.
*/
#define FAIL(m) { regerror(m); return(NULL); }
+ #define FAILN(m) { regerror(m); return(-1); }
#define ISREPN(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define META "^$.[()|?+*\\"
***************
*** 162,173 ****
const char *exp;
{
register regexp *r;
! register char *scan;
int flags;
struct comp co;
if (exp == NULL)
! FAIL("NULL argument to regcomp");
/* First pass: determine size, legality. */
co.regparse = (char *)exp;
--- 163,193 ----
const char *exp;
{
register regexp *r;
! size_t len;
!
! len = regcomp_len(exp);
! if (len <= 0)
! return NULL;
!
! /* Allocate space. */
! r = (regexp *)malloc(len);
!
! if (r == NULL)
! FAIL("out of space");
! return regcomp_comp(exp, r, len);
! }
!
!
! size_t
! regcomp_len(exp)
! const char *exp;
! {
int flags;
+ register regexp *r;
struct comp co;
if (exp == NULL)
! FAILN("NULL argument to regcomp");
/* First pass: determine size, legality. */
co.regparse = (char *)exp;
***************
*** 178,198 ****
co.regcode = co.regdummy;
regc(&co, MAGIC);
if (reg(&co, 0, &flags) == NULL)
! return(NULL);
/* Small enough for pointer-storage convention? */
if (co.regsize >= 0x7fffL) /* Probably could be 0xffffL. */
! FAIL("regexp too big");
! /* Allocate space. */
! r = (regexp *)malloc(sizeof(regexp) + (size_t)co.regsize);
! if (r == NULL)
! FAIL("out of space");
/* Second pass: emit code. */
co.regparse = (char *)exp;
co.regnpar = 1;
co.regcode = r->program;
regc(&co, MAGIC);
if (reg(&co, 0, &flags) == NULL)
return(NULL);
--- 198,228 ----
co.regcode = co.regdummy;
regc(&co, MAGIC);
if (reg(&co, 0, &flags) == NULL)
! return -1;
/* Small enough for pointer-storage convention? */
if (co.regsize >= 0x7fffL) /* Probably could be 0xffffL. */
! FAILN("regexp too big");
! return (sizeof(regexp) + (size_t)co.regsize);
! }
!
!
! regexp *
! regcomp_comp(exp, r, len)
! const char *exp;
! register regexp *r;
! size_t len;
! {
! register char *scan;
! int flags;
! struct comp co;
/* Second pass: emit code. */
co.regparse = (char *)exp;
co.regnpar = 1;
co.regcode = r->program;
+ co.regsize = len - sizeof(regexp);
regc(&co, MAGIC);
if (reg(&co, 0, &flags) == NULL)
return(NULL);
***************
*** 200,206 ****
/* Dig out information for optimizations. */
r->regstart = '\0'; /* Worst-case defaults. */
r->reganch = 0;
! r->regmust = NULL;
r->regmlen = 0;
scan = r->program+1; /* First BRANCH. */
if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
--- 230,236 ----
/* Dig out information for optimizations. */
r->regstart = '\0'; /* Worst-case defaults. */
r->reganch = 0;
! r->regmust = 0;
r->regmlen = 0;
scan = r->program+1; /* First BRANCH. */
if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
***************
*** 229,235 ****
longest = OPERAND(scan);
len = strlen(OPERAND(scan));
}
! r->regmust = longest;
r->regmlen = (int)len;
}
}
--- 259,265 ----
longest = OPERAND(scan);
len = strlen(OPERAND(scan));
}
! r->regmust = longest - r->program;
r->regmlen = (int)len;
}
}
***************
*** 648,655 ****
struct exec {
char *reginput; /* String-input pointer. */
char *regbol; /* Beginning of input, for ^ check. */
! char **regstartp; /* Pointer to startp array. */
! char **regendp; /* Ditto for endp. */
};
/*
--- 678,685 ----
struct exec {
char *reginput; /* String-input pointer. */
char *regbol; /* Beginning of input, for ^ check. */
! const char **regstartp; /* Pointer to startp array. */
! const char **regendp; /* Ditto for endp. */
};
/*
***************
*** 690,696 ****
}
/* If there is a "must appear" string, look for it. */
! if (prog->regmust != NULL && strstr(string, prog->regmust) == NULL)
return(0);
/* Mark beginning of line for ^ . */
--- 720,727 ----
}
/* If there is a "must appear" string, look for it. */
! if ((prog->regmlen > 0) &&
! strstr(string, &prog->program[prog->regmust]) == NULL)
return(0);
/* Mark beginning of line for ^ . */
***************
*** 729,736 ****
char *string;
{
register int i;
! register char **stp;
! register char **enp;
ep->reginput = string;
--- 760,767 ----
char *string;
{
register int i;
! register const char **stp;
! register const char **enp;
ep->reginput = string;
***************
*** 1004,1011 ****
printf("start `%c' ", r->regstart);
if (r->reganch)
printf("anchored ");
! if (r->regmust != NULL)
! printf("must have \"%s\"", r->regmust);
printf("\n");
}
--- 1035,1042 ----
printf("start `%c' ", r->regstart);
if (r->reganch)
printf("anchored ");
! if (r->regmlen > 0)
! printf("must have \"%s\"", &r->program[r->regmust]);
printf("\n");
}
*** regexp.h 1996/04/06 19:24:49 1.1
--- regexp.h 1996/04/07 01:52:19
***************
*** 6,16 ****
*/
#define NSUBEXP 10
typedef struct regexp {
! char *startp[NSUBEXP];
! char *endp[NSUBEXP];
char regstart; /* Internal use only. */
char reganch; /* Internal use only. */
! char *regmust; /* Internal use only. */
int regmlen; /* Internal use only. */
char program[1]; /* Unwarranted chumminess with compiler. */
} regexp;
--- 6,16 ----
*/
#define NSUBEXP 10
typedef struct regexp {
! const char *startp[NSUBEXP];
! const char *endp[NSUBEXP];
char regstart; /* Internal use only. */
char reganch; /* Internal use only. */
! int regmust; /* Internal use only. */
int regmlen; /* Internal use only. */
char program[1]; /* Unwarranted chumminess with compiler. */
} regexp;
***************
*** 18,21 ****
--- 18,27 ----
extern regexp *regcomp(const char *re);
extern int regexec(regexp *rp, const char *s);
extern void regsub(const regexp *rp, const char *src, char *dst);
+ extern void regnsub(const regexp *rp, const char *src, char *dst, size_t len);
+ extern size_t regsublen(const regexp *rp, const char *src);
+
extern void regerror(char *message);
+ extern size_t regcomp_len(const char *exp);
+ extern regexp *regcomp_comp(const char *exp, struct regexp *r, size_t len);
+
*** regsub.c 1996/04/06 19:24:49 1.1
--- regsub.c 1996/04/07 02:10:29
***************
*** 11,25 ****
/*
- regsub - perform substitutions after a regexp match
*/
void
! regsub(rp, source, dest)
const regexp *rp;
const char *source;
char *dest;
{
register regexp * const prog = (regexp *)rp;
! register char *src = (char *)source;
register char *dst = dest;
register char c;
register int no;
register size_t len;
--- 11,42 ----
/*
- regsub - perform substitutions after a regexp match
*/
+
+ void regsub(rp, source, dest)
+ const regexp *rp;
+ const char *source;
+ char *dest;
+ {
+ regnsub(rp, source, dest, BUFSIZ);
+ }
+
+
+
+ /*
+ - regnsub - perform bounds-checked substitutions after a regexp match
+ */
void
! regnsub(rp, source, dest, destlen)
const regexp *rp;
const char *source;
char *dest;
+ size_t destlen;
{
register regexp * const prog = (regexp *)rp;
! register const char *src = (char *)source;
register char *dst = dest;
+ char *dstend = dest + destlen;
+ char *odst;
register char c;
register int no;
register size_t len;
***************
*** 45,55 ****
if (c == '\\' && (*src == '\\' || *src == '&'))
c = *src++;
*dst++ = c;
} else if (prog->startp[no] != NULL && prog->endp[no] != NULL &&
! prog->endp[no] > prog->startp[no]) {
len = prog->endp[no] - prog->startp[no];
! (void) strncpy(dst, prog->startp[no], len);
dst += len;
if (*(dst-1) == '\0') { /* strncpy hit NUL. */
regerror("damaged match string");
return;
--- 62,83 ----
if (c == '\\' && (*src == '\\' || *src == '&'))
c = *src++;
*dst++ = c;
+ if (dst >= dstend)
+ {
+ regerror("output buffer too small");
+ return;
+ }
} else if (prog->startp[no] != NULL && prog->endp[no] != NULL &&
! prog->endp[no] > prog->startp[no]) {
len = prog->endp[no] - prog->startp[no];
! odst = dst;
dst += len;
+ if (dst >= dstend)
+ {
+ regerror("output buffer too small");
+ return;
+ }
+ (void) strncpy(odst, prog->startp[no], len);
if (*(dst-1) == '\0') { /* strncpy hit NUL. */
regerror("damaged match string");
return;
***************
*** 58,60 ****
--- 86,131 ----
}
*dst++ = '\0';
}
+
+ size_t regsublen(rp, source)
+ const regexp *rp;
+ const char *source;
+ {
+ register regexp * const prog = (regexp *)rp;
+ register char *src = (char *)source;
+ register char c;
+ register int no;
+ register int len = 0;
+
+ if (prog == NULL || source == NULL) {
+ regerror("NULL parameter to regsublen");
+ return -1;
+ }
+
+ if ((unsigned char)*(prog->program) != MAGIC) {
+ regerror("damaged regexp");
+ return -1;
+ }
+ while ((c = *src++) != '\0') {
+ if (c == '&')
+ no = 0;
+ else if (c == '\\' && isdigit(*src))
+ no = *src++ - '0';
+ else
+ no = -1;
+ if (no < 0) { /* Ordinary character. */
+ if (c == '\\' && (*src == '\\' || *src == '&'))
+ src++;
+ len++;
+ } else {
+ const char *s = prog->startp[no];
+ const char *e = prog->endp[no];
+ if ((s != NULL) && (e != NULL) && (e > s)) {
+ len += e-s;
+ }
+ }
+ }
+ return len+1;
+ }
+
+
Original regexp code from henry:
[unpacked & deleted -Olin]

View File

@ -1,51 +0,0 @@
/* ========= begin header generated by ./mkh ========= */
#ifdef __cplusplus
extern "C" {
#endif
/* === regcomp.c === */
static void p_ere(register struct parse *p, int stop);
static void p_ere_exp(register struct parse *p);
static void p_str(register struct parse *p);
static void p_bre(register struct parse *p, register int end1, register int end2);
static int p_simp_re(register struct parse *p, int starordinary);
static int p_count(register struct parse *p);
static void p_bracket(register struct parse *p);
static void p_b_term(register struct parse *p, register cset *cs);
static void p_b_cclass(register struct parse *p, register cset *cs);
static void p_b_eclass(register struct parse *p, register cset *cs);
static char p_b_symbol(register struct parse *p);
static char p_b_coll_elem(register struct parse *p, int endc);
static char othercase(int ch);
static void bothcases(register struct parse *p, int ch);
static void ordinary(register struct parse *p, register int ch);
static void nonnewline(register struct parse *p);
static void repeat(register struct parse *p, sopno start, int from, int to);
static int seterr(register struct parse *p, int e);
static cset *allocset(register struct parse *p);
static void freeset(register struct parse *p, register cset *cs);
static int freezeset(register struct parse *p, register cset *cs);
static int firstch(register struct parse *p, register cset *cs);
static int nch(register struct parse *p, register cset *cs);
static void mcadd(register struct parse *p, register cset *cs, register char *cp);
static void mcsub(register cset *cs, register char *cp);
static int mcin(register cset *cs, register char *cp);
static char *mcfind(register cset *cs, register char *cp);
static void mcinvert(register struct parse *p, register cset *cs);
static void mccase(register struct parse *p, register cset *cs);
static int isinsets(register struct re_guts *g, int c);
static int samesets(register struct re_guts *g, int c1, int c2);
static void categorize(struct parse *p, register struct re_guts *g);
static sopno dupl(register struct parse *p, sopno start, sopno finish);
static void doemit(register struct parse *p, sop op, size_t opnd);
static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
static void dofwd(register struct parse *p, sopno pos, sop value);
static void enlarge(register struct parse *p, sopno size);
static void stripsnug(register struct parse *p, register struct re_guts *g);
static void findmust(register struct parse *p, register struct re_guts *g);
static sopno pluscount(register struct parse *p, register struct re_guts *g);
#ifdef __cplusplus
}
#endif
/* ========= end header generated by ./mkh ========= */

View File

@ -1,18 +1,126 @@
/*
* regerror
*/
#include <sys/types.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
#include <stdlib.h>
#include <regex.h>
void
regerror(s)
char *s;
#include "utils.h"
#include "regerror.ih"
/*
= #define REG_OKAY 0
= #define REG_NOMATCH 1
= #define REG_BADPAT 2
= #define REG_ECOLLATE 3
= #define REG_ECTYPE 4
= #define REG_EESCAPE 5
= #define REG_ESUBREG 6
= #define REG_EBRACK 7
= #define REG_EPAREN 8
= #define REG_EBRACE 9
= #define REG_BADBR 10
= #define REG_ERANGE 11
= #define REG_ESPACE 12
= #define REG_BADRPT 13
= #define REG_EMPTY 14
= #define REG_ASSERT 15
= #define REG_INVARG 16
= #define REG_ATOI 255 // convert name to number (!)
= #define REG_ITOA 0400 // convert number to name (!)
*/
static struct rerr {
int code;
char *name;
char *explain;
} rerrs[] = {
REG_OKAY, "REG_OKAY", "no errors detected",
REG_NOMATCH, "REG_NOMATCH", "regexec() failed to match",
REG_BADPAT, "REG_BADPAT", "invalid regular expression",
REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element",
REG_ECTYPE, "REG_ECTYPE", "invalid character class",
REG_EESCAPE, "REG_EESCAPE", "trailing backslash (\\)",
REG_ESUBREG, "REG_ESUBREG", "invalid backreference number",
REG_EBRACK, "REG_EBRACK", "brackets ([ ]) not balanced",
REG_EPAREN, "REG_EPAREN", "parentheses not balanced",
REG_EBRACE, "REG_EBRACE", "braces not balanced",
REG_BADBR, "REG_BADBR", "invalid repetition count(s)",
REG_ERANGE, "REG_ERANGE", "invalid character range",
REG_ESPACE, "REG_ESPACE", "out of memory",
REG_BADRPT, "REG_BADRPT", "repetition-operator operand invalid",
REG_EMPTY, "REG_EMPTY", "empty (sub)expression",
REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug",
REG_INVARG, "REG_INVARG", "invalid argument to regex routine",
-1, "", "*** unknown regexp error code ***",
};
/*
- regerror - the interface to error numbers
= extern size_t regerror(int, const regex_t *, char *, size_t);
*/
/* ARGSUSED */
size_t
regerror(errcode, preg, errbuf, errbuf_size)
int errcode;
const regex_t *preg;
char *errbuf;
size_t errbuf_size;
{
#ifdef ERRAVAIL
error("regexp: %s", s);
#else
fprintf(stderr, "regexp(3): %s\n", s);
exit(EXIT_FAILURE);
#endif
/* NOTREACHED */
register struct rerr *r;
register size_t len;
register int target = errcode &~ REG_ITOA;
register char *s;
char convbuf[50];
if (errcode == REG_ATOI)
s = regatoi(preg, convbuf);
else {
for (r = rerrs; r->code >= 0; r++)
if (r->code == target)
break;
if (errcode&REG_ITOA) {
if (r->code >= 0)
(void) strcpy(convbuf, r->name);
else
sprintf(convbuf, "REG_0x%x", target);
assert(strlen(convbuf) < sizeof(convbuf));
s = convbuf;
} else
s = r->explain;
}
len = strlen(s) + 1;
if (errbuf_size > 0) {
if (errbuf_size > len)
(void) strcpy(errbuf, s);
else {
(void) strncpy(errbuf, s, errbuf_size-1);
errbuf[errbuf_size-1] = '\0';
}
}
return(len);
}
/*
- regatoi - internal routine to implement REG_ATOI
== static char *regatoi(const regex_t *preg, char *localbuf);
*/
static char *
regatoi(preg, localbuf)
const regex_t *preg;
char *localbuf;
{
register struct rerr *r;
for (r = rerrs; r->code >= 0; r++)
if (strcmp(r->name, preg->re_endp) == 0)
break;
if (r->code < 0)
return("0");
sprintf(localbuf, "%d", r->code);
return(localbuf);
}

View File

@ -1,12 +0,0 @@
/* ========= begin header generated by ./mkh ========= */
#ifdef __cplusplus
extern "C" {
#endif
/* === regerror.c === */
static char *regatoi(const regex_t *preg, char *localbuf);
#ifdef __cplusplus
}
#endif
/* ========= end header generated by ./mkh ========= */

View File

@ -1,74 +0,0 @@
#ifndef _REGEX_H_
#define _REGEX_H_ /* never again */
/* ========= begin header generated by ./mkh ========= */
#ifdef __cplusplus
extern "C" {
#endif
/* === regex2.h === */
typedef off_t regoff_t;
typedef struct {
int re_magic;
size_t re_nsub; /* number of parenthesized subexpressions */
const char *re_endp; /* end pointer for REG_PEND */
struct re_guts *re_g; /* none of your business :-) */
} regex_t;
typedef struct {
regoff_t rm_so; /* start of match */
regoff_t rm_eo; /* end of match */
} regmatch_t;
/* === regcomp.c === */
extern int regcomp(regex_t *, const char *, int);
#define REG_BASIC 0000
#define REG_EXTENDED 0001
#define REG_ICASE 0002
#define REG_NOSUB 0004
#define REG_NEWLINE 0010
#define REG_NOSPEC 0020
#define REG_PEND 0040
#define REG_DUMP 0200
/* === regerror.c === */
#define REG_OKAY 0
#define REG_NOMATCH 1
#define REG_BADPAT 2
#define REG_ECOLLATE 3
#define REG_ECTYPE 4
#define REG_EESCAPE 5
#define REG_ESUBREG 6
#define REG_EBRACK 7
#define REG_EPAREN 8
#define REG_EBRACE 9
#define REG_BADBR 10
#define REG_ERANGE 11
#define REG_ESPACE 12
#define REG_BADRPT 13
#define REG_EMPTY 14
#define REG_ASSERT 15
#define REG_INVARG 16
#define REG_ATOI 255 /* convert name to number (!) */
#define REG_ITOA 0400 /* convert number to name (!) */
extern size_t regerror(int, const regex_t *, char *, size_t);
/* === regexec.c === */
extern int regexec(const regex_t *, const char *, size_t, regmatch_t [], int);
#define REG_NOTBOL 00001
#define REG_NOTEOL 00002
#define REG_STARTEND 00004
#define REG_TRACE 00400 /* tracing of execution */
#define REG_LARGE 01000 /* force large representation */
#define REG_BACKR 02000 /* force use of backref code */
/* === regfree.c === */
extern void regfree(regex_t *);
#ifdef __cplusplus
}
#endif
/* ========= end header generated by ./mkh ========= */
#endif

View File

@ -36,36 +36,36 @@
* In state representations, an operator's bit is on to signify a state
* immediately *preceding* "execution" of that operator.
*/
typedef unsigned long sop; /* strip operator */
typedef long sop; /* strip operator */
typedef long sopno;
#define OPRMASK 0xf8000000
#define OPDMASK 0x07ffffff
#define OPSHIFT ((unsigned)27)
#define OPRMASK 0x7c000000
#define OPDMASK 0x03ffffff
#define OPSHIFT (26)
#define OP(n) ((n)&OPRMASK)
#define OPND(n) ((n)&OPDMASK)
#define SOP(op, opnd) ((op)|(opnd))
/* operators meaning operand */
/* (back, fwd are offsets) */
#define OEND (1ul<<OPSHIFT) /* endmarker - */
#define OCHAR (2ul<<OPSHIFT) /* character unsigned char */
#define OBOL (3ul<<OPSHIFT) /* left anchor - */
#define OEOL (4ul<<OPSHIFT) /* right anchor - */
#define OANY (5ul<<OPSHIFT) /* . - */
#define OANYOF (6ul<<OPSHIFT) /* [...] set number */
#define OBACK_ (7ul<<OPSHIFT) /* begin \d paren number */
#define O_BACK (8ul<<OPSHIFT) /* end \d paren number */
#define OPLUS_ (9ul<<OPSHIFT) /* + prefix fwd to suffix */
#define O_PLUS (10ul<<OPSHIFT) /* + suffix back to prefix */
#define OQUEST_ (11ul<<OPSHIFT) /* ? prefix fwd to suffix */
#define O_QUEST (12ul<<OPSHIFT) /* ? suffix back to prefix */
#define OLPAREN (13ul<<OPSHIFT) /* ( fwd to ) */
#define ORPAREN (14ul<<OPSHIFT) /* ) back to ( */
#define OCH_ (15ul<<OPSHIFT) /* begin choice fwd to OOR2 */
#define OOR1 (16ul<<OPSHIFT) /* | pt. 1 back to OOR1 or OCH_ */
#define OOR2 (17ul<<OPSHIFT) /* | pt. 2 fwd to OOR2 or O_CH */
#define O_CH (18ul<<OPSHIFT) /* end choice back to OOR1 */
#define OBOW (19ul<<OPSHIFT) /* begin word - */
#define OEOW (20ul<<OPSHIFT) /* end word - */
#define OEND (1<<OPSHIFT) /* endmarker - */
#define OCHAR (2<<OPSHIFT) /* character unsigned char */
#define OBOL (3<<OPSHIFT) /* left anchor - */
#define OEOL (4<<OPSHIFT) /* right anchor - */
#define OANY (5<<OPSHIFT) /* . - */
#define OANYOF (6<<OPSHIFT) /* [...] set number */
#define OBACK_ (7<<OPSHIFT) /* begin \d paren number */
#define O_BACK (8<<OPSHIFT) /* end \d paren number */
#define OPLUS_ (9<<OPSHIFT) /* + prefix fwd to suffix */
#define O_PLUS (10<<OPSHIFT) /* + suffix back to prefix */
#define OQUEST_ (11<<OPSHIFT) /* ? prefix fwd to suffix */
#define O_QUEST (12<<OPSHIFT) /* ? suffix back to prefix */
#define OLPAREN (13<<OPSHIFT) /* ( fwd to ) */
#define ORPAREN (14<<OPSHIFT) /* ) back to ( */
#define OCH_ (15<<OPSHIFT) /* begin choice fwd to OOR2 */
#define OOR1 (16<<OPSHIFT) /* | pt. 1 back to OOR1 or OCH_ */
#define OOR2 (17<<OPSHIFT) /* | pt. 2 fwd to OOR2 or O_CH */
#define O_CH (18<<OPSHIFT) /* end choice back to OOR1 */
#define OBOW (19<<OPSHIFT) /* begin word - */
#define OEOW (20<<OPSHIFT) /* end word - */
/*
* Structure for [] character-set representation. Character sets are

View File

@ -19,27 +19,27 @@
static int nope = 0; /* for use in asserts; shuts lint up */
/* macros for manipulating states, small version */
#define states long
#define states1 states /* for later use in regexec() decision */
#define states unsigned
#define states1 unsigned /* for later use in regexec() decision */
#define CLEAR(v) ((v) = 0)
#define SET0(v, n) ((v) &= ~(1ul << (n)))
#define SET1(v, n) ((v) |= 1ul << (n))
#define ISSET(v, n) ((v) & (1ul << (n)))
#define SET0(v, n) ((v) &= ~((unsigned)1 << (n)))
#define SET1(v, n) ((v) |= (unsigned)1 << (n))
#define ISSET(v, n) ((v) & ((unsigned)1 << (n)))
#define ASSIGN(d, s) ((d) = (s))
#define EQ(a, b) ((a) == (b))
#define STATEVARS int dummy /* dummy version */
#define STATESETUP(m, n) /* nothing */
#define STATETEARDOWN(m) /* nothing */
#define SETUP(v) ((v) = 0)
#define onestate long
#define INIT(o, n) ((o) = (unsigned long)1 << (n))
#define onestate unsigned
#define INIT(o, n) ((o) = (unsigned)1 << (n))
#define INC(o) ((o) <<= 1)
#define ISSTATEIN(v, o) ((v) & (o))
/* some abbreviations; note that some of these know variable names! */
/* do "if I'm here, I can also be there" etc without branches */
#define FWD(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) << (n))
#define BACK(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) >> (n))
#define ISSETBACK(v, n) ((v) & ((unsigned long)here >> (n)))
#define FWD(dst, src, n) ((dst) |= ((unsigned)(src)&(here)) << (n))
#define BACK(dst, src, n) ((dst) |= ((unsigned)(src)&(here)) >> (n))
#define ISSETBACK(v, n) ((v) & ((unsigned)here >> (n)))
/* function names */
#define SNAMES /* engine.c looks after details */

View File

@ -1,186 +0,0 @@
.TH REGEXP 3 "2 Sept 1995"
.SH NAME
regcomp, regexec, regsub, regerror \- regular expression handler
.SH SYNOPSIS
.ft B
.nf
#include <regexp.h>
regexp *regcomp(exp)
const char *exp;
int regexec(prog, string)
regexp *prog;
const char *string;
void regsub(prog, source, dest)
const regexp *prog;
const char *source;
char *dest;
void regerror(msg)
char *msg;
.SH DESCRIPTION
These functions implement
.IR egrep (1)-style
regular expressions and supporting facilities.
.PP
.I Regcomp
compiles a regular expression into a structure of type
.IR regexp ,
and returns a pointer to it.
The space has been allocated using
.IR malloc (3)
and may be released by
.IR free .
.PP
.I Regexec
matches a NUL-terminated \fIstring\fR against the compiled regular expression
in \fIprog\fR.
It returns 1 for success and 0 for failure, and adjusts the contents of
\fIprog\fR's \fIstartp\fR and \fIendp\fR (see below) accordingly.
.PP
The members of a
.I regexp
structure include at least the following (not necessarily in order):
.PP
.RS
char *startp[NSUBEXP];
.br
char *endp[NSUBEXP];
.RE
.PP
where
.I NSUBEXP
is defined (as 10) in the header file.
Once a successful \fIregexec\fR has been done using the \fIregexp\fR,
each \fIstartp\fR-\fIendp\fR pair describes one substring
within the \fIstring\fR,
with the \fIstartp\fR pointing to the first character of the substring and
the \fIendp\fR pointing to the first character following the substring.
The 0th substring is the substring of \fIstring\fR that matched the whole
regular expression.
The others are those substrings that matched parenthesized expressions
within the regular expression, with parenthesized expressions numbered
in left-to-right order of their opening parentheses.
.PP
.I Regsub
copies \fIsource\fR to \fIdest\fR, making substitutions according to the
most recent \fIregexec\fR performed using \fIprog\fR.
Each instance of `&' in \fIsource\fR is replaced by the substring
indicated by \fIstartp\fR[\fI0\fR] and
\fIendp\fR[\fI0\fR].
Each instance of `\e\fIn\fR', where \fIn\fR is a digit, is replaced by
the substring indicated by
\fIstartp\fR[\fIn\fR] and
\fIendp\fR[\fIn\fR].
To get a literal `&' or `\e\fIn\fR' into \fIdest\fR, prefix it with `\e';
to get a literal `\e' preceding `&' or `\e\fIn\fR', prefix it with
another `\e'.
.PP
.I Regerror
is called whenever an error is detected in \fIregcomp\fR, \fIregexec\fR,
or \fIregsub\fR.
The default \fIregerror\fR writes the string \fImsg\fR,
with a suitable indicator of origin,
on the standard
error output
and invokes \fIexit\fR(2).
.I Regerror
can be replaced by the user if other actions are desirable.
.SH "REGULAR EXPRESSION SYNTAX"
A regular expression is zero or more \fIbranches\fR, separated by `|'.
It matches anything that matches one of the branches.
.PP
A branch is zero or more \fIpieces\fR, concatenated.
It matches a match for the first, followed by a match for the second, etc.
.PP
A piece is an \fIatom\fR possibly followed by `*', `+', or `?'.
An atom followed by `*' matches a sequence of 0 or more matches of the atom.
An atom followed by `+' matches a sequence of 1 or more matches of the atom.
An atom followed by `?' matches a match of the atom, or the null string.
.PP
An atom is a regular expression in parentheses (matching a match for the
regular expression), a \fIrange\fR (see below), `.'
(matching any single character), `^' (matching the null string at the
beginning of the input string), `$' (matching the null string at the
end of the input string), a `\e' followed by a single character (matching
that character), or a single character with no other significance
(matching that character).
.PP
A \fIrange\fR is a sequence of characters enclosed in `[]'.
It normally matches any single character from the sequence.
If the sequence begins with `^',
it matches any single character \fInot\fR from the rest of the sequence.
If two characters in the sequence are separated by `\-', this is shorthand
for the full list of ASCII characters between them
(e.g. `[0-9]' matches any decimal digit).
To include a literal `]' in the sequence, make it the first character
(following a possible `^').
To include a literal `\-', make it the first or last character.
.SH AMBIGUITY
If a regular expression could match two different parts of the input string,
it will match the one which begins earliest.
If both begin in the same place but match different lengths, or match
the same length in different ways, life gets messier, as follows.
.PP
In general, the possibilities in a list of branches are considered in
left-to-right order, the possibilities for `*', `+', and `?' are
considered longest-first, nested constructs are considered from the
outermost in, and concatenated constructs are considered leftmost-first.
The match that will be chosen is the one that uses the earliest
possibility in the first choice that has to be made.
If there is more than one choice, the next will be made in the same manner
(earliest possibility) subject to the decision on the first choice.
And so forth.
.PP
For example, `(ab|a)b*c' could match `abc' in one of two ways.
The first choice is between `ab' and `a'; since `ab' is earlier, and does
lead to a successful overall match, it is chosen.
Since the `b' is already spoken for,
the `b*' must match its last possibility\(emthe empty string\(emsince
it must respect the earlier choice.
.PP
In the particular case where the regular expression does not use `|'
and does not apply `*', `+', or `?' to parenthesized subexpressions,
the net effect is that the longest possible
match will be chosen.
So `ab*', presented with `xabbbby', will match `abbbb'.
Note that if `ab*' is tried against `xabyabbbz', it
will match `ab' just after `x', due to the begins-earliest rule.
(In effect, the decision on where to start the match is the first choice
to be made, hence subsequent choices must respect it even if this leads them
to less-preferred alternatives.)
.SH SEE ALSO
egrep(1), expr(1)
.SH DIAGNOSTICS
\fIRegcomp\fR returns NULL for a failure
(\fIregerror\fR permitting),
where failures are syntax errors, exceeding implementation limits,
or applying `+' or `*' to a possibly-null operand.
.SH HISTORY
This is a revised version.
Both code and manual page were
originally written by Henry Spencer at University of Toronto.
They are intended to be compatible with the Bell V8 \fIregexp\fR(3),
but are not derived from Bell code.
.SH BUGS
Empty branches and empty regular expressions are not portable
to other, otherwise-similar, implementations.
.PP
The ban on
applying `*' or `+' to a possibly-null operand is an artifact of the
simplistic implementation.
.PP
The match-choice rules are complex.
A simple ``longest match'' rule would be preferable,
but is harder to implement.
.PP
Although there is a general similarity to POSIX.2 ``extended'' regular
expressions, neither the regular-expression syntax nor the programming
interface is an exact match.
.PP
Due to emphasis on
compactness and simplicity,
it's not strikingly fast.
It does give some attention to handling simple cases quickly.

File diff suppressed because it is too large Load Diff

View File

@ -1,27 +0,0 @@
/*
* Definitions etc. for regexp(3) routines.
*
* Caveat: this is V8 regexp(3) [actually, a reimplementation thereof],
* not the System V one.
*/
#define NSUBEXP 10
typedef struct regexp {
const char *startp[NSUBEXP];
const char *endp[NSUBEXP];
char regstart; /* Internal use only. */
char reganch; /* Internal use only. */
int regmust; /* Internal use only. */
int regmlen; /* Internal use only. */
char program[1]; /* Unwarranted chumminess with compiler. */
} regexp;
extern regexp *regcomp(const char *re);
extern int regexec(regexp *rp, const char *s);
extern void regsub(const regexp *rp, const char *src, char *dst);
extern void regnsub(const regexp *rp, const char *src, char *dst, size_t len);
extern size_t regsublen(const regexp *rp, const char *src);
extern void regerror(char *message);
extern size_t regcomp_len(const char *exp);
extern regexp *regcomp_comp(const char *exp, struct regexp *r, size_t len);

View File

@ -1,5 +0,0 @@
/*
* The first byte of the regexp internal "program" is actually this magic
* number; the start node begins in the second byte.
*/
#define MAGIC 0234

View File

@ -1,131 +0,0 @@
/*
* regsub
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <regexp.h>
#include "regmagic.h"
/*
- regsub - perform substitutions after a regexp match
*/
void regsub(rp, source, dest)
const regexp *rp;
const char *source;
char *dest;
{
regnsub(rp, source, dest, BUFSIZ);
}
/*
- regnsub - perform bounds-checked substitutions after a regexp match
*/
void
regnsub(rp, source, dest, destlen)
const regexp *rp;
const char *source;
char *dest;
size_t destlen;
{
register regexp * const prog = (regexp *)rp;
register const char *src = (char *)source;
register char *dst = dest;
char *dstend = dest + destlen;
char *odst;
register char c;
register int no;
register size_t len;
if (prog == NULL || source == NULL || dest == NULL) {
regerror("NULL parameter to regsub");
return;
}
if ((unsigned char)*(prog->program) != MAGIC) {
regerror("damaged regexp");
return;
}
while ((c = *src++) != '\0') {
if (c == '&')
no = 0;
else if (c == '\\' && isdigit(*src))
no = *src++ - '0';
else
no = -1;
if (no < 0) { /* Ordinary character. */
if (c == '\\' && (*src == '\\' || *src == '&'))
c = *src++;
*dst++ = c;
if (dst >= dstend)
{
regerror("output buffer too small");
return;
}
} else if (prog->startp[no] != NULL && prog->endp[no] != NULL &&
prog->endp[no] > prog->startp[no]) {
len = prog->endp[no] - prog->startp[no];
odst = dst;
dst += len;
if (dst >= dstend)
{
regerror("output buffer too small");
return;
}
(void) strncpy(odst, prog->startp[no], len);
if (*(dst-1) == '\0') { /* strncpy hit NUL. */
regerror("damaged match string");
return;
}
}
}
*dst++ = '\0';
}
size_t regsublen(rp, source)
const regexp *rp;
const char *source;
{
register regexp * const prog = (regexp *)rp;
register char *src = (char *)source;
register char c;
register int no;
register int len = 0;
if (prog == NULL || source == NULL) {
regerror("NULL parameter to regsublen");
return -1;
}
if ((unsigned char)*(prog->program) != MAGIC) {
regerror("damaged regexp");
return -1;
}
while ((c = *src++) != '\0') {
if (c == '&')
no = 0;
else if (c == '\\' && isdigit(*src))
no = *src++ - '0';
else
no = -1;
if (no < 0) { /* Ordinary character. */
if (c == '\\' && (*src == '\\' || *src == '&'))
src++;
len++;
} else {
const char *s = prog->startp[no];
const char *e = prog->endp[no];
if ((s != NULL) && (e != NULL) && (e > s)) {
len += e-s;
}
}
}
return len+1;
}

View File

@ -1,127 +1,477 @@
abc abc y & abc
abc xbc n - -
abc axc n - -
abc abx n - -
abc xabcy y & abc
abc ababc y & abc
ab*c abc y & abc
ab*bc abc y & abc
ab*bc abbc y & abbc
ab*bc abbbbc y & abbbbc
ab+bc abbc y & abbc
ab+bc abc n - -
ab+bc abq n - -
ab+bc abbbbc y & abbbbc
ab?bc abbc y & abbc
ab?bc abc y & abc
ab?bc abbbbc n - -
ab?c abc y & abc
^abc$ abc y & abc
^abc$ abcc n - -
^abc abcc y & abc
^abc$ aabc n - -
abc$ aabc y & abc
^ abc y &
$ abc y &
a.c abc y & abc
a.c axc y & axc
a.*c axyzc y & axyzc
a.*c axyzd n - -
a[bc]d abc n - -
a[bc]d abd y & abd
a[b-d]e abd n - -
a[b-d]e ace y & ace
a[b-d] aac y & ac
a[-b] a- y & a-
a[b-] a- y & a-
[k] ab n - -
a[b-a] - c - -
a[]b - c - -
a[ - c - -
a] a] y & a]
a[]]b a]b y & a]b
a[^bc]d aed y & aed
a[^bc]d abd n - -
a[^-b]c adc y & adc
a[^-b]c a-c n - -
a[^]b]c a]c n - -
a[^]b]c adc y & adc
ab|cd abc y & ab
ab|cd abcd y & ab
()ef def y &-\1 ef-
()* - c - -
*a - c - -
^* - c - -
$* - c - -
(*)b - c - -
$b b n - -
a\ - c - -
a\(b a(b y &-\1 a(b-
a\(*b ab y & ab
a\(*b a((b y & a((b
a\\b a\b y & a\b
abc) - c - -
(abc - c - -
((a)) abc y &-\1-\2 a-a-a
(a)b(c) abc y &-\1-\2 abc-a-c
a+b+c aabbabc y & abc
a** - c - -
a*? - c - -
(a*)* - c - -
(a*)+ - c - -
(a|)* - c - -
(a*|b)* - c - -
(a+|b)* ab y &-\1 ab-b
(a+|b)+ ab y &-\1 ab-b
(a+|b)? ab y &-\1 a-a
[^ab]* cde y & cde
(^)* - c - -
(ab|)* - c - -
)( - c - -
abc y &
abc n - -
a* y &
abcd abcd y &-\&-\\& abcd-&-\abcd
a(bc)d abcd y \1-\\1-\\\1 bc-\1-\bc
([abc])*d abbbcd y &-\1 abbbcd-c
([abc])*bcd abcd y &-\1 abcd-a
a|b|c|d|e e y & e
(a|b|c|d|e)f ef y &-\1 ef-e
((a*|b))* - c - -
abcd*efg abcdefg y & abcdefg
ab* xabyabbbz y & ab
ab* xayabbbz y & a
(ab|cd)e abcde y &-\1 cde-cd
[abhgefdc]ij hij y & hij
^(ab|cd)e abcde n x\1y xy
(abc|)ef abcdef y &-\1 ef-
(a|b)c*d abcd y &-\1 bcd-b
(ab|ab*)bc abc y &-\1 abc-a
a([bc]*)c* abc y &-\1 abc-bc
a([bc]*)(c*d) abcd y &-\1-\2 abcd-bc-d
a([bc]+)(c*d) abcd y &-\1-\2 abcd-bc-d
a([bc]*)(c+d) abcd y &-\1-\2 abcd-b-cd
a[bcd]*dcdcde adcdcde y & adcdcde
a[bcd]+dcdcde adcdcde n - -
(ab|a)b*c abc y &-\1 abc-ab
((a)(b)c)(d) abcd y \1-\2-\3-\4 abc-a-b-d
[ -~]* abc y & abc
[ -~ -~]* abc y & abc
[ -~ -~ -~]* abc y & abc
[ -~ -~ -~ -~]* abc y & abc
[ -~ -~ -~ -~ -~]* abc y & abc
[ -~ -~ -~ -~ -~ -~]* abc y & abc
[ -~ -~ -~ -~ -~ -~ -~]* abc y & abc
[a-zA-Z_][a-zA-Z0-9_]* alpha y & alpha
^a(bc+|b[eh])g|.h$ abh y &-\1 bh-
(bc+d$|ef*g.|h?i(j|k)) effgz y &-\1-\2 effgz-effgz-
(bc+d$|ef*g.|h?i(j|k)) ij y &-\1-\2 ij-ij-j
(bc+d$|ef*g.|h?i(j|k)) effg n - -
(bc+d$|ef*g.|h?i(j|k)) bcdd n - -
(bc+d$|ef*g.|h?i(j|k)) reffgz y &-\1-\2 effgz-effgz-
((((((((((a)))))))))) - c - -
(((((((((a))))))))) a y & a
multiple words of text uh-uh n - -
multiple words multiple words, yeah y & multiple words
(.*)c(.*) abcde y &-\1-\2 abcde-ab-de
\((.*), (.*)\) (a, b) y (\2, \1) (b, a)
# regular expression test set
# Lines are at least three fields, separated by one or more tabs. "" stands
# for an empty field. First field is an RE. Second field is flags. If
# C flag given, regcomp() is expected to fail, and the third field is the
# error name (minus the leading REG_).
#
# Otherwise it is expected to succeed, and the third field is the string to
# try matching it against. If there is no fourth field, the match is
# expected to fail. If there is a fourth field, it is the substring that
# the RE is expected to match. If there is a fifth field, it is a comma-
# separated list of what the subexpressions should match, with - indicating
# no match for that one. In both the fourth and fifth fields, a (sub)field
# starting with @ indicates that the (sub)expression is expected to match
# a null string followed by the stuff after the @; this provides a way to
# test where null strings match. The character `N' in REs and strings
# is newline, `S' is space, `T' is tab, `Z' is NUL.
#
# The full list of flags:
# - placeholder, does nothing
# b RE is a BRE, not an ERE
# & try it as both an ERE and a BRE
# C regcomp() error expected, third field is error name
# i REG_ICASE
# m ("mundane") REG_NOSPEC
# s REG_NOSUB (not really testable)
# n REG_NEWLINE
# ^ REG_NOTBOL
# $ REG_NOTEOL
# # REG_STARTEND (see below)
# p REG_PEND
#
# For REG_STARTEND, the start/end offsets are those of the substring
# enclosed in ().
# basics
a & a a
abc & abc abc
abc|de - abc abc
a|b|c - abc a
# parentheses and perversions thereof
a(b)c - abc abc
a\(b\)c b abc abc
a( C EPAREN
a( b a( a(
a\( - a( a(
a\( bC EPAREN
a\(b bC EPAREN
a(b C EPAREN
a(b b a(b a(b
# gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly)
a) - a) a)
) - ) )
# end gagging (in a just world, those *should* give EPAREN)
a) b a) a)
a\) bC EPAREN
\) bC EPAREN
a()b - ab ab
a\(\)b b ab ab
# anchoring and REG_NEWLINE
^abc$ & abc abc
a^b - a^b
a^b b a^b a^b
a$b - a$b
a$b b a$b a$b
^ & abc @abc
$ & abc @
^$ & "" @
$^ - "" @
\($\)\(^\) b "" @
# stop retching, those are legitimate (although disgusting)
^^ - "" @
$$ - "" @
b$ & abNc
b$ &n abNc b
^b$ & aNbNc
^b$ &n aNbNc b
^$ &n aNNb @Nb
^$ n abc
^$ n abcN @
$^ n aNNb @Nb
\($\)\(^\) bn aNNb @Nb
^^ n^ aNNb @Nb
$$ n aNNb @NN
^a ^ a
a$ $ a
^a ^n aNb
^b ^n aNb b
a$ $n bNa
b$ $n bNa b
a*(^b$)c* - b b
a*\(^b$\)c* b b b
# certain syntax errors and non-errors
| C EMPTY
| b | |
* C BADRPT
* b * *
+ C BADRPT
? C BADRPT
"" &C EMPTY
() - abc @abc
\(\) b abc @abc
a||b C EMPTY
|ab C EMPTY
ab| C EMPTY
(|a)b C EMPTY
(a|)b C EMPTY
(*a) C BADRPT
(+a) C BADRPT
(?a) C BADRPT
({1}a) C BADRPT
\(\{1\}a\) bC BADRPT
(a|*b) C BADRPT
(a|+b) C BADRPT
(a|?b) C BADRPT
(a|{1}b) C BADRPT
^* C BADRPT
^* b * *
^+ C BADRPT
^? C BADRPT
^{1} C BADRPT
^\{1\} bC BADRPT
# metacharacters, backslashes
a.c & abc abc
a[bc]d & abd abd
a\*c & a*c a*c
a\\b & a\b a\b
a\\\*b & a\*b a\*b
a\bc & abc abc
a\ &C EESCAPE
a\\bc & a\bc a\bc
\{ bC BADRPT
a\[b & a[b a[b
a[b &C EBRACK
# trailing $ is a peculiar special case for the BRE code
a$ & a a
a$ & a$
a\$ & a
a\$ & a$ a$
a\\$ & a
a\\$ & a$
a\\$ & a\$
a\\$ & a\ a\
# back references, ugh
a\(b\)\2c bC ESUBREG
a\(b\1\)c bC ESUBREG
a\(b*\)c\1d b abbcbbd abbcbbd bb
a\(b*\)c\1d b abbcbd
a\(b*\)c\1d b abbcbbbd
^\(.\)\1 b abc
a\([bc]\)\1d b abcdabbd abbd b
a\(\([bc]\)\2\)*d b abbccd abbccd
a\(\([bc]\)\2\)*d b abbcbd
# actually, this next one probably ought to fail, but the spec is unclear
a\(\(b\)*\2\)*d b abbbd abbbd
# here is a case that no NFA implementation does right
\(ab*\)[ab]*\1 b ababaaa ababaaa a
# check out normal matching in the presence of back refs
\(a\)\1bcd b aabcd aabcd
\(a\)\1bc*d b aabcd aabcd
\(a\)\1bc*d b aabd aabd
\(a\)\1bc*d b aabcccd aabcccd
\(a\)\1bc*[ce]d b aabcccd aabcccd
^\(a\)\1b\(c\)*cd$ b aabcccd aabcccd
# ordinary repetitions
ab*c & abc abc
ab+c - abc abc
ab?c - abc abc
a\(*\)b b a*b a*b
a\(**\)b b ab ab
a\(***\)b bC BADRPT
*a b *a *a
**a b a a
***a bC BADRPT
# the dreaded bounded repetitions
{ & { {
{abc & {abc {abc
{1 C BADRPT
{1} C BADRPT
a{b & a{b a{b
a{1}b - ab ab
a\{1\}b b ab ab
a{1,}b - ab ab
a\{1,\}b b ab ab
a{1,2}b - aab aab
a\{1,2\}b b aab aab
a{1 C EBRACE
a\{1 bC EBRACE
a{1a C EBRACE
a\{1a bC EBRACE
a{1a} C BADBR
a\{1a\} bC BADBR
a{,2} - a{,2} a{,2}
a\{,2\} bC BADBR
a{,} - a{,} a{,}
a\{,\} bC BADBR
a{1,x} C BADBR
a\{1,x\} bC BADBR
a{1,x C EBRACE
a\{1,x bC EBRACE
a{300} C BADBR
a\{300\} bC BADBR
a{1,0} C BADBR
a\{1,0\} bC BADBR
ab{0,0}c - abcac ac
ab\{0,0\}c b abcac ac
ab{0,1}c - abcac abc
ab\{0,1\}c b abcac abc
ab{0,3}c - abbcac abbc
ab\{0,3\}c b abbcac abbc
ab{1,1}c - acabc abc
ab\{1,1\}c b acabc abc
ab{1,3}c - acabc abc
ab\{1,3\}c b acabc abc
ab{2,2}c - abcabbc abbc
ab\{2,2\}c b abcabbc abbc
ab{2,4}c - abcabbc abbc
ab\{2,4\}c b abcabbc abbc
((a{1,10}){1,10}){1,10} - a a a,a
# multiple repetitions
a** &C BADRPT
a++ C BADRPT
a?? C BADRPT
a*+ C BADRPT
a*? C BADRPT
a+* C BADRPT
a+? C BADRPT
a?* C BADRPT
a?+ C BADRPT
a{1}{1} C BADRPT
a*{1} C BADRPT
a+{1} C BADRPT
a?{1} C BADRPT
a{1}* C BADRPT
a{1}+ C BADRPT
a{1}? C BADRPT
a*{b} - a{b} a{b}
a\{1\}\{1\} bC BADRPT
a*\{1\} bC BADRPT
a\{1\}* bC BADRPT
# brackets, and numerous perversions thereof
a[b]c & abc abc
a[ab]c & abc abc
a[^ab]c & adc adc
a[]b]c & a]c a]c
a[[b]c & a[c a[c
a[-b]c & a-c a-c
a[^]b]c & adc adc
a[^-b]c & adc adc
a[b-]c & a-c a-c
a[b &C EBRACK
a[] &C EBRACK
a[1-3]c & a2c a2c
a[3-1]c &C ERANGE
a[1-3-5]c &C ERANGE
a[[.-.]--]c & a-c a-c
a[1- &C ERANGE
a[[. &C EBRACK
a[[.x &C EBRACK
a[[.x. &C EBRACK
a[[.x.] &C EBRACK
a[[.x.]] & ax ax
a[[.x,.]] &C ECOLLATE
a[[.one.]]b & a1b a1b
a[[.notdef.]]b &C ECOLLATE
a[[.].]]b & a]b a]b
a[[:alpha:]]c & abc abc
a[[:notdef:]]c &C ECTYPE
a[[: &C EBRACK
a[[:alpha &C EBRACK
a[[:alpha:] &C EBRACK
a[[:alpha,:] &C ECTYPE
a[[:]:]]b &C ECTYPE
a[[:-:]]b &C ECTYPE
a[[:alph:]] &C ECTYPE
a[[:alphabet:]] &C ECTYPE
[[:alnum:]]+ - -%@a0X- a0X
[[:alpha:]]+ - -%@aX0- aX
[[:blank:]]+ - aSSTb SST
[[:cntrl:]]+ - aNTb NT
[[:digit:]]+ - a019b 019
[[:graph:]]+ - Sa%bS a%b
[[:lower:]]+ - AabC ab
[[:print:]]+ - NaSbN aSb
[[:punct:]]+ - S%-&T %-&
[[:space:]]+ - aSNTb SNT
[[:upper:]]+ - aBCd BC
[[:xdigit:]]+ - p0f3Cq 0f3C
a[[=b=]]c & abc abc
a[[= &C EBRACK
a[[=b &C EBRACK
a[[=b= &C EBRACK
a[[=b=] &C EBRACK
a[[=b,=]] &C ECOLLATE
a[[=one=]]b & a1b a1b
# complexities
a(((b)))c - abc abc
a(b|(c))d - abd abd
a(b*|c)d - abbd abbd
# just gotta have one DFA-buster, of course
a[ab]{20} - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab
# and an inline expansion in case somebody gets tricky
a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab] - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab
# and in case somebody just slips in an NFA...
a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee|week)(knights|night) - aaaaabaaaabaaaabaaaabweeknights aaaaabaaaabaaaabaaaabweeknights
# fish for anomalies as the number of states passes 32
12345678901234567890123456789 - a12345678901234567890123456789b 12345678901234567890123456789
123456789012345678901234567890 - a123456789012345678901234567890b 123456789012345678901234567890
1234567890123456789012345678901 - a1234567890123456789012345678901b 1234567890123456789012345678901
12345678901234567890123456789012 - a12345678901234567890123456789012b 12345678901234567890123456789012
123456789012345678901234567890123 - a123456789012345678901234567890123b 123456789012345678901234567890123
# and one really big one, beyond any plausible word width
1234567890123456789012345678901234567890123456789012345678901234567890 - a1234567890123456789012345678901234567890123456789012345678901234567890b 1234567890123456789012345678901234567890123456789012345678901234567890
# fish for problems as brackets go past 8
[ab][cd][ef][gh][ij][kl][mn] - xacegikmoq acegikm
[ab][cd][ef][gh][ij][kl][mn][op] - xacegikmoq acegikmo
[ab][cd][ef][gh][ij][kl][mn][op][qr] - xacegikmoqy acegikmoq
[ab][cd][ef][gh][ij][kl][mn][op][q] - xacegikmoqy acegikmoq
# subtleties of matching
abc & xabcy abc
a\(b\)?c\1d b acd
aBc i Abc Abc
a[Bc]*d i abBCcd abBCcd
0[[:upper:]]1 &i 0a1 0a1
0[[:lower:]]1 &i 0A1 0A1
a[^b]c &i abc
a[^b]c &i aBc
a[^b]c &i adc adc
[a]b[c] - abc abc
[a]b[a] - aba aba
[abc]b[abc] - abc abc
[abc]b[abd] - abd abd
a(b?c)+d - accd accd
(wee|week)(knights|night) - weeknights weeknights
(we|wee|week|frob)(knights|night|day) - weeknights weeknights
a[bc]d - xyzaaabcaababdacd abd
a[ab]c - aaabc abc
abc s abc abc
a* & b @b
# Let's have some fun -- try to match a C comment.
# first the obvious, which looks okay at first glance...
/\*.*\*/ - /*x*/ /*x*/
# but...
/\*.*\*/ - /*x*/y/*z*/ /*x*/y/*z*/
# okay, we must not match */ inside; try to do that...
/\*([^*]|\*[^/])*\*/ - /*x*/ /*x*/
/\*([^*]|\*[^/])*\*/ - /*x*/y/*z*/ /*x*/
# but...
/\*([^*]|\*[^/])*\*/ - /*x**/y/*z*/ /*x**/y/*z*/
# and a still fancier version, which does it right (I think)...
/\*([^*]|\*+[^*/])*\*+/ - /*x*/ /*x*/
/\*([^*]|\*+[^*/])*\*+/ - /*x*/y/*z*/ /*x*/
/\*([^*]|\*+[^*/])*\*+/ - /*x**/y/*z*/ /*x**/
/\*([^*]|\*+[^*/])*\*+/ - /*x****/y/*z*/ /*x****/
/\*([^*]|\*+[^*/])*\*+/ - /*x**x*/y/*z*/ /*x**x*/
/\*([^*]|\*+[^*/])*\*+/ - /*x***x/y/*z*/ /*x***x/y/*z*/
# subexpressions
.* - abc abc -
a(b)(c)d - abcd abcd b,c
a(((b)))c - abc abc b,b,b
a(b|(c))d - abd abd b,-
a(b*|c|e)d - abbd abbd bb
a(b*|c|e)d - acd acd c
a(b*|c|e)d - ad ad @d
a(b?)c - abc abc b
a(b?)c - ac ac @c
a(b+)c - abc abc b
a(b+)c - abbbc abbbc bbb
a(b*)c - ac ac @c
(a|ab)(bc([de]+)f|cde) - abcdef abcdef a,bcdef,de
# the regression tester only asks for 9 subexpressions
a(b)(c)(d)(e)(f)(g)(h)(i)(j)k - abcdefghijk abcdefghijk b,c,d,e,f,g,h,i,j
a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l - abcdefghijkl abcdefghijkl b,c,d,e,f,g,h,i,j,k
a([bc]?)c - abc abc b
a([bc]?)c - ac ac @c
a([bc]+)c - abc abc b
a([bc]+)c - abcc abcc bc
a([bc]+)bc - abcbc abcbc bc
a(bb+|b)b - abb abb b
a(bbb+|bb+|b)b - abb abb b
a(bbb+|bb+|b)b - abbb abbb bb
a(bbb+|bb+|b)bb - abbb abbb b
(.*).* - abcdef abcdef abcdef
(a*)* - bc @b @b
# do we get the right subexpression when it is used more than once?
a(b|c)*d - ad ad -
a(b|c)*d - abcd abcd c
a(b|c)+d - abd abd b
a(b|c)+d - abcd abcd c
a(b|c?)+d - ad ad @d
a(b|c?)+d - abcd abcd @d
a(b|c){0,0}d - ad ad -
a(b|c){0,1}d - ad ad -
a(b|c){0,1}d - abd abd b
a(b|c){0,2}d - ad ad -
a(b|c){0,2}d - abcd abcd c
a(b|c){0,}d - ad ad -
a(b|c){0,}d - abcd abcd c
a(b|c){1,1}d - abd abd b
a(b|c){1,1}d - acd acd c
a(b|c){1,2}d - abd abd b
a(b|c){1,2}d - abcd abcd c
a(b|c){1,}d - abd abd b
a(b|c){1,}d - abcd abcd c
a(b|c){2,2}d - acbd acbd b
a(b|c){2,2}d - abcd abcd c
a(b|c){2,4}d - abcd abcd c
a(b|c){2,4}d - abcbd abcbd b
a(b|c){2,4}d - abcbcd abcbcd c
a(b|c){2,}d - abcd abcd c
a(b|c){2,}d - abcbd abcbd b
a(b+|((c)*))+d - abd abd @d,@d,-
a(b+|((c)*))+d - abcd abcd @d,@d,-
# check out the STARTEND option
[abc] &# a(b)c b
[abc] &# a(d)c
[abc] &# a(bc)d b
[abc] &# a(dc)d c
. &# a()c
b.*c &# b(bc)c bc
b.* &# b(bc)c bc
.*c &# b(bc)c bc
# plain strings, with the NOSPEC flag
abc m abc abc
abc m xabcy abc
abc m xyz
a*b m aba*b a*b
a*b m ab
"" mC EMPTY
# cases involving NULs
aZb & a a
aZb &p a
aZb &p# (aZb) aZb
aZ*b &p# (ab) ab
a.b &# (aZb) aZb
a.* &# (aZb)c aZb
# word boundaries (ick)
[[:<:]]a & a a
[[:<:]]a & ba
[[:<:]]a & -a a
a[[:>:]] & a a
a[[:>:]] & ab
a[[:>:]] & a- a
[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc abc
[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc-q abc
[[:<:]]a.c[[:>:]] & axc-dayc-dazce-abc axc
[[:<:]]b.c[[:>:]] & a_bxc-byc_d-bzc-q bzc
[[:<:]].x..[[:>:]] & y_xa_-_xb_y-_xc_-axdc _xc_
[[:<:]]a_b[[:>:]] & x_a_b
# past problems, and suspected problems
(A[1])|(A[2])|(A[3])|(A[4])|(A[5])|(A[6])|(A[7])|(A[8])|(A[9])|(A[A]) - A1 A1
abcdefghijklmnop i abcdefghijklmnop abcdefghijklmnop
abcdefghijklmnopqrstuv i abcdefghijklmnopqrstuv abcdefghijklmnopqrstuv
(ALAK)|(ALT[AB])|(CC[123]1)|(CM[123]1)|(GAMC)|(LC[23][EO ])|(SEM[1234])|(SL[ES][12])|(SLWW)|(SLF )|(SLDT)|(VWH[12])|(WH[34][EW])|(WP1[ESN]) - CC11 CC11
CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a - CC11 CC11
Char \([a-z0-9_]*\)\[.* b Char xyz[k Char xyz[k xyz
a?b - ab ab
-\{0,1\}[0-9]*$ b -5 -5
a*a*a*a*a*a*a* & aaaaaa aaaaaa

View File

@ -1,164 +0,0 @@
/*
* Simple timing program for regcomp().
* Usage: timer ncomp nexec nsub
* or
* timer ncomp nexec nsub regexp string [ answer [ sub ] ]
*
* The second form is for timing repetitions of a single test case.
* The first form's test data is a compiled-in copy of the "tests" file.
* Ncomp, nexec, nsub are how many times to do each regcomp, regexec,
* and regsub. The way to time an operation individually is to do something
* like "timer 1 50 1".
*/
#include <stdio.h>
struct try {
char *re, *str, *ans, *src, *dst;
} tests[] = {
#include "timer.t.h"
{ NULL, NULL, NULL, NULL, NULL }
};
#include <regexp.h>
int errreport = 0; /* Report errors via errseen? */
char *errseen = NULL; /* Error message. */
char *progname;
/* ARGSUSED */
main(argc, argv)
int argc;
char *argv[];
{
int ncomp, nexec, nsub;
struct try one;
char dummy[512];
if (argc < 4) {
ncomp = 1;
nexec = 1;
nsub = 1;
} else {
ncomp = atoi(argv[1]);
nexec = atoi(argv[2]);
nsub = atoi(argv[3]);
}
progname = argv[0];
if (argc > 5) {
one.re = argv[4];
one.str = argv[5];
if (argc > 6)
one.ans = argv[6];
else
one.ans = "y";
if (argc > 7) {
one.src = argv[7];
one.dst = "xxx";
} else {
one.src = "x";
one.dst = "x";
}
errreport = 1;
try(one, ncomp, nexec, nsub);
} else
multiple(ncomp, nexec, nsub);
exit(0);
}
void
regerror(s)
char *s;
{
if (errreport)
errseen = s;
else
error(s, "");
}
#ifndef ERRAVAIL
error(s1, s2)
char *s1;
char *s2;
{
fprintf(stderr, "regexp: ");
fprintf(stderr, s1, s2);
fprintf(stderr, "\n");
exit(1);
}
#endif
int lineno = 0;
multiple(ncomp, nexec, nsub)
int ncomp, nexec, nsub;
{
register int i;
extern char *strchr();
errreport = 1;
for (i = 0; tests[i].re != NULL; i++) {
lineno++;
try(tests[i], ncomp, nexec, nsub);
}
}
try(fields, ncomp, nexec, nsub)
struct try fields;
int ncomp, nexec, nsub;
{
regexp *r;
char dbuf[BUFSIZ];
register int i;
errseen = NULL;
r = regcomp(fields.re);
if (r == NULL) {
if (*fields.ans != 'c')
complain("regcomp failure in `%s'", fields.re);
return;
}
if (*fields.ans == 'c') {
complain("unexpected regcomp success in `%s'", fields.re);
free((char *)r);
return;
}
for (i = ncomp-1; i > 0; i--) {
free((char *)r);
r = regcomp(fields.re);
}
if (!regexec(r, fields.str)) {
if (*fields.ans != 'n')
complain("regexec failure in `%s'", "");
free((char *)r);
return;
}
if (*fields.ans == 'n') {
complain("unexpected regexec success", "");
free((char *)r);
return;
}
for (i = nexec-1; i > 0; i--)
(void) regexec(r, fields.str);
errseen = NULL;
for (i = nsub; i > 0; i--)
regsub(r, fields.src, dbuf);
if (errseen != NULL) {
complain("regsub complaint", "");
free((char *)r);
return;
}
if (strcmp(dbuf, fields.dst) != 0)
complain("regsub result `%s' wrong", dbuf);
free((char *)r);
}
complain(s1, s2)
char *s1;
char *s2;
{
fprintf(stderr, "try: %d: ", lineno);
fprintf(stderr, s1, s2);
fprintf(stderr, " (%s)\n", (errseen != NULL) ? errseen : "");
}

View File

@ -1,220 +0,0 @@
/*
* Simple test program for regexp(3) stuff. Knows about debugging hooks.
* Usage: try re [string [output [-]]]
* The re is compiled and dumped, regexeced against the string, the result
* is applied to output using regsub(). The - triggers a running narrative
* from regexec(). Dumping and narrative don't happen unless DEBUG.
*
* If there are no arguments, stdin is assumed to be a stream of lines with
* five fields: a r.e., a string to match it against, a result code, a
* source string for regsub, and the proper result. Result codes are 'c'
* for compile failure, 'y' for match success, 'n' for match failure.
* Field separator is tab.
*/
#include <stdio.h>
#include <regexp.h>
#ifdef ERRAVAIL
char *progname;
extern char *mkprogname();
#endif
#ifdef DEBUG
extern int regnarrate;
#endif
char buf[BUFSIZ];
int errreport = 0; /* Report errors via errseen? */
char *errseen = NULL; /* Error message. */
int status = 0; /* Exit status. */
/* ARGSUSED */
main(argc, argv)
int argc;
char *argv[];
{
regexp *r;
int i;
#ifdef ERRAVAIL
progname = mkprogname(argv[0]);
#endif
if (argc == 1) {
multiple();
exit(status);
}
r = regcomp(argv[1]);
if (r == NULL)
error("regcomp failure", "");
#ifdef DEBUG
regdump(r);
if (argc > 4)
regnarrate++;
#endif
if (argc > 2) {
i = regexec(r, argv[2]);
printf("%d", i);
for (i = 1; i < NSUBEXP; i++)
if (r->startp[i] != NULL && r->endp[i] != NULL)
printf(" \\%d", i);
printf("\n");
}
if (argc > 3) {
regsub(r, argv[3], buf);
printf("%s\n", buf);
}
exit(status);
}
void
regerror(s)
char *s;
{
if (errreport)
errseen = s;
else
error(s, "");
}
#ifndef ERRAVAIL
error(s1, s2)
char *s1;
char *s2;
{
fprintf(stderr, "regexp: ");
fprintf(stderr, s1, s2);
fprintf(stderr, "\n");
exit(1);
}
#endif
int lineno;
regexp badregexp; /* Implicit init to 0. */
multiple()
{
char rbuf[BUFSIZ];
char *field[5];
char *scan;
int i;
regexp *r;
extern char *strchr();
errreport = 1;
lineno = 0;
while (fgets(rbuf, sizeof(rbuf), stdin) != NULL) {
rbuf[strlen(rbuf)-1] = '\0'; /* Dispense with \n. */
lineno++;
scan = rbuf;
for (i = 0; i < 5; i++) {
field[i] = scan;
if (field[i] == NULL) {
complain("bad testfile format", "");
exit(1);
}
scan = strchr(scan, '\t');
if (scan != NULL)
*scan++ = '\0';
}
try(field);
}
/* And finish up with some internal testing... */
lineno = 9990;
errseen = NULL;
if (regcomp((char *)NULL) != NULL || errseen == NULL)
complain("regcomp(NULL) doesn't complain", "");
lineno = 9991;
errseen = NULL;
if (regexec((regexp *)NULL, "foo") || errseen == NULL)
complain("regexec(NULL, ...) doesn't complain", "");
lineno = 9992;
r = regcomp("foo");
if (r == NULL) {
complain("regcomp(\"foo\") fails", "");
return;
}
lineno = 9993;
errseen = NULL;
if (regexec(r, (char *)NULL) || errseen == NULL)
complain("regexec(..., NULL) doesn't complain", "");
lineno = 9994;
errseen = NULL;
regsub((regexp *)NULL, "foo", rbuf);
if (errseen == NULL)
complain("regsub(NULL, ..., ...) doesn't complain", "");
lineno = 9995;
errseen = NULL;
regsub(r, (char *)NULL, rbuf);
if (errseen == NULL)
complain("regsub(..., NULL, ...) doesn't complain", "");
lineno = 9996;
errseen = NULL;
regsub(r, "foo", (char *)NULL);
if (errseen == NULL)
complain("regsub(..., ..., NULL) doesn't complain", "");
lineno = 9997;
errseen = NULL;
if (regexec(&badregexp, "foo") || errseen == NULL)
complain("regexec(nonsense, ...) doesn't complain", "");
lineno = 9998;
errseen = NULL;
regsub(&badregexp, "foo", rbuf);
if (errseen == NULL)
complain("regsub(nonsense, ..., ...) doesn't complain", "");
}
try(fields)
char **fields;
{
regexp *r;
char dbuf[BUFSIZ];
errseen = NULL;
r = regcomp(fields[0]);
if (r == NULL) {
if (*fields[2] != 'c')
complain("regcomp failure in `%s'", fields[0]);
return;
}
if (*fields[2] == 'c') {
complain("unexpected regcomp success in `%s'", fields[0]);
free((char *)r);
return;
}
if (!regexec(r, fields[1])) {
if (*fields[2] != 'n')
complain("regexec failure in `%s'", fields[0]);
free((char *)r);
return;
}
if (*fields[2] == 'n') {
complain("unexpected regexec success", "");
free((char *)r);
return;
}
errseen = NULL;
regsub(r, fields[3], dbuf);
if (errseen != NULL) {
complain("regsub complaint", "");
free((char *)r);
return;
}
if (strcmp(dbuf, fields[4]) != 0)
complain("regsub result `%s' wrong", dbuf);
free((char *)r);
}
complain(s1, s2)
char *s1;
char *s2;
{
fprintf(stderr, "try: %d: ", lineno);
fprintf(stderr, s1, s2);
fprintf(stderr, " (%s)\n", (errseen != NULL) ? errseen : "");
status = 1;
}