scsh-0.5/scsh/regexp/patch-msg

804 lines
21 KiB
Plaintext

Date: Mon, 1 Jul 1996 23:22:47 GMT
From: Bill Sommerfeld <sommerfeld@orchard.medford.ma.us>
To: shivers@lcs.mit.edu, bdc@ai.mit.edu
Subject: scsh patch for precompiled regexps..
I meant to send this out months ago but I was just too hosed with work.
Here's what I have right now:
There are three pieces here:
diffs to the "core" scsh
diffs to Henry Spencer's latest regexp library
a copy of Henry Spencer's latest regexp library..
It appears to work (it passes the same regression tests as the C library..).
Let me know if I didn't include something needed for this to work..
- Bill
diff -rc scsh-0.4.2/scsh/re.scm scsh-0.4.2-regexp/scsh/re.scm
*** scsh-0.4.2/scsh/re.scm Fri Oct 27 04:58:56 1995
--- scsh-0.4.2-regexp/scsh/re.scm Sat Apr 6 21:07:41 1996
***************
*** 34,49 ****
;;; Bogus stub definitions for low-level match routines:
! (define regexp? string?)
! (define (make-regexp str) str)
! (define (regexp-exec regexp str . maybe-start)
(let ((start (optional-arg maybe-start 0))
(start-vec (make-vector 10))
(end-vec (make-vector 10)))
! (and (%regexp-match regexp str start start-vec end-vec)
! (make-regexp-match str start-vec end-vec))))
!
;;; Convert a string into a regex pattern that matches that string exactly --
;;; in other words, quote the special chars with backslashes.
--- 34,53 ----
;;; Bogus stub definitions for low-level match routines:
! (define-record iregexp
! string)
! (define regexp? iregexp?)
!
! (define (make-regexp str)
! (make-iregexp (compile-regexp str)))
!
! (define (regexp-exec r s . maybe-start)
(let ((start (optional-arg maybe-start 0))
(start-vec (make-vector 10))
(end-vec (make-vector 10)))
! (and (%regexp-exec-1 (iregexp:string r) s start start-vec end-vec)
! (make-regexp-match s start-vec end-vec))))
;;; Convert a string into a regex pattern that matches that string exactly --
;;; in other words, quote the special chars with backslashes.
***************
*** 58,75 ****
(cons #\\ result)
result))))))
! (define-foreign %regexp-match/errno (reg_match (string regexp)
! (string s)
! (integer start)
! (vector-desc start-vec)
! (vector-desc end-vec))
! static-string ; Error string or #f if all is ok.
! bool) ; match?
!
! (define (%regexp-match regexp string start start-vec end-vec)
! (receive (err match?) (%regexp-match/errno regexp string start
! start-vec end-vec)
! (if err (error err %regexp-match regexp string start) match?)))
;;; I do this one in C, I'm not sure why:
--- 62,79 ----
(cons #\\ result)
result))))))
! ;;;(define-foreign %regexp-match/errno (reg_match (string regexp)
! ;;; (string s)
! ;;; (integer start)
! ;;; (vector-desc start-vec)
! ;;; (vector-desc end-vec))
! ;;; static-string ; Error string or #f if all is ok.
! ;;; bool) ; match?
!
! ;;;(define (%regexp-match regexp string start start-vec end-vec)
! ;;; (receive (err match?) (%regexp-match/errno regexp string start
! ;;; start-vec end-vec)
! ;;; (if err (error err %regexp-match regexp string start) match?)))
;;; I do this one in C, I'm not sure why:
***************
*** 79,81 ****
--- 83,166 ----
(filter_stringvec (string regexp) ((C "char const ** ~a") cvec))
static-string ; error message -- #f if no error.
integer) ; number of files that pass the filter.
+
+ ;;; precompiled regexps.
+
+ (define-foreign %regexp-compiled-length (reg_comp_len (string regexp))
+ static-string
+ integer)
+
+ (define-foreign %regexp-compile (reg_comp_comp (string regexp)
+ (string-desc re-buf))
+ static-string)
+
+ (define (%regexp-exec-1 r s start sv ev)
+ (receive (err match?) (%regexp-exec r s start sv ev)
+ (if err (error err s start)
+ match?)))
+
+ (define-foreign %regexp-exec (reg_exec (string-desc regexp)
+ (string s)
+ (integer start)
+ (vector-desc start-vec)
+ (vector-desc end-vec))
+ static-string
+ bool)
+
+
+ (define (compile-regexp e)
+ (receive (err len)
+ (%regexp-compiled-length e)
+ (if err (error err e)
+ (let ((buf (make-string len)))
+ (%regexp-compile e buf)
+ buf))))
+
+
+
+ (define-foreign %regexp-subst (reg_subst (string-desc regexp)
+ (string m)
+ (string s)
+ (integer start)
+ (vector-desc start-vec)
+ (vector-desc end-vec)
+ (string-desc outbuf))
+ static-string
+ integer)
+
+ (define-foreign %regexp-subst-len (reg_subst_len (string-desc regexp)
+ (string m)
+ (string s)
+ (integer start)
+ (vector-desc start-vec)
+ (vector-desc end-vec))
+ static-string
+ integer)
+
+
+ (define (regexp-subst re match replacement)
+ (let ((cr (iregexp:string re))
+ (matchstr (regexp-match:string match))
+ (startvec (regexp-match:start match))
+ (endvec (regexp-match:end match)))
+ (receive (err outlen)
+ (%regexp-subst-len cr
+ matchstr
+ replacement
+ 0
+ startvec
+ endvec)
+ (if err (error err matchstr replacement)
+ (let ((outbuf (make-string outlen)))
+ (receive (err outlen)
+ (%regexp-subst cr
+ matchstr
+ replacement
+ 0
+ startvec
+ endvec
+ outbuf)
+ (if err (error err matchstr replacement)
+ (substring outbuf 0 outlen))))))))
+
+
\ No newline at end of file
diff -rc scsh-0.4.2/scsh/re1.c scsh-0.4.2-regexp/scsh/re1.c
*** scsh-0.4.2/scsh/re1.c Fri Oct 27 04:58:58 1995
--- scsh-0.4.2-regexp/scsh/re1.c Sat Apr 6 21:01:15 1996
***************
*** 19,24 ****
--- 19,150 ----
/* Stash error msg in global. */
void regerror(char *msg) {regexp_error = msg;}
+ /*
+ ** Return NULL normally, error string on error.
+ ** Stash number of bytes needed for compiled regexp into `*len'
+ */
+
+ char *reg_comp_len(const char *re, int *len)
+ {
+ int l;
+
+ regexp_error = NULL;
+ *len = regcomp_len(re);
+ return regexp_error;
+ }
+
+ /*
+ ** Return NULL normally, error string on error.
+ ** Compile regexp into string described by `cr'.
+ */
+
+ char *reg_comp_comp(const char *re, scheme_value cr)
+ {
+ int len = STRING_LENGTH(cr);
+ regexp *r = (regexp *)&STRING_REF(cr, 0);
+
+ regexp_error = NULL;
+ r = regcomp_comp(re, r, len);
+ return regexp_error;
+ }
+
+ /* Return NULL normally, error string on error.
+ ** Stash match info in start_vec and end_vec.
+ ** Returns boolean match/no-match in hit.
+ */
+
+ char *reg_exec(scheme_value cr, const char *string, int start,
+ scheme_value start_vec, scheme_value end_vec, int *hit)
+ {
+ regexp *r = (regexp *)&STRING_REF(cr, 0);
+
+ if( VECTOR_LENGTH(start_vec) != NSUBEXP ) {
+ return "Illegal start vector";
+ }
+
+ if( VECTOR_LENGTH(end_vec) != NSUBEXP ) {
+ return "Illegal end vector";
+ }
+
+ regexp_error = 0;
+ *hit = 0;
+
+ if( regexec(r, string+start) ) {
+ int i;
+ for(i=0; i<NSUBEXP; i++) {
+ const char *s = r->startp[i];
+ const char *e = r->endp[i];
+ VECTOR_REF(start_vec,i) = s?ENTER_FIXNUM(s - string):SCHFALSE;
+ VECTOR_REF(end_vec,i) = e?ENTER_FIXNUM(e - string):SCHFALSE;
+ r->startp[i] = NULL;
+ r->endp[i] = NULL;
+ }
+ *hit = 1;
+ }
+ return regexp_error;
+ }
+
+ char *reg_subst(scheme_value cr, const char *match,
+ const char *src, int start,
+ scheme_value start_vec, scheme_value end_vec,
+ scheme_value outbuf, int *len)
+ {
+ int i;
+ regexp *r = (regexp *)&STRING_REF(cr, 0);
+
+ if( VECTOR_LENGTH(start_vec) != NSUBEXP ) {
+ return "Illegal start vector";
+ }
+
+ if( VECTOR_LENGTH(end_vec) != NSUBEXP ) {
+ return "Illegal end vector";
+ }
+
+ for (i=0; i<NSUBEXP; i++)
+ {
+ scheme_value se = VECTOR_REF(start_vec, i);
+ scheme_value ee = VECTOR_REF(end_vec, i);
+ r->startp[i] = FIXNUMP(se)?(match + EXTRACT_FIXNUM(se)):NULL;
+ r->endp[i] = FIXNUMP(ee)? (match + EXTRACT_FIXNUM(ee)):NULL;
+ }
+
+ regexp_error = NULL;
+ regnsub (r, src, &STRING_REF(outbuf, 0), STRING_LENGTH(outbuf));
+ *len = strlen(&STRING_REF(outbuf, 0));
+ return regexp_error;
+ }
+
+ char *reg_subst_len(scheme_value cr, const char *match,
+ const char *src, int start,
+ scheme_value start_vec, scheme_value end_vec,
+ int *len)
+ {
+ int i;
+ regexp *r = (regexp *)&STRING_REF(cr, 0);
+
+ if( VECTOR_LENGTH(start_vec) != NSUBEXP ) {
+ return "Illegal start vector";
+ }
+
+ if( VECTOR_LENGTH(end_vec) != NSUBEXP ) {
+ return "Illegal end vector";
+ }
+
+ for (i=0; i<NSUBEXP; i++)
+ {
+ scheme_value se = VECTOR_REF(start_vec, i);
+ scheme_value ee = VECTOR_REF(end_vec, i);
+ r->startp[i] = FIXNUMP(se)?(match + EXTRACT_FIXNUM(se)):NULL;
+ r->endp[i] = FIXNUMP(ee)? (match + EXTRACT_FIXNUM(ee)):NULL;
+ }
+
+ regexp_error = NULL;
+ *len = regsublen (r, src);
+ return regexp_error;
+ }
+
+
+ #if 0
/* Return NULL normally, error string on error.
** Stash match info in start_vec and end_vec.
** Returns boolean match/no-match in hit.
***************
*** 56,61 ****
--- 182,188 ----
Free(prog);
return regexp_error;
}
+ #endif
char *filter_stringvec(const char *re, char const **stringvec, int *nummatch)
diff -rc scsh-0.4.2/scsh/re1.h scsh-0.4.2-regexp/scsh/re1.h
*** scsh-0.4.2/scsh/re1.h Sun Oct 22 08:34:34 1995
--- scsh-0.4.2-regexp/scsh/re1.h Sat Apr 6 17:54:09 1996
***************
*** 1,6 ****
--- 1,21 ----
+ #if 0
char *reg_match(const char *re, const char *string, int start,
scheme_value start_vec, scheme_value end_vec,
int *hit);
+ #endif
char *filter_stringvec(const char *re, char const **stringvec,
int *nummatch);
+
+ char *reg_comp_len(const char *re, int *len);
+ char *reg_comp_comp(const char *re, scheme_value cr);
+
+ char *reg_exec(scheme_value cr, const char *string, int start,
+ scheme_value start_vec, scheme_value end_vec, int *hit);
+
+ char *reg_subst(scheme_value cr, const char *match,
+ const char *src, int start,
+ scheme_value start_vec, scheme_value end_vec,
+ scheme_value outbuf, int *len);
+
+
Only in scsh-0.4.2-regexp/scsh: re2.scm
diff -rc scsh-0.4.2/scsh/scsh-interfaces.scm scsh-0.4.2-regexp/scsh/scsh-interfaces.scm
*** scsh-0.4.2/scsh/scsh-interfaces.scm Tue Oct 31 19:19:30 1995
--- scsh-0.4.2-regexp/scsh/scsh-interfaces.scm Sat Apr 6 18:48:12 1996
***************
*** 413,418 ****
--- 413,419 ----
make-regexp
regexp?
regexp-exec
+ regexp-subst
regexp-quote))
regexp library changes:
*** Makefile 1996/04/06 19:24:49 1.1
--- Makefile 1996/04/06 20:46:26
***************
*** 5,11 ****
# Things you might want to put in TEST:
# -DDEBUG debugging hooks
# -I. regexp.h from current directory, not /usr/include
! TEST=-I.
# Things you might want to put in PROF:
# -pg profiler
--- 5,11 ----
# Things you might want to put in TEST:
# -DDEBUG debugging hooks
# -I. regexp.h from current directory, not /usr/include
! TEST=-I. -DDEBUG
# Things you might want to put in PROF:
# -pg profiler
*** regexp.c 1996/04/06 19:24:49 1.1
--- regexp.c 1996/04/06 22:34:55
***************
*** 105,110 ****
--- 105,111 ----
* Utility definitions.
*/
#define FAIL(m) { regerror(m); return(NULL); }
+ #define FAILN(m) { regerror(m); return(-1); }
#define ISREPN(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define META "^$.[()|?+*\\"
***************
*** 162,173 ****
const char *exp;
{
register regexp *r;
! register char *scan;
int flags;
struct comp co;
if (exp == NULL)
! FAIL("NULL argument to regcomp");
/* First pass: determine size, legality. */
co.regparse = (char *)exp;
--- 163,193 ----
const char *exp;
{
register regexp *r;
! size_t len;
!
! len = regcomp_len(exp);
! if (len <= 0)
! return NULL;
!
! /* Allocate space. */
! r = (regexp *)malloc(len);
!
! if (r == NULL)
! FAIL("out of space");
! return regcomp_comp(exp, r, len);
! }
!
!
! size_t
! regcomp_len(exp)
! const char *exp;
! {
int flags;
+ register regexp *r;
struct comp co;
if (exp == NULL)
! FAILN("NULL argument to regcomp");
/* First pass: determine size, legality. */
co.regparse = (char *)exp;
***************
*** 178,198 ****
co.regcode = co.regdummy;
regc(&co, MAGIC);
if (reg(&co, 0, &flags) == NULL)
! return(NULL);
/* Small enough for pointer-storage convention? */
if (co.regsize >= 0x7fffL) /* Probably could be 0xffffL. */
! FAIL("regexp too big");
! /* Allocate space. */
! r = (regexp *)malloc(sizeof(regexp) + (size_t)co.regsize);
! if (r == NULL)
! FAIL("out of space");
/* Second pass: emit code. */
co.regparse = (char *)exp;
co.regnpar = 1;
co.regcode = r->program;
regc(&co, MAGIC);
if (reg(&co, 0, &flags) == NULL)
return(NULL);
--- 198,228 ----
co.regcode = co.regdummy;
regc(&co, MAGIC);
if (reg(&co, 0, &flags) == NULL)
! return -1;
/* Small enough for pointer-storage convention? */
if (co.regsize >= 0x7fffL) /* Probably could be 0xffffL. */
! FAILN("regexp too big");
! return (sizeof(regexp) + (size_t)co.regsize);
! }
!
!
! regexp *
! regcomp_comp(exp, r, len)
! const char *exp;
! register regexp *r;
! size_t len;
! {
! register char *scan;
! int flags;
! struct comp co;
/* Second pass: emit code. */
co.regparse = (char *)exp;
co.regnpar = 1;
co.regcode = r->program;
+ co.regsize = len - sizeof(regexp);
regc(&co, MAGIC);
if (reg(&co, 0, &flags) == NULL)
return(NULL);
***************
*** 200,206 ****
/* Dig out information for optimizations. */
r->regstart = '\0'; /* Worst-case defaults. */
r->reganch = 0;
! r->regmust = NULL;
r->regmlen = 0;
scan = r->program+1; /* First BRANCH. */
if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
--- 230,236 ----
/* Dig out information for optimizations. */
r->regstart = '\0'; /* Worst-case defaults. */
r->reganch = 0;
! r->regmust = 0;
r->regmlen = 0;
scan = r->program+1; /* First BRANCH. */
if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
***************
*** 229,235 ****
longest = OPERAND(scan);
len = strlen(OPERAND(scan));
}
! r->regmust = longest;
r->regmlen = (int)len;
}
}
--- 259,265 ----
longest = OPERAND(scan);
len = strlen(OPERAND(scan));
}
! r->regmust = longest - r->program;
r->regmlen = (int)len;
}
}
***************
*** 648,655 ****
struct exec {
char *reginput; /* String-input pointer. */
char *regbol; /* Beginning of input, for ^ check. */
! char **regstartp; /* Pointer to startp array. */
! char **regendp; /* Ditto for endp. */
};
/*
--- 678,685 ----
struct exec {
char *reginput; /* String-input pointer. */
char *regbol; /* Beginning of input, for ^ check. */
! const char **regstartp; /* Pointer to startp array. */
! const char **regendp; /* Ditto for endp. */
};
/*
***************
*** 690,696 ****
}
/* If there is a "must appear" string, look for it. */
! if (prog->regmust != NULL && strstr(string, prog->regmust) == NULL)
return(0);
/* Mark beginning of line for ^ . */
--- 720,727 ----
}
/* If there is a "must appear" string, look for it. */
! if ((prog->regmlen > 0) &&
! strstr(string, &prog->program[prog->regmust]) == NULL)
return(0);
/* Mark beginning of line for ^ . */
***************
*** 729,736 ****
char *string;
{
register int i;
! register char **stp;
! register char **enp;
ep->reginput = string;
--- 760,767 ----
char *string;
{
register int i;
! register const char **stp;
! register const char **enp;
ep->reginput = string;
***************
*** 1004,1011 ****
printf("start `%c' ", r->regstart);
if (r->reganch)
printf("anchored ");
! if (r->regmust != NULL)
! printf("must have \"%s\"", r->regmust);
printf("\n");
}
--- 1035,1042 ----
printf("start `%c' ", r->regstart);
if (r->reganch)
printf("anchored ");
! if (r->regmlen > 0)
! printf("must have \"%s\"", &r->program[r->regmust]);
printf("\n");
}
*** regexp.h 1996/04/06 19:24:49 1.1
--- regexp.h 1996/04/07 01:52:19
***************
*** 6,16 ****
*/
#define NSUBEXP 10
typedef struct regexp {
! char *startp[NSUBEXP];
! char *endp[NSUBEXP];
char regstart; /* Internal use only. */
char reganch; /* Internal use only. */
! char *regmust; /* Internal use only. */
int regmlen; /* Internal use only. */
char program[1]; /* Unwarranted chumminess with compiler. */
} regexp;
--- 6,16 ----
*/
#define NSUBEXP 10
typedef struct regexp {
! const char *startp[NSUBEXP];
! const char *endp[NSUBEXP];
char regstart; /* Internal use only. */
char reganch; /* Internal use only. */
! int regmust; /* Internal use only. */
int regmlen; /* Internal use only. */
char program[1]; /* Unwarranted chumminess with compiler. */
} regexp;
***************
*** 18,21 ****
--- 18,27 ----
extern regexp *regcomp(const char *re);
extern int regexec(regexp *rp, const char *s);
extern void regsub(const regexp *rp, const char *src, char *dst);
+ extern void regnsub(const regexp *rp, const char *src, char *dst, size_t len);
+ extern size_t regsublen(const regexp *rp, const char *src);
+
extern void regerror(char *message);
+ extern size_t regcomp_len(const char *exp);
+ extern regexp *regcomp_comp(const char *exp, struct regexp *r, size_t len);
+
*** regsub.c 1996/04/06 19:24:49 1.1
--- regsub.c 1996/04/07 02:10:29
***************
*** 11,25 ****
/*
- regsub - perform substitutions after a regexp match
*/
void
! regsub(rp, source, dest)
const regexp *rp;
const char *source;
char *dest;
{
register regexp * const prog = (regexp *)rp;
! register char *src = (char *)source;
register char *dst = dest;
register char c;
register int no;
register size_t len;
--- 11,42 ----
/*
- regsub - perform substitutions after a regexp match
*/
+
+ void regsub(rp, source, dest)
+ const regexp *rp;
+ const char *source;
+ char *dest;
+ {
+ regnsub(rp, source, dest, BUFSIZ);
+ }
+
+
+
+ /*
+ - regnsub - perform bounds-checked substitutions after a regexp match
+ */
void
! regnsub(rp, source, dest, destlen)
const regexp *rp;
const char *source;
char *dest;
+ size_t destlen;
{
register regexp * const prog = (regexp *)rp;
! register const char *src = (char *)source;
register char *dst = dest;
+ char *dstend = dest + destlen;
+ char *odst;
register char c;
register int no;
register size_t len;
***************
*** 45,55 ****
if (c == '\\' && (*src == '\\' || *src == '&'))
c = *src++;
*dst++ = c;
} else if (prog->startp[no] != NULL && prog->endp[no] != NULL &&
! prog->endp[no] > prog->startp[no]) {
len = prog->endp[no] - prog->startp[no];
! (void) strncpy(dst, prog->startp[no], len);
dst += len;
if (*(dst-1) == '\0') { /* strncpy hit NUL. */
regerror("damaged match string");
return;
--- 62,83 ----
if (c == '\\' && (*src == '\\' || *src == '&'))
c = *src++;
*dst++ = c;
+ if (dst >= dstend)
+ {
+ regerror("output buffer too small");
+ return;
+ }
} else if (prog->startp[no] != NULL && prog->endp[no] != NULL &&
! prog->endp[no] > prog->startp[no]) {
len = prog->endp[no] - prog->startp[no];
! odst = dst;
dst += len;
+ if (dst >= dstend)
+ {
+ regerror("output buffer too small");
+ return;
+ }
+ (void) strncpy(odst, prog->startp[no], len);
if (*(dst-1) == '\0') { /* strncpy hit NUL. */
regerror("damaged match string");
return;
***************
*** 58,60 ****
--- 86,131 ----
}
*dst++ = '\0';
}
+
+ size_t regsublen(rp, source)
+ const regexp *rp;
+ const char *source;
+ {
+ register regexp * const prog = (regexp *)rp;
+ register char *src = (char *)source;
+ register char c;
+ register int no;
+ register int len = 0;
+
+ if (prog == NULL || source == NULL) {
+ regerror("NULL parameter to regsublen");
+ return -1;
+ }
+
+ if ((unsigned char)*(prog->program) != MAGIC) {
+ regerror("damaged regexp");
+ return -1;
+ }
+ while ((c = *src++) != '\0') {
+ if (c == '&')
+ no = 0;
+ else if (c == '\\' && isdigit(*src))
+ no = *src++ - '0';
+ else
+ no = -1;
+ if (no < 0) { /* Ordinary character. */
+ if (c == '\\' && (*src == '\\' || *src == '&'))
+ src++;
+ len++;
+ } else {
+ const char *s = prog->startp[no];
+ const char *e = prog->endp[no];
+ if ((s != NULL) && (e != NULL) && (e > s)) {
+ len += e-s;
+ }
+ }
+ }
+ return len+1;
+ }
+
+
Original regexp code from henry:
[unpacked & deleted -Olin]