Date: Mon, 1 Jul 1996 23:22:47 GMT From: Bill Sommerfeld To: shivers@lcs.mit.edu, bdc@ai.mit.edu Subject: scsh patch for precompiled regexps.. I meant to send this out months ago but I was just too hosed with work. Here's what I have right now: There are three pieces here: diffs to the "core" scsh diffs to Henry Spencer's latest regexp library a copy of Henry Spencer's latest regexp library.. It appears to work (it passes the same regression tests as the C library..). Let me know if I didn't include something needed for this to work.. - Bill diff -rc scsh-0.4.2/scsh/re.scm scsh-0.4.2-regexp/scsh/re.scm *** scsh-0.4.2/scsh/re.scm Fri Oct 27 04:58:56 1995 --- scsh-0.4.2-regexp/scsh/re.scm Sat Apr 6 21:07:41 1996 *************** *** 34,49 **** ;;; Bogus stub definitions for low-level match routines: ! (define regexp? string?) ! (define (make-regexp str) str) ! (define (regexp-exec regexp str . maybe-start) (let ((start (optional-arg maybe-start 0)) (start-vec (make-vector 10)) (end-vec (make-vector 10))) ! (and (%regexp-match regexp str start start-vec end-vec) ! (make-regexp-match str start-vec end-vec)))) ! ;;; Convert a string into a regex pattern that matches that string exactly -- ;;; in other words, quote the special chars with backslashes. --- 34,53 ---- ;;; Bogus stub definitions for low-level match routines: ! (define-record iregexp ! string) ! (define regexp? iregexp?) ! ! (define (make-regexp str) ! (make-iregexp (compile-regexp str))) ! ! (define (regexp-exec r s . maybe-start) (let ((start (optional-arg maybe-start 0)) (start-vec (make-vector 10)) (end-vec (make-vector 10))) ! (and (%regexp-exec-1 (iregexp:string r) s start start-vec end-vec) ! (make-regexp-match s start-vec end-vec)))) ;;; Convert a string into a regex pattern that matches that string exactly -- ;;; in other words, quote the special chars with backslashes. *************** *** 58,75 **** (cons #\\ result) result)))))) ! (define-foreign %regexp-match/errno (reg_match (string regexp) ! (string s) ! (integer start) ! (vector-desc start-vec) ! (vector-desc end-vec)) ! static-string ; Error string or #f if all is ok. ! bool) ; match? ! ! (define (%regexp-match regexp string start start-vec end-vec) ! (receive (err match?) (%regexp-match/errno regexp string start ! start-vec end-vec) ! (if err (error err %regexp-match regexp string start) match?))) ;;; I do this one in C, I'm not sure why: --- 62,79 ---- (cons #\\ result) result)))))) ! ;;;(define-foreign %regexp-match/errno (reg_match (string regexp) ! ;;; (string s) ! ;;; (integer start) ! ;;; (vector-desc start-vec) ! ;;; (vector-desc end-vec)) ! ;;; static-string ; Error string or #f if all is ok. ! ;;; bool) ; match? ! ! ;;;(define (%regexp-match regexp string start start-vec end-vec) ! ;;; (receive (err match?) (%regexp-match/errno regexp string start ! ;;; start-vec end-vec) ! ;;; (if err (error err %regexp-match regexp string start) match?))) ;;; I do this one in C, I'm not sure why: *************** *** 79,81 **** --- 83,166 ---- (filter_stringvec (string regexp) ((C "char const ** ~a") cvec)) static-string ; error message -- #f if no error. integer) ; number of files that pass the filter. + + ;;; precompiled regexps. + + (define-foreign %regexp-compiled-length (reg_comp_len (string regexp)) + static-string + integer) + + (define-foreign %regexp-compile (reg_comp_comp (string regexp) + (string-desc re-buf)) + static-string) + + (define (%regexp-exec-1 r s start sv ev) + (receive (err match?) (%regexp-exec r s start sv ev) + (if err (error err s start) + match?))) + + (define-foreign %regexp-exec (reg_exec (string-desc regexp) + (string s) + (integer start) + (vector-desc start-vec) + (vector-desc end-vec)) + static-string + bool) + + + (define (compile-regexp e) + (receive (err len) + (%regexp-compiled-length e) + (if err (error err e) + (let ((buf (make-string len))) + (%regexp-compile e buf) + buf)))) + + + + (define-foreign %regexp-subst (reg_subst (string-desc regexp) + (string m) + (string s) + (integer start) + (vector-desc start-vec) + (vector-desc end-vec) + (string-desc outbuf)) + static-string + integer) + + (define-foreign %regexp-subst-len (reg_subst_len (string-desc regexp) + (string m) + (string s) + (integer start) + (vector-desc start-vec) + (vector-desc end-vec)) + static-string + integer) + + + (define (regexp-subst re match replacement) + (let ((cr (iregexp:string re)) + (matchstr (regexp-match:string match)) + (startvec (regexp-match:start match)) + (endvec (regexp-match:end match))) + (receive (err outlen) + (%regexp-subst-len cr + matchstr + replacement + 0 + startvec + endvec) + (if err (error err matchstr replacement) + (let ((outbuf (make-string outlen))) + (receive (err outlen) + (%regexp-subst cr + matchstr + replacement + 0 + startvec + endvec + outbuf) + (if err (error err matchstr replacement) + (substring outbuf 0 outlen)))))))) + + \ No newline at end of file diff -rc scsh-0.4.2/scsh/re1.c scsh-0.4.2-regexp/scsh/re1.c *** scsh-0.4.2/scsh/re1.c Fri Oct 27 04:58:58 1995 --- scsh-0.4.2-regexp/scsh/re1.c Sat Apr 6 21:01:15 1996 *************** *** 19,24 **** --- 19,150 ---- /* Stash error msg in global. */ void regerror(char *msg) {regexp_error = msg;} + /* + ** Return NULL normally, error string on error. + ** Stash number of bytes needed for compiled regexp into `*len' + */ + + char *reg_comp_len(const char *re, int *len) + { + int l; + + regexp_error = NULL; + *len = regcomp_len(re); + return regexp_error; + } + + /* + ** Return NULL normally, error string on error. + ** Compile regexp into string described by `cr'. + */ + + char *reg_comp_comp(const char *re, scheme_value cr) + { + int len = STRING_LENGTH(cr); + regexp *r = (regexp *)&STRING_REF(cr, 0); + + regexp_error = NULL; + r = regcomp_comp(re, r, len); + return regexp_error; + } + + /* Return NULL normally, error string on error. + ** Stash match info in start_vec and end_vec. + ** Returns boolean match/no-match in hit. + */ + + char *reg_exec(scheme_value cr, const char *string, int start, + scheme_value start_vec, scheme_value end_vec, int *hit) + { + regexp *r = (regexp *)&STRING_REF(cr, 0); + + if( VECTOR_LENGTH(start_vec) != NSUBEXP ) { + return "Illegal start vector"; + } + + if( VECTOR_LENGTH(end_vec) != NSUBEXP ) { + return "Illegal end vector"; + } + + regexp_error = 0; + *hit = 0; + + if( regexec(r, string+start) ) { + int i; + for(i=0; istartp[i]; + const char *e = r->endp[i]; + VECTOR_REF(start_vec,i) = s?ENTER_FIXNUM(s - string):SCHFALSE; + VECTOR_REF(end_vec,i) = e?ENTER_FIXNUM(e - string):SCHFALSE; + r->startp[i] = NULL; + r->endp[i] = NULL; + } + *hit = 1; + } + return regexp_error; + } + + char *reg_subst(scheme_value cr, const char *match, + const char *src, int start, + scheme_value start_vec, scheme_value end_vec, + scheme_value outbuf, int *len) + { + int i; + regexp *r = (regexp *)&STRING_REF(cr, 0); + + if( VECTOR_LENGTH(start_vec) != NSUBEXP ) { + return "Illegal start vector"; + } + + if( VECTOR_LENGTH(end_vec) != NSUBEXP ) { + return "Illegal end vector"; + } + + for (i=0; istartp[i] = FIXNUMP(se)?(match + EXTRACT_FIXNUM(se)):NULL; + r->endp[i] = FIXNUMP(ee)? (match + EXTRACT_FIXNUM(ee)):NULL; + } + + regexp_error = NULL; + regnsub (r, src, &STRING_REF(outbuf, 0), STRING_LENGTH(outbuf)); + *len = strlen(&STRING_REF(outbuf, 0)); + return regexp_error; + } + + char *reg_subst_len(scheme_value cr, const char *match, + const char *src, int start, + scheme_value start_vec, scheme_value end_vec, + int *len) + { + int i; + regexp *r = (regexp *)&STRING_REF(cr, 0); + + if( VECTOR_LENGTH(start_vec) != NSUBEXP ) { + return "Illegal start vector"; + } + + if( VECTOR_LENGTH(end_vec) != NSUBEXP ) { + return "Illegal end vector"; + } + + for (i=0; istartp[i] = FIXNUMP(se)?(match + EXTRACT_FIXNUM(se)):NULL; + r->endp[i] = FIXNUMP(ee)? (match + EXTRACT_FIXNUM(ee)):NULL; + } + + regexp_error = NULL; + *len = regsublen (r, src); + return regexp_error; + } + + + #if 0 /* Return NULL normally, error string on error. ** Stash match info in start_vec and end_vec. ** Returns boolean match/no-match in hit. *************** *** 56,61 **** --- 182,188 ---- Free(prog); return regexp_error; } + #endif char *filter_stringvec(const char *re, char const **stringvec, int *nummatch) diff -rc scsh-0.4.2/scsh/re1.h scsh-0.4.2-regexp/scsh/re1.h *** scsh-0.4.2/scsh/re1.h Sun Oct 22 08:34:34 1995 --- scsh-0.4.2-regexp/scsh/re1.h Sat Apr 6 17:54:09 1996 *************** *** 1,6 **** --- 1,21 ---- + #if 0 char *reg_match(const char *re, const char *string, int start, scheme_value start_vec, scheme_value end_vec, int *hit); + #endif char *filter_stringvec(const char *re, char const **stringvec, int *nummatch); + + char *reg_comp_len(const char *re, int *len); + char *reg_comp_comp(const char *re, scheme_value cr); + + char *reg_exec(scheme_value cr, const char *string, int start, + scheme_value start_vec, scheme_value end_vec, int *hit); + + char *reg_subst(scheme_value cr, const char *match, + const char *src, int start, + scheme_value start_vec, scheme_value end_vec, + scheme_value outbuf, int *len); + + Only in scsh-0.4.2-regexp/scsh: re2.scm diff -rc scsh-0.4.2/scsh/scsh-interfaces.scm scsh-0.4.2-regexp/scsh/scsh-interfaces.scm *** scsh-0.4.2/scsh/scsh-interfaces.scm Tue Oct 31 19:19:30 1995 --- scsh-0.4.2-regexp/scsh/scsh-interfaces.scm Sat Apr 6 18:48:12 1996 *************** *** 413,418 **** --- 413,419 ---- make-regexp regexp? regexp-exec + regexp-subst regexp-quote)) regexp library changes: *** Makefile 1996/04/06 19:24:49 1.1 --- Makefile 1996/04/06 20:46:26 *************** *** 5,11 **** # Things you might want to put in TEST: # -DDEBUG debugging hooks # -I. regexp.h from current directory, not /usr/include ! TEST=-I. # Things you might want to put in PROF: # -pg profiler --- 5,11 ---- # Things you might want to put in TEST: # -DDEBUG debugging hooks # -I. regexp.h from current directory, not /usr/include ! TEST=-I. -DDEBUG # Things you might want to put in PROF: # -pg profiler *** regexp.c 1996/04/06 19:24:49 1.1 --- regexp.c 1996/04/06 22:34:55 *************** *** 105,110 **** --- 105,111 ---- * Utility definitions. */ #define FAIL(m) { regerror(m); return(NULL); } + #define FAILN(m) { regerror(m); return(-1); } #define ISREPN(c) ((c) == '*' || (c) == '+' || (c) == '?') #define META "^$.[()|?+*\\" *************** *** 162,173 **** const char *exp; { register regexp *r; ! register char *scan; int flags; struct comp co; if (exp == NULL) ! FAIL("NULL argument to regcomp"); /* First pass: determine size, legality. */ co.regparse = (char *)exp; --- 163,193 ---- const char *exp; { register regexp *r; ! size_t len; ! ! len = regcomp_len(exp); ! if (len <= 0) ! return NULL; ! ! /* Allocate space. */ ! r = (regexp *)malloc(len); ! ! if (r == NULL) ! FAIL("out of space"); ! return regcomp_comp(exp, r, len); ! } ! ! ! size_t ! regcomp_len(exp) ! const char *exp; ! { int flags; + register regexp *r; struct comp co; if (exp == NULL) ! FAILN("NULL argument to regcomp"); /* First pass: determine size, legality. */ co.regparse = (char *)exp; *************** *** 178,198 **** co.regcode = co.regdummy; regc(&co, MAGIC); if (reg(&co, 0, &flags) == NULL) ! return(NULL); /* Small enough for pointer-storage convention? */ if (co.regsize >= 0x7fffL) /* Probably could be 0xffffL. */ ! FAIL("regexp too big"); ! /* Allocate space. */ ! r = (regexp *)malloc(sizeof(regexp) + (size_t)co.regsize); ! if (r == NULL) ! FAIL("out of space"); /* Second pass: emit code. */ co.regparse = (char *)exp; co.regnpar = 1; co.regcode = r->program; regc(&co, MAGIC); if (reg(&co, 0, &flags) == NULL) return(NULL); --- 198,228 ---- co.regcode = co.regdummy; regc(&co, MAGIC); if (reg(&co, 0, &flags) == NULL) ! return -1; /* Small enough for pointer-storage convention? */ if (co.regsize >= 0x7fffL) /* Probably could be 0xffffL. */ ! FAILN("regexp too big"); ! return (sizeof(regexp) + (size_t)co.regsize); ! } ! ! ! regexp * ! regcomp_comp(exp, r, len) ! const char *exp; ! register regexp *r; ! size_t len; ! { ! register char *scan; ! int flags; ! struct comp co; /* Second pass: emit code. */ co.regparse = (char *)exp; co.regnpar = 1; co.regcode = r->program; + co.regsize = len - sizeof(regexp); regc(&co, MAGIC); if (reg(&co, 0, &flags) == NULL) return(NULL); *************** *** 200,206 **** /* Dig out information for optimizations. */ r->regstart = '\0'; /* Worst-case defaults. */ r->reganch = 0; ! r->regmust = NULL; r->regmlen = 0; scan = r->program+1; /* First BRANCH. */ if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ --- 230,236 ---- /* Dig out information for optimizations. */ r->regstart = '\0'; /* Worst-case defaults. */ r->reganch = 0; ! r->regmust = 0; r->regmlen = 0; scan = r->program+1; /* First BRANCH. */ if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ *************** *** 229,235 **** longest = OPERAND(scan); len = strlen(OPERAND(scan)); } ! r->regmust = longest; r->regmlen = (int)len; } } --- 259,265 ---- longest = OPERAND(scan); len = strlen(OPERAND(scan)); } ! r->regmust = longest - r->program; r->regmlen = (int)len; } } *************** *** 648,655 **** struct exec { char *reginput; /* String-input pointer. */ char *regbol; /* Beginning of input, for ^ check. */ ! char **regstartp; /* Pointer to startp array. */ ! char **regendp; /* Ditto for endp. */ }; /* --- 678,685 ---- struct exec { char *reginput; /* String-input pointer. */ char *regbol; /* Beginning of input, for ^ check. */ ! const char **regstartp; /* Pointer to startp array. */ ! const char **regendp; /* Ditto for endp. */ }; /* *************** *** 690,696 **** } /* If there is a "must appear" string, look for it. */ ! if (prog->regmust != NULL && strstr(string, prog->regmust) == NULL) return(0); /* Mark beginning of line for ^ . */ --- 720,727 ---- } /* If there is a "must appear" string, look for it. */ ! if ((prog->regmlen > 0) && ! strstr(string, &prog->program[prog->regmust]) == NULL) return(0); /* Mark beginning of line for ^ . */ *************** *** 729,736 **** char *string; { register int i; ! register char **stp; ! register char **enp; ep->reginput = string; --- 760,767 ---- char *string; { register int i; ! register const char **stp; ! register const char **enp; ep->reginput = string; *************** *** 1004,1011 **** printf("start `%c' ", r->regstart); if (r->reganch) printf("anchored "); ! if (r->regmust != NULL) ! printf("must have \"%s\"", r->regmust); printf("\n"); } --- 1035,1042 ---- printf("start `%c' ", r->regstart); if (r->reganch) printf("anchored "); ! if (r->regmlen > 0) ! printf("must have \"%s\"", &r->program[r->regmust]); printf("\n"); } *** regexp.h 1996/04/06 19:24:49 1.1 --- regexp.h 1996/04/07 01:52:19 *************** *** 6,16 **** */ #define NSUBEXP 10 typedef struct regexp { ! char *startp[NSUBEXP]; ! char *endp[NSUBEXP]; char regstart; /* Internal use only. */ char reganch; /* Internal use only. */ ! char *regmust; /* Internal use only. */ int regmlen; /* Internal use only. */ char program[1]; /* Unwarranted chumminess with compiler. */ } regexp; --- 6,16 ---- */ #define NSUBEXP 10 typedef struct regexp { ! const char *startp[NSUBEXP]; ! const char *endp[NSUBEXP]; char regstart; /* Internal use only. */ char reganch; /* Internal use only. */ ! int regmust; /* Internal use only. */ int regmlen; /* Internal use only. */ char program[1]; /* Unwarranted chumminess with compiler. */ } regexp; *************** *** 18,21 **** --- 18,27 ---- extern regexp *regcomp(const char *re); extern int regexec(regexp *rp, const char *s); extern void regsub(const regexp *rp, const char *src, char *dst); + extern void regnsub(const regexp *rp, const char *src, char *dst, size_t len); + extern size_t regsublen(const regexp *rp, const char *src); + extern void regerror(char *message); + extern size_t regcomp_len(const char *exp); + extern regexp *regcomp_comp(const char *exp, struct regexp *r, size_t len); + *** regsub.c 1996/04/06 19:24:49 1.1 --- regsub.c 1996/04/07 02:10:29 *************** *** 11,25 **** /* - regsub - perform substitutions after a regexp match */ void ! regsub(rp, source, dest) const regexp *rp; const char *source; char *dest; { register regexp * const prog = (regexp *)rp; ! register char *src = (char *)source; register char *dst = dest; register char c; register int no; register size_t len; --- 11,42 ---- /* - regsub - perform substitutions after a regexp match */ + + void regsub(rp, source, dest) + const regexp *rp; + const char *source; + char *dest; + { + regnsub(rp, source, dest, BUFSIZ); + } + + + + /* + - regnsub - perform bounds-checked substitutions after a regexp match + */ void ! regnsub(rp, source, dest, destlen) const regexp *rp; const char *source; char *dest; + size_t destlen; { register regexp * const prog = (regexp *)rp; ! register const char *src = (char *)source; register char *dst = dest; + char *dstend = dest + destlen; + char *odst; register char c; register int no; register size_t len; *************** *** 45,55 **** if (c == '\\' && (*src == '\\' || *src == '&')) c = *src++; *dst++ = c; } else if (prog->startp[no] != NULL && prog->endp[no] != NULL && ! prog->endp[no] > prog->startp[no]) { len = prog->endp[no] - prog->startp[no]; ! (void) strncpy(dst, prog->startp[no], len); dst += len; if (*(dst-1) == '\0') { /* strncpy hit NUL. */ regerror("damaged match string"); return; --- 62,83 ---- if (c == '\\' && (*src == '\\' || *src == '&')) c = *src++; *dst++ = c; + if (dst >= dstend) + { + regerror("output buffer too small"); + return; + } } else if (prog->startp[no] != NULL && prog->endp[no] != NULL && ! prog->endp[no] > prog->startp[no]) { len = prog->endp[no] - prog->startp[no]; ! odst = dst; dst += len; + if (dst >= dstend) + { + regerror("output buffer too small"); + return; + } + (void) strncpy(odst, prog->startp[no], len); if (*(dst-1) == '\0') { /* strncpy hit NUL. */ regerror("damaged match string"); return; *************** *** 58,60 **** --- 86,131 ---- } *dst++ = '\0'; } + + size_t regsublen(rp, source) + const regexp *rp; + const char *source; + { + register regexp * const prog = (regexp *)rp; + register char *src = (char *)source; + register char c; + register int no; + register int len = 0; + + if (prog == NULL || source == NULL) { + regerror("NULL parameter to regsublen"); + return -1; + } + + if ((unsigned char)*(prog->program) != MAGIC) { + regerror("damaged regexp"); + return -1; + } + while ((c = *src++) != '\0') { + if (c == '&') + no = 0; + else if (c == '\\' && isdigit(*src)) + no = *src++ - '0'; + else + no = -1; + if (no < 0) { /* Ordinary character. */ + if (c == '\\' && (*src == '\\' || *src == '&')) + src++; + len++; + } else { + const char *s = prog->startp[no]; + const char *e = prog->endp[no]; + if ((s != NULL) && (e != NULL) && (e > s)) { + len += e-s; + } + } + } + return len+1; + } + + Original regexp code from henry: [unpacked & deleted -Olin]