diff --git a/scsh/re.scm b/scsh/re.scm index 0def23b..ba202f1 100644 --- a/scsh/re.scm +++ b/scsh/re.scm @@ -7,46 +7,161 @@ "" "" ) -(define-record regexp-match - string - start ; 10 elt vec - end) ; 10 elt vec +;;; Match data for regexp matches. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;;; Need to do error case for these three procs. +(define-record regexp-match + string ; The string against which we matched. + start ; 10 elt vec + end) ; 10 elt vec (define (match:start match . maybe-index) - (vector-ref (regexp-match:start match) - (:optional maybe-index 0))) + (let ((i (:optional maybe-index 0))) + (or (vector-ref (regexp-match:start match) i) + (error match:start "No sub-match found." match i)))) (define (match:end match . maybe-index) - (vector-ref (regexp-match:end match) - (:optional maybe-index 0))) + (let ((i (:optional maybe-index 0))) + (or (vector-ref (regexp-match:end match) i) + (error match:start "No sub-match found." match i)))) (define (match:substring match . maybe-index) - (let ((i (:optional maybe-index 0))) - (substring (regexp-match:string match) - (match:start match i) - (match:end match i)))) - -(define (string-match pattern string . maybe-start) - (apply regexp-exec (make-regexp pattern) string maybe-start)) + (let* ((i (:optional maybe-index 0)) + (start (vector-ref (regexp-match:start match) i))) + (if start + (substring (regexp-match:string match) + start + (vector-ref (regexp-match:end match) i)) + (error match:substring "No sub-match found." match i)))) -;;; Bogus stub definitions for low-level match routines: +;;; Compiling regexps +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(define regexp? string?) -(define (make-regexp str) str) +(define-record %regexp + string ; The string form of the regexp. + bytes ; The compiled representation, stuffed into a Scheme string. + ((disclose self) (list "Regexp" (%regexp:string self)))) + +(define regexp? %regexp?) + + +(define (make-regexp pattern) + (receive (err len) (%regexp-compiled-length pattern) + (if err (error err make-regexp pattern) + (let ((buf (make-string len))) + (%regexp-compile pattern buf) + (make-%regexp pattern buf))))) + +(define-foreign %regexp-compiled-length (re_byte_len (string pattern)) + static-string ; Error msg or #f + integer) ; number of bytes needed to compile REGEXP. + +(define-foreign %regexp-compile (re_compile (string pattern) + (string-desc bytes)) + static-string) ; Error msg or #f + + +;;; Executing compiled regexps +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define (regexp-exec regexp str . maybe-start) (let ((start (:optional maybe-start 0)) (start-vec (make-vector 10)) (end-vec (make-vector 10))) - (and (%regexp-match regexp str start start-vec end-vec) - (make-regexp-match str start-vec end-vec)))) + (receive (err match?) + (%regexp-exec (%regexp:bytes regexp) str start start-vec end-vec) + (if err (error err regexp-exec regexp str start) + (and match? + (make-regexp-match str start-vec end-vec)))))) + +(define-foreign %regexp-exec (re_exec (string-desc compiled-regexp) + (string s) + (integer start) + (vector-desc start-vec) + (vector-desc end-vec)) + static-string ; Error msg or #f + bool) ; Matched? + + +;;; Compile&match regexps in one go +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; I could do this with the separate compile and execute procedures, +;;; but I go straight to C just for fun. + +(define (string-match pattern string . maybe-start) + (let ((start (:optional maybe-start 0)) + (start-vec (make-vector 10)) + (end-vec (make-vector 10))) + (receive (err match?) (%string-match pattern string start + start-vec end-vec) + (if err (error err string-match pattern string start) + (and match? (make-regexp-match string start-vec end-vec)))))) + +(define-foreign %string-match (re_match (string pattern) + (string s) + (integer start) + (vector-desc start-vec) + (vector-desc end-vec)) + static-string ; Error string or #f if all is ok. + bool) ; match? + + + +;;; Substitutions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define-foreign %regexp-subst (re_subst (string-desc compiled-regexp) + (string match) + (string str) + (integer start) + (vector-desc start-vec) + (vector-desc end-vec) + (string-desc outbuf)) + static-string ; Error msg or #f + integer) + +(define-foreign %regexp-subst-len (re_subst_len (string-desc compiled-regexp) + (string match) + (string str) + (integer start) + (vector-desc start-vec) + (vector-desc end-vec)) + static-string ; Error msg or #f + integer) + +;;; What does this do? + +(define (regexp-subst re match replacement) + (let ((cr (%regexp:bytes re)) + (str (regexp-match:string match)) + (start-vec (regexp-match:start match)) + (end-vec (regexp-match:end match))) + (receive (err out-len) (%regexp-subst-len cr str replacement 0 + start-vec end-vec) + (if err (error err regexp-subst str replacement) ; More data here + (let ((out-buf (make-string out-len))) + (receive (err out-len) (%regexp-subst cr str replacement 0 + start-vec end-vec out-buf) + (if err (error err regexp-subst str replacement) + (substring out-buf 0 out-len)))))))) + +;;; Miscellaneous +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; I do this one in C, I'm not sure why: +;;; It is used by MATCH-FILES. + +(define-foreign %filter-C-strings! + (filter_stringvec (string pattern) ((C "char const ** ~a") cvec)) + static-string ; error message -- #f if no error. + integer) ; number of files that pass the filter. ;;; Convert a string into a regex pattern that matches that string exactly -- ;;; in other words, quote the special chars with backslashes. + (define (regexp-quote string) (let lp ((i (- (string-length string) 1)) (result '())) @@ -57,25 +172,3 @@ (if (memv c '(#\[ #\] #\. #\* #\? #\( #\) #\| #\\ #\$ #\^ #\+)) (cons #\\ result) result)))))) - -(define-foreign %regexp-match/errno (reg_match (string regexp) - (string s) - (integer start) - (vector-desc start-vec) - (vector-desc end-vec)) - static-string ; Error string or #f if all is ok. - bool) ; match? - -(define (%regexp-match regexp string start start-vec end-vec) - (receive (err match?) (%regexp-match/errno regexp string start - start-vec end-vec) - (if err (error err %regexp-match regexp string start) match?))) - - -;;; I do this one in C, I'm not sure why: -;;; Used by MATCH-FILES. - -(define-foreign %filter-C-strings! - (filter_stringvec (string regexp) ((C "char const ** ~a") cvec)) - static-string ; error message -- #f if no error. - integer) ; number of files that pass the filter. diff --git a/scsh/re1.c b/scsh/re1.c index fd7a7eb..3a4cf49 100644 --- a/scsh/re1.c +++ b/scsh/re1.c @@ -9,23 +9,136 @@ /* Make sure our exports match up w/the implementation: */ #include "re1.h" -#ifndef NULL -#define NULL 0 -#endif - /* Not multi-threaded reentrant. */ static char *regexp_error; /* Stash error msg in global. */ void regerror(char *msg) {regexp_error = msg;} +/* +** Return NULL normally, error string on error. +** Stash number of bytes needed for compiled regexp into `*len' +*/ + +char *re_byte_len(const char *re, int *len) +{ + int l; + + regexp_error = 0; + *len = regcomp_len(re); + return regexp_error; + } + +/* +** Return NULL normally, error string on error. +** Compile regexp into string described by `cr'. +*/ + +char *re_compile(const char *re, scheme_value cr) +{ + int len = STRING_LENGTH(cr); + regexp *r = (regexp *) &STRING_REF(cr, 0); + + regexp_error = 0; + r = regcomp_comp(re, r, len); + return regexp_error; + } + /* Return NULL normally, error string on error. ** Stash match info in start_vec and end_vec. ** Returns boolean match/no-match in hit. */ -char *reg_match(const char *re, const char *string, int start, - scheme_value start_vec, scheme_value end_vec, int *hit) +char *re_exec(scheme_value cr, const char *string, int start, + scheme_value start_vec, scheme_value end_vec, int *hit) +{ + regexp *r = (regexp *) &STRING_REF(cr, 0); + + *hit = 0; + + if( VECTOR_LENGTH(start_vec) != NSUBEXP ) /* These tests should */ + return "Illegal start vector"; /* never trigger. */ + if( VECTOR_LENGTH(end_vec) != NSUBEXP ) + return "Illegal end vector"; + + regexp_error = 0; + + if( regexec(r, string+start) ) { + int i; + for(i=0; istartp[i]; + const char *e = r->endp[i]; + VECTOR_REF(start_vec,i) = s ? ENTER_FIXNUM(s - string) : SCHFALSE; + VECTOR_REF(end_vec,i) = e ? ENTER_FIXNUM(e - string) : SCHFALSE; + r->startp[i] = 0; /* Why did Sommerfeld */ + r->endp[i] = 0; /* put these here? */ + } + *hit = 1; + } + + return regexp_error; + } + + +char *re_subst(scheme_value cr, const char *match, + const char *src, int start, + scheme_value start_vec, scheme_value end_vec, + scheme_value outbuf, int *len) +{ + int i; + regexp *r = (regexp *) &STRING_REF(cr, 0); + + if( VECTOR_LENGTH(start_vec) != NSUBEXP ) /* These tests should */ + return "Illegal start vector"; /* never trigger. */ + if( VECTOR_LENGTH(end_vec) != NSUBEXP ) + return "Illegal end vector"; + + for (i=0; istartp[i] = FIXNUMP(se) ? (match + EXTRACT_FIXNUM(se)) : 0; + r->endp[i] = FIXNUMP(ee) ? (match + EXTRACT_FIXNUM(ee)) : 0; + } + + regexp_error = 0; + regnsub(r, src, &STRING_REF(outbuf, 0), STRING_LENGTH(outbuf)); + *len = strlen(&STRING_REF(outbuf, 0)); + return regexp_error; + } + +char *re_subst_len(scheme_value cr, const char *match, + const char *src, int start, + scheme_value start_vec, scheme_value end_vec, + int *len) +{ + int i; + regexp *r = (regexp *) &STRING_REF(cr, 0); + + if( VECTOR_LENGTH(start_vec) != NSUBEXP ) /* These tests should */ + return "Illegal start vector"; /* never trigger. */ + if( VECTOR_LENGTH(end_vec) != NSUBEXP ) + return "Illegal end vector"; + + for (i=0; istartp[i] = FIXNUMP(se) ? (match + EXTRACT_FIXNUM(se)) : 0; + r->endp[i] = FIXNUMP(ee) ? (match + EXTRACT_FIXNUM(ee)) : 0; + } + + regexp_error = 0; + *len = regsublen(r, src); + return regexp_error; + } + + +/* Return NULL normally, error string on error. +** Stash match info in start_vec and end_vec. +** Returns boolean match/no-match in hit. +*/ + +char *re_match(const char *re, const char *string, int start, + scheme_value start_vec, scheme_value end_vec, int *hit) { regexp *prog; @@ -34,12 +147,12 @@ char *reg_match(const char *re, const char *string, int start, prog = regcomp(re); if( !prog ) return regexp_error; - if( VECTOR_LENGTH(start_vec) != NSUBEXP ) { + if( VECTOR_LENGTH(start_vec) != NSUBEXP ) { /* These two tests */ Free(prog); return "Illegal start vector"; } - if( VECTOR_LENGTH(end_vec) != NSUBEXP ) { + if( VECTOR_LENGTH(end_vec) != NSUBEXP ) { /* should never trigger. */ Free(prog); return "Illegal end vector"; } @@ -47,8 +160,10 @@ char *reg_match(const char *re, const char *string, int start, if( regexec(prog, string+start) ) { int i; for(i=0; istartp[i] - string); - VECTOR_REF(end_vec,i) = ENTER_FIXNUM(prog->endp[i] - string); + const char *s = prog->startp[i]; + const char *e = prog->endp[i]; + VECTOR_REF(start_vec,i) = s ? ENTER_FIXNUM(s - string) : SCHFALSE; + VECTOR_REF(end_vec,i) = e ? ENTER_FIXNUM(e - string) : SCHFALSE; } *hit = 1; } diff --git a/scsh/re1.h b/scsh/re1.h index 7a1762d..3938f7e 100644 --- a/scsh/re1.h +++ b/scsh/re1.h @@ -1,6 +1,24 @@ -char *reg_match(const char *re, const char *string, int start, - scheme_value start_vec, scheme_value end_vec, - int *hit); +/* Exports from re1.c */ + +char *re_byte_len(const char *re, int *len); +char *re_compile(const char *re, scheme_value target); + +char *re_exec(scheme_value cr, const char *string, int start, + scheme_value start_vec, scheme_value end_vec, int *hit); + +char *re_match(const char *re, const char *string, int start, + scheme_value start_vec, scheme_value end_vec, + int *hit); + +char *re_subst_len(scheme_value cr, const char *match, + const char *src, int start, + scheme_value start_vec, scheme_value end_vec, + int *len); + +char *re_subst(scheme_value cr, const char *match, + const char *src, int start, + scheme_value start_vec, scheme_value end_vec, + scheme_value outbuf, int *len); char *filter_stringvec(const char *re, char const **stringvec, int *nummatch);