Added regexp compilation
This commit is contained in:
parent
e84440fecd
commit
f948e51831
179
scsh/re.scm
179
scsh/re.scm
|
@ -7,46 +7,161 @@
|
||||||
"" ""
|
"" ""
|
||||||
)
|
)
|
||||||
|
|
||||||
(define-record regexp-match
|
;;; Match data for regexp matches.
|
||||||
string
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
start ; 10 elt vec
|
|
||||||
end) ; 10 elt vec
|
|
||||||
|
|
||||||
;;; Need to do error case for these three procs.
|
(define-record regexp-match
|
||||||
|
string ; The string against which we matched.
|
||||||
|
start ; 10 elt vec
|
||||||
|
end) ; 10 elt vec
|
||||||
|
|
||||||
(define (match:start match . maybe-index)
|
(define (match:start match . maybe-index)
|
||||||
(vector-ref (regexp-match:start match)
|
(let ((i (:optional maybe-index 0)))
|
||||||
(:optional maybe-index 0)))
|
(or (vector-ref (regexp-match:start match) i)
|
||||||
|
(error match:start "No sub-match found." match i))))
|
||||||
|
|
||||||
(define (match:end match . maybe-index)
|
(define (match:end match . maybe-index)
|
||||||
(vector-ref (regexp-match:end match)
|
(let ((i (:optional maybe-index 0)))
|
||||||
(:optional maybe-index 0)))
|
(or (vector-ref (regexp-match:end match) i)
|
||||||
|
(error match:start "No sub-match found." match i))))
|
||||||
|
|
||||||
(define (match:substring match . maybe-index)
|
(define (match:substring match . maybe-index)
|
||||||
(let ((i (:optional maybe-index 0)))
|
(let* ((i (:optional maybe-index 0))
|
||||||
(substring (regexp-match:string match)
|
(start (vector-ref (regexp-match:start match) i)))
|
||||||
(match:start match i)
|
(if start
|
||||||
(match:end match i))))
|
(substring (regexp-match:string match)
|
||||||
|
start
|
||||||
(define (string-match pattern string . maybe-start)
|
(vector-ref (regexp-match:end match) i))
|
||||||
(apply regexp-exec (make-regexp pattern) string maybe-start))
|
(error match:substring "No sub-match found." match i))))
|
||||||
|
|
||||||
|
|
||||||
;;; Bogus stub definitions for low-level match routines:
|
;;; Compiling regexps
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
(define regexp? string?)
|
(define-record %regexp
|
||||||
(define (make-regexp str) str)
|
string ; The string form of the regexp.
|
||||||
|
bytes ; The compiled representation, stuffed into a Scheme string.
|
||||||
|
((disclose self) (list "Regexp" (%regexp:string self))))
|
||||||
|
|
||||||
|
(define regexp? %regexp?)
|
||||||
|
|
||||||
|
|
||||||
|
(define (make-regexp pattern)
|
||||||
|
(receive (err len) (%regexp-compiled-length pattern)
|
||||||
|
(if err (error err make-regexp pattern)
|
||||||
|
(let ((buf (make-string len)))
|
||||||
|
(%regexp-compile pattern buf)
|
||||||
|
(make-%regexp pattern buf)))))
|
||||||
|
|
||||||
|
(define-foreign %regexp-compiled-length (re_byte_len (string pattern))
|
||||||
|
static-string ; Error msg or #f
|
||||||
|
integer) ; number of bytes needed to compile REGEXP.
|
||||||
|
|
||||||
|
(define-foreign %regexp-compile (re_compile (string pattern)
|
||||||
|
(string-desc bytes))
|
||||||
|
static-string) ; Error msg or #f
|
||||||
|
|
||||||
|
|
||||||
|
;;; Executing compiled regexps
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
(define (regexp-exec regexp str . maybe-start)
|
(define (regexp-exec regexp str . maybe-start)
|
||||||
(let ((start (:optional maybe-start 0))
|
(let ((start (:optional maybe-start 0))
|
||||||
(start-vec (make-vector 10))
|
(start-vec (make-vector 10))
|
||||||
(end-vec (make-vector 10)))
|
(end-vec (make-vector 10)))
|
||||||
(and (%regexp-match regexp str start start-vec end-vec)
|
(receive (err match?)
|
||||||
(make-regexp-match str start-vec end-vec))))
|
(%regexp-exec (%regexp:bytes regexp) str start start-vec end-vec)
|
||||||
|
(if err (error err regexp-exec regexp str start)
|
||||||
|
(and match?
|
||||||
|
(make-regexp-match str start-vec end-vec))))))
|
||||||
|
|
||||||
|
(define-foreign %regexp-exec (re_exec (string-desc compiled-regexp)
|
||||||
|
(string s)
|
||||||
|
(integer start)
|
||||||
|
(vector-desc start-vec)
|
||||||
|
(vector-desc end-vec))
|
||||||
|
static-string ; Error msg or #f
|
||||||
|
bool) ; Matched?
|
||||||
|
|
||||||
|
|
||||||
|
;;; Compile&match regexps in one go
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;;; I could do this with the separate compile and execute procedures,
|
||||||
|
;;; but I go straight to C just for fun.
|
||||||
|
|
||||||
|
(define (string-match pattern string . maybe-start)
|
||||||
|
(let ((start (:optional maybe-start 0))
|
||||||
|
(start-vec (make-vector 10))
|
||||||
|
(end-vec (make-vector 10)))
|
||||||
|
(receive (err match?) (%string-match pattern string start
|
||||||
|
start-vec end-vec)
|
||||||
|
(if err (error err string-match pattern string start)
|
||||||
|
(and match? (make-regexp-match string start-vec end-vec))))))
|
||||||
|
|
||||||
|
(define-foreign %string-match (re_match (string pattern)
|
||||||
|
(string s)
|
||||||
|
(integer start)
|
||||||
|
(vector-desc start-vec)
|
||||||
|
(vector-desc end-vec))
|
||||||
|
static-string ; Error string or #f if all is ok.
|
||||||
|
bool) ; match?
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;;; Substitutions
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
(define-foreign %regexp-subst (re_subst (string-desc compiled-regexp)
|
||||||
|
(string match)
|
||||||
|
(string str)
|
||||||
|
(integer start)
|
||||||
|
(vector-desc start-vec)
|
||||||
|
(vector-desc end-vec)
|
||||||
|
(string-desc outbuf))
|
||||||
|
static-string ; Error msg or #f
|
||||||
|
integer)
|
||||||
|
|
||||||
|
(define-foreign %regexp-subst-len (re_subst_len (string-desc compiled-regexp)
|
||||||
|
(string match)
|
||||||
|
(string str)
|
||||||
|
(integer start)
|
||||||
|
(vector-desc start-vec)
|
||||||
|
(vector-desc end-vec))
|
||||||
|
static-string ; Error msg or #f
|
||||||
|
integer)
|
||||||
|
|
||||||
|
;;; What does this do?
|
||||||
|
|
||||||
|
(define (regexp-subst re match replacement)
|
||||||
|
(let ((cr (%regexp:bytes re))
|
||||||
|
(str (regexp-match:string match))
|
||||||
|
(start-vec (regexp-match:start match))
|
||||||
|
(end-vec (regexp-match:end match)))
|
||||||
|
(receive (err out-len) (%regexp-subst-len cr str replacement 0
|
||||||
|
start-vec end-vec)
|
||||||
|
(if err (error err regexp-subst str replacement) ; More data here
|
||||||
|
(let ((out-buf (make-string out-len)))
|
||||||
|
(receive (err out-len) (%regexp-subst cr str replacement 0
|
||||||
|
start-vec end-vec out-buf)
|
||||||
|
(if err (error err regexp-subst str replacement)
|
||||||
|
(substring out-buf 0 out-len))))))))
|
||||||
|
|
||||||
|
;;; Miscellaneous
|
||||||
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
|
|
||||||
|
;;; I do this one in C, I'm not sure why:
|
||||||
|
;;; It is used by MATCH-FILES.
|
||||||
|
|
||||||
|
(define-foreign %filter-C-strings!
|
||||||
|
(filter_stringvec (string pattern) ((C "char const ** ~a") cvec))
|
||||||
|
static-string ; error message -- #f if no error.
|
||||||
|
integer) ; number of files that pass the filter.
|
||||||
|
|
||||||
|
|
||||||
;;; Convert a string into a regex pattern that matches that string exactly --
|
;;; Convert a string into a regex pattern that matches that string exactly --
|
||||||
;;; in other words, quote the special chars with backslashes.
|
;;; in other words, quote the special chars with backslashes.
|
||||||
|
|
||||||
(define (regexp-quote string)
|
(define (regexp-quote string)
|
||||||
(let lp ((i (- (string-length string) 1))
|
(let lp ((i (- (string-length string) 1))
|
||||||
(result '()))
|
(result '()))
|
||||||
|
@ -57,25 +172,3 @@
|
||||||
(if (memv c '(#\[ #\] #\. #\* #\? #\( #\) #\| #\\ #\$ #\^ #\+))
|
(if (memv c '(#\[ #\] #\. #\* #\? #\( #\) #\| #\\ #\$ #\^ #\+))
|
||||||
(cons #\\ result)
|
(cons #\\ result)
|
||||||
result))))))
|
result))))))
|
||||||
|
|
||||||
(define-foreign %regexp-match/errno (reg_match (string regexp)
|
|
||||||
(string s)
|
|
||||||
(integer start)
|
|
||||||
(vector-desc start-vec)
|
|
||||||
(vector-desc end-vec))
|
|
||||||
static-string ; Error string or #f if all is ok.
|
|
||||||
bool) ; match?
|
|
||||||
|
|
||||||
(define (%regexp-match regexp string start start-vec end-vec)
|
|
||||||
(receive (err match?) (%regexp-match/errno regexp string start
|
|
||||||
start-vec end-vec)
|
|
||||||
(if err (error err %regexp-match regexp string start) match?)))
|
|
||||||
|
|
||||||
|
|
||||||
;;; I do this one in C, I'm not sure why:
|
|
||||||
;;; Used by MATCH-FILES.
|
|
||||||
|
|
||||||
(define-foreign %filter-C-strings!
|
|
||||||
(filter_stringvec (string regexp) ((C "char const ** ~a") cvec))
|
|
||||||
static-string ; error message -- #f if no error.
|
|
||||||
integer) ; number of files that pass the filter.
|
|
||||||
|
|
135
scsh/re1.c
135
scsh/re1.c
|
@ -9,23 +9,136 @@
|
||||||
/* Make sure our exports match up w/the implementation: */
|
/* Make sure our exports match up w/the implementation: */
|
||||||
#include "re1.h"
|
#include "re1.h"
|
||||||
|
|
||||||
#ifndef NULL
|
|
||||||
#define NULL 0
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Not multi-threaded reentrant. */
|
/* Not multi-threaded reentrant. */
|
||||||
static char *regexp_error;
|
static char *regexp_error;
|
||||||
|
|
||||||
/* Stash error msg in global. */
|
/* Stash error msg in global. */
|
||||||
void regerror(char *msg) {regexp_error = msg;}
|
void regerror(char *msg) {regexp_error = msg;}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Return NULL normally, error string on error.
|
||||||
|
** Stash number of bytes needed for compiled regexp into `*len'
|
||||||
|
*/
|
||||||
|
|
||||||
|
char *re_byte_len(const char *re, int *len)
|
||||||
|
{
|
||||||
|
int l;
|
||||||
|
|
||||||
|
regexp_error = 0;
|
||||||
|
*len = regcomp_len(re);
|
||||||
|
return regexp_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Return NULL normally, error string on error.
|
||||||
|
** Compile regexp into string described by `cr'.
|
||||||
|
*/
|
||||||
|
|
||||||
|
char *re_compile(const char *re, scheme_value cr)
|
||||||
|
{
|
||||||
|
int len = STRING_LENGTH(cr);
|
||||||
|
regexp *r = (regexp *) &STRING_REF(cr, 0);
|
||||||
|
|
||||||
|
regexp_error = 0;
|
||||||
|
r = regcomp_comp(re, r, len);
|
||||||
|
return regexp_error;
|
||||||
|
}
|
||||||
|
|
||||||
/* Return NULL normally, error string on error.
|
/* Return NULL normally, error string on error.
|
||||||
** Stash match info in start_vec and end_vec.
|
** Stash match info in start_vec and end_vec.
|
||||||
** Returns boolean match/no-match in hit.
|
** Returns boolean match/no-match in hit.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
char *reg_match(const char *re, const char *string, int start,
|
char *re_exec(scheme_value cr, const char *string, int start,
|
||||||
scheme_value start_vec, scheme_value end_vec, int *hit)
|
scheme_value start_vec, scheme_value end_vec, int *hit)
|
||||||
|
{
|
||||||
|
regexp *r = (regexp *) &STRING_REF(cr, 0);
|
||||||
|
|
||||||
|
*hit = 0;
|
||||||
|
|
||||||
|
if( VECTOR_LENGTH(start_vec) != NSUBEXP ) /* These tests should */
|
||||||
|
return "Illegal start vector"; /* never trigger. */
|
||||||
|
if( VECTOR_LENGTH(end_vec) != NSUBEXP )
|
||||||
|
return "Illegal end vector";
|
||||||
|
|
||||||
|
regexp_error = 0;
|
||||||
|
|
||||||
|
if( regexec(r, string+start) ) {
|
||||||
|
int i;
|
||||||
|
for(i=0; i<NSUBEXP; i++) {
|
||||||
|
const char *s = r->startp[i];
|
||||||
|
const char *e = r->endp[i];
|
||||||
|
VECTOR_REF(start_vec,i) = s ? ENTER_FIXNUM(s - string) : SCHFALSE;
|
||||||
|
VECTOR_REF(end_vec,i) = e ? ENTER_FIXNUM(e - string) : SCHFALSE;
|
||||||
|
r->startp[i] = 0; /* Why did Sommerfeld */
|
||||||
|
r->endp[i] = 0; /* put these here? */
|
||||||
|
}
|
||||||
|
*hit = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return regexp_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
char *re_subst(scheme_value cr, const char *match,
|
||||||
|
const char *src, int start,
|
||||||
|
scheme_value start_vec, scheme_value end_vec,
|
||||||
|
scheme_value outbuf, int *len)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
regexp *r = (regexp *) &STRING_REF(cr, 0);
|
||||||
|
|
||||||
|
if( VECTOR_LENGTH(start_vec) != NSUBEXP ) /* These tests should */
|
||||||
|
return "Illegal start vector"; /* never trigger. */
|
||||||
|
if( VECTOR_LENGTH(end_vec) != NSUBEXP )
|
||||||
|
return "Illegal end vector";
|
||||||
|
|
||||||
|
for (i=0; i<NSUBEXP; i++) {
|
||||||
|
scheme_value se = VECTOR_REF(start_vec, i);
|
||||||
|
scheme_value ee = VECTOR_REF(end_vec, i);
|
||||||
|
r->startp[i] = FIXNUMP(se) ? (match + EXTRACT_FIXNUM(se)) : 0;
|
||||||
|
r->endp[i] = FIXNUMP(ee) ? (match + EXTRACT_FIXNUM(ee)) : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
regexp_error = 0;
|
||||||
|
regnsub(r, src, &STRING_REF(outbuf, 0), STRING_LENGTH(outbuf));
|
||||||
|
*len = strlen(&STRING_REF(outbuf, 0));
|
||||||
|
return regexp_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *re_subst_len(scheme_value cr, const char *match,
|
||||||
|
const char *src, int start,
|
||||||
|
scheme_value start_vec, scheme_value end_vec,
|
||||||
|
int *len)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
regexp *r = (regexp *) &STRING_REF(cr, 0);
|
||||||
|
|
||||||
|
if( VECTOR_LENGTH(start_vec) != NSUBEXP ) /* These tests should */
|
||||||
|
return "Illegal start vector"; /* never trigger. */
|
||||||
|
if( VECTOR_LENGTH(end_vec) != NSUBEXP )
|
||||||
|
return "Illegal end vector";
|
||||||
|
|
||||||
|
for (i=0; i<NSUBEXP; i++) {
|
||||||
|
scheme_value se = VECTOR_REF(start_vec, i);
|
||||||
|
scheme_value ee = VECTOR_REF(end_vec, i);
|
||||||
|
r->startp[i] = FIXNUMP(se) ? (match + EXTRACT_FIXNUM(se)) : 0;
|
||||||
|
r->endp[i] = FIXNUMP(ee) ? (match + EXTRACT_FIXNUM(ee)) : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
regexp_error = 0;
|
||||||
|
*len = regsublen(r, src);
|
||||||
|
return regexp_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Return NULL normally, error string on error.
|
||||||
|
** Stash match info in start_vec and end_vec.
|
||||||
|
** Returns boolean match/no-match in hit.
|
||||||
|
*/
|
||||||
|
|
||||||
|
char *re_match(const char *re, const char *string, int start,
|
||||||
|
scheme_value start_vec, scheme_value end_vec, int *hit)
|
||||||
{
|
{
|
||||||
regexp *prog;
|
regexp *prog;
|
||||||
|
|
||||||
|
@ -34,12 +147,12 @@ char *reg_match(const char *re, const char *string, int start,
|
||||||
prog = regcomp(re);
|
prog = regcomp(re);
|
||||||
if( !prog ) return regexp_error;
|
if( !prog ) return regexp_error;
|
||||||
|
|
||||||
if( VECTOR_LENGTH(start_vec) != NSUBEXP ) {
|
if( VECTOR_LENGTH(start_vec) != NSUBEXP ) { /* These two tests */
|
||||||
Free(prog);
|
Free(prog);
|
||||||
return "Illegal start vector";
|
return "Illegal start vector";
|
||||||
}
|
}
|
||||||
|
|
||||||
if( VECTOR_LENGTH(end_vec) != NSUBEXP ) {
|
if( VECTOR_LENGTH(end_vec) != NSUBEXP ) { /* should never trigger. */
|
||||||
Free(prog);
|
Free(prog);
|
||||||
return "Illegal end vector";
|
return "Illegal end vector";
|
||||||
}
|
}
|
||||||
|
@ -47,8 +160,10 @@ char *reg_match(const char *re, const char *string, int start,
|
||||||
if( regexec(prog, string+start) ) {
|
if( regexec(prog, string+start) ) {
|
||||||
int i;
|
int i;
|
||||||
for(i=0; i<NSUBEXP; i++) {
|
for(i=0; i<NSUBEXP; i++) {
|
||||||
VECTOR_REF(start_vec,i) = ENTER_FIXNUM(prog->startp[i] - string);
|
const char *s = prog->startp[i];
|
||||||
VECTOR_REF(end_vec,i) = ENTER_FIXNUM(prog->endp[i] - string);
|
const char *e = prog->endp[i];
|
||||||
|
VECTOR_REF(start_vec,i) = s ? ENTER_FIXNUM(s - string) : SCHFALSE;
|
||||||
|
VECTOR_REF(end_vec,i) = e ? ENTER_FIXNUM(e - string) : SCHFALSE;
|
||||||
}
|
}
|
||||||
*hit = 1;
|
*hit = 1;
|
||||||
}
|
}
|
||||||
|
|
24
scsh/re1.h
24
scsh/re1.h
|
@ -1,6 +1,24 @@
|
||||||
char *reg_match(const char *re, const char *string, int start,
|
/* Exports from re1.c */
|
||||||
scheme_value start_vec, scheme_value end_vec,
|
|
||||||
int *hit);
|
char *re_byte_len(const char *re, int *len);
|
||||||
|
char *re_compile(const char *re, scheme_value target);
|
||||||
|
|
||||||
|
char *re_exec(scheme_value cr, const char *string, int start,
|
||||||
|
scheme_value start_vec, scheme_value end_vec, int *hit);
|
||||||
|
|
||||||
|
char *re_match(const char *re, const char *string, int start,
|
||||||
|
scheme_value start_vec, scheme_value end_vec,
|
||||||
|
int *hit);
|
||||||
|
|
||||||
|
char *re_subst_len(scheme_value cr, const char *match,
|
||||||
|
const char *src, int start,
|
||||||
|
scheme_value start_vec, scheme_value end_vec,
|
||||||
|
int *len);
|
||||||
|
|
||||||
|
char *re_subst(scheme_value cr, const char *match,
|
||||||
|
const char *src, int start,
|
||||||
|
scheme_value start_vec, scheme_value end_vec,
|
||||||
|
scheme_value outbuf, int *len);
|
||||||
|
|
||||||
char *filter_stringvec(const char *re, char const **stringvec,
|
char *filter_stringvec(const char *re, char const **stringvec,
|
||||||
int *nummatch);
|
int *nummatch);
|
||||||
|
|
Loading…
Reference in New Issue