1999-09-23 10:27:41 -04:00
|
|
|
/* Scheme48 interface to Henry Spencer's Posix regular expression package.
|
|
|
|
** Copyright (c) 1993, 1994, 1998 by Olin Shivers.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Todo:
|
|
|
|
** not_eol not_bol support on searchers
|
|
|
|
** error code -> err msg
|
|
|
|
** regex freeing
|
|
|
|
** regexp-string -> regex_t caching
|
|
|
|
** make filter_stringvec return an error code.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include "../regexp/regex.h"
|
|
|
|
#include "../cstuff.h"
|
|
|
|
|
|
|
|
/* Make sure our exports match up w/the implementation: */
|
|
|
|
#include "re1.h"
|
|
|
|
|
|
|
|
/*
|
|
|
|
** Compile regexp into a malloc'd struct.
|
|
|
|
** The flag sm_p is true if we want to compile for submatches.
|
|
|
|
** On success, store pointer to struct into cr and return 0.
|
|
|
|
** On failure, free the struct, store NULL into cr,
|
|
|
|
** and return a non-zero error code.
|
|
|
|
*/
|
|
|
|
|
1999-09-23 13:46:46 -04:00
|
|
|
int compile_re(s48_value re_str, int sm_p, regex_t **cr)
|
1999-09-23 10:27:41 -04:00
|
|
|
{
|
1999-09-23 13:46:46 -04:00
|
|
|
// JMG: char *s = &STRING_REF(re_str, 0);
|
|
|
|
char *s = s48_extract_string(re_str);
|
|
|
|
int len = S48_STRING_LENGTH(re_str);
|
1999-09-23 10:27:41 -04:00
|
|
|
int err;
|
|
|
|
regex_t *re = Alloc(regex_t);
|
|
|
|
|
|
|
|
if( !re ) return -1;
|
|
|
|
|
|
|
|
re->re_endp = s + len;
|
|
|
|
err = regcomp(re, s, REG_EXTENDED | REG_PEND
|
|
|
|
| (sm_p ? 0 : REG_NOSUB));
|
|
|
|
if( err ) {Free(re); *cr=0;}
|
|
|
|
else *cr=re;
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Do a regex search of RE through string STR, beginning at STR[START].
|
|
|
|
** - STR is passed as a Scheme value as it is allowed to contain nul bytes.
|
|
|
|
**
|
|
|
|
** - trans_vec contains the translation from the user's "virtual" submatches to
|
|
|
|
** the actual submatches the engine will report:
|
|
|
|
** - trans_vec[i] = #F means user submatch #i is a dead submatch.
|
|
|
|
** - trans_vec[i] = j means user submatch #i corresponds to paren #j in re.
|
|
|
|
**
|
|
|
|
** Indexing fence-posts are a little complicated due to the fact that you
|
|
|
|
** get an extra match elt back from the matcher -- match 0 is not a
|
|
|
|
** paren-based *sub*match, but rather the match info for the whole thing.
|
|
|
|
**
|
|
|
|
** So, here is how it works:
|
|
|
|
** length(start_vec) = length(end_vec) = length(trans_vec) + 1
|
|
|
|
** because trans_vec doesn't have a translation for submatch 0, which
|
|
|
|
** is SRE submatch #0 => Posix submatch #0. For SRE submatch #i (1, 2, ...),
|
|
|
|
** we want the submatch associated with Posix paren # trans_vec[i-1].
|
|
|
|
**
|
|
|
|
** - MAX_PSM is the maximum paren in which we have submatch interest -- the
|
|
|
|
** max element in TRANS_VEC. Any parens after paren #MAX_PSM are just for
|
|
|
|
** grouping, not for marking submatches. We only have to allocate MAX_PSM+1
|
|
|
|
** elements in the submatch vector we pass into the search engine. If
|
|
|
|
** MAX_PSM = -1, then we don't even want the whole-match match bounds, which
|
|
|
|
** is really good -- the search engine can really fly in this case.
|
|
|
|
**
|
|
|
|
** If we match, map re's submatches over to the exported start_vec and
|
|
|
|
** end_vec match vectors using trans_vec.
|
|
|
|
**
|
|
|
|
** Return 0 on success; #f if no match; non-zero integer error code otherwise.
|
|
|
|
*/
|
|
|
|
|
1999-09-23 13:46:46 -04:00
|
|
|
s48_value re_search(const regex_t *re, s48_value str, int start,
|
|
|
|
s48_value trans_vec, int max_psm,
|
|
|
|
s48_value start_vec, s48_value end_vec)
|
1999-09-23 10:27:41 -04:00
|
|
|
{
|
1999-09-23 13:46:46 -04:00
|
|
|
// JMG: char *s = &STRING_REF(str,0); /* Passed as a s48_value because */
|
|
|
|
char *s = s48_extract_string(str);
|
|
|
|
int len = S48_STRING_LENGTH(str); /* it might contain nul bytes. */
|
1999-09-23 10:27:41 -04:00
|
|
|
|
1999-09-23 13:46:46 -04:00
|
|
|
int vlen = S48_VECTOR_LENGTH(start_vec);
|
1999-09-23 10:27:41 -04:00
|
|
|
int retval;
|
|
|
|
|
|
|
|
regmatch_t static_pmatch[10], *pm;
|
|
|
|
|
|
|
|
/* If max_psm+1 > 10, we can't use static_pmatch. */
|
|
|
|
if( max_psm < 10 ) pm = static_pmatch;
|
|
|
|
else {
|
|
|
|
pm = Malloc(regmatch_t, max_psm+1);/* Add 1 for the whole-match info. */
|
1999-09-23 13:46:46 -04:00
|
|
|
if( !pm ) return s48_enter_fixnum(-1);
|
1999-09-23 10:27:41 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
pm[0].rm_so = start;
|
|
|
|
pm[0].rm_eo = len;
|
|
|
|
|
|
|
|
retval = regexec(re, s, max_psm+1, pm, REG_STARTEND); /* Do it. */
|
|
|
|
|
|
|
|
/* We matched and have match-bound info, so translate it over. */
|
|
|
|
if( !retval && max_psm >= 0 ) {
|
|
|
|
int i;
|
|
|
|
|
1999-09-23 13:46:46 -04:00
|
|
|
//JMG: S48_VECTOR_REF(start_vec,0) = s48_enter_fixnum(pm[0].rm_so); /* whole-match */
|
|
|
|
//S48_VECTOR_REF(end_vec,0) = s48_enter_fixnum(pm[0].rm_eo);
|
|
|
|
|
|
|
|
S48_VECTOR_SET(start_vec,0, s48_enter_fixnum(pm[0].rm_so));
|
|
|
|
S48_VECTOR_SET(end_vec,0, s48_enter_fixnum(pm[0].rm_eo));
|
1999-09-23 10:27:41 -04:00
|
|
|
|
|
|
|
for( i=vlen-1; --i >= 0; ) { /* submatches */
|
1999-09-23 13:46:46 -04:00
|
|
|
s48_value j_scm = S48_VECTOR_REF(trans_vec,i);
|
|
|
|
if( j_scm != S48_FALSE ) {
|
|
|
|
int j = s48_extract_fixnum(j_scm);
|
1999-09-23 10:27:41 -04:00
|
|
|
int k = pm[j].rm_so,
|
|
|
|
l = pm[j].rm_eo;
|
1999-09-23 13:46:46 -04:00
|
|
|
// JMG S48_VECTOR_REF(start_vec,i+1) = (k != -1) ? s48_enter_fixnum(k) : S48_FALSE;
|
|
|
|
//S48_VECTOR_REF(end_vec, i+1) = (l != -1) ? s48_enter_fixnum(l) : S48_FALSE;
|
|
|
|
S48_VECTOR_SET(start_vec,i+1, (k != -1) ? s48_enter_fixnum(k) : S48_FALSE);
|
|
|
|
S48_VECTOR_SET(end_vec, i+1, (l != -1) ? s48_enter_fixnum(l) : S48_FALSE);
|
1999-09-23 10:27:41 -04:00
|
|
|
}
|
|
|
|
}
|
1999-09-23 13:46:46 -04:00
|
|
|
}
|
1999-09-23 10:27:41 -04:00
|
|
|
|
|
|
|
if( max_psm >= 10 ) Free(pm);
|
|
|
|
|
1999-09-23 13:46:46 -04:00
|
|
|
if( retval==REG_NOMATCH ) return S48_FALSE;
|
|
|
|
if( ! retval ) return S48_TRUE;
|
|
|
|
return s48_enter_fixnum(retval);
|
1999-09-23 10:27:41 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* Filter a vector of strings by regexp RE_STR.
|
|
|
|
** Stringvec is a NULL-terminated vector of strings;
|
|
|
|
** filter it in-place, copying the survivors back to compact them.
|
|
|
|
** Put the number of survivors in nummatch.
|
|
|
|
*/
|
|
|
|
|
1999-09-23 13:46:46 -04:00
|
|
|
int filter_stringvec(s48_value re_str, char const **stringvec)
|
1999-09-23 10:27:41 -04:00
|
|
|
{
|
1999-09-23 13:46:46 -04:00
|
|
|
int re_len = S48_STRING_LENGTH(re_str);/* Passed as a s48_value because */
|
|
|
|
//JMG: char *re_chars = &STRING_REF(re_str,0);/* it might contain nul bytes. */
|
|
|
|
char *re_chars = s48_extract_string (re_str);/* it might contain nul bytes. */
|
|
|
|
|
|
|
|
regex_t re;
|
1999-09-23 10:27:41 -04:00
|
|
|
|
|
|
|
char const **p, **q;
|
|
|
|
|
|
|
|
/* REG_NOSUB -- We just want to know if it matches or not. */
|
|
|
|
re.re_endp = re_chars + re_len;
|
|
|
|
if( regcomp(&re, re_chars, REG_EXTENDED | REG_PEND | REG_NOSUB) ) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
for(p=q=stringvec; *p; p++) {
|
|
|
|
char const *s = *p;
|
|
|
|
if( ! regexec(&re, s, 0, 0, 0) ) *q++ = s;
|
|
|
|
}
|
|
|
|
|
|
|
|
regfree(&re);
|
|
|
|
return q-stringvec;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const char *re_errint2str(int errcode, const regex_t *re)
|
|
|
|
{
|
|
|
|
int size = regerror(errcode, re, 0, 0);
|
|
|
|
char *s = Malloc(char,size);
|
|
|
|
if(s) regerror(errcode, re, s, size);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void free_re(regex_t *re)
|
|
|
|
{
|
|
|
|
regfree(re);
|
|
|
|
Free(re);
|
|
|
|
}
|