/* Scheme48 interface to Henry Spencer's Posix regular expression package. ** Copyright (c) 1993, 1994, 1998 by Olin Shivers. */ /* Todo: ** not_eol not_bol support on searchers ** error code -> err msg ** regex freeing ** regexp-string -> regex_t caching ** make filter_stringvec return an error code. */ #include #include #include "../regexp/regex.h" #include "../cstuff.h" /* Make sure our exports match up w/the implementation: */ #include "re1.h" /* ** Compile regexp into a malloc'd struct. ** The flag sm_p is true if we want to compile for submatches. ** On success, store pointer to struct into cr and return 0. ** On failure, free the struct, store NULL into cr, ** and return a non-zero error code. */ int compile_re(scheme_value re_str, int sm_p, regex_t **cr) { char *s = &STRING_REF(re_str, 0); int len = STRING_LENGTH(re_str); int err; regex_t *re = Alloc(regex_t); if( !re ) return -1; re->re_endp = s + len; err = regcomp(re, s, REG_EXTENDED | REG_PEND | (sm_p ? 0 : REG_NOSUB)); if( err ) {Free(re); *cr=0;} else *cr=re; return err; } /* Do a regex search of RE through string STR, beginning at STR[START]. ** - STR is passed as a Scheme value as it is allowed to contain nul bytes. ** ** - trans_vec contains the translation from the user's "virtual" submatches to ** the actual submatches the engine will report: ** - trans_vec[i] = #F means user submatch #i is a dead submatch. ** - trans_vec[i] = j means user submatch #i corresponds to paren #j in re. ** ** Indexing fence-posts are a little complicated due to the fact that you ** get an extra match elt back from the matcher -- match 0 is not a ** paren-based *sub*match, but rather the match info for the whole thing. ** ** So, here is how it works: ** length(start_vec) = length(end_vec) = length(trans_vec) + 1 ** because trans_vec doesn't have a translation for submatch 0, which ** is SRE submatch #0 => Posix submatch #0. For SRE submatch #i (1, 2, ...), ** we want the submatch associated with Posix paren # trans_vec[i-1]. ** ** - MAX_PSM is the maximum paren in which we have submatch interest -- the ** max element in TRANS_VEC. Any parens after paren #MAX_PSM are just for ** grouping, not for marking submatches. We only have to allocate MAX_PSM+1 ** elements in the submatch vector we pass into the search engine. If ** MAX_PSM = -1, then we don't even want the whole-match match bounds, which ** is really good -- the search engine can really fly in this case. ** ** If we match, map re's submatches over to the exported start_vec and ** end_vec match vectors using trans_vec. ** ** Return 0 on success; #f if no match; non-zero integer error code otherwise. */ scheme_value re_search(const regex_t *re, scheme_value str, int start, scheme_value trans_vec, int max_psm, scheme_value start_vec, scheme_value end_vec) { char *s = &STRING_REF(str,0); /* Passed as a scheme_value because */ int len = STRING_LENGTH(str); /* it might contain nul bytes. */ int vlen = VECTOR_LENGTH(start_vec); int retval; regmatch_t static_pmatch[10], *pm; /* If max_psm+1 > 10, we can't use static_pmatch. */ if( max_psm < 10 ) pm = static_pmatch; else { pm = Malloc(regmatch_t, max_psm+1);/* Add 1 for the whole-match info. */ if( !pm ) return ENTER_FIXNUM(-1); } pm[0].rm_so = start; pm[0].rm_eo = len; retval = regexec(re, s, max_psm+1, pm, REG_STARTEND); /* Do it. */ /* We matched and have match-bound info, so translate it over. */ if( !retval && max_psm >= 0 ) { int i; VECTOR_REF(start_vec,0) = ENTER_FIXNUM(pm[0].rm_so); /* whole-match */ VECTOR_REF(end_vec,0) = ENTER_FIXNUM(pm[0].rm_eo); for( i=vlen-1; --i >= 0; ) { /* submatches */ scheme_value j_scm = VECTOR_REF(trans_vec,i); if( j_scm != SCHFALSE ) { int j = EXTRACT_FIXNUM(j_scm); int k = pm[j].rm_so, l = pm[j].rm_eo; VECTOR_REF(start_vec,i+1) = (k != -1) ? ENTER_FIXNUM(k) : SCHFALSE; VECTOR_REF(end_vec, i+1) = (l != -1) ? ENTER_FIXNUM(l) : SCHFALSE; } } } if( max_psm >= 10 ) Free(pm); if( retval==REG_NOMATCH ) return SCHFALSE; if( ! retval ) return SCHTRUE; return ENTER_FIXNUM(retval); } /* Filter a vector of strings by regexp RE_STR. ** Stringvec is a NULL-terminated vector of strings; ** filter it in-place, copying the survivors back to compact them. ** Put the number of survivors in nummatch. */ int filter_stringvec(scheme_value re_str, char const **stringvec) { int re_len = STRING_LENGTH(re_str);/* Passed as a scheme_value because */ char *re_chars = &STRING_REF(re_str,0);/* it might contain nul bytes. */ regex_t re; char const **p, **q; /* REG_NOSUB -- We just want to know if it matches or not. */ re.re_endp = re_chars + re_len; if( regcomp(&re, re_chars, REG_EXTENDED | REG_PEND | REG_NOSUB) ) { return 0; } for(p=q=stringvec; *p; p++) { char const *s = *p; if( ! regexec(&re, s, 0, 0, 0) ) *q++ = s; } regfree(&re); return q-stringvec; } const char *re_errint2str(int errcode, const regex_t *re) { int size = regerror(errcode, re, 0, 0); char *s = Malloc(char,size); if(s) regerror(errcode, re, s, size); return s; } void free_re(regex_t *re) { regfree(re); Free(re); }