scsh-0.6/scsh/rx/re1.c

/* Scheme48 interface to Henry Spencer's Posix regular expression package.
** Copyright (c) 1993, 1994, 1998 by Olin Shivers.
*/

/* Todo:
**   not_eol not_bol support on searchers
**   error code -> err msg
**   regex freeing
**   regexp-string -> regex_t caching
**   make filter_stringvec return an error code.
*/

#include <stdlib.h>
#include <sys/types.h>
#include "../regexp/regex.h"
#include "../cstuff.h"

/* Make sure our exports match up w/the implementation: */
#include "re1.h"

/*
** Compile regexp into a malloc'd struct.
** The flag sm_p is true if we want to compile for submatches.
** On success, store pointer to struct into cr and return 0.
** On failure, free the struct, store NULL into cr, 
**   and return a non-zero error code.
*/

int compile_re(s48_value re_str, int sm_p, regex_t **cr)
{
  // JMG: char *s = &STRING_REF(re_str, 0);
  char *s = s48_extract_string(re_str);
    int len = S48_STRING_LENGTH(re_str);
    int err;
    regex_t *re = Alloc(regex_t);

    if( !re ) return -1;
    
    re->re_endp = s + len;
    err = regcomp(re, s, REG_EXTENDED | REG_PEND
		                      | (sm_p ? 0 : REG_NOSUB));
    if( err ) {Free(re); *cr=0;}
    else *cr=re;

    return err;
    }

/* Do a regex search of RE through string STR, beginning at STR[START].
** - STR is passed as a Scheme value as it is allowed to contain nul bytes.
**
** - trans_vec contains the translation from the user's "virtual" submatches to
**   the actual submatches the engine will report:
**   - trans_vec[i] = #F means user submatch #i is a dead submatch.
**   - trans_vec[i] = j  means user submatch #i corresponds to paren #j in re.
**
**   Indexing fence-posts are a little complicated due to the fact that you
**   get an extra match elt back from the matcher -- match 0 is not a 
**   paren-based *sub*match, but rather the match info for the whole thing.
**
**   So, here is how it works: 
**     length(start_vec) = length(end_vec) = length(trans_vec) + 1
**   because trans_vec doesn't have a translation for submatch 0, which
**   is SRE submatch #0 => Posix submatch #0. For SRE submatch #i (1, 2, ...),
**   we want the submatch associated with Posix paren # trans_vec[i-1].
**
** - MAX_PSM is the maximum paren in which we have submatch interest -- the
**   max element in TRANS_VEC. Any parens after paren #MAX_PSM are just for
**   grouping, not for marking submatches. We only have to allocate MAX_PSM+1
**   elements in the submatch vector we pass into the search engine. If
**   MAX_PSM = -1, then we don't even want the whole-match match bounds, which
**   is really good -- the search engine can really fly in this case.
**
**   If we match, map re's submatches over to the exported start_vec and 
**   end_vec match vectors using trans_vec.
**
** Return 0 on success; #f if no match; non-zero integer error code otherwise.
*/

s48_value re_search(const regex_t *re, s48_value str, int start,
		       s48_value trans_vec, int max_psm,
		       s48_value start_vec, s48_value end_vec)
{
  // JMG: char *s = &STRING_REF(str,0);		/* Passed as a s48_value because */
  char *s = s48_extract_string(str);
  int len = S48_STRING_LENGTH(str);		/* it might contain nul bytes.      */

  int vlen = S48_VECTOR_LENGTH(start_vec);
  int retval;

  regmatch_t static_pmatch[10], *pm;

  /* If max_psm+1 > 10, we can't use static_pmatch. */
  if( max_psm < 10 ) pm = static_pmatch;
  else {
      pm = Malloc(regmatch_t, max_psm+1);/* Add 1 for the whole-match info. */
      if( !pm ) return s48_enter_fixnum(-1);
      }
  
  pm[0].rm_so = start;
  pm[0].rm_eo = len;

  retval = regexec(re, s, max_psm+1, pm, REG_STARTEND);	/* Do it. */

  /* We matched and have match-bound info, so translate it over. */
  if( !retval && max_psm >= 0 ) { 
      int i;

      //JMG:      S48_VECTOR_REF(start_vec,0) = s48_enter_fixnum(pm[0].rm_so);    /* whole-match */
      //S48_VECTOR_REF(end_vec,0)   = s48_enter_fixnum(pm[0].rm_eo);

      S48_VECTOR_SET(start_vec,0, s48_enter_fixnum(pm[0].rm_so));
      S48_VECTOR_SET(end_vec,0, s48_enter_fixnum(pm[0].rm_eo));

      for( i=vlen-1; --i >= 0; ) {			      /* submatches  */
	  s48_value j_scm = S48_VECTOR_REF(trans_vec,i);
	  if( j_scm != S48_FALSE ) {
	      int j = s48_extract_fixnum(j_scm);
	      int k = pm[j].rm_so,
		  l = pm[j].rm_eo;
	      //	JMG      S48_VECTOR_REF(start_vec,i+1) = (k != -1) ? s48_enter_fixnum(k) : S48_FALSE;
	      //S48_VECTOR_REF(end_vec,  i+1) = (l != -1) ? s48_enter_fixnum(l) : S48_FALSE;
	      S48_VECTOR_SET(start_vec,i+1, (k != -1) ? s48_enter_fixnum(k) : S48_FALSE);
	      S48_VECTOR_SET(end_vec,  i+1, (l != -1) ? s48_enter_fixnum(l) : S48_FALSE);
	  }
      }
      }

  if( max_psm >= 10 ) Free(pm);

  if( retval==REG_NOMATCH ) return S48_FALSE;
  if( ! retval ) return S48_TRUE;
  return s48_enter_fixnum(retval);
  }


/* Filter a vector of strings by regexp RE_STR.
** Stringvec is a NULL-terminated vector of strings;
** filter it in-place, copying the survivors back to compact them.
** Put the number of survivors in nummatch.
*/

int filter_stringvec(s48_value re_str, char const **stringvec)
{
  int re_len     = S48_STRING_LENGTH(re_str);/* Passed as a s48_value because */
  //JMG: char *re_chars = &STRING_REF(re_str,0);/* it might contain nul bytes.      */
  char *re_chars = s48_extract_string (re_str);/* it might contain nul bytes.      */
 
 regex_t re;

  char const **p, **q;

  /* REG_NOSUB -- We just want to know if it matches or not. */
  re.re_endp = re_chars + re_len;
  if( regcomp(&re, re_chars, REG_EXTENDED | REG_PEND | REG_NOSUB) ) {
      return 0;
      }

  for(p=q=stringvec; *p; p++) {
      char const *s = *p;
      if( ! regexec(&re, s, 0, 0, 0) ) *q++ = s;
      }

  regfree(&re);
  return q-stringvec;
  }


const char *re_errint2str(int errcode, const regex_t *re)
{
  int size = regerror(errcode, re, 0, 0);
  char *s = Malloc(char,size);
  if(s) regerror(errcode, re, s, size);
  return s;
  }


void free_re(regex_t *re)
{
  regfree(re);
  Free(re);
  }
new from 0.5.2 1999-09-23 10:27:41 -04:00			`/* Scheme48 interface to Henry Spencer's Posix regular expression package.`
			`** Copyright (c) 1993, 1994, 1998 by Olin Shivers.`
			`*/`

			`/* Todo:`
			`** not_eol not_bol support on searchers`
			`** error code -> err msg`
			`** regex freeing`
			`** regexp-string -> regex_t caching`
			`** make filter_stringvec return an error code.`
			`*/`

			`#include <stdlib.h>`
			`#include <sys/types.h>`
			`#include "../regexp/regex.h"`
			`#include "../cstuff.h"`

			`/* Make sure our exports match up w/the implementation: */`
			`#include "re1.h"`

			`/*`
			`** Compile regexp into a malloc'd struct.`
			`** The flag sm_p is true if we want to compile for submatches.`
			`** On success, store pointer to struct into cr and return 0.`
			`** On failure, free the struct, store NULL into cr,`
			`** and return a non-zero error code.`
			`*/`

updates from 0.5.2 1999-09-23 13:46:46 -04:00			`int compile_re(s48_value re_str, int sm_p, regex_t **cr)`
new from 0.5.2 1999-09-23 10:27:41 -04:00			`{`
updates from 0.5.2 1999-09-23 13:46:46 -04:00			`// JMG: char *s = &STRING_REF(re_str, 0);`
			`char *s = s48_extract_string(re_str);`
			`int len = S48_STRING_LENGTH(re_str);`
new from 0.5.2 1999-09-23 10:27:41 -04:00			`int err;`
			`regex_t *re = Alloc(regex_t);`

			`if( !re ) return -1;`

			`re->re_endp = s + len;`
			`err = regcomp(re, s, REG_EXTENDED \| REG_PEND`
			`\| (sm_p ? 0 : REG_NOSUB));`
			`if( err ) {Free(re); *cr=0;}`
			`else *cr=re;`

			`return err;`
			`}`

			`/* Do a regex search of RE through string STR, beginning at STR[START].`
			`** - STR is passed as a Scheme value as it is allowed to contain nul bytes.`
			`**`
			`** - trans_vec contains the translation from the user's "virtual" submatches to`
			`** the actual submatches the engine will report:`
			`** - trans_vec[i] = #F means user submatch #i is a dead submatch.`
			`** - trans_vec[i] = j means user submatch #i corresponds to paren #j in re.`
			`**`
			`** Indexing fence-posts are a little complicated due to the fact that you`
			`** get an extra match elt back from the matcher -- match 0 is not a`
			`** paren-based submatch, but rather the match info for the whole thing.`
			`**`
			`** So, here is how it works:`
			`** length(start_vec) = length(end_vec) = length(trans_vec) + 1`
			`** because trans_vec doesn't have a translation for submatch 0, which`
			`** is SRE submatch #0 => Posix submatch #0. For SRE submatch #i (1, 2, ...),`
			`** we want the submatch associated with Posix paren # trans_vec[i-1].`
			`**`
			`** - MAX_PSM is the maximum paren in which we have submatch interest -- the`
			`** max element in TRANS_VEC. Any parens after paren #MAX_PSM are just for`
			`** grouping, not for marking submatches. We only have to allocate MAX_PSM+1`
			`** elements in the submatch vector we pass into the search engine. If`
			`** MAX_PSM = -1, then we don't even want the whole-match match bounds, which`
			`** is really good -- the search engine can really fly in this case.`
			`**`
			`** If we match, map re's submatches over to the exported start_vec and`
			`** end_vec match vectors using trans_vec.`
			`**`
			`** Return 0 on success; #f if no match; non-zero integer error code otherwise.`
			`*/`

updates from 0.5.2 1999-09-23 13:46:46 -04:00			`s48_value re_search(const regex_t *re, s48_value str, int start,`
			`s48_value trans_vec, int max_psm,`
			`s48_value start_vec, s48_value end_vec)`
new from 0.5.2 1999-09-23 10:27:41 -04:00			`{`
updates from 0.5.2 1999-09-23 13:46:46 -04:00			`// JMG: char s = &STRING_REF(str,0); / Passed as a s48_value because */`
			`char *s = s48_extract_string(str);`
			`int len = S48_STRING_LENGTH(str); /* it might contain nul bytes. */`
new from 0.5.2 1999-09-23 10:27:41 -04:00
updates from 0.5.2 1999-09-23 13:46:46 -04:00			`int vlen = S48_VECTOR_LENGTH(start_vec);`
new from 0.5.2 1999-09-23 10:27:41 -04:00			`int retval;`

			`regmatch_t static_pmatch[10], *pm;`

			`/* If max_psm+1 > 10, we can't use static_pmatch. */`
			`if( max_psm < 10 ) pm = static_pmatch;`
			`else {`
			`pm = Malloc(regmatch_t, max_psm+1);/* Add 1 for the whole-match info. */`
updates from 0.5.2 1999-09-23 13:46:46 -04:00			`if( !pm ) return s48_enter_fixnum(-1);`
new from 0.5.2 1999-09-23 10:27:41 -04:00			`}`

			`pm[0].rm_so = start;`
			`pm[0].rm_eo = len;`

			`retval = regexec(re, s, max_psm+1, pm, REG_STARTEND); /* Do it. */`

			`/* We matched and have match-bound info, so translate it over. */`
			`if( !retval && max_psm >= 0 ) {`
			`int i;`

updates from 0.5.2 1999-09-23 13:46:46 -04:00			`//JMG: S48_VECTOR_REF(start_vec,0) = s48_enter_fixnum(pm[0].rm_so); /* whole-match */`
			`//S48_VECTOR_REF(end_vec,0) = s48_enter_fixnum(pm[0].rm_eo);`

			`S48_VECTOR_SET(start_vec,0, s48_enter_fixnum(pm[0].rm_so));`
			`S48_VECTOR_SET(end_vec,0, s48_enter_fixnum(pm[0].rm_eo));`
new from 0.5.2 1999-09-23 10:27:41 -04:00
			`for( i=vlen-1; --i >= 0; ) { /* submatches */`
updates from 0.5.2 1999-09-23 13:46:46 -04:00			`s48_value j_scm = S48_VECTOR_REF(trans_vec,i);`
			`if( j_scm != S48_FALSE ) {`
			`int j = s48_extract_fixnum(j_scm);`
new from 0.5.2 1999-09-23 10:27:41 -04:00			`int k = pm[j].rm_so,`
			`l = pm[j].rm_eo;`
updates from 0.5.2 1999-09-23 13:46:46 -04:00			`// JMG S48_VECTOR_REF(start_vec,i+1) = (k != -1) ? s48_enter_fixnum(k) : S48_FALSE;`
			`//S48_VECTOR_REF(end_vec, i+1) = (l != -1) ? s48_enter_fixnum(l) : S48_FALSE;`
			`S48_VECTOR_SET(start_vec,i+1, (k != -1) ? s48_enter_fixnum(k) : S48_FALSE);`
			`S48_VECTOR_SET(end_vec, i+1, (l != -1) ? s48_enter_fixnum(l) : S48_FALSE);`
new from 0.5.2 1999-09-23 10:27:41 -04:00			`}`
			`}`
updates from 0.5.2 1999-09-23 13:46:46 -04:00			`}`
new from 0.5.2 1999-09-23 10:27:41 -04:00
			`if( max_psm >= 10 ) Free(pm);`

updates from 0.5.2 1999-09-23 13:46:46 -04:00			`if( retval==REG_NOMATCH ) return S48_FALSE;`
			`if( ! retval ) return S48_TRUE;`
			`return s48_enter_fixnum(retval);`
new from 0.5.2 1999-09-23 10:27:41 -04:00			`}`



			`/* Filter a vector of strings by regexp RE_STR.`
			`** Stringvec is a NULL-terminated vector of strings;`
			`** filter it in-place, copying the survivors back to compact them.`
			`** Put the number of survivors in nummatch.`
			`*/`

updates from 0.5.2 1999-09-23 13:46:46 -04:00			`int filter_stringvec(s48_value re_str, char const **stringvec)`
new from 0.5.2 1999-09-23 10:27:41 -04:00			`{`
updates from 0.5.2 1999-09-23 13:46:46 -04:00			`int re_len = S48_STRING_LENGTH(re_str);/* Passed as a s48_value because */`
			`//JMG: char re_chars = &STRING_REF(re_str,0);/ it might contain nul bytes. */`
			`char re_chars = s48_extract_string (re_str);/ it might contain nul bytes. */`

			`regex_t re;`
new from 0.5.2 1999-09-23 10:27:41 -04:00
			`char const p, q;`

			`/* REG_NOSUB -- We just want to know if it matches or not. */`
			`re.re_endp = re_chars + re_len;`
			`if( regcomp(&re, re_chars, REG_EXTENDED \| REG_PEND \| REG_NOSUB) ) {`
			`return 0;`
			`}`

			`for(p=q=stringvec; *p; p++) {`
			`char const s = p;`
			`if( ! regexec(&re, s, 0, 0, 0) ) *q++ = s;`
			`}`

			`regfree(&re);`
			`return q-stringvec;`
			`}`


			`const char re_errint2str(int errcode, const regex_t re)`
			`{`
			`int size = regerror(errcode, re, 0, 0);`
			`char *s = Malloc(char,size);`
			`if(s) regerror(errcode, re, s, size);`
			`return s;`
			`}`


			`void free_re(regex_t *re)`
			`{`
			`regfree(re);`
			`Free(re);`
			`}`