Added regexp compilation
This commit is contained in:
		
							parent
							
								
									e84440fecd
								
							
						
					
					
						commit
						f948e51831
					
				
							
								
								
									
										179
									
								
								scsh/re.scm
								
								
								
								
							
							
						
						
									
										179
									
								
								scsh/re.scm
								
								
								
								
							|  | @ -7,46 +7,161 @@ | |||
|   "" "" | ||||
|   ) | ||||
| 
 | ||||
| (define-record regexp-match | ||||
|   string | ||||
|   start	; 10 elt vec | ||||
|   end)  ; 10 elt vec | ||||
| ;;; Match data for regexp matches. | ||||
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | ||||
| 
 | ||||
| ;;; Need to do error case for these three procs. | ||||
| (define-record regexp-match | ||||
|   string	; The string against which we matched. | ||||
|   start		; 10 elt vec | ||||
|   end)		; 10 elt vec | ||||
| 
 | ||||
| (define (match:start match . maybe-index) | ||||
|   (vector-ref (regexp-match:start match) | ||||
| 	      (:optional maybe-index 0))) | ||||
|   (let ((i (:optional maybe-index 0))) | ||||
|     (or (vector-ref (regexp-match:start match) i) | ||||
| 	(error match:start "No sub-match found." match i)))) | ||||
| 
 | ||||
| (define (match:end match . maybe-index) | ||||
|   (vector-ref (regexp-match:end match) | ||||
| 	      (:optional maybe-index 0))) | ||||
|   (let ((i (:optional maybe-index 0))) | ||||
|     (or (vector-ref (regexp-match:end match) i) | ||||
| 	(error match:start "No sub-match found." match i)))) | ||||
| 
 | ||||
| (define (match:substring match . maybe-index) | ||||
|   (let ((i (:optional maybe-index 0))) | ||||
|     (substring (regexp-match:string match) | ||||
| 	       (match:start match i) | ||||
| 	       (match:end match i)))) | ||||
| 
 | ||||
| (define (string-match pattern string . maybe-start) | ||||
|   (apply regexp-exec (make-regexp pattern) string maybe-start)) | ||||
|   (let* ((i (:optional maybe-index 0)) | ||||
| 	 (start (vector-ref (regexp-match:start match) i))) | ||||
|     (if start | ||||
| 	(substring (regexp-match:string match) | ||||
| 		   start | ||||
| 		   (vector-ref (regexp-match:end match) i)) | ||||
| 	(error match:substring "No sub-match found." match i)))) | ||||
| 
 | ||||
| 
 | ||||
| ;;; Bogus stub definitions for low-level match routines: | ||||
| ;;; Compiling regexps | ||||
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | ||||
| 
 | ||||
| (define regexp? string?) | ||||
| (define (make-regexp str) str) | ||||
| (define-record %regexp | ||||
|   string	; The string form of the regexp. | ||||
|   bytes		; The compiled representation, stuffed into a Scheme string. | ||||
|   ((disclose self) (list "Regexp" (%regexp:string self)))) | ||||
| 
 | ||||
| (define regexp? %regexp?) | ||||
| 
 | ||||
| 
 | ||||
| (define (make-regexp pattern) | ||||
|   (receive (err len) (%regexp-compiled-length pattern) | ||||
|     (if err (error err make-regexp pattern) | ||||
| 	(let ((buf (make-string len))) | ||||
| 	  (%regexp-compile pattern buf) | ||||
| 	  (make-%regexp pattern buf))))) | ||||
| 
 | ||||
| (define-foreign %regexp-compiled-length (re_byte_len (string pattern)) | ||||
|   static-string	; Error msg or #f | ||||
|   integer)	; number of bytes needed to compile REGEXP. | ||||
| 
 | ||||
| (define-foreign %regexp-compile (re_compile (string pattern) | ||||
| 					    (string-desc bytes)) | ||||
|   static-string) ; Error msg or #f | ||||
| 
 | ||||
| 
 | ||||
| ;;; Executing compiled regexps | ||||
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | ||||
| 
 | ||||
| (define (regexp-exec regexp str . maybe-start) | ||||
|   (let ((start (:optional maybe-start 0)) | ||||
| 	(start-vec (make-vector 10)) | ||||
| 	(end-vec (make-vector 10))) | ||||
|     (and (%regexp-match regexp str start start-vec end-vec) | ||||
| 	 (make-regexp-match str start-vec end-vec)))) | ||||
|     (receive (err match?) | ||||
| 	(%regexp-exec (%regexp:bytes regexp) str start start-vec end-vec) | ||||
|       (if err (error err regexp-exec regexp str start) | ||||
| 	  (and match? | ||||
| 	       (make-regexp-match str start-vec end-vec)))))) | ||||
| 
 | ||||
| (define-foreign %regexp-exec (re_exec (string-desc compiled-regexp) | ||||
| 				      (string s) | ||||
| 				      (integer start) | ||||
| 				      (vector-desc start-vec) | ||||
| 				      (vector-desc end-vec)) | ||||
|   static-string	; Error msg or #f | ||||
|   bool)		; Matched? | ||||
| 
 | ||||
| 
 | ||||
| ;;; Compile&match regexps in one go | ||||
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | ||||
| 
 | ||||
| ;;; I could do this with the separate compile and execute procedures,  | ||||
| ;;; but I go straight to C just for fun. | ||||
| 
 | ||||
| (define (string-match pattern string . maybe-start) | ||||
|   (let ((start (:optional maybe-start 0)) | ||||
| 	(start-vec (make-vector 10)) | ||||
| 	(end-vec (make-vector 10))) | ||||
|     (receive (err match?) (%string-match pattern string start | ||||
| 					 start-vec end-vec) | ||||
|       (if err (error err string-match pattern string start) | ||||
| 	  (and match? (make-regexp-match string start-vec end-vec)))))) | ||||
| 
 | ||||
| (define-foreign %string-match (re_match (string pattern) | ||||
| 					(string s) | ||||
| 					(integer start) | ||||
| 					(vector-desc start-vec) | ||||
| 					(vector-desc end-vec)) | ||||
|   static-string ; Error string or #f if all is ok. | ||||
|   bool)		; match? | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| ;;; Substitutions | ||||
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | ||||
| 
 | ||||
| (define-foreign %regexp-subst (re_subst (string-desc compiled-regexp) | ||||
| 					(string match) | ||||
| 					(string str) | ||||
| 					(integer start) | ||||
| 					(vector-desc start-vec) | ||||
| 					(vector-desc end-vec) | ||||
| 					(string-desc outbuf)) | ||||
|   static-string	; Error msg or #f | ||||
|   integer) | ||||
| 
 | ||||
| (define-foreign %regexp-subst-len (re_subst_len (string-desc compiled-regexp) | ||||
| 						(string match) | ||||
| 						(string str) | ||||
| 						(integer start) | ||||
| 						(vector-desc start-vec) | ||||
| 						(vector-desc end-vec)) | ||||
|   static-string	; Error msg or #f | ||||
|   integer) | ||||
| 
 | ||||
| ;;; What does this do? | ||||
| 
 | ||||
| (define (regexp-subst re match replacement) | ||||
|   (let ((cr (%regexp:bytes re)) | ||||
| 	(str       (regexp-match:string match)) | ||||
| 	(start-vec (regexp-match:start  match)) | ||||
| 	(end-vec   (regexp-match:end    match))) | ||||
|     (receive (err out-len) (%regexp-subst-len cr str replacement 0 | ||||
| 					      start-vec end-vec) | ||||
|       (if err (error err regexp-subst str replacement) ; More data here | ||||
| 	  (let ((out-buf (make-string out-len))) | ||||
| 	    (receive (err out-len) (%regexp-subst cr str replacement 0 | ||||
| 						  start-vec end-vec out-buf) | ||||
| 	      (if err (error err regexp-subst str replacement) | ||||
| 		  (substring out-buf 0 out-len)))))))) | ||||
| 
 | ||||
| ;;; Miscellaneous | ||||
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | ||||
| 
 | ||||
| ;;; I do this one in C, I'm not sure why: | ||||
| ;;; It is used by MATCH-FILES. | ||||
| 
 | ||||
| (define-foreign %filter-C-strings! | ||||
|   (filter_stringvec (string pattern) ((C "char const ** ~a") cvec)) | ||||
|   static-string	; error message -- #f if no error. | ||||
|   integer)	; number of files that pass the filter. | ||||
| 
 | ||||
| 
 | ||||
| ;;; Convert a string into a regex pattern that matches that string exactly -- | ||||
| ;;; in other words, quote the special chars with backslashes. | ||||
| 
 | ||||
| (define (regexp-quote string) | ||||
|   (let lp ((i (- (string-length string) 1)) | ||||
| 	   (result '())) | ||||
|  | @ -57,25 +172,3 @@ | |||
| 	      (if (memv c '(#\[ #\] #\. #\* #\? #\( #\) #\| #\\ #\$ #\^ #\+)) | ||||
| 		  (cons #\\ result) | ||||
| 		  result)))))) | ||||
| 
 | ||||
| (define-foreign %regexp-match/errno (reg_match (string regexp) | ||||
| 					       (string s) | ||||
| 					       (integer start) | ||||
| 					       (vector-desc start-vec) | ||||
| 					       (vector-desc end-vec)) | ||||
|   static-string ; Error string or #f if all is ok. | ||||
|   bool)		; match? | ||||
| 
 | ||||
| (define (%regexp-match regexp string start start-vec end-vec) | ||||
|   (receive (err match?) (%regexp-match/errno regexp string start | ||||
| 					     start-vec end-vec) | ||||
|     (if err (error err %regexp-match regexp string start) match?))) | ||||
| 
 | ||||
| 
 | ||||
| ;;; I do this one in C, I'm not sure why: | ||||
| ;;; Used by MATCH-FILES. | ||||
| 
 | ||||
| (define-foreign %filter-C-strings! | ||||
|   (filter_stringvec (string regexp) ((C "char const ** ~a") cvec)) | ||||
|   static-string	; error message -- #f if no error. | ||||
|   integer)	; number of files that pass the filter. | ||||
|  |  | |||
							
								
								
									
										135
									
								
								scsh/re1.c
								
								
								
								
							
							
						
						
									
										135
									
								
								scsh/re1.c
								
								
								
								
							|  | @ -9,23 +9,136 @@ | |||
| /* Make sure our exports match up w/the implementation: */ | ||||
| #include "re1.h" | ||||
| 
 | ||||
| #ifndef NULL | ||||
| #define NULL 0 | ||||
| #endif | ||||
| 
 | ||||
| /* Not multi-threaded reentrant. */ | ||||
| static char *regexp_error; | ||||
| 
 | ||||
| /* Stash error msg in global. */ | ||||
| void regerror(char *msg) {regexp_error = msg;} | ||||
| 
 | ||||
| /*
 | ||||
| ** Return NULL normally, error string on error. | ||||
| ** Stash number of bytes needed for compiled regexp into `*len' | ||||
| */ | ||||
| 
 | ||||
| char *re_byte_len(const char *re, int *len) | ||||
| { | ||||
|     int l; | ||||
| 
 | ||||
|     regexp_error = 0; | ||||
|     *len = regcomp_len(re);  | ||||
|     return regexp_error; | ||||
|     } | ||||
| 
 | ||||
| /*
 | ||||
| ** Return NULL normally, error string on error. | ||||
| ** Compile regexp into string described by `cr'. | ||||
| */ | ||||
| 
 | ||||
| char *re_compile(const char *re, scheme_value cr)  | ||||
| { | ||||
|     int len = STRING_LENGTH(cr); | ||||
|     regexp *r = (regexp *) &STRING_REF(cr, 0); | ||||
| 
 | ||||
|     regexp_error = 0; | ||||
|     r = regcomp_comp(re, r, len);  | ||||
|     return regexp_error; | ||||
|     } | ||||
| 
 | ||||
| /* Return NULL normally, error string on error.
 | ||||
| ** Stash match info in start_vec and end_vec. | ||||
| ** Returns boolean match/no-match in hit. | ||||
| */ | ||||
| 
 | ||||
| char *reg_match(const char *re, const char *string, int start, | ||||
| 		scheme_value start_vec, scheme_value end_vec,  int *hit) | ||||
| char *re_exec(scheme_value cr, const char *string, int start, | ||||
| 	      scheme_value start_vec, scheme_value end_vec,  int *hit) | ||||
| { | ||||
|     regexp *r = (regexp *) &STRING_REF(cr, 0); | ||||
| 
 | ||||
|     *hit = 0; | ||||
| 
 | ||||
|     if( VECTOR_LENGTH(start_vec) != NSUBEXP )	/* These tests should */ | ||||
| 	return "Illegal start vector";		/* never trigger.     */ | ||||
|     if( VECTOR_LENGTH(end_vec) != NSUBEXP ) | ||||
| 	return "Illegal end vector"; | ||||
| 
 | ||||
|     regexp_error = 0; | ||||
|      | ||||
|     if( regexec(r, string+start) ) { | ||||
| 	int i; | ||||
| 	for(i=0; i<NSUBEXP; i++) { | ||||
| 	    const char *s = r->startp[i]; | ||||
| 	    const char *e = r->endp[i]; | ||||
| 	    VECTOR_REF(start_vec,i) = s ? ENTER_FIXNUM(s - string) : SCHFALSE; | ||||
| 	    VECTOR_REF(end_vec,i)   = e ? ENTER_FIXNUM(e - string) : SCHFALSE; | ||||
| 	    r->startp[i] = 0;	/* Why did Sommerfeld */ | ||||
| 	    r->endp[i]   = 0;	/* put these here? */ | ||||
| 	    } | ||||
| 	*hit = 1; | ||||
| 	} | ||||
| 
 | ||||
|     return regexp_error; | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
| char *re_subst(scheme_value cr, const char *match, | ||||
| 	       const char *src, int start, | ||||
| 	       scheme_value start_vec, scheme_value end_vec, | ||||
| 	       scheme_value outbuf, int *len) | ||||
| { | ||||
|     int i; | ||||
|     regexp *r = (regexp *) &STRING_REF(cr, 0); | ||||
| 
 | ||||
|     if( VECTOR_LENGTH(start_vec) != NSUBEXP )	/* These tests should */ | ||||
| 	return "Illegal start vector";		/* never trigger.     */ | ||||
|     if( VECTOR_LENGTH(end_vec) != NSUBEXP ) | ||||
| 	return "Illegal end vector"; | ||||
| 
 | ||||
|     for (i=0; i<NSUBEXP; i++) { | ||||
| 	scheme_value se = VECTOR_REF(start_vec, i); | ||||
| 	scheme_value ee = VECTOR_REF(end_vec, i); | ||||
| 	r->startp[i] = FIXNUMP(se) ? (match + EXTRACT_FIXNUM(se)) : 0; | ||||
| 	r->endp[i]   = FIXNUMP(ee) ? (match + EXTRACT_FIXNUM(ee)) : 0; | ||||
|         } | ||||
|      | ||||
|     regexp_error = 0; | ||||
|     regnsub(r, src, &STRING_REF(outbuf, 0), STRING_LENGTH(outbuf)); | ||||
|     *len = strlen(&STRING_REF(outbuf, 0)); | ||||
|     return regexp_error; | ||||
|     } | ||||
| 
 | ||||
| char *re_subst_len(scheme_value cr, const char *match, | ||||
| 		   const char *src, int start, | ||||
| 		   scheme_value start_vec, scheme_value end_vec, | ||||
| 		   int *len) | ||||
| { | ||||
|     int i; | ||||
|     regexp *r = (regexp *) &STRING_REF(cr, 0); | ||||
| 
 | ||||
|     if( VECTOR_LENGTH(start_vec) != NSUBEXP )	/* These tests should */ | ||||
| 	return "Illegal start vector";		/* never trigger.     */ | ||||
|     if( VECTOR_LENGTH(end_vec) != NSUBEXP ) | ||||
| 	return "Illegal end vector"; | ||||
| 
 | ||||
|     for (i=0; i<NSUBEXP; i++) { | ||||
| 	scheme_value se = VECTOR_REF(start_vec, i); | ||||
| 	scheme_value ee = VECTOR_REF(end_vec, i); | ||||
| 	r->startp[i] = FIXNUMP(se) ? (match + EXTRACT_FIXNUM(se)) : 0; | ||||
| 	r->endp[i]   = FIXNUMP(ee) ? (match + EXTRACT_FIXNUM(ee)) : 0; | ||||
|         } | ||||
|      | ||||
|     regexp_error = 0; | ||||
|     *len = regsublen(r, src); | ||||
|     return regexp_error; | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
| /* Return NULL normally, error string on error.
 | ||||
| ** Stash match info in start_vec and end_vec. | ||||
| ** Returns boolean match/no-match in hit. | ||||
| */ | ||||
| 
 | ||||
| char *re_match(const char *re, const char *string, int start, | ||||
| 	       scheme_value start_vec, scheme_value end_vec,  int *hit) | ||||
| { | ||||
|     regexp *prog; | ||||
| 
 | ||||
|  | @ -34,12 +147,12 @@ char *reg_match(const char *re, const char *string, int start, | |||
|     prog = regcomp(re); | ||||
|     if( !prog ) return regexp_error; | ||||
| 
 | ||||
|     if( VECTOR_LENGTH(start_vec) != NSUBEXP ) { | ||||
|     if( VECTOR_LENGTH(start_vec) != NSUBEXP ) {	    /* These two tests */ | ||||
| 	Free(prog); | ||||
| 	return "Illegal start vector"; | ||||
| 	} | ||||
|      | ||||
|     if( VECTOR_LENGTH(end_vec) != NSUBEXP ) { | ||||
|     if( VECTOR_LENGTH(end_vec) != NSUBEXP ) {	    /* should never trigger. */ | ||||
| 	Free(prog); | ||||
| 	return "Illegal end vector"; | ||||
| 	} | ||||
|  | @ -47,8 +160,10 @@ char *reg_match(const char *re, const char *string, int start, | |||
|     if( regexec(prog, string+start) ) { | ||||
| 	int i; | ||||
| 	for(i=0; i<NSUBEXP; i++) { | ||||
| 	    VECTOR_REF(start_vec,i) = ENTER_FIXNUM(prog->startp[i] - string); | ||||
| 	    VECTOR_REF(end_vec,i)   = ENTER_FIXNUM(prog->endp[i]   - string); | ||||
| 	    const char *s = prog->startp[i]; | ||||
| 	    const char *e = prog->endp[i]; | ||||
| 	    VECTOR_REF(start_vec,i) = s ? ENTER_FIXNUM(s - string) : SCHFALSE; | ||||
| 	    VECTOR_REF(end_vec,i)   = e ? ENTER_FIXNUM(e - string) : SCHFALSE; | ||||
| 	    } | ||||
| 	*hit = 1; | ||||
| 	} | ||||
|  |  | |||
							
								
								
									
										24
									
								
								scsh/re1.h
								
								
								
								
							
							
						
						
									
										24
									
								
								scsh/re1.h
								
								
								
								
							|  | @ -1,6 +1,24 @@ | |||
| char *reg_match(const char *re, const char *string, int start, | ||||
| 		scheme_value start_vec, scheme_value end_vec, | ||||
| 		int *hit); | ||||
| /* Exports from re1.c */ | ||||
| 
 | ||||
| char *re_byte_len(const char *re, int *len); | ||||
| char *re_compile(const char *re, scheme_value target); | ||||
| 
 | ||||
| char *re_exec(scheme_value cr, const char *string, int start, | ||||
| 	      scheme_value start_vec, scheme_value end_vec,  int *hit); | ||||
| 
 | ||||
| char *re_match(const char *re, const char *string, int start, | ||||
| 	       scheme_value start_vec, scheme_value end_vec, | ||||
| 	       int *hit); | ||||
| 
 | ||||
| char *re_subst_len(scheme_value cr, const char *match, | ||||
| 		   const char *src, int start, | ||||
| 		   scheme_value start_vec, scheme_value end_vec, | ||||
| 		   int *len); | ||||
| 
 | ||||
| char *re_subst(scheme_value cr, const char *match, | ||||
| 	       const char *src, int start, | ||||
| 	       scheme_value start_vec, scheme_value end_vec, | ||||
| 	       scheme_value outbuf, int *len); | ||||
| 
 | ||||
| char *filter_stringvec(const char *re, char const **stringvec, | ||||
| 		       int *nummatch); | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 shivers
						shivers