From Scheme 48 0.57, extended by an optional START parameter for
REGEXP-MATCH.
This commit is contained in:
parent
f96ad831bb
commit
34c5cd67bd
|
@ -0,0 +1,159 @@
|
|||
; Copyright (c) 1993-2001 by Richard Kelsey and Jonathan Rees. See file COPYING.
|
||||
|
||||
; Exports:
|
||||
; make-regexp
|
||||
; regexp?
|
||||
; regexp-match
|
||||
; regexp-match?
|
||||
; regexp-match-start
|
||||
; regexp-match-end
|
||||
; regexp-option
|
||||
|
||||
; The compiled version of the expression is produced when needed.
|
||||
|
||||
(define-record-type regexp :regexp
|
||||
(really-make-regexp pattern compiled
|
||||
extended? ignore-case? submatches? newline?)
|
||||
regexp?
|
||||
(pattern regexp-pattern) ; immutable string
|
||||
(compiled real-regexp-compiled set-regexp-compiled!) ; #f or a c-record
|
||||
(extended? regexp-extended?) ; four flags
|
||||
(ignore-case? regexp-ignore-case?)
|
||||
(submatches? regexp-submatches?)
|
||||
(newline? regexp-newline?))
|
||||
|
||||
; Drop the compiled version when resuming. We may be resuming on a different
|
||||
; architecture, or version of the library, or whatever.
|
||||
|
||||
(define-record-resumer :regexp
|
||||
(lambda (regexp)
|
||||
(set-regexp-compiled! regexp #f)))
|
||||
|
||||
; There are four options when making a regular expression.
|
||||
|
||||
(define-enumerated-type regexp-option :regexp-option
|
||||
regexp-option?
|
||||
regexp-options
|
||||
regexp-option-name
|
||||
regexp-option-index
|
||||
(extended ignore-case submatches newline))
|
||||
|
||||
; Loop down finding which options are present and checking for duplicates.
|
||||
; This is not specific to regular expressions.
|
||||
;
|
||||
; It would be nice if this could handle values as well, as in
|
||||
; (make-regexp "sldkjf" (regexp-option size 10))
|
||||
|
||||
(define (decode-boolean-options options all-options predicate indexer)
|
||||
(let ((map (make-vector (vector-length all-options) #f)))
|
||||
(let loop ((options options))
|
||||
(if (null? options)
|
||||
(vector->list map)
|
||||
(let ((option (car options)))
|
||||
(if (predicate option)
|
||||
(let ((index (indexer option)))
|
||||
(if (vector-ref map index)
|
||||
'duplicates
|
||||
(begin
|
||||
(vector-set! map index #t)
|
||||
(loop (cdr options)))))
|
||||
'bad-value))))))
|
||||
|
||||
; The only thing we do here is to decode the options and make sure that the
|
||||
; pattern is immutable, as it won't be used until later.
|
||||
|
||||
(define (make-regexp pattern . options)
|
||||
(let ((options (decode-boolean-options options
|
||||
regexp-options
|
||||
regexp-option?
|
||||
regexp-option-index)))
|
||||
(if (and (string? pattern)
|
||||
(pair? options))
|
||||
(let* ((pattern (immutable-copy-string pattern))
|
||||
(regexp (apply really-make-regexp pattern #f options)))
|
||||
(add-finalizer! regexp free-compiled-regexp)
|
||||
regexp)
|
||||
(apply call-error "invalid argument(s)"
|
||||
make-regexp
|
||||
pattern
|
||||
options))))
|
||||
|
||||
; Free up the C-heap storage used for the compiled regexp.
|
||||
|
||||
(define (free-compiled-regexp regexp)
|
||||
(let ((compiled (real-regexp-compiled regexp)))
|
||||
(if compiled
|
||||
(call-imported-binding posix-free-regexp compiled))))
|
||||
|
||||
; We compile the pattern if that hasn't already been done, raising an error
|
||||
; if anything goes wrong.
|
||||
|
||||
(define (regexp-compiled regexp)
|
||||
(or (real-regexp-compiled regexp)
|
||||
(let ((compiled (call-imported-binding posix-compile-regexp
|
||||
(regexp-pattern regexp)
|
||||
(regexp-extended? regexp)
|
||||
(regexp-ignore-case? regexp)
|
||||
(regexp-submatches? regexp)
|
||||
(regexp-newline? regexp))))
|
||||
(if (not (integer? compiled))
|
||||
(begin
|
||||
(set-regexp-compiled! regexp compiled)
|
||||
compiled)
|
||||
(let ((message (call-imported-binding posix-regexp-error-message
|
||||
(regexp-pattern regexp)
|
||||
(regexp-extended? regexp)
|
||||
(regexp-ignore-case? regexp)
|
||||
(regexp-submatches? regexp)
|
||||
(regexp-newline? regexp))))
|
||||
(error (if message
|
||||
(string-append "Posix regexp: " message)
|
||||
"inconsistent results from Posix regexp compiler")
|
||||
regexp))))))
|
||||
|
||||
; Call the pattern matcher. We return #F if the match fails. On a successful
|
||||
; match, we either return #T or a list of match records, depending on the value
|
||||
; of SUBMATCHES?.
|
||||
|
||||
(define (regexp-match regexp string submatches? starts-line? ends-line? . rest)
|
||||
(if (and (regexp? regexp)
|
||||
(string? string))
|
||||
(let ((start (if (null? rest)
|
||||
0
|
||||
(car rest))))
|
||||
(call-imported-binding posix-regexp-match
|
||||
(regexp-compiled regexp)
|
||||
string
|
||||
submatches?
|
||||
starts-line?
|
||||
ends-line?
|
||||
start))
|
||||
(call-error "invalid argument"
|
||||
regexp-match
|
||||
regexp string starts-line? ends-line?)))
|
||||
|
||||
; These are made by the C code. The SUBMATCHES field is not used by us,
|
||||
; but is used by the functional interface.
|
||||
|
||||
(define-record-type match :match
|
||||
(make-match start end submatches)
|
||||
match?
|
||||
(start match-start)
|
||||
(end match-end)
|
||||
(submatches match-submatches))
|
||||
|
||||
(define-record-discloser :match
|
||||
(lambda (rem)
|
||||
(list 'match
|
||||
(match-start rem)
|
||||
(match-end rem))))
|
||||
|
||||
(define-exported-binding "posix-regexp-match-type" :match)
|
||||
|
||||
; The various C functions we use.
|
||||
|
||||
(import-definition posix-compile-regexp)
|
||||
(import-definition posix-regexp-match)
|
||||
(import-definition posix-regexp-error-message)
|
||||
(import-definition posix-free-regexp)
|
||||
|
|
@ -0,0 +1,222 @@
|
|||
/* Copyright (c) 1993-2000 by Richard Kelsey and Jonathan Rees.
|
||||
See file COPYING. */
|
||||
|
||||
/*
|
||||
* Scheme 48/POSIX regex interface
|
||||
*/
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <regex.h> /* POSIX.2 */
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "scheme48.h"
|
||||
|
||||
extern void s48_init_posix_regex(void);
|
||||
static s48_value posix_compile_regexp(s48_value pattern,
|
||||
s48_value extended_p,
|
||||
s48_value ignore_case_p,
|
||||
s48_value submatches_p,
|
||||
s48_value newline_p),
|
||||
posix_regexp_match(s48_value sch_regex,
|
||||
s48_value string,
|
||||
s48_value submatches_p,
|
||||
s48_value bol_p,
|
||||
s48_value eol_p,
|
||||
s48_value sch_start),
|
||||
posix_regexp_error_message(s48_value pattern,
|
||||
s48_value extended_p,
|
||||
s48_value ignore_case_p,
|
||||
s48_value submatches_p,
|
||||
s48_value newline_p),
|
||||
posix_free_regexp(s48_value sch_regex);
|
||||
|
||||
/*
|
||||
* Record type imported from Scheme.
|
||||
*/
|
||||
static s48_value posix_regexp_match_type_binding = S48_FALSE;
|
||||
|
||||
/*
|
||||
* Install all exported functions in Scheme48.
|
||||
*/
|
||||
void
|
||||
s48_init_posix_regexp(void)
|
||||
{
|
||||
/* Export our stuff. */
|
||||
S48_EXPORT_FUNCTION(posix_compile_regexp);
|
||||
S48_EXPORT_FUNCTION(posix_regexp_match);
|
||||
S48_EXPORT_FUNCTION(posix_regexp_error_message);
|
||||
S48_EXPORT_FUNCTION(posix_free_regexp);
|
||||
|
||||
/* Protect and import the regex-match type. */
|
||||
S48_GC_PROTECT_GLOBAL(posix_regexp_match_type_binding);
|
||||
posix_regexp_match_type_binding =
|
||||
s48_get_imported_binding("posix-regexp-match-type");
|
||||
}
|
||||
|
||||
/*
|
||||
* Interface to regcomp. We encode the flags, make the return value, and
|
||||
* then call regcomp() to fill it in.
|
||||
*/
|
||||
static s48_value
|
||||
posix_compile_regexp(s48_value pattern,
|
||||
s48_value extended_p, s48_value ignore_case_p,
|
||||
s48_value submatches_p, s48_value newline_p)
|
||||
{
|
||||
s48_value sch_regex;
|
||||
int status;
|
||||
S48_DECLARE_GC_PROTECT(1);
|
||||
int flags = S48_EXTRACT_BOOLEAN(extended_p) ? REG_EXTENDED : 0 |
|
||||
S48_EXTRACT_BOOLEAN(ignore_case_p) ? REG_ICASE : 0 |
|
||||
S48_EXTRACT_BOOLEAN(submatches_p) ? 0 : REG_NOSUB |
|
||||
S48_EXTRACT_BOOLEAN(newline_p) ? REG_NEWLINE : 0;
|
||||
|
||||
S48_GC_PROTECT_1(pattern);
|
||||
|
||||
S48_CHECK_STRING(pattern);
|
||||
|
||||
sch_regex = S48_MAKE_VALUE(regex_t);
|
||||
|
||||
status = regcomp(S48_UNSAFE_EXTRACT_VALUE_POINTER(sch_regex, regex_t),
|
||||
S48_UNSAFE_EXTRACT_STRING(pattern),
|
||||
flags);
|
||||
|
||||
S48_GC_UNPROTECT();
|
||||
|
||||
if (status == 0)
|
||||
return sch_regex;
|
||||
else
|
||||
return S48_UNSAFE_ENTER_FIXNUM(status); /* not that it can do them much good */
|
||||
}
|
||||
|
||||
/*
|
||||
* Interface to regexec.
|
||||
*
|
||||
* Returns #f if there is no match, #t if there is a match and submatches_p
|
||||
* is false, and a list of regex-match records otherwise.
|
||||
*
|
||||
* Most of this is making the buffer for the match structs and then translating
|
||||
* them into Scheme match records.
|
||||
*/
|
||||
static s48_value
|
||||
posix_regexp_match(s48_value sch_regex, s48_value string,
|
||||
s48_value submatches_p,
|
||||
s48_value bol_p, s48_value eol_p,
|
||||
s48_value sch_start)
|
||||
{
|
||||
int status;
|
||||
s48_value result;
|
||||
int start, len;
|
||||
/* re_nsub doesn't include the full pattern */
|
||||
size_t nmatch = 1 + S48_EXTRACT_VALUE_POINTER(sch_regex, regex_t)->re_nsub;
|
||||
regmatch_t *pmatch,
|
||||
pmatch_buffer[32];
|
||||
int flags = S48_EXTRACT_BOOLEAN(bol_p) ? 0 : REG_NOTBOL |
|
||||
S48_EXTRACT_BOOLEAN(eol_p) ? 0 : REG_NOTEOL;
|
||||
|
||||
start = s48_extract_fixnum(sch_start);
|
||||
len = S48_STRING_LENGTH(string);
|
||||
if ((start < 0) || (start > len))
|
||||
s48_raise_range_error(sch_start,
|
||||
s48_enter_fixnum(0),
|
||||
s48_enter_fixnum(len));
|
||||
|
||||
if (nmatch <= 32)
|
||||
pmatch = pmatch_buffer;
|
||||
else {
|
||||
pmatch = (regmatch_t *) malloc(nmatch * sizeof(regmatch_t));
|
||||
if (pmatch == NULL)
|
||||
s48_raise_out_of_memory_error(); }
|
||||
|
||||
status = regexec(S48_EXTRACT_VALUE_POINTER(sch_regex, regex_t),
|
||||
S48_UNSAFE_EXTRACT_STRING(string) + start,
|
||||
nmatch, pmatch, flags);
|
||||
|
||||
if (status == REG_NOMATCH)
|
||||
result = S48_FALSE;
|
||||
else if (! S48_EXTRACT_BOOLEAN(submatches_p))
|
||||
result = S48_TRUE;
|
||||
else {
|
||||
s48_value match = S48_FALSE;
|
||||
s48_value matches = S48_NULL;
|
||||
int i;
|
||||
S48_DECLARE_GC_PROTECT(2);
|
||||
|
||||
S48_GC_PROTECT_2(match, matches);
|
||||
|
||||
for(i = nmatch - 1; i > -1; i--) {
|
||||
if (pmatch[i].rm_so == -1)
|
||||
match = S48_FALSE;
|
||||
else {
|
||||
match = s48_make_record(posix_regexp_match_type_binding);
|
||||
S48_UNSAFE_RECORD_SET(match, 0,
|
||||
s48_enter_fixnum(pmatch[i].rm_so + start));
|
||||
S48_UNSAFE_RECORD_SET(match, 1,
|
||||
s48_enter_fixnum(pmatch[i].rm_eo + start));
|
||||
S48_UNSAFE_RECORD_SET(match, 2, S48_FALSE); } /* submatches */
|
||||
matches = s48_cons(match, matches); }
|
||||
|
||||
S48_GC_UNPROTECT();
|
||||
|
||||
result = matches; }
|
||||
|
||||
if (nmatch > 32)
|
||||
free(pmatch);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* Interface to regcomp.
|
||||
*
|
||||
* This takes the same arguments as `compile_regexp' but returns the error
|
||||
* message, if any, that `regcomp()' returns. For some reason `regerror()'
|
||||
* requires both the status code and the compiled pattern buffer returned
|
||||
* by `regcomp()'. `compile_regexp' only returned the status so we have to
|
||||
* redo the compilation.
|
||||
*
|
||||
*/
|
||||
static s48_value
|
||||
posix_regexp_error_message(s48_value pattern,
|
||||
s48_value extended_p, s48_value ignore_case_p,
|
||||
s48_value submatches_p, s48_value newline_p)
|
||||
{
|
||||
regex_t compiled_regex;
|
||||
int status;
|
||||
int flags = S48_EXTRACT_BOOLEAN(extended_p) ? REG_EXTENDED : 0 |
|
||||
S48_EXTRACT_BOOLEAN(ignore_case_p) ? REG_ICASE : 0 |
|
||||
S48_EXTRACT_BOOLEAN(submatches_p) ? 0 : REG_NOSUB |
|
||||
S48_EXTRACT_BOOLEAN(newline_p) ? REG_NEWLINE : 0;
|
||||
|
||||
S48_CHECK_STRING(pattern);
|
||||
|
||||
status = regcomp(&compiled_regex, S48_UNSAFE_EXTRACT_STRING(pattern), flags);
|
||||
|
||||
if (status == 0)
|
||||
return S48_FALSE;
|
||||
else {
|
||||
size_t buffer_size;
|
||||
s48_value buffer;
|
||||
|
||||
buffer_size = regerror(status, &compiled_regex, NULL, 0);
|
||||
/* For string lengths C counts the nul, Scheme doesn't. */
|
||||
buffer = s48_make_string(buffer_size - 1, ' ');
|
||||
regerror(status,
|
||||
&compiled_regex,
|
||||
S48_UNSAFE_EXTRACT_STRING(buffer),
|
||||
buffer_size);
|
||||
|
||||
return buffer; }
|
||||
}
|
||||
|
||||
/*
|
||||
* Stub for regfree().
|
||||
*/
|
||||
|
||||
static s48_value
|
||||
posix_free_regexp(s48_value sch_regex)
|
||||
{
|
||||
regfree(S48_EXTRACT_VALUE_POINTER(sch_regex, regex_t));
|
||||
|
||||
return S48_UNSPECIFIC;
|
||||
}
|
Loading…
Reference in New Issue