153 lines
5.3 KiB
Scheme
153 lines
5.3 KiB
Scheme
;;; Regular expression matching for scsh
|
|
;;; Copyright (c) 1994 by Olin Shivers.
|
|
|
|
(foreign-source
|
|
"/* Make sure foreign-function stubs interface to the C funs correctly: */"
|
|
"#include <sys/types.h>"
|
|
"#include \"../regexp/regex.h\""
|
|
"#include \"re1.h\""
|
|
"" ""
|
|
)
|
|
|
|
;;; Match data for regexp matches.
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
(define-record regexp-match
|
|
string ; The string against which we matched
|
|
start ; vector of starting indices
|
|
end) ; vector of ending indices
|
|
|
|
(define (match:start match . maybe-index)
|
|
(vector-ref (regexp-match:start match)
|
|
(:optional maybe-index 0)))
|
|
|
|
(define (match:end match . maybe-index)
|
|
(vector-ref (regexp-match:end match)
|
|
(:optional maybe-index 0)))
|
|
|
|
(define (match:substring match . maybe-index)
|
|
(let* ((i (:optional maybe-index 0))
|
|
(start (vector-ref (regexp-match:start match) i)))
|
|
(and start (substring (regexp-match:string match)
|
|
start
|
|
(vector-ref (regexp-match:end match) i)))))
|
|
|
|
;;; Compiling regexps
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;; There's no legal Posix string expressing the empty match (e.g., (|))
|
|
;;; that will never match anything. So when we have one of these, we set
|
|
;;; the STRING field to #f. The matchers will spot this case and handle it
|
|
;;; specially.
|
|
|
|
;;; We compile the string two ways, on demand -- one for cre-search, and
|
|
;;; one for cre-search?.
|
|
|
|
(define-record cre ; A compiled regular expression
|
|
string ; The Posix string form of the regexp or #F.
|
|
max-paren ; Max paren in STRING needed for submatches.
|
|
(bytes #f) ; Pointer to the compiled form, in the C heap, or #F.
|
|
(bytes/nm #f) ; Same as BYTES, but compiled with no-submatch.
|
|
tvec ; Translation vector for the submatches
|
|
((disclose self) (list "cre" (cre:string self))))
|
|
|
|
(define (new-cre str tvec) (make-cre str (max-live-posix-submatch tvec) tvec))
|
|
|
|
(define (max-live-posix-submatch tvec)
|
|
(vfold (lambda (sm mlpsm) (if sm (max mlpsm sm) mlpsm)) 0 tvec))
|
|
|
|
(define (compile-posix-re->c-struct re-string sm?)
|
|
(receive (errcode c-struct) (%compile-re re-string sm?)
|
|
(if (zero? errcode) c-struct
|
|
(error errcode (%regerror-msg errcode c-struct)
|
|
compile-posix-re->c-struct re-string sm?))))
|
|
|
|
(define-foreign %compile-re (compile_re (string-desc pattern) (bool submatches?))
|
|
integer ; 0 or error code
|
|
(C regex_t*))
|
|
|
|
|
|
;;; Searching with compiled regexps
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;; cre-search returns match info; cre-search? is just a predicate.
|
|
|
|
(define (cre-search cre start-vec end-vec str start)
|
|
(let ((re-str (cre:string cre))) ;; RE-STR = #F => empty match.
|
|
(and re-str
|
|
(let* ((C-bytes (or (cre:bytes cre)
|
|
(let ((C-bytes (compile-posix-re->c-struct re-str #t)))
|
|
(set-cre:bytes cre C-bytes)
|
|
(register-re-c-struct cre C-bytes)
|
|
C-bytes)))
|
|
(retcode (%cre-search C-bytes str start
|
|
(cre:tvec cre)
|
|
(cre:max-paren cre)
|
|
start-vec end-vec)))
|
|
(if (integer? retcode)
|
|
(error retcode (%regerror-msg retcode C-bytes)
|
|
cre-search cre start-vec end-vec str start)
|
|
(and retcode (make-regexp-match str start-vec end-vec)))))))
|
|
|
|
(define (cre-search? cre str start)
|
|
(let ((re-str (cre:string cre))) ;; RE-STR = #F => empty match.
|
|
(and re-str
|
|
(let* ((C-bytes (or (cre:bytes/nm cre)
|
|
(let ((C-bytes (compile-posix-re->c-struct re-str #f)))
|
|
(set-cre:bytes/nm cre C-bytes)
|
|
(register-re-c-struct cre C-bytes)
|
|
C-bytes)))
|
|
(retcode (%cre-search C-bytes str start '#() -1 '#() '#())))
|
|
(if (integer? retcode)
|
|
(error retcode (%regerror-msg retcode C-bytes)
|
|
cre-search? cre str start)
|
|
retcode)))))
|
|
|
|
(define-foreign %cre-search
|
|
(re_search ((C "const regex_t *~a") compiled-regexp)
|
|
(string-desc str)
|
|
(integer start)
|
|
(vector-desc tvec) (integer max-psm)
|
|
(vector-desc svec) (vector-desc evec))
|
|
desc) ; 0 success, #f no-match, or non-zero int error code.
|
|
|
|
|
|
;;; Generate an error msg from an error code.
|
|
|
|
(define-foreign %regerror-msg (re_errint2str (integer errcode)
|
|
((C "const regex_t *~a") re))
|
|
string)
|
|
|
|
|
|
;;; Reclaiming compiled regexp storage
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;; Avert your eyes from the unsightly crock.
|
|
;;;
|
|
;;; S48 0.36 doesn't have finalizers, so we don't have a way to free
|
|
;;; the C regexp_t structure when its CRE record is gc'd. So our current
|
|
;;; lame approximation is to keep track of all the CRE's with a list of
|
|
;;; (cre-weak-pointer . regex_t*)
|
|
;;; pairs. From time to time, we should walk the list. If we deref the
|
|
;;; weak pointer and discover the CRE's been GC'd, we free the regex_t
|
|
;;; struct.
|
|
;;;
|
|
;;; Note this code is completely thread unsafe.
|
|
|
|
;;; Free the space used by a compiled regexp.
|
|
(define-foreign %free-re (free_re ((C regex_t*) re)) ignore)
|
|
|
|
(define *master-cre-list* '())
|
|
|
|
;;; Whenever we make a new CRE, use this proc to add it to the master list.
|
|
(define (register-re-c-struct cre c-bytes)
|
|
(set! *master-cre-list* (cons (cons (make-weak-pointer cre) c-bytes)
|
|
*master-cre-list*)))
|
|
|
|
(define (clean-up-cres)
|
|
(set! *master-cre-list*
|
|
(fold (lambda (elt lis)
|
|
(if (weak-pointer-ref (car elt)) ; Still alive
|
|
(cons elt lis)
|
|
(begin (%free-re (cdr elt))
|
|
lis)))
|
|
'()
|
|
*master-cre-list*)))
|