scsh-0.5/scsh/rx/re-low.scm

153 lines
5.3 KiB
Scheme

;;; Regular expression matching for scsh
;;; Copyright (c) 1994 by Olin Shivers.
(foreign-source
"/* Make sure foreign-function stubs interface to the C funs correctly: */"
"#include <sys/types.h>"
"#include \"../regexp/regex.h\""
"#include \"re1.h\""
"" ""
)
;;; Match data for regexp matches.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define-record regexp-match
string ; The string against which we matched
start ; vector of starting indices
end) ; vector of ending indices
(define (match:start match . maybe-index)
(vector-ref (regexp-match:start match)
(:optional maybe-index 0)))
(define (match:end match . maybe-index)
(vector-ref (regexp-match:end match)
(:optional maybe-index 0)))
(define (match:substring match . maybe-index)
(let* ((i (:optional maybe-index 0))
(start (vector-ref (regexp-match:start match) i)))
(and start (substring (regexp-match:string match)
start
(vector-ref (regexp-match:end match) i)))))
;;; Compiling regexps
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; There's no legal Posix string expressing the empty match (e.g., (|))
;;; that will never match anything. So when we have one of these, we set
;;; the STRING field to #f. The matchers will spot this case and handle it
;;; specially.
;;; We compile the string two ways, on demand -- one for cre-search, and
;;; one for cre-search?.
(define-record cre ; A compiled regular expression
string ; The Posix string form of the regexp or #F.
max-paren ; Max paren in STRING needed for submatches.
(bytes #f) ; Pointer to the compiled form, in the C heap, or #F.
(bytes/nm #f) ; Same as BYTES, but compiled with no-submatch.
tvec ; Translation vector for the submatches
((disclose self) (list "cre" (cre:string self))))
(define (new-cre str tvec) (make-cre str (max-live-posix-submatch tvec) tvec))
(define (max-live-posix-submatch tvec)
(vfold (lambda (sm mlpsm) (if sm (max mlpsm sm) mlpsm)) 0 tvec))
(define (compile-posix-re->c-struct re-string sm?)
(receive (errcode c-struct) (%compile-re re-string sm?)
(if (zero? errcode) c-struct
(error errcode (%regerror-msg errcode c-struct)
compile-posix-re->c-struct re-string sm?))))
(define-foreign %compile-re (compile_re (string-desc pattern) (bool submatches?))
integer ; 0 or error code
(C regex_t*))
;;; Searching with compiled regexps
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; cre-search returns match info; cre-search? is just a predicate.
(define (cre-search cre start-vec end-vec str start)
(let ((re-str (cre:string cre))) ;; RE-STR = #F => empty match.
(and re-str
(let* ((C-bytes (or (cre:bytes cre)
(let ((C-bytes (compile-posix-re->c-struct re-str #t)))
(set-cre:bytes cre C-bytes)
(register-re-c-struct cre C-bytes)
C-bytes)))
(retcode (%cre-search C-bytes str start
(cre:tvec cre)
(cre:max-paren cre)
start-vec end-vec)))
(if (integer? retcode)
(error retcode (%regerror-msg retcode C-bytes)
cre-search cre start-vec end-vec str start)
(and retcode (make-regexp-match str start-vec end-vec)))))))
(define (cre-search? cre str start)
(let ((re-str (cre:string cre))) ;; RE-STR = #F => empty match.
(and re-str
(let* ((C-bytes (or (cre:bytes/nm cre)
(let ((C-bytes (compile-posix-re->c-struct re-str #f)))
(set-cre:bytes/nm cre C-bytes)
(register-re-c-struct cre C-bytes)
C-bytes)))
(retcode (%cre-search C-bytes str start '#() -1 '#() '#())))
(if (integer? retcode)
(error retcode (%regerror-msg retcode C-bytes)
cre-search? cre str start)
retcode)))))
(define-foreign %cre-search
(re_search ((C "const regex_t *~a") compiled-regexp)
(string-desc str)
(integer start)
(vector-desc tvec) (integer max-psm)
(vector-desc svec) (vector-desc evec))
desc) ; 0 success, #f no-match, or non-zero int error code.
;;; Generate an error msg from an error code.
(define-foreign %regerror-msg (re_errint2str (integer errcode)
((C "const regex_t *~a") re))
string)
;;; Reclaiming compiled regexp storage
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; Avert your eyes from the unsightly crock.
;;;
;;; S48 0.36 doesn't have finalizers, so we don't have a way to free
;;; the C regexp_t structure when its CRE record is gc'd. So our current
;;; lame approximation is to keep track of all the CRE's with a list of
;;; (cre-weak-pointer . regex_t*)
;;; pairs. From time to time, we should walk the list. If we deref the
;;; weak pointer and discover the CRE's been GC'd, we free the regex_t
;;; struct.
;;;
;;; Note this code is completely thread unsafe.
;;; Free the space used by a compiled regexp.
(define-foreign %free-re (free_re ((C regex_t*) re)) ignore)
(define *master-cre-list* '())
;;; Whenever we make a new CRE, use this proc to add it to the master list.
(define (register-re-c-struct cre c-bytes)
(set! *master-cre-list* (cons (cons (make-weak-pointer cre) c-bytes)
*master-cre-list*)))
(define (clean-up-cres)
(set! *master-cre-list*
(fold (lambda (elt lis)
(if (weak-pointer-ref (car elt)) ; Still alive
(cons elt lis)
(begin (%free-re (cdr elt))
lis)))
'()
*master-cre-list*)))