;;; Regular expression matching for scsh ;;; Copyright (c) 1994 by Olin Shivers. (foreign-source "/* Make sure foreign-function stubs interface to the C funs correctly: */" "#include " "#include \"../regexp/regex.h\"" "#include \"re1.h\"" "" "" ) ;;; Match data for regexp matches. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define-record regexp-match string ; The string against which we matched start ; vector of starting indices end) ; vector of ending indices (define (match:start match . maybe-index) (vector-ref (regexp-match:start match) (:optional maybe-index 0))) (define (match:end match . maybe-index) (vector-ref (regexp-match:end match) (:optional maybe-index 0))) (define (match:substring match . maybe-index) (let* ((i (:optional maybe-index 0)) (start (vector-ref (regexp-match:start match) i))) (and start (substring (regexp-match:string match) start (vector-ref (regexp-match:end match) i))))) ;;; Compiling regexps ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; There's no legal Posix string expressing the empty match (e.g., (|)) ;;; that will never match anything. So when we have one of these, we set ;;; the STRING field to #f. The matchers will spot this case and handle it ;;; specially. ;;; We compile the string two ways, on demand -- one for cre-search, and ;;; one for cre-search?. (define-record cre ; A compiled regular expression string ; The Posix string form of the regexp or #F. max-paren ; Max paren in STRING needed for submatches. (bytes #f) ; Pointer to the compiled form, in the C heap, or #F. (bytes/nm #f) ; Same as BYTES, but compiled with no-submatch. tvec ; Translation vector for the submatches ((disclose self) (list "cre" (cre:string self)))) (define (new-cre str tvec) (make-cre str (max-live-posix-submatch tvec) tvec)) (define (max-live-posix-submatch tvec) (vfold (lambda (sm mlpsm) (if sm (max mlpsm sm) mlpsm)) 0 tvec)) (define (compile-posix-re->c-struct re-string sm?) (receive (errcode c-struct) (%compile-re re-string sm?) (if (zero? errcode) c-struct (error errcode (%regerror-msg errcode c-struct) compile-posix-re->c-struct re-string sm?)))) (define-foreign %compile-re (compile_re (string-desc pattern) (bool submatches?)) integer ; 0 or error code (C regex_t*)) ;;; Searching with compiled regexps ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; cre-search returns match info; cre-search? is just a predicate. (define (cre-search cre start-vec end-vec str start) (let ((re-str (cre:string cre))) ;; RE-STR = #F => empty match. (and re-str (let* ((C-bytes (or (cre:bytes cre) (let ((C-bytes (compile-posix-re->c-struct re-str #t))) (set-cre:bytes cre C-bytes) (register-re-c-struct cre C-bytes) C-bytes))) (retcode (%cre-search C-bytes str start (cre:tvec cre) (cre:max-paren cre) start-vec end-vec))) (if (integer? retcode) (error retcode (%regerror-msg retcode C-bytes) cre-search cre start-vec end-vec str start) (and retcode (make-regexp-match str start-vec end-vec))))))) (define (cre-search? cre str start) (let ((re-str (cre:string cre))) ;; RE-STR = #F => empty match. (and re-str (let* ((C-bytes (or (cre:bytes/nm cre) (let ((C-bytes (compile-posix-re->c-struct re-str #f))) (set-cre:bytes/nm cre C-bytes) (register-re-c-struct cre C-bytes) C-bytes))) (retcode (%cre-search C-bytes str start '#() -1 '#() '#()))) (if (integer? retcode) (error retcode (%regerror-msg retcode C-bytes) cre-search? cre str start) retcode))))) (define-foreign %cre-search (re_search ((C "const regex_t *~a") compiled-regexp) (string-desc str) (integer start) (vector-desc tvec) (integer max-psm) (vector-desc svec) (vector-desc evec)) desc) ; 0 success, #f no-match, or non-zero int error code. ;;; Generate an error msg from an error code. (define-foreign %regerror-msg (re_errint2str (integer errcode) ((C "const regex_t *~a") re)) string) ;;; Reclaiming compiled regexp storage ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Avert your eyes from the unsightly crock. ;;; ;;; S48 0.36 doesn't have finalizers, so we don't have a way to free ;;; the C regexp_t structure when its CRE record is gc'd. So our current ;;; lame approximation is to keep track of all the CRE's with a list of ;;; (cre-weak-pointer . regex_t*) ;;; pairs. From time to time, we should walk the list. If we deref the ;;; weak pointer and discover the CRE's been GC'd, we free the regex_t ;;; struct. ;;; ;;; Note this code is completely thread unsafe. ;;; Free the space used by a compiled regexp. (define-foreign %free-re (free_re ((C regex_t*) re)) ignore) (define *master-cre-list* '()) ;;; Whenever we make a new CRE, use this proc to add it to the master list. (define (register-re-c-struct cre c-bytes) (set! *master-cre-list* (cons (cons (make-weak-pointer cre) c-bytes) *master-cre-list*))) (define (clean-up-cres) (set! *master-cre-list* (fold (lambda (elt lis) (if (weak-pointer-ref (car elt)) ; Still alive (cons elt lis) (begin (%free-re (cdr elt)) lis))) '() *master-cre-list*)))