;;; Regular expression matching for scsh ;;; Copyright (c) 1994 by Olin Shivers. (foreign-init-name "re_low") (foreign-source "/* Make sure foreign-function stubs interface to the C funs correctly: */" "#include " "#include \"../regexp/regex.h\"" "#include \"re1.h\"" "" "" ) ;;; Match data for regexp matches. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define-record regexp-match string ; The string against which we matched start ; vector of starting indices end) ; vector of ending indices (define (match:start match . maybe-index) (vector-ref (regexp-match:start match) (:optional maybe-index 0))) (define (match:end match . maybe-index) (vector-ref (regexp-match:end match) (:optional maybe-index 0))) (define (match:substring match . maybe-index) (let* ((i (:optional maybe-index 0)) (start (vector-ref (regexp-match:start match) i))) (and start (substring (regexp-match:string match) start (vector-ref (regexp-match:end match) i))))) ;;; Compiling regexps ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; There's no legal Posix string expressing the empty match (e.g., (|)) ;;; that will never match anything. So when we have one of these, we set ;;; the STRING field to #f. The matchers will spot this case and handle it ;;; specially. ;;; We compile the string two ways, on demand -- one for cre-search, and ;;; one for cre-search?. ;(define-record cre ; A compiled regular expression ; string ; The Posix string form of the regexp or #F. ; max-paren ; Max paren in STRING needed for submatches. ; (bytes #f) ; Pointer to the compiled form, in the C heap, or #F. ; (bytes/nm #f) ; Same as BYTES, but compiled with no-submatch. ; tvec ; Translation vector for the submatches ; ((disclose self) (list "cre" (cre:string self)))) (define-record-type cre :cre (really-make-cre string max-paren bytes bytes/nm tvec debug) cre? (string cre:string set-cre:string) (max-paren cre:max-paren set-cre:max-paren) (bytes cre:bytes set-cre:bytes) (bytes/nm cre:bytes/nm set-cre:bytes/nm) (tvec cre:tvec set-cre:tvec) (debug cre:debug set-cre:debug)) (define-record-discloser :cre (lambda (self) (list "cre" (cre:string self)))) (define (make-cre str max-paren tvec) (really-make-cre str max-paren #f #f tvec #f)) (define (new-cre str tvec) (make-cre str (max-live-posix-submatch tvec) tvec)) (define (max-live-posix-submatch tvec) (vfold (lambda (sm mlpsm) (if sm (max mlpsm sm) mlpsm)) 0 tvec)) (define (compile-posix-re->c-struct re-string sm?) (receive (errcode c-struct) (%compile-re re-string sm?) (if (zero? errcode) c-struct (error errcode (%regerror-msg errcode c-struct) compile-posix-re->c-struct re-string sm?)))) (define-foreign %compile-re (compile_re (string-desc pattern) (bool submatches?)) integer ; 0 or error code (C regex_t*)) ;;; Searching with compiled regexps ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; cre-search returns match info; cre-search? is just a predicate. (define (cre-search cre start-vec end-vec str start) (let ((re-str (cre:string cre))) ;; RE-STR = #F => empty match. (and re-str (let* ((C-bytes (or (cre:bytes cre) (let ((C-bytes (compile-posix-re->c-struct re-str #t))) (set-cre:bytes cre C-bytes) (register-re-c-struct:bytes cre) C-bytes))) (retcode (%cre-search C-bytes str start (cre:tvec cre) (cre:max-paren cre) start-vec end-vec))) (if (integer? retcode) (error retcode (%regerror-msg retcode C-bytes) cre-search cre start-vec end-vec str start) (and retcode (make-regexp-match str start-vec end-vec))))))) (define (cre-search? cre str start) (let ((re-str (cre:string cre))) ;; RE-STR = #F => empty match. (and re-str (let* ((C-bytes (or (cre:bytes/nm cre) (let ((C-bytes (compile-posix-re->c-struct re-str #f))) (set-cre:bytes/nm cre C-bytes) (register-re-c-struct:bytes/nm cre) C-bytes))) (retcode (%cre-search C-bytes str start '#() -1 '#() '#()))) (if (integer? retcode) (error retcode (%regerror-msg retcode C-bytes) cre-search? cre str start) retcode))))) (define-foreign %cre-search (re_search ((C "const regex_t *~a") compiled-regexp) (string-desc str) (integer start) (vector-desc tvec) (integer max-psm) (vector-desc svec) (vector-desc evec)) desc) ; 0 success, #f no-match, or non-zero int error code. ;;; Generate an error msg from an error code. (define-foreign %regerror-msg (re_errint2str (integer errcode) ((C "const regex_t *~a") re)) string) ;;; Reclaiming compiled regexp storage ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define-foreign %free-re (free_re ((C regex_t*) re)) ignore) ;;; Whenever we make a new CRE, add the appropriate finalizer, ;;; so the C regex_t structure can be freeed (define (free-bytes the-cre) (if (cre:bytes the-cre) (%free-re (cre:bytes the-cre)) (warn "free-bytes called on #f"))) (define (free-bytes/nm the-cre) (if (cre:bytes the-cre) (%free-re (cre:bytes/nm the-cre)) (warn "free-bytes/nm called on #f"))) (define (register-re-c-struct:bytes cre) (add-finalizer! cre free-bytes)) (define (register-re-c-struct:bytes/nm cre) (add-finalizer! cre free-bytes/nm)) (define (register-re-c-struct cre c-bytes) (error "function register-re-c-struct no longer supported")) (define (clean-up-cres) (warn "function clean-up-cres no longer supported"))