Use Scheme 48 regexp code instead of ours.

This commit is contained in:
sperber 2001-08-09 13:53:18 +00:00
parent 34c5cd67bd
commit 6213213e14
5 changed files with 125 additions and 153 deletions

View File

@ -124,7 +124,7 @@ SCSHOBJS = \
scsh/machine/libansi.o \ scsh/machine/libansi.o \
scsh/network1.o \ scsh/network1.o \
scsh/putenv.o \ scsh/putenv.o \
scsh/rx/re1.o \ scsh/rx/regexp1.o \
scsh/select.o scsh/select1.o \ scsh/select.o scsh/select1.o \
scsh/sleep1.o \ scsh/sleep1.o \
scsh/syscalls.o scsh/syscalls1.o \ scsh/syscalls.o scsh/syscalls1.o \
@ -132,11 +132,11 @@ SCSHOBJS = \
scsh/time.o scsh/time1.o \ scsh/time.o scsh/time1.o \
scsh/tty.o scsh/tty1.o \ scsh/tty.o scsh/tty1.o \
scsh/userinfo1.o \ scsh/userinfo1.o \
scsh/sighandlers1.o \ scsh/sighandlers1.o
scsh/regexp/libregex.a
SCSH_INITIALIZERS = s48_init_syslog s48_init_userinfo s48_init_sighandlers \ SCSH_INITIALIZERS = s48_init_syslog s48_init_posix_regexp \
s48_init_re_low s48_init_syscalls2 s48_init_network s48_init_flock s48_init_userinfo s48_init_sighandlers \
s48_init_syscalls2 s48_init_network s48_init_flock
UNIX_OBJS = c/unix/misc.o c/unix/io.o c/unix/fd-io.o c/unix/event.o UNIX_OBJS = c/unix/misc.o c/unix/io.o c/unix/fd-io.o c/unix/event.o
@ -233,10 +233,7 @@ scsh/syscalls1.o scsh/syscalls.o: scsh/syscalls1.h
scsh/time1.o scsh/time.o: scsh/time1.h scsh/time1.o scsh/time.o: scsh/time1.h
scsh/tty1.o scsh/tty.o: scsh/tty1.h scsh/tty1.o scsh/tty.o: scsh/tty1.h
# Not really, but making regexp/libregex.a makes the regexp/regex.h file that scsh/rx/regexp1.o: c/scheme48.h
# re1.c actually does need.
# TODO: This is broken at the moment: regex.h is not made after checkout
scsh/rx/re1.o: scsh/rx/re1.h scsh/regexp/libregex.a
scsh/syscalls.o: scsh/syscalls1.h scsh/dirstuff1.h scsh/fdports1.h \ scsh/syscalls.o: scsh/syscalls1.h scsh/dirstuff1.h scsh/fdports1.h \
scsh/select1.h scsh/userinfo1.h scsh/select1.h scsh/userinfo1.h
@ -424,7 +421,7 @@ clean-scm2c:
distclean: clean distclean: clean
rm -f Makefile config.log config.status c/sysdep.h config.cache \ rm -f Makefile config.log config.status c/sysdep.h config.cache \
scsh/machine scsh/regexp/Makefile \ scsh/machine \
scsh/endian.scm scsh/static.scm \ scsh/endian.scm scsh/static.scm \
exportlist.aix exportlist.aix
$(RM) a.exe $(VM).base $(VM).def $(VM).exp $(RM) a.exe $(VM).base $(VM).def $(VM).exp
@ -759,6 +756,8 @@ SCHEME =scsh/awk.scm \
scsh/pty.scm \ scsh/pty.scm \
scsh/rdelim.scm \ scsh/rdelim.scm \
scsh/rw.scm \ scsh/rw.scm \
scsh/rx/packages.scm \
scsh/rx/cond-package.scm \
scsh/scsh-condition.scm \ scsh/scsh-condition.scm \
scsh/scsh-interfaces.scm \ scsh/scsh-interfaces.scm \
scsh/scsh-package.scm \ scsh/scsh-package.scm \
@ -837,9 +836,6 @@ scsh/scsh.image: $(VM) $(SCHEME) $(CIG).image
echo ",batch on") \ echo ",batch on") \
| ./$(VM) -o ./$(VM) -i $(CIG).image -h 10000000 | ./$(VM) -o ./$(VM) -i $(CIG).image -h 10000000
scsh/regexp/libregex.a:
cd ./scsh/regexp; $(MAKE) lib
install-scsh: scsh install-scsh-image install-scsh: scsh install-scsh-image
$(RM) $(bindir)/$(RUNNABLE) $(RM) $(bindir)/$(RUNNABLE)
$(INSTALL_PROGRAM) $(srcdir)/scsh/scsh $(bindir)/$(RUNNABLE) $(INSTALL_PROGRAM) $(srcdir)/scsh/scsh $(bindir)/$(RUNNABLE)
@ -858,9 +854,8 @@ install-scsh-image:
$(INSTALL_DATA) /tmp/scsh.image $(LIB)/scsh.image $(INSTALL_DATA) /tmp/scsh.image $(LIB)/scsh.image
clean-scsh: clean-scsh:
$(RM) scsh/*.o scsh/regexp/*.o scsh/rx/*.o scsh/machine/*.o $(RM) scsh/*.o scsh/rx/*.o scsh/machine/*.o
$(RM) scsh/*.image $(RM) scsh/*.image
$(RM) $(LIBSCSHVM) $(LIBSCSH) scsh/scsh$(EXEEXT) $(RM) $(LIBSCSHVM) $(LIBSCSH) scsh/scsh$(EXEEXT)
-cd scsh/regexp; $(MAKE) clean

View File

@ -396,6 +396,6 @@ fail
AC_SUBST(TMPDIR) AC_SUBST(TMPDIR)
AC_OUTPUT(Makefile scsh/regexp/Makefile scsh/endian.scm scsh/static.scm) AC_OUTPUT(Makefile scsh/endian.scm scsh/static.scm)
chmod +x scsh/static.scm chmod +x scsh/static.scm

View File

@ -3,6 +3,47 @@
;;; scsh-level-0, and export from there. ;;; scsh-level-0, and export from there.
;;; -Olin <shivers@ai.mit.edu> 8/98 ;;; -Olin <shivers@ai.mit.edu> 8/98
;; From Scheme 48, only here temporarily
(define-structure external-util (export immutable-copy-string)
(open scheme
primitives ;copy-bytes!
features) ;immutable? make-immutable!
(begin
(define (immutable-copy-string string)
(if (immutable? string)
string
(let ((copy (copy-string string)))
(make-immutable! copy)
copy)))
; Why isn't this available elsewhere?
(define (copy-string string)
(let* ((length (string-length string))
(new (make-string length #\?)))
(copy-bytes! string 0 new 0 length)
new))))
(define-interface posix-regexps-interface
(export make-regexp
(regexp-option :syntax)
regexp?
regexp-match
match?
match-start
match-end
match-submatches
))
(define-structures ((posix-regexps posix-regexps-interface)
(posix-regexps-internal (export make-match)))
(open scheme define-record-types finite-types external-calls
signals
external-util)
(files regexp))
(define-interface basic-re-interface (define-interface basic-re-interface
(export (re-dsm? (proc (:value) :boolean)) (export (re-dsm? (proc (:value) :boolean))
(make-re-dsm (proc (:value :exact-integer :exact-integer) :value)) (make-re-dsm (proc (:value :exact-integer :exact-integer) :value))
@ -128,8 +169,7 @@
(define re-match-internals-interface (define re-match-internals-interface
(export (regexp-match:string (proc (:value) :string)) (export (regexp-match:string (proc (:value) :string))
(regexp-match:start (proc (:value) :vector)) (regexp-match:submatches (proc (:value) :vector))))
(regexp-match:end (proc (:value) :vector))))
(define-interface posix-re-interface (define-interface posix-re-interface
@ -167,7 +207,6 @@
(match:start (proc (:value &opt :exact-integer) :value)) (match:start (proc (:value &opt :exact-integer) :value))
(match:end (proc (:value &opt :exact-integer) :value)) (match:end (proc (:value &opt :exact-integer) :value))
(match:substring (proc (:value &opt :exact-integer) :value)) (match:substring (proc (:value &opt :exact-integer) :value))
(clean-up-cres (proc () :unspecific))
(regexp-search (proc (:value :string &opt :exact-integer) (regexp-search (proc (:value :string &opt :exact-integer)
:value)) :value))
(regexp-search? (proc (:value :string &opt :exact-integer) (regexp-search? (proc (:value :string &opt :exact-integer)
@ -205,6 +244,7 @@
define-record-types ; JMG debugging define-record-types ; JMG debugging
external-calls external-calls
string-lib ; string-fold string-lib ; string-fold
posix-regexps
scheme) scheme)
(files re-low re simp re-high (files re-low re simp re-high
@ -290,6 +330,7 @@
(define-structure re-subst re-subst-interface (define-structure re-subst re-subst-interface
(open re-level-0 (open re-level-0
re-match-internals re-match-internals
posix-regexps
scsh-utilities ; fold & some string utilities that need to be moved. scsh-utilities ; fold & some string utilities that need to be moved.
scsh-level-0 ; write-string scsh-level-0 ; write-string
string-lib ; string-copy! string-lib ; string-copy!
@ -332,8 +373,8 @@
;;; re-high compile-regexp regexp-search regexp-search? ;;; re-high compile-regexp regexp-search regexp-search?
;;; re-subst regexp-substitute regexp-substitute/global ;;; re-subst regexp-substitute regexp-substitute/global
;;; re-low match:start match:end match:substring ;;; re-low match:start match:end match:substring
;;; CRE record, new-cre, compile-posix-re->c-struct ;;; CRE record, new-cre
;;; cre-search cre-search? clean-up-cres ;;; cre-search cre-search?
;;; re-syntax sre-form? if-sre-form expand-rx ;;; re-syntax sre-form? if-sre-form expand-rx
;;; re.scm The ADT. flush-submatches uncase uncase-char-set ;;; re.scm The ADT. flush-submatches uncase uncase-char-set
;;; char-set-full? char-set-empty? ;;; char-set-full? char-set-empty?

View File

@ -1,38 +1,29 @@
;;; Regular expression matching for scsh ;;; Regular expression matching for scsh
;;; Copyright (c) 1994 by Olin Shivers. ;;; Copyright (c) 1994 by Olin Shivers.
(foreign-init-name "re_low")
(foreign-source
"/* Make sure foreign-function stubs interface to the C funs correctly: */"
"#include <sys/types.h>"
"#include \"../regexp/regex.h\""
"#include \"re1.h\""
"" ""
)
;;; Match data for regexp matches. ;;; Match data for regexp matches.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define-record regexp-match (define-record regexp-match
string ; The string against which we matched string
start ; vector of starting indices submatches)
end) ; vector of ending indices
(define (match:start match . maybe-index) (define (match:start match . maybe-index)
(vector-ref (regexp-match:start match) (match-start
(:optional maybe-index 0))) (vector-ref (regexp-match:submatches match)
(:optional maybe-index 0))))
(define (match:end match . maybe-index) (define (match:end match . maybe-index)
(vector-ref (regexp-match:end match) (match-start
(:optional maybe-index 0))) (vector-ref (regexp-match:submatches match)
(:optional maybe-index 0))))
(define (match:substring match . maybe-index) (define (match:substring match . maybe-index)
(let* ((i (:optional maybe-index 0)) (let* ((i (:optional maybe-index 0))
(start (vector-ref (regexp-match:start match) i))) (submatch (vector-ref (regexp-match:submatches match) i)))
(and start (substring (regexp-match:string match) (and submatch (substring (regexp-match:string match)
start (match-start submatch)
(vector-ref (regexp-match:end match) i))))) (match-end submatch)))))
;;; Compiling regexps ;;; Compiling regexps
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@ -47,121 +38,59 @@
;(define-record cre ; A compiled regular expression ;(define-record cre ; A compiled regular expression
; string ; The Posix string form of the regexp or #F. ; string ; The Posix string form of the regexp or #F.
; max-paren ; Max paren in STRING needed for submatches. ; max-paren ; Max paren in STRING needed for submatches.
; (bytes #f) ; Pointer to the compiled form, in the C heap, or #F. ; (regexp #f) ; Compiled form or #F.
; (bytes/nm #f) ; Same as BYTES, but compiled with no-submatch. ; (regexp/nm #f) ; Same as REGEXP, but compiled with no-submatch.
; tvec ; Translation vector for the submatches ; tvec ; Translation vector for the submatches
; ((disclose self) (list "cre" (cre:string self)))) ; ((disclose self) (list "cre" (cre:string self))))
(define-record-type cre :cre (define-record-type cre :cre
(really-make-cre string max-paren bytes bytes/nm tvec debug) (really-make-cre string max-paren regexp regexp/nm tvec debug)
cre? cre?
(string cre:string set-cre:string) (string cre:string set-cre:string)
(max-paren cre:max-paren set-cre:max-paren) (max-paren cre:max-paren set-cre:max-paren)
(bytes cre:bytes set-cre:bytes) (regexp cre:regexp set-cre:regexp)
(bytes/nm cre:bytes/nm set-cre:bytes/nm) (regexp/nm cre:regexp/nm set-cre:regexp/nm)
(tvec cre:tvec set-cre:tvec) (tvec cre:tvec set-cre:tvec)
(debug cre:debug set-cre:debug)) (debug cre:debug set-cre:debug))
(define-record-discloser :cre (define-record-discloser :cre
(lambda (self) (list "cre" (cre:string self)))) (lambda (self) (list "cre" (cre:string self))))
(define-record-resumer :cre (lambda (cre)
(set-cre:bytes cre #f)
(set-cre:bytes/nm cre #f)))
(define (make-cre str max-paren tvec) (define (make-cre str max-paren tvec)
(really-make-cre str max-paren #f #f tvec #f)) (really-make-cre str max-paren #f #f tvec #f))
(define (new-cre str tvec)
(define (new-cre str tvec) (make-cre str (max-live-posix-submatch tvec) tvec)) (make-cre str (max-live-posix-submatch tvec) tvec))
(define (max-live-posix-submatch tvec) (define (max-live-posix-submatch tvec)
(vfold (lambda (sm mlpsm) (if sm (max mlpsm sm) mlpsm)) 0 tvec)) (vfold (lambda (sm mlpsm) (if sm (max mlpsm sm) mlpsm)) 0 tvec))
(define (compile-posix-re->c-struct re-string sm?)
(let ((maybe-struct (%compile-re re-string sm?)))
(if (pair? maybe-struct)
(error (car maybe-struct)
(%regerror-msg (car maybe-struct) (cdr maybe-struct))
compile-posix-re->c-struct re-string sm?)
maybe-struct)))
;;; returns pointer as number or a pair of error number and 0
(define-stubless-foreign %compile-re (pattern submatches?) "compile_re")
;;; Searching with compiled regexps ;;; Searching with compiled regexps
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; cre-search returns match info; cre-search? is just a predicate. ;;; cre-search returns match info; cre-search? is just a predicate.
;; ### we do not look at START yet
(define (cre-search cre start-vec end-vec str start) (define (cre-search cre start-vec end-vec str start)
(let ((re-str (cre:string cre))) ;; RE-STR = #F => empty match. (let ((re-str (cre:string cre))) ;; RE-STR = #F => empty match.
(and re-str (if (not re-str)
(let* ((C-bytes (or (cre:bytes cre) #f
(let ((C-bytes (compile-posix-re->c-struct re-str #t))) (begin
(set-cre:bytes cre C-bytes) (if (not (cre:regexp cre))
(register-re-c-struct:bytes cre) (set-cre:regexp cre (make-regexp re-str
C-bytes))) (regexp-option extended)
(retcode (%cre-search C-bytes str start (regexp-option submatches))))
(cre:tvec cre) (let ((ret (regexp-match (cre:regexp cre) str #t #f #f start)))
(cre:max-paren cre) (if (not ret)
start-vec end-vec))) #f
(if (integer? retcode) (make-regexp-match str
(error retcode (%regerror-msg retcode C-bytes) (list->vector ret))))))))
cre-search cre start-vec end-vec str start)
(and retcode (make-regexp-match str start-vec end-vec)))))))
(define (cre-search? cre str start) (define (cre-search? cre start-vec end-vec str start)
(let ((re-str (cre:string cre))) ;; RE-STR = #F => empty match. (let ((re-str (cre:string cre))) ;; RE-STR = #F => empty match.
(and re-str (if (not re-str)
(let* ((C-bytes (or (cre:bytes/nm cre) #f
(let ((C-bytes (compile-posix-re->c-struct re-str #f))) (begin
(set-cre:bytes/nm cre C-bytes) (if (not (cre:regexp/nm cre))
(register-re-c-struct:bytes/nm cre) (set-cre:regexp/nm cre (make-regexp re-str
C-bytes))) (regexp-option extended))))
(retcode (%cre-search C-bytes str start '#() -1 '#() '#()))) (regexp-match (cre:regexp/nm cre) str #f #f #f)))))
(if (integer? retcode)
(error retcode (%regerror-msg retcode C-bytes)
cre-search? cre str start)
retcode)))))
; 0 success, #f no-match, or non-zero int error code:
(define-stubless-foreign %cre-search
(compiled-regexp str start tvec max-psm svec evec) "re_search")
;;; Generate an error msg from an error code.
(define-stubless-foreign %regerror-msg (errcode re) "re_errint2str")
;;; Reclaiming compiled regexp storage
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define-stubless-foreign %free-re (re) "free_re")
;;; Whenever we make a new CRE, add the appropriate finalizer,
;;; so the C regex_t structure can be freeed
(define (free-bytes the-cre)
(if (cre:bytes the-cre)
(%free-re (cre:bytes the-cre))
(warn "free-bytes called on #f")))
(define (free-bytes/nm the-cre)
(if (cre:bytes/nm the-cre)
(%free-re (cre:bytes/nm the-cre))
(warn "free-bytes/nm called on #f")))
(define (register-re-c-struct:bytes cre)
(add-finalizer! cre free-bytes))
(define (register-re-c-struct:bytes/nm cre)
(add-finalizer! cre free-bytes/nm))
(define (register-re-c-struct cre c-bytes)
(error "function register-re-c-struct no longer supported"))
(define (clean-up-cres)
(warn "function clean-up-cres no longer supported"))

View File

@ -11,14 +11,17 @@
(define (regexp-substitute port match . items) (define (regexp-substitute port match . items)
(let* ((str (regexp-match:string match)) (let* ((str (regexp-match:string match))
(sv (regexp-match:start match)) (submatches (regexp-match:submatches match))
(ev (regexp-match:end match))
(range (lambda (item) ; Return start & end of (range (lambda (item) ; Return start & end of
(cond ((integer? item) ; ITEM's range in STR. (cond ((integer? item) ; ITEM's range in STR.
(values (vector-ref sv item) (let ((submatch (vector-ref submatches item)))
(vector-ref ev item))) (values (match-start submatch)
((eq? 'pre item) (values 0 (vector-ref sv 0))) (match-end submatch))))
((eq? 'post item) (values (vector-ref ev 0) ((eq? 'pre item) (values 0
(match-start
(vector-ref submatches 0))))
((eq? 'post item) (values (match-end
(vector-ref submatches 0))
(string-length str))) (string-length str)))
(else (error "Illegal substitution item." (else (error "Illegal substitution item."
item item
@ -54,11 +57,15 @@
(define (regexp-substitute/global port re str . items) (define (regexp-substitute/global port re str . items)
(let ((str-len (string-length str)) (let ((str-len (string-length str))
(range (lambda (start sv ev item) ; Return start & end of (range (lambda (start submatches item) ; Return start & end of
(cond ((integer? item) ; ITEM's range in STR. (cond ((integer? item) ; ITEM's range in STR.
(values (vector-ref sv item) (let ((submatch (vector-ref submatches item)))
(vector-ref ev item))) (values (match-start submatch)
((eq? 'pre item) (values start (vector-ref sv 0))) (match-end submatch))))
((eq? 'pre item)
(values start
(match-start
(vector-ref submatches 0))))
(else (error "Illegal substitution item." (else (error "Illegal substitution item."
item item
regexp-substitute/global))))) regexp-substitute/global)))))
@ -73,10 +80,9 @@
(if (<= start str-len) (if (<= start str-len)
(let ((match (regexp-search re str start))) (let ((match (regexp-search re str start)))
(if match (if match
(let* ((sv (regexp-match:start match)) (let* ((submatches (regexp-match:submatches match))
(ev (regexp-match:end match)) (s (match-start (vector-ref submatches 0)))
(s (vector-ref sv 0)) (e (match-end (vector-ref submatches 0)))
(e (vector-ref ev 0))
(empty? (= s e))) (empty? (= s e)))
(for-each (lambda (item) (for-each (lambda (item)
(cond ((string? item) (write-string item port)) (cond ((string? item) (write-string item port))
@ -91,7 +97,7 @@
(recur (if empty? (+ 1 e) e))) (recur (if empty? (+ 1 e) e)))
(else (receive (si ei) (else (receive (si ei)
(range start sv ev item) (range start submatches item)
(write-string str port si ei))))) (write-string str port si ei)))))
items)) items))
@ -99,14 +105,15 @@
;; Either we're making a string, or >1 POST. ;; Either we're making a string, or >1 POST.
(let* ((pieces (let recur ((start 0)) (let* ((pieces (let recur ((start 0))
(if (> start str-len) '() (if (> start str-len)
'()
(let ((match (regexp-search re str start)) (let ((match (regexp-search re str start))
(cached-post #f)) (cached-post #f))
(if match (if match
(let* ((sv (regexp-match:start match)) (let* ((submatches
(ev (regexp-match:end match)) (regexp-match:submatches match))
(s (vector-ref sv 0)) (s (match-start (vector-ref submatches 0)))
(e (vector-ref ev 0)) (e (match-end (vector-ref submatches 0)))
(empty? (= s e))) (empty? (= s e)))
(fold (lambda (item pieces) (fold (lambda (item pieces)
(cond ((string? item) (cond ((string? item)
@ -128,7 +135,7 @@
(append cached-post pieces)) (append cached-post pieces))
(else (receive (si ei) (else (receive (si ei)
(range start sv ev item) (range start submatches item)
(cons (substring str si ei) (cons (substring str si ei)
pieces))))) pieces)))))
'() items)) '() items))