Use Scheme 48 regexp code instead of ours.

This commit is contained in:
sperber 2001-08-09 13:53:18 +00:00
parent 34c5cd67bd
commit 6213213e14
5 changed files with 125 additions and 153 deletions

View File

@ -124,7 +124,7 @@ SCSHOBJS = \
scsh/machine/libansi.o \
scsh/network1.o \
scsh/putenv.o \
scsh/rx/re1.o \
scsh/rx/regexp1.o \
scsh/select.o scsh/select1.o \
scsh/sleep1.o \
scsh/syscalls.o scsh/syscalls1.o \
@ -132,11 +132,11 @@ SCSHOBJS = \
scsh/time.o scsh/time1.o \
scsh/tty.o scsh/tty1.o \
scsh/userinfo1.o \
scsh/sighandlers1.o \
scsh/regexp/libregex.a
scsh/sighandlers1.o
SCSH_INITIALIZERS = s48_init_syslog s48_init_userinfo s48_init_sighandlers \
s48_init_re_low s48_init_syscalls2 s48_init_network s48_init_flock
SCSH_INITIALIZERS = s48_init_syslog s48_init_posix_regexp \
s48_init_userinfo s48_init_sighandlers \
s48_init_syscalls2 s48_init_network s48_init_flock
UNIX_OBJS = c/unix/misc.o c/unix/io.o c/unix/fd-io.o c/unix/event.o
@ -233,10 +233,7 @@ scsh/syscalls1.o scsh/syscalls.o: scsh/syscalls1.h
scsh/time1.o scsh/time.o: scsh/time1.h
scsh/tty1.o scsh/tty.o: scsh/tty1.h
# Not really, but making regexp/libregex.a makes the regexp/regex.h file that
# re1.c actually does need.
# TODO: This is broken at the moment: regex.h is not made after checkout
scsh/rx/re1.o: scsh/rx/re1.h scsh/regexp/libregex.a
scsh/rx/regexp1.o: c/scheme48.h
scsh/syscalls.o: scsh/syscalls1.h scsh/dirstuff1.h scsh/fdports1.h \
scsh/select1.h scsh/userinfo1.h
@ -424,7 +421,7 @@ clean-scm2c:
distclean: clean
rm -f Makefile config.log config.status c/sysdep.h config.cache \
scsh/machine scsh/regexp/Makefile \
scsh/machine \
scsh/endian.scm scsh/static.scm \
exportlist.aix
$(RM) a.exe $(VM).base $(VM).def $(VM).exp
@ -759,6 +756,8 @@ SCHEME =scsh/awk.scm \
scsh/pty.scm \
scsh/rdelim.scm \
scsh/rw.scm \
scsh/rx/packages.scm \
scsh/rx/cond-package.scm \
scsh/scsh-condition.scm \
scsh/scsh-interfaces.scm \
scsh/scsh-package.scm \
@ -837,9 +836,6 @@ scsh/scsh.image: $(VM) $(SCHEME) $(CIG).image
echo ",batch on") \
| ./$(VM) -o ./$(VM) -i $(CIG).image -h 10000000
scsh/regexp/libregex.a:
cd ./scsh/regexp; $(MAKE) lib
install-scsh: scsh install-scsh-image
$(RM) $(bindir)/$(RUNNABLE)
$(INSTALL_PROGRAM) $(srcdir)/scsh/scsh $(bindir)/$(RUNNABLE)
@ -858,9 +854,8 @@ install-scsh-image:
$(INSTALL_DATA) /tmp/scsh.image $(LIB)/scsh.image
clean-scsh:
$(RM) scsh/*.o scsh/regexp/*.o scsh/rx/*.o scsh/machine/*.o
$(RM) scsh/*.o scsh/rx/*.o scsh/machine/*.o
$(RM) scsh/*.image
$(RM) $(LIBSCSHVM) $(LIBSCSH) scsh/scsh$(EXEEXT)
-cd scsh/regexp; $(MAKE) clean

View File

@ -396,6 +396,6 @@ fail
AC_SUBST(TMPDIR)
AC_OUTPUT(Makefile scsh/regexp/Makefile scsh/endian.scm scsh/static.scm)
AC_OUTPUT(Makefile scsh/endian.scm scsh/static.scm)
chmod +x scsh/static.scm

View File

@ -3,6 +3,47 @@
;;; scsh-level-0, and export from there.
;;; -Olin <shivers@ai.mit.edu> 8/98
;; From Scheme 48, only here temporarily
(define-structure external-util (export immutable-copy-string)
(open scheme
primitives ;copy-bytes!
features) ;immutable? make-immutable!
(begin
(define (immutable-copy-string string)
(if (immutable? string)
string
(let ((copy (copy-string string)))
(make-immutable! copy)
copy)))
; Why isn't this available elsewhere?
(define (copy-string string)
(let* ((length (string-length string))
(new (make-string length #\?)))
(copy-bytes! string 0 new 0 length)
new))))
(define-interface posix-regexps-interface
(export make-regexp
(regexp-option :syntax)
regexp?
regexp-match
match?
match-start
match-end
match-submatches
))
(define-structures ((posix-regexps posix-regexps-interface)
(posix-regexps-internal (export make-match)))
(open scheme define-record-types finite-types external-calls
signals
external-util)
(files regexp))
(define-interface basic-re-interface
(export (re-dsm? (proc (:value) :boolean))
(make-re-dsm (proc (:value :exact-integer :exact-integer) :value))
@ -128,8 +169,7 @@
(define re-match-internals-interface
(export (regexp-match:string (proc (:value) :string))
(regexp-match:start (proc (:value) :vector))
(regexp-match:end (proc (:value) :vector))))
(regexp-match:submatches (proc (:value) :vector))))
(define-interface posix-re-interface
@ -167,7 +207,6 @@
(match:start (proc (:value &opt :exact-integer) :value))
(match:end (proc (:value &opt :exact-integer) :value))
(match:substring (proc (:value &opt :exact-integer) :value))
(clean-up-cres (proc () :unspecific))
(regexp-search (proc (:value :string &opt :exact-integer)
:value))
(regexp-search? (proc (:value :string &opt :exact-integer)
@ -205,6 +244,7 @@
define-record-types ; JMG debugging
external-calls
string-lib ; string-fold
posix-regexps
scheme)
(files re-low re simp re-high
@ -290,6 +330,7 @@
(define-structure re-subst re-subst-interface
(open re-level-0
re-match-internals
posix-regexps
scsh-utilities ; fold & some string utilities that need to be moved.
scsh-level-0 ; write-string
string-lib ; string-copy!
@ -332,8 +373,8 @@
;;; re-high compile-regexp regexp-search regexp-search?
;;; re-subst regexp-substitute regexp-substitute/global
;;; re-low match:start match:end match:substring
;;; CRE record, new-cre, compile-posix-re->c-struct
;;; cre-search cre-search? clean-up-cres
;;; CRE record, new-cre
;;; cre-search cre-search?
;;; re-syntax sre-form? if-sre-form expand-rx
;;; re.scm The ADT. flush-submatches uncase uncase-char-set
;;; char-set-full? char-set-empty?

View File

@ -1,38 +1,29 @@
;;; Regular expression matching for scsh
;;; Copyright (c) 1994 by Olin Shivers.
(foreign-init-name "re_low")
(foreign-source
"/* Make sure foreign-function stubs interface to the C funs correctly: */"
"#include <sys/types.h>"
"#include \"../regexp/regex.h\""
"#include \"re1.h\""
"" ""
)
;;; Match data for regexp matches.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define-record regexp-match
string ; The string against which we matched
start ; vector of starting indices
end) ; vector of ending indices
string
submatches)
(define (match:start match . maybe-index)
(vector-ref (regexp-match:start match)
(:optional maybe-index 0)))
(match-start
(vector-ref (regexp-match:submatches match)
(:optional maybe-index 0))))
(define (match:end match . maybe-index)
(vector-ref (regexp-match:end match)
(:optional maybe-index 0)))
(match-start
(vector-ref (regexp-match:submatches match)
(:optional maybe-index 0))))
(define (match:substring match . maybe-index)
(let* ((i (:optional maybe-index 0))
(start (vector-ref (regexp-match:start match) i)))
(and start (substring (regexp-match:string match)
start
(vector-ref (regexp-match:end match) i)))))
(submatch (vector-ref (regexp-match:submatches match) i)))
(and submatch (substring (regexp-match:string match)
(match-start submatch)
(match-end submatch)))))
;;; Compiling regexps
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@ -47,121 +38,59 @@
;(define-record cre ; A compiled regular expression
; string ; The Posix string form of the regexp or #F.
; max-paren ; Max paren in STRING needed for submatches.
; (bytes #f) ; Pointer to the compiled form, in the C heap, or #F.
; (bytes/nm #f) ; Same as BYTES, but compiled with no-submatch.
; (regexp #f) ; Compiled form or #F.
; (regexp/nm #f) ; Same as REGEXP, but compiled with no-submatch.
; tvec ; Translation vector for the submatches
; ((disclose self) (list "cre" (cre:string self))))
(define-record-type cre :cre
(really-make-cre string max-paren bytes bytes/nm tvec debug)
(really-make-cre string max-paren regexp regexp/nm tvec debug)
cre?
(string cre:string set-cre:string)
(max-paren cre:max-paren set-cre:max-paren)
(bytes cre:bytes set-cre:bytes)
(bytes/nm cre:bytes/nm set-cre:bytes/nm)
(regexp cre:regexp set-cre:regexp)
(regexp/nm cre:regexp/nm set-cre:regexp/nm)
(tvec cre:tvec set-cre:tvec)
(debug cre:debug set-cre:debug))
(define-record-discloser :cre
(lambda (self) (list "cre" (cre:string self))))
(define-record-resumer :cre (lambda (cre)
(set-cre:bytes cre #f)
(set-cre:bytes/nm cre #f)))
(define (make-cre str max-paren tvec)
(really-make-cre str max-paren #f #f tvec #f))
(define (new-cre str tvec) (make-cre str (max-live-posix-submatch tvec) tvec))
(define (new-cre str tvec)
(make-cre str (max-live-posix-submatch tvec) tvec))
(define (max-live-posix-submatch tvec)
(vfold (lambda (sm mlpsm) (if sm (max mlpsm sm) mlpsm)) 0 tvec))
(define (compile-posix-re->c-struct re-string sm?)
(let ((maybe-struct (%compile-re re-string sm?)))
(if (pair? maybe-struct)
(error (car maybe-struct)
(%regerror-msg (car maybe-struct) (cdr maybe-struct))
compile-posix-re->c-struct re-string sm?)
maybe-struct)))
;;; returns pointer as number or a pair of error number and 0
(define-stubless-foreign %compile-re (pattern submatches?) "compile_re")
;;; Searching with compiled regexps
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; cre-search returns match info; cre-search? is just a predicate.
;; ### we do not look at START yet
(define (cre-search cre start-vec end-vec str start)
(let ((re-str (cre:string cre))) ;; RE-STR = #F => empty match.
(and re-str
(let* ((C-bytes (or (cre:bytes cre)
(let ((C-bytes (compile-posix-re->c-struct re-str #t)))
(set-cre:bytes cre C-bytes)
(register-re-c-struct:bytes cre)
C-bytes)))
(retcode (%cre-search C-bytes str start
(cre:tvec cre)
(cre:max-paren cre)
start-vec end-vec)))
(if (integer? retcode)
(error retcode (%regerror-msg retcode C-bytes)
cre-search cre start-vec end-vec str start)
(and retcode (make-regexp-match str start-vec end-vec)))))))
(if (not re-str)
#f
(begin
(if (not (cre:regexp cre))
(set-cre:regexp cre (make-regexp re-str
(regexp-option extended)
(regexp-option submatches))))
(let ((ret (regexp-match (cre:regexp cre) str #t #f #f start)))
(if (not ret)
#f
(make-regexp-match str
(list->vector ret))))))))
(define (cre-search? cre str start)
(define (cre-search? cre start-vec end-vec str start)
(let ((re-str (cre:string cre))) ;; RE-STR = #F => empty match.
(and re-str
(let* ((C-bytes (or (cre:bytes/nm cre)
(let ((C-bytes (compile-posix-re->c-struct re-str #f)))
(set-cre:bytes/nm cre C-bytes)
(register-re-c-struct:bytes/nm cre)
C-bytes)))
(retcode (%cre-search C-bytes str start '#() -1 '#() '#())))
(if (integer? retcode)
(error retcode (%regerror-msg retcode C-bytes)
cre-search? cre str start)
retcode)))))
; 0 success, #f no-match, or non-zero int error code:
(define-stubless-foreign %cre-search
(compiled-regexp str start tvec max-psm svec evec) "re_search")
;;; Generate an error msg from an error code.
(define-stubless-foreign %regerror-msg (errcode re) "re_errint2str")
;;; Reclaiming compiled regexp storage
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define-stubless-foreign %free-re (re) "free_re")
;;; Whenever we make a new CRE, add the appropriate finalizer,
;;; so the C regex_t structure can be freeed
(define (free-bytes the-cre)
(if (cre:bytes the-cre)
(%free-re (cre:bytes the-cre))
(warn "free-bytes called on #f")))
(define (free-bytes/nm the-cre)
(if (cre:bytes/nm the-cre)
(%free-re (cre:bytes/nm the-cre))
(warn "free-bytes/nm called on #f")))
(define (register-re-c-struct:bytes cre)
(add-finalizer! cre free-bytes))
(define (register-re-c-struct:bytes/nm cre)
(add-finalizer! cre free-bytes/nm))
(define (register-re-c-struct cre c-bytes)
(error "function register-re-c-struct no longer supported"))
(define (clean-up-cres)
(warn "function clean-up-cres no longer supported"))
(if (not re-str)
#f
(begin
(if (not (cre:regexp/nm cre))
(set-cre:regexp/nm cre (make-regexp re-str
(regexp-option extended))))
(regexp-match (cre:regexp/nm cre) str #f #f #f)))))

View File

@ -11,14 +11,17 @@
(define (regexp-substitute port match . items)
(let* ((str (regexp-match:string match))
(sv (regexp-match:start match))
(ev (regexp-match:end match))
(submatches (regexp-match:submatches match))
(range (lambda (item) ; Return start & end of
(cond ((integer? item) ; ITEM's range in STR.
(values (vector-ref sv item)
(vector-ref ev item)))
((eq? 'pre item) (values 0 (vector-ref sv 0)))
((eq? 'post item) (values (vector-ref ev 0)
(let ((submatch (vector-ref submatches item)))
(values (match-start submatch)
(match-end submatch))))
((eq? 'pre item) (values 0
(match-start
(vector-ref submatches 0))))
((eq? 'post item) (values (match-end
(vector-ref submatches 0))
(string-length str)))
(else (error "Illegal substitution item."
item
@ -54,11 +57,15 @@
(define (regexp-substitute/global port re str . items)
(let ((str-len (string-length str))
(range (lambda (start sv ev item) ; Return start & end of
(range (lambda (start submatches item) ; Return start & end of
(cond ((integer? item) ; ITEM's range in STR.
(values (vector-ref sv item)
(vector-ref ev item)))
((eq? 'pre item) (values start (vector-ref sv 0)))
(let ((submatch (vector-ref submatches item)))
(values (match-start submatch)
(match-end submatch))))
((eq? 'pre item)
(values start
(match-start
(vector-ref submatches 0))))
(else (error "Illegal substitution item."
item
regexp-substitute/global)))))
@ -73,10 +80,9 @@
(if (<= start str-len)
(let ((match (regexp-search re str start)))
(if match
(let* ((sv (regexp-match:start match))
(ev (regexp-match:end match))
(s (vector-ref sv 0))
(e (vector-ref ev 0))
(let* ((submatches (regexp-match:submatches match))
(s (match-start (vector-ref submatches 0)))
(e (match-end (vector-ref submatches 0)))
(empty? (= s e)))
(for-each (lambda (item)
(cond ((string? item) (write-string item port))
@ -91,7 +97,7 @@
(recur (if empty? (+ 1 e) e)))
(else (receive (si ei)
(range start sv ev item)
(range start submatches item)
(write-string str port si ei)))))
items))
@ -99,14 +105,15 @@
;; Either we're making a string, or >1 POST.
(let* ((pieces (let recur ((start 0))
(if (> start str-len) '()
(if (> start str-len)
'()
(let ((match (regexp-search re str start))
(cached-post #f))
(if match
(let* ((sv (regexp-match:start match))
(ev (regexp-match:end match))
(s (vector-ref sv 0))
(e (vector-ref ev 0))
(let* ((submatches
(regexp-match:submatches match))
(s (match-start (vector-ref submatches 0)))
(e (match-end (vector-ref submatches 0)))
(empty? (= s e)))
(fold (lambda (item pieces)
(cond ((string? item)
@ -128,7 +135,7 @@
(append cached-post pieces))
(else (receive (si ei)
(range start sv ev item)
(range start submatches item)
(cons (substring str si ei)
pieces)))))
'() items))