From 8e78e4cdbb3f3ab52e65ea77344061b05e2848ea Mon Sep 17 00:00:00 2001 From: JeffBezanson Date: Thu, 2 Apr 2009 02:22:38 +0000 Subject: [PATCH] making some utf8 routines more robust against invalid data implementing tail position in compiler adding arg count checking for instructionized builtins --- femtolisp/compiler.lsp | 160 +++++++++++++++++++++++------------------ llt/utf8.c | 13 +++- 2 files changed, 102 insertions(+), 71 deletions(-) diff --git a/femtolisp/compiler.lsp b/femtolisp/compiler.lsp index 076c05d..13426b4 100644 --- a/femtolisp/compiler.lsp +++ b/femtolisp/compiler.lsp @@ -18,13 +18,30 @@ :+ :- :* :/ :< :lognot :compare - :vector :aref :aset :length :for + :vector :aref :aset! :length :for :loadt :loadf :loadnil :load0 :load1 :loadv :loadv.l :loadg :loada :loadc :setg :seta :setc :loadg.l :setg.l - :closure :trycatch])) + :closure :trycatch :tcall :tapply])) + +(define arg-counts + (table :eq? 2 :eqv? 2 + :equal? 2 :atom? 1 + :not 1 :null? 1 + :boolean? 1 :symbol? 1 + :number? 1 :bound? 1 + :pair? 1 :builtin? 1 + :vector? 1 :fixnum? 1 + :cons 2 :car 1 + :cdr 1 :set-car! 2 + :set-cdr! 2 :eval 1 + :eval* 1 :apply 2 + :< 2 :lognot 1 + :compare 2 :aref 2 + :aset! 3 :length 1 + :for 3)) (define 1/Instructions (table.invert Instructions)) @@ -104,7 +121,7 @@ (io.write bcode (uint32 nxt)) (set! i (+ i 1))) - ((:loada :seta :call :loadv :loadg :setg :popn + ((:loada :seta :call :tcall :loadv :loadg :setg :popn :list :+ :- :* :/ :vector) (io.write bcode (uint8 nxt)) (set! i (+ i 1))) @@ -168,7 +185,7 @@ (if (null? curr) lev (+ lev 1)) #f))))) -(define (compile-sym g s env Is) +(define (compile-sym g env s Is) (let ((loc (lookup-sym s env 0 #t))) (case (car loc) (arg (emit g (aref Is 0) (cadr loc))) @@ -191,69 +208,64 @@ ,(cons 'begin (cdr clause)) ,(cond-clauses->if (cdr lst))))))) -(define (compile-if g x env) +(define (compile-if g env tail? x) (let ((elsel (make-label g)) (endl (make-label g))) - (compile-in g (cadr x) env) + (compile-in g env #f (cadr x)) (emit g :brf elsel) - (compile-in g (caddr x) env) - (emit g :jmp endl) + (compile-in g env tail? (caddr x)) + (if tail? + (emit g :ret) + (emit g :jmp endl)) (mark-label g elsel) - (compile-in g (if (pair? (cdddr x)) - (cadddr x) - #f) - env) + (compile-in g env tail? + (if (pair? (cdddr x)) + (cadddr x) + #f)) (mark-label g endl))) -(define (compile-begin g forms env) - (cond ((atom? forms) (compile-in g #f env)) +(define (compile-begin g env tail? forms) + (cond ((atom? forms) (compile-in g env tail? #f)) ((atom? (cdr forms)) - (compile-in g (car forms) env)) + (compile-in g env tail? (car forms))) (else - (compile-in g (car forms) env) + (compile-in g env #f (car forms)) (emit g :pop) - (compile-begin g (cdr forms) env)))) + (compile-begin g env tail? (cdr forms))))) -(define (compile-prog1 g x env) - (compile-in g (cadr x) env) +(define (compile-prog1 g env x) + (compile-in g env #f (cadr x)) (if (pair? (cddr x)) - (begin (compile-begin g (cddr x) env) + (begin (compile-begin g env #f (cddr x)) (emit g :pop)))) -(define (compile-while g cond body env) +(define (compile-while g env cond body) (let ((top (make-label g)) (end (make-label g))) (mark-label g top) - (compile-in g cond env) + (compile-in g env #f cond) (emit g :brf end) - (compile-in g body env) + (compile-in g env #f body) (emit g :pop) (emit g :jmp top) (mark-label g end))) -(define (compile-and g forms env) - (cond ((atom? forms) (compile-in g #t env)) - ((atom? (cdr forms)) (compile-in g (car forms) env)) +(define (compile-short-circuit g env tail? forms default branch) + (cond ((atom? forms) (compile-in g env tail? default)) + ((atom? (cdr forms)) (compile-in g env tail? (car forms))) (else (let ((end (make-label g))) - (compile-in g (car forms) env) + (compile-in g env #f (car forms)) (emit g :dup) - (emit g :brf end) + (emit g branch end) (emit g :pop) - (compile-and g (cdr forms) env) + (compile-short-circuit g env tail? (cdr forms) default branch) (mark-label g end))))) -(define (compile-or g forms env) - (cond ((atom? forms) (compile-in g #f env)) - ((atom? (cdr forms)) (compile-in g (car forms) env)) - (else - (let ((end (make-label g))) - (compile-in g (car forms) env) - (emit g :dup) - (emit g :brt end) - (emit g :pop) - (compile-or g (cdr forms) env) - (mark-label g end))))) +(define (compile-and g env tail? forms) + (compile-short-circuit g env tail? forms #t :brf)) +(define (compile-or g env tail? forms) + (compile-short-circuit g env tail? forms #f :brt)) (define MAX_ARGS 127) @@ -276,10 +288,10 @@ (define (just-compile-args g lst env) (for-each (lambda (a) - (compile-in g a env)) + (compile-in g env #f a)) lst)) -(define (compile-arglist g lst env) +(define (compile-arglist g env lst) (let ((argtail (length> lst MAX_ARGS))) (if argtail (begin (just-compile-args g (list-head lst MAX_ARGS) env) @@ -287,12 +299,12 @@ (cons nconc (map (lambda (l) (cons list l)) (list-partition argtail MAX_ARGS))))) - (compile-in g rest env)) + (compile-in g env #f rest)) (+ MAX_ARGS 1)) (begin (just-compile-args g lst env) (length lst))))) -(define (compile-app g x env) +(define (compile-app g env tail? x) (let ((head (car x))) (let ((head (if (and (symbol? head) @@ -305,16 +317,23 @@ (let ((b (and (builtin? head) (builtin->instruction head)))) (if (not b) - (compile-in g head env)) - (let ((nargs (compile-arglist g (cdr x) env))) - (if b ;; TODO check arg count - (if (memq b '(:list :+ :- :* :/ :vector)) - (emit g b nargs) - (emit g b)) - (emit g :call nargs))))))) + (compile-in g env #f head)) + (let ((nargs (compile-arglist g env (cdr x)))) + (if b + (let ((count (get arg-counts b #f))) + (if (and count + (not (length= (cdr x) count))) + (error (string "compile error: " head " expects " count + (if (= count 1) + " argument." + " arguments.")))) + (if (memq b '(:list :+ :- :* :/ :vector)) + (emit g b nargs) + (emit g (if (and tail? (eq? b :apply)) :tapply b)))) + (emit g (if tail? :tcall :call) nargs))))))) -(define (compile-in g x env) - (cond ((symbol? x) (compile-sym g x env [:loada :loadc :loadg])) +(define (compile-in g env tail? x) + (cond ((symbol? x) (compile-sym g env x [:loada :loadc :loadg])) ((atom? x) (cond ((eq? x 0) (emit g :load0)) ((eq? x 1) (emit g :load1)) @@ -325,30 +344,30 @@ (else (case (car x) (quote (emit g :loadv (cadr x))) - (cond (compile-in g (cond->if x) env)) - (if (compile-if g x env)) - (begin (compile-begin g (cdr x) env)) - (prog1 (compile-prog1 g x env)) - (lambda (begin (emit g :loadv (compile-f x env)) + (cond (compile-in g env tail? (cond->if x))) + (if (compile-if g env tail? x)) + (begin (compile-begin g env tail? (cdr x))) + (prog1 (compile-prog1 g env tail? x)) + (lambda (begin (emit g :loadv (compile-f env x)) (emit g :closure))) - (and (compile-and g (cdr x) env)) - (or (compile-or g (cdr x) env)) - (while (compile-while g (cadr x) (caddr x) env)) - (set! (compile-in g (caddr x) env) - (compile-sym g (cadr x) env [:seta :setc :setg])) - (trycatch (compile-in g `(lambda () ,(cadr x)) env) - (compile-in g (caddr x)) + (and (compile-and g env tail? (cdr x))) + (or (compile-or g env tail? (cdr x))) + (while (compile-while g env (cadr x) (caddr x))) + (set! (compile-in g env #f (caddr x)) + (compile-sym g env (cadr x) [:seta :setc :setg])) + (trycatch (compile-in g env #f `(lambda () ,(cadr x))) + (compile-in g env #f (caddr x)) (emit g :trycatch)) - (else (compile-app g x env)))))) + (else (compile-app g env tail? x)))))) -(define (compile-f f env) +(define (compile-f env f) (let ((g (make-code-emitter))) - (compile-in g (caddr f) (cons (to-proper (cadr f)) env)) + (compile-in g (cons (to-proper (cadr f)) env) #t (caddr f)) (emit g :ret) `(compiled-lambda ,(cadr f) ,(bytecode g)))) (define (compile x) - (bytecode (compile-in (make-code-emitter) x ()))) + (bytecode (compile-in (make-code-emitter) () #t x))) (define (ref-uint32-LE a i) (+ (ash (aref a (+ i 0)) 0) @@ -392,7 +411,8 @@ (print-val (aref vals (aref code i))) (set! i (+ i 1))) - ((:loada :seta :call :popn :list :+ :- :* :/ :vector) + ((:loada :seta :call :tcall :popn :list :+ :- :* :/ + :vector) (princ (number->string (aref code i))) (set! i (+ i 1))) diff --git a/llt/utf8.c b/llt/utf8.c index a42158f..284aec4 100644 --- a/llt/utf8.c +++ b/llt/utf8.c @@ -93,12 +93,20 @@ size_t u8_toucs(u_int32_t *dest, size_t sz, const char *src, size_t srcsz) return 0; while (i < sz) { + if (!isutf(*src)) { // invalid sequence + dest[i++] = 0xFFFD; + src++; + if (src >= src_end) break; + continue; + } nb = trailingBytesForUTF8[(unsigned char)*src]; if (src + nb >= src_end) break; ch = 0; switch (nb) { /* these fall through deliberately */ + case 5: ch += (unsigned char)*src++; ch <<= 6; + case 4: ch += (unsigned char)*src++; ch <<= 6; case 3: ch += (unsigned char)*src++; ch <<= 6; case 2: ch += (unsigned char)*src++; ch <<= 6; case 1: ch += (unsigned char)*src++; ch <<= 6; @@ -242,17 +250,20 @@ size_t u8_strwidth(const char *s) if (sc) tot++; } else { + if (!isutf(sc)) { tot++; s++; continue; } nb = trailingBytesForUTF8[(unsigned char)sc]; ch = 0; switch (nb) { /* these fall through deliberately */ + case 5: ch += (unsigned char)*s++; ch <<= 6; + case 4: ch += (unsigned char)*s++; ch <<= 6; case 3: ch += (unsigned char)*s++; ch <<= 6; case 2: ch += (unsigned char)*s++; ch <<= 6; case 1: ch += (unsigned char)*s++; ch <<= 6; case 0: ch += (unsigned char)*s++; } ch -= offsetsFromUTF8[nb]; - w = wcwidth(ch); + w = wcwidth(ch); // might return -1 if (w > 0) tot += w; } }