;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; Copyright 2007 William D Clinger. ; ; Permission to copy this software, in whole or in part, to use this ; software for any lawful purpose, and to redistribute this software ; is granted subject to the restriction that all copies made of this ; software must include this copyright notice in full. ; ; I also request that you send me a copy of any improvements that you ; make to this software so that they may be incorporated within it to ; the benefit of the Scheme community. ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; Tests of string <-> bytevector conversions. ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (import (rnrs base) (rnrs unicode) (rnrs bytevectors) (rnrs control) (rnrs io simple) (rnrs mutable-strings)) ; Crude test rig, just for benchmarking. (define failed-tests '()) (define (test name actual expected) (if (not (equal? actual expected)) (begin (display "******** FAILED TEST ******** ") (display name) (newline) (set! failed-tests (cons name failed-tests))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; The R6RS doesn't specify exactly how many replacement ; characters get generated by an encoding or decoding error, ; so the results of some tests are compared by treating any ; sequence of consecutive replacement characters the same as ; a single replacement character. ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define (string~? s1 s2) (define (replacement? c) (char=? c #\xfffd)) (define (canonicalized s) (let loop ((rchars (reverse (string->list s))) (cchars '())) (cond ((or (null? rchars) (null? (cdr rchars))) (list->string cchars)) ((and (replacement? (car rchars)) (replacement? (cadr rchars))) (loop (cdr rchars) cchars)) (else (loop (cdr rchars) (cons (car rchars) cchars)))))) (string=? (canonicalized s1) (canonicalized s2))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; Basic sanity tests, followed by stress tests on random inputs. ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define (string-bytevector-tests *random-stress-tests* *random-stress-test-max-size*) (define (test-roundtrip bvec tostring tobvec) (let* ((s1 (tostring bvec)) (b2 (tobvec s1)) (s2 (tostring b2))) (test "round trip of string conversion" (string=? s1 s2) #t))) ; This random number generator doesn't have to be good. ; It just has to be fast. (define random (letrec ((random14 (lambda (n) (set! x (mod (+ (* a x) c) (+ m 1))) (mod (div x 8) n))) (a 701) (x 1) (c 743483) (m 524287) (loop (lambda (q r n) (if (zero? q) (mod r n) (loop (div q 16384) (+ (* 16384 r) (random14 16384)) n))))) (lambda (n) (if (< n 16384) (random14 n) (loop (div n 16384) (random14 16384) n))))) ; Returns a random bytevector of length up to n. (define (random-bytevector n) (let* ((n (random n)) (bv (make-bytevector n))) (do ((i 0 (+ i 1))) ((= i n) bv) (bytevector-u8-set! bv i (random 256))))) ; Returns a random bytevector of even length up to n. (define (random-bytevector2 n) (let* ((n (random n)) (n (if (odd? n) (+ n 1) n)) (bv (make-bytevector n))) (do ((i 0 (+ i 1))) ((= i n) bv) (bytevector-u8-set! bv i (random 256))))) ; Returns a random bytevector of multiple-of-4 length up to n. (define (random-bytevector4 n) (let* ((n (random n)) (n (* 4 (round (/ n 4)))) (bv (make-bytevector n))) (do ((i 0 (+ i 1))) ((= i n) bv) (bytevector-u8-set! bv i (random 256))))) (test "utf-8, BMP" (bytevector=? (string->utf8 "k\x007f;\x0080;\x07ff;\x0800;\xffff;") '#vu8(#x6b #x7f #b11000010 #b10000000 #b11011111 #b10111111 #b11100000 #b10100000 #b10000000 #b11101111 #b10111111 #b10111111)) #t) (test "utf-8, supplemental" (bytevector=? (string->utf8 "\x010000;\x10ffff;") '#vu8(#b11110000 #b10010000 #b10000000 #b10000000 #b11110100 #b10001111 #b10111111 #b10111111)) #t) (test "utf-8, errors 1" (string~? (utf8->string '#vu8(#x61 ; a #xc0 #x62 ; ?b #xc1 #x63 ; ?c #xc2 #x64 ; ?d #x80 #x65 ; ?e #xc0 #xc0 #x66 ; ??f #xe0 #x67 ; ?g )) "a\xfffd;b\xfffd;c\xfffd;d\xfffd;e\xfffd;\xfffd;f\xfffd;g") #t) (test "utf-8, errors 2" (string~? (utf8->string '#vu8(#xe0 #x80 #x80 #x68 ; ???h #xe0 #xc0 #x80 #x69 ; ???i #xf0 #x6a ; ?j )) "\xfffd;\xfffd;\xfffd;h\xfffd;\xfffd;\xfffd;i\xfffd;j") #t) (test "utf-8, errors 3" (string~? (utf8->string '#vu8(#x61 ; a #xf0 #x80 #x80 #x80 #x62 ; ????b #xf0 #x90 #x80 #x80 #x63 ; .c )) "a\xfffd;\xfffd;\xfffd;\xfffd;b\x10000;c") #t) (test "utf-8, errors 4" (string~? (utf8->string '#vu8(#x61 ; a #xf0 #xbf #xbf #xbf #x64 ; .d #xf0 #xbf #xbf #x65 ; ?e #xf0 #xbf #x66 ; ?f )) "a\x3ffff;d\xfffd;e\xfffd;f") #t) (test "utf-8, errors 5" (string~? (utf8->string '#vu8(#x61 ; a #xf4 #x8f #xbf #xbf #x62 ; .b #xf4 #x90 #x80 #x80 #x63 ; ????c )) "a\x10ffff;b\xfffd;\xfffd;\xfffd;\xfffd;c") #t) (test "utf-8, errors 6" (string~? (utf8->string '#vu8(#x61 ; a #xf5 #x80 #x80 #x80 #x64 ; ????d )) "a\xfffd;\xfffd;\xfffd;\xfffd;d") #t) ; ignores BOM signature ; Officially, there is no BOM signature for UTF-8, ; so this test is commented out. #;(test "utf-8, BOM" (string=? (utf8->string '#vu8(#xef #xbb #xbf #x61 #x62 #x63 #x64)) "abcd") #t) (test-roundtrip (random-bytevector 10) utf8->string string->utf8) (do ((i 0 (+ i 1))) ((= i *random-stress-tests*)) (test-roundtrip (random-bytevector *random-stress-test-max-size*) utf8->string string->utf8)) (test "utf-16, BMP" (bytevector=? (string->utf16 "k\x007f;\x0080;\x07ff;\x0800;\xffff;") '#vu8(#x00 #x6b #x00 #x7f #x00 #x80 #x07 #xff #x08 #x00 #xff #xff)) #t) (test "utf-16le, BMP" (bytevector=? (string->utf16 "k\x007f;\x0080;\x07ff;\x0800;\xffff;" 'little) '#vu8(#x6b #x00 #x7f #x00 #x80 #x00 #xff #x07 #x00 #x08 #xff #xff)) #t) (test "utf-16, supplemental" (bytevector=? (string->utf16 "\x010000;\xfdcba;\x10ffff;") '#vu8(#xd8 #x00 #xdc #x00 #xdb #xb7 #xdc #xba #xdb #xff #xdf #xff)) #t) (test "utf-16le, supplemental" (bytevector=? (string->utf16 "\x010000;\xfdcba;\x10ffff;" 'little) '#vu8(#x00 #xd8 #x00 #xdc #xb7 #xdb #xba #xdc #xff #xdb #xff #xdf)) #t) (test "utf-16be" (bytevector=? (string->utf16 "ab\x010000;\xfdcba;\x10ffff;cd") (string->utf16 "ab\x010000;\xfdcba;\x10ffff;cd" 'big)) #t) (test "utf-16, errors 1" (string~? "k\x007f;\x0080;\x07ff;\x0800;\xffff;" (utf16->string '#vu8(#x00 #x6b #x00 #x7f #x00 #x80 #x07 #xff #x08 #x00 #xff #xff) 'big)) #t) (test "utf-16, errors 2" (string~? "k\x007f;\x0080;\x07ff;\x0800;\xffff;" (utf16->string '#vu8(#x00 #x6b #x00 #x7f #x00 #x80 #x07 #xff #x08 #x00 #xff #xff) 'big #t)) #t) (test "utf-16, errors 3" (string~? "k\x007f;\x0080;\x07ff;\x0800;\xffff;" (utf16->string '#vu8(#xfe #xff ; big-endian BOM #x00 #x6b #x00 #x7f #x00 #x80 #x07 #xff #x08 #x00 #xff #xff) 'big)) #t) (test "utf-16, errors 4" (string~? "k\x007f;\x0080;\x07ff;\x0800;\xffff;" (utf16->string '#vu8(#x6b #x00 #x7f #x00 #x80 #x00 #xff #x07 #x00 #x08 #xff #xff) 'little #t)) #t) (test "utf-16, errors 5" (string~? "k\x007f;\x0080;\x07ff;\x0800;\xffff;" (utf16->string '#vu8(#xff #xfe ; little-endian BOM #x6b #x00 #x7f #x00 #x80 #x00 #xff #x07 #x00 #x08 #xff #xff) 'big)) #t) (let ((tostring (lambda (bv) (utf16->string bv 'big))) (tostring-big (lambda (bv) (utf16->string bv 'big #t))) (tostring-little (lambda (bv) (utf16->string bv 'little #t))) (tobvec string->utf16) (tobvec-big (lambda (s) (string->utf16 s 'big))) (tobvec-little (lambda (s) (string->utf16 s 'little)))) (do ((i 0 (+ i 1))) ((= i *random-stress-tests*)) (test-roundtrip (random-bytevector2 *random-stress-test-max-size*) tostring tobvec) (test-roundtrip (random-bytevector2 *random-stress-test-max-size*) tostring-big tobvec-big) (test-roundtrip (random-bytevector2 *random-stress-test-max-size*) tostring-little tobvec-little))) (test "utf-32" (bytevector=? (string->utf32 "abc") '#vu8(#x00 #x00 #x00 #x61 #x00 #x00 #x00 #x62 #x00 #x00 #x00 #x63)) #t) (test "utf-32be" (bytevector=? (string->utf32 "abc" 'big) '#vu8(#x00 #x00 #x00 #x61 #x00 #x00 #x00 #x62 #x00 #x00 #x00 #x63)) #t) (test "utf-32le" (bytevector=? (string->utf32 "abc" 'little) '#vu8(#x61 #x00 #x00 #x00 #x62 #x00 #x00 #x00 #x63 #x00 #x00 #x00)) #t) (test "utf-32, errors 1" (string~? "a\xfffd;b\xfffd;c\xfffd;d\xfffd;e" (utf32->string '#vu8(#x00 #x00 #x00 #x61 #x00 #x00 #xd9 #x00 #x00 #x00 #x00 #x62 #x00 #x00 #xdd #xab #x00 #x00 #x00 #x63 #x00 #x11 #x00 #x00 #x00 #x00 #x00 #x64 #x01 #x00 #x00 #x65 #x00 #x00 #x00 #x65) 'big)) #t) (test "utf-32, errors 2" (string~? "a\xfffd;b\xfffd;c\xfffd;d\xfffd;e" (utf32->string '#vu8(#x00 #x00 #x00 #x61 #x00 #x00 #xd9 #x00 #x00 #x00 #x00 #x62 #x00 #x00 #xdd #xab #x00 #x00 #x00 #x63 #x00 #x11 #x00 #x00 #x00 #x00 #x00 #x64 #x01 #x00 #x00 #x65 #x00 #x00 #x00 #x65) 'big #t)) #t) (test "utf-32, errors 3" (string~? "a\xfffd;b\xfffd;c\xfffd;d\xfffd;e" (utf32->string '#vu8(#x00 #x00 #xfe #xff ; big-endian BOM #x00 #x00 #x00 #x61 #x00 #x00 #xd9 #x00 #x00 #x00 #x00 #x62 #x00 #x00 #xdd #xab #x00 #x00 #x00 #x63 #x00 #x11 #x00 #x00 #x00 #x00 #x00 #x64 #x01 #x00 #x00 #x65 #x00 #x00 #x00 #x65) 'big)) #t) (test "utf-32, errors 4" (string~? "\xfeff;a\xfffd;b\xfffd;c\xfffd;d\xfffd;e" (utf32->string '#vu8(#x00 #x00 #xfe #xff ; big-endian BOM #x00 #x00 #x00 #x61 #x00 #x00 #xd9 #x00 #x00 #x00 #x00 #x62 #x00 #x00 #xdd #xab #x00 #x00 #x00 #x63 #x00 #x11 #x00 #x00 #x00 #x00 #x00 #x64 #x01 #x00 #x00 #x65 #x00 #x00 #x00 #x65) 'big #t)) #t) (test "utf-32, errors 5" (string~? "a\xfffd;b\xfffd;c\xfffd;d\xfffd;e" (utf32->string '#vu8(#x61 #x00 #x00 #x00 #x00 #xd9 #x00 #x00 #x62 #x00 #x00 #x00 #xab #xdd #x00 #x00 #x63 #x00 #x00 #x00 #x00 #x00 #x11 #x00 #x64 #x00 #x00 #x00 #x65 #x00 #x00 #x01 #x65 #x00 #x00 #x00) 'little #t)) #t) (test "utf-32, errors 6" (string~? "a\xfffd;b\xfffd;c\xfffd;d\xfffd;e" (utf32->string '#vu8(#xff #xfe #x00 #x00 ; little-endian BOM #x61 #x00 #x00 #x00 #x00 #xd9 #x00 #x00 #x62 #x00 #x00 #x00 #xab #xdd #x00 #x00 #x63 #x00 #x00 #x00 #x00 #x00 #x11 #x00 #x64 #x00 #x00 #x00 #x65 #x00 #x00 #x01 #x65 #x00 #x00 #x00) 'big)) #t) (test "utf-32, errors 7" (string~? "\xfeff;a\xfffd;b\xfffd;c\xfffd;d\xfffd;e" (utf32->string '#vu8(#xff #xfe #x00 #x00 ; little-endian BOM #x61 #x00 #x00 #x00 #x00 #xd9 #x00 #x00 #x62 #x00 #x00 #x00 #xab #xdd #x00 #x00 #x63 #x00 #x00 #x00 #x00 #x00 #x11 #x00 #x64 #x00 #x00 #x00 #x65 #x00 #x00 #x01 #x65 #x00 #x00 #x00) 'little #t)) #t) (let ((tostring (lambda (bv) (utf32->string bv 'big))) (tostring-big (lambda (bv) (utf32->string bv 'big #t))) (tostring-little (lambda (bv) (utf32->string bv 'little #t))) (tobvec string->utf32) (tobvec-big (lambda (s) (string->utf32 s 'big))) (tobvec-little (lambda (s) (string->utf32 s 'little)))) (do ((i 0 (+ i 1))) ((= i *random-stress-tests*)) (test-roundtrip (random-bytevector4 *random-stress-test-max-size*) tostring tobvec) (test-roundtrip (random-bytevector4 *random-stress-test-max-size*) tostring-big tobvec-big) (test-roundtrip (random-bytevector4 *random-stress-test-max-size*) tostring-little tobvec-little))) ) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; Exhaustive tests. ; ; Tests string <-> bytevector conversion on strings ; that contain every Unicode scalar value. ; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (define (exhaustive-string-bytevector-tests) ; Tests throughout an inclusive range. (define (test-char-range lo hi tostring tobytevector) (let* ((n (+ 1 (- hi lo))) (s (make-string n)) (replacement-character (integer->char #xfffd))) (do ((i lo (+ i 1))) ((> i hi)) (let ((c (if (or (<= 0 i #xd7ff) (<= #xe000 i #x10ffff)) (integer->char i) replacement-character))) (string-set! s (- i lo) c))) (test "test of long string conversion" (string=? (tostring (tobytevector s)) s) #t))) (define (test-exhaustively name tostring tobytevector) ;(display "Testing ") ;(display name) ;(display " conversions...") ;(newline) (test-char-range 0 #xffff tostring tobytevector) (test-char-range #x10000 #x1ffff tostring tobytevector) (test-char-range #x20000 #x2ffff tostring tobytevector) (test-char-range #x30000 #x3ffff tostring tobytevector) (test-char-range #x40000 #x4ffff tostring tobytevector) (test-char-range #x50000 #x5ffff tostring tobytevector) (test-char-range #x60000 #x6ffff tostring tobytevector) (test-char-range #x70000 #x7ffff tostring tobytevector) (test-char-range #x80000 #x8ffff tostring tobytevector) (test-char-range #x90000 #x9ffff tostring tobytevector) (test-char-range #xa0000 #xaffff tostring tobytevector) (test-char-range #xb0000 #xbffff tostring tobytevector) (test-char-range #xc0000 #xcffff tostring tobytevector) (test-char-range #xd0000 #xdffff tostring tobytevector) (test-char-range #xe0000 #xeffff tostring tobytevector) (test-char-range #xf0000 #xfffff tostring tobytevector) (test-char-range #x100000 #x10ffff tostring tobytevector)) ; Feel free to replace this with your favorite timing macro. (define (timeit x) x) (timeit (test-exhaustively "UTF-8" utf8->string string->utf8)) ; NOTE: An unfortunate misunderstanding led to a late deletion ; of single-argument utf16->string from the R6RS. To get the ; correct effect of single-argument utf16->string, you have to ; use two arguments, as below. ; ;(timeit (test-exhaustively "UTF-16" utf16->string string->utf16)) (timeit (test-exhaustively "UTF-16" (lambda (bv) (utf16->string bv 'big)) string->utf16)) ; NOTE: To get the correct effect of two-argument utf16->string, ; you have to use three arguments, as below. (timeit (test-exhaustively "UTF-16BE" (lambda (bv) (utf16->string bv 'big #t)) (lambda (s) (string->utf16 s 'big)))) (timeit (test-exhaustively "UTF-16LE" (lambda (bv) (utf16->string bv 'little #t)) (lambda (s) (string->utf16 s 'little)))) ; NOTE: An unfortunate misunderstanding led to a late deletion ; of single-argument utf32->string from the R6RS. To get the ; correct effect of single-argument utf32->string, you have to ; use two arguments, as below. ; ;(timeit (test-exhaustively "UTF-32" utf32->string string->utf32)) (timeit (test-exhaustively "UTF-32" (lambda (bv) (utf32->string bv 'big)) string->utf32)) ; NOTE: To get the correct effect of two-argument utf32->string, ; you have to use three arguments, as below. (timeit (test-exhaustively "UTF-32BE" (lambda (bv) (utf32->string bv 'big #t)) (lambda (s) (string->utf32 s 'big)))) (timeit (test-exhaustively "UTF-32LE" (lambda (bv) (utf32->string bv 'little #t)) (lambda (s) (string->utf32 s 'little))))) (define (main) (let* ((count (read)) (input1 (read)) (input2 (read)) (output (read)) (s3 (number->string count)) (s2 (number->string input2)) (s1 (number->string input1)) (name "bv2string")) (run-r6rs-benchmark (string-append name ":" s1 ":" s2 ":" s3) count (lambda () (string-bytevector-tests (hide count count) (hide count input1)) (exhaustive-string-bytevector-tests) (length failed-tests)) (lambda (result) (equal? result output)))))