picrin/etc/R7RS/src/bv2string.sch

600 lines
22 KiB
Scheme

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Copyright 2007 William D Clinger.
;
; Permission to copy this software, in whole or in part, to use this
; software for any lawful purpose, and to redistribute this software
; is granted subject to the restriction that all copies made of this
; software must include this copyright notice in full.
;
; I also request that you send me a copy of any improvements that you
; make to this software so that they may be incorporated within it to
; the benefit of the Scheme community.
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Tests of string <-> bytevector conversions.
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(import (rnrs base)
(rnrs unicode)
(rnrs bytevectors)
(rnrs control)
(rnrs io simple)
(rnrs mutable-strings))
; Crude test rig, just for benchmarking.
(define failed-tests '())
(define (test name actual expected)
(if (not (equal? actual expected))
(begin (display "******** FAILED TEST ******** ")
(display name)
(newline)
(set! failed-tests (cons name failed-tests)))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; The R6RS doesn't specify exactly how many replacement
; characters get generated by an encoding or decoding error,
; so the results of some tests are compared by treating any
; sequence of consecutive replacement characters the same as
; a single replacement character.
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define (string~? s1 s2)
(define (replacement? c)
(char=? c #\xfffd))
(define (canonicalized s)
(let loop ((rchars (reverse (string->list s)))
(cchars '()))
(cond ((or (null? rchars) (null? (cdr rchars)))
(list->string cchars))
((and (replacement? (car rchars))
(replacement? (cadr rchars)))
(loop (cdr rchars) cchars))
(else
(loop (cdr rchars) (cons (car rchars) cchars))))))
(string=? (canonicalized s1) (canonicalized s2)))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Basic sanity tests, followed by stress tests on random inputs.
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define (string-bytevector-tests
*random-stress-tests* *random-stress-test-max-size*)
(define (test-roundtrip bvec tostring tobvec)
(let* ((s1 (tostring bvec))
(b2 (tobvec s1))
(s2 (tostring b2)))
(test "round trip of string conversion" (string=? s1 s2) #t)))
; This random number generator doesn't have to be good.
; It just has to be fast.
(define random
(letrec ((random14
(lambda (n)
(set! x (mod (+ (* a x) c) (+ m 1)))
(mod (div x 8) n)))
(a 701)
(x 1)
(c 743483)
(m 524287)
(loop
(lambda (q r n)
(if (zero? q)
(mod r n)
(loop (div q 16384)
(+ (* 16384 r) (random14 16384))
n)))))
(lambda (n)
(if (< n 16384)
(random14 n)
(loop (div n 16384) (random14 16384) n)))))
; Returns a random bytevector of length up to n.
(define (random-bytevector n)
(let* ((n (random n))
(bv (make-bytevector n)))
(do ((i 0 (+ i 1)))
((= i n) bv)
(bytevector-u8-set! bv i (random 256)))))
; Returns a random bytevector of even length up to n.
(define (random-bytevector2 n)
(let* ((n (random n))
(n (if (odd? n) (+ n 1) n))
(bv (make-bytevector n)))
(do ((i 0 (+ i 1)))
((= i n) bv)
(bytevector-u8-set! bv i (random 256)))))
; Returns a random bytevector of multiple-of-4 length up to n.
(define (random-bytevector4 n)
(let* ((n (random n))
(n (* 4 (round (/ n 4))))
(bv (make-bytevector n)))
(do ((i 0 (+ i 1)))
((= i n) bv)
(bytevector-u8-set! bv i (random 256)))))
(test "utf-8, BMP"
(bytevector=? (string->utf8 "k\x007f;\x0080;\x07ff;\x0800;\xffff;")
'#vu8(#x6b
#x7f
#b11000010 #b10000000
#b11011111 #b10111111
#b11100000 #b10100000 #b10000000
#b11101111 #b10111111 #b10111111))
#t)
(test "utf-8, supplemental"
(bytevector=? (string->utf8 "\x010000;\x10ffff;")
'#vu8(#b11110000 #b10010000 #b10000000 #b10000000
#b11110100 #b10001111 #b10111111 #b10111111))
#t)
(test "utf-8, errors 1"
(string~? (utf8->string '#vu8(#x61 ; a
#xc0 #x62 ; ?b
#xc1 #x63 ; ?c
#xc2 #x64 ; ?d
#x80 #x65 ; ?e
#xc0 #xc0 #x66 ; ??f
#xe0 #x67 ; ?g
))
"a\xfffd;b\xfffd;c\xfffd;d\xfffd;e\xfffd;\xfffd;f\xfffd;g")
#t)
(test "utf-8, errors 2"
(string~? (utf8->string '#vu8(#xe0 #x80 #x80 #x68 ; ???h
#xe0 #xc0 #x80 #x69 ; ???i
#xf0 #x6a ; ?j
))
"\xfffd;\xfffd;\xfffd;h\xfffd;\xfffd;\xfffd;i\xfffd;j")
#t)
(test "utf-8, errors 3"
(string~? (utf8->string '#vu8(#x61 ; a
#xf0 #x80 #x80 #x80 #x62 ; ????b
#xf0 #x90 #x80 #x80 #x63 ; .c
))
"a\xfffd;\xfffd;\xfffd;\xfffd;b\x10000;c")
#t)
(test "utf-8, errors 4"
(string~? (utf8->string '#vu8(#x61 ; a
#xf0 #xbf #xbf #xbf #x64 ; .d
#xf0 #xbf #xbf #x65 ; ?e
#xf0 #xbf #x66 ; ?f
))
"a\x3ffff;d\xfffd;e\xfffd;f")
#t)
(test "utf-8, errors 5"
(string~? (utf8->string '#vu8(#x61 ; a
#xf4 #x8f #xbf #xbf #x62 ; .b
#xf4 #x90 #x80 #x80 #x63 ; ????c
))
"a\x10ffff;b\xfffd;\xfffd;\xfffd;\xfffd;c")
#t)
(test "utf-8, errors 6"
(string~? (utf8->string '#vu8(#x61 ; a
#xf5 #x80 #x80 #x80 #x64 ; ????d
))
"a\xfffd;\xfffd;\xfffd;\xfffd;d")
#t)
; ignores BOM signature
; Officially, there is no BOM signature for UTF-8,
; so this test is commented out.
#;(test "utf-8, BOM"
(string=? (utf8->string '#vu8(#xef #xbb #xbf #x61 #x62 #x63 #x64))
"abcd")
#t)
(test-roundtrip (random-bytevector 10) utf8->string string->utf8)
(do ((i 0 (+ i 1)))
((= i *random-stress-tests*))
(test-roundtrip (random-bytevector *random-stress-test-max-size*)
utf8->string string->utf8))
(test "utf-16, BMP"
(bytevector=? (string->utf16 "k\x007f;\x0080;\x07ff;\x0800;\xffff;")
'#vu8(#x00 #x6b
#x00 #x7f
#x00 #x80
#x07 #xff
#x08 #x00
#xff #xff))
#t)
(test "utf-16le, BMP"
(bytevector=? (string->utf16 "k\x007f;\x0080;\x07ff;\x0800;\xffff;"
'little)
'#vu8(#x6b #x00
#x7f #x00
#x80 #x00
#xff #x07
#x00 #x08
#xff #xff))
#t)
(test "utf-16, supplemental"
(bytevector=? (string->utf16 "\x010000;\xfdcba;\x10ffff;")
'#vu8(#xd8 #x00 #xdc #x00
#xdb #xb7 #xdc #xba
#xdb #xff #xdf #xff))
#t)
(test "utf-16le, supplemental"
(bytevector=? (string->utf16 "\x010000;\xfdcba;\x10ffff;" 'little)
'#vu8(#x00 #xd8 #x00 #xdc
#xb7 #xdb #xba #xdc
#xff #xdb #xff #xdf))
#t)
(test "utf-16be"
(bytevector=? (string->utf16 "ab\x010000;\xfdcba;\x10ffff;cd")
(string->utf16 "ab\x010000;\xfdcba;\x10ffff;cd" 'big))
#t)
(test "utf-16, errors 1"
(string~? "k\x007f;\x0080;\x07ff;\x0800;\xffff;"
(utf16->string
'#vu8(#x00 #x6b
#x00 #x7f
#x00 #x80
#x07 #xff
#x08 #x00
#xff #xff)
'big))
#t)
(test "utf-16, errors 2"
(string~? "k\x007f;\x0080;\x07ff;\x0800;\xffff;"
(utf16->string
'#vu8(#x00 #x6b
#x00 #x7f
#x00 #x80
#x07 #xff
#x08 #x00
#xff #xff)
'big #t))
#t)
(test "utf-16, errors 3"
(string~? "k\x007f;\x0080;\x07ff;\x0800;\xffff;"
(utf16->string
'#vu8(#xfe #xff ; big-endian BOM
#x00 #x6b
#x00 #x7f
#x00 #x80
#x07 #xff
#x08 #x00
#xff #xff)
'big))
#t)
(test "utf-16, errors 4"
(string~? "k\x007f;\x0080;\x07ff;\x0800;\xffff;"
(utf16->string
'#vu8(#x6b #x00
#x7f #x00
#x80 #x00
#xff #x07
#x00 #x08
#xff #xff)
'little #t))
#t)
(test "utf-16, errors 5"
(string~? "k\x007f;\x0080;\x07ff;\x0800;\xffff;"
(utf16->string
'#vu8(#xff #xfe ; little-endian BOM
#x6b #x00
#x7f #x00
#x80 #x00
#xff #x07
#x00 #x08
#xff #xff)
'big))
#t)
(let ((tostring (lambda (bv) (utf16->string bv 'big)))
(tostring-big (lambda (bv) (utf16->string bv 'big #t)))
(tostring-little (lambda (bv) (utf16->string bv 'little #t)))
(tobvec string->utf16)
(tobvec-big (lambda (s) (string->utf16 s 'big)))
(tobvec-little (lambda (s) (string->utf16 s 'little))))
(do ((i 0 (+ i 1)))
((= i *random-stress-tests*))
(test-roundtrip (random-bytevector2 *random-stress-test-max-size*)
tostring tobvec)
(test-roundtrip (random-bytevector2 *random-stress-test-max-size*)
tostring-big tobvec-big)
(test-roundtrip (random-bytevector2 *random-stress-test-max-size*)
tostring-little tobvec-little)))
(test "utf-32"
(bytevector=? (string->utf32 "abc")
'#vu8(#x00 #x00 #x00 #x61
#x00 #x00 #x00 #x62
#x00 #x00 #x00 #x63))
#t)
(test "utf-32be"
(bytevector=? (string->utf32 "abc" 'big)
'#vu8(#x00 #x00 #x00 #x61
#x00 #x00 #x00 #x62
#x00 #x00 #x00 #x63))
#t)
(test "utf-32le"
(bytevector=? (string->utf32 "abc" 'little)
'#vu8(#x61 #x00 #x00 #x00
#x62 #x00 #x00 #x00
#x63 #x00 #x00 #x00))
#t)
(test "utf-32, errors 1"
(string~? "a\xfffd;b\xfffd;c\xfffd;d\xfffd;e"
(utf32->string
'#vu8(#x00 #x00 #x00 #x61
#x00 #x00 #xd9 #x00
#x00 #x00 #x00 #x62
#x00 #x00 #xdd #xab
#x00 #x00 #x00 #x63
#x00 #x11 #x00 #x00
#x00 #x00 #x00 #x64
#x01 #x00 #x00 #x65
#x00 #x00 #x00 #x65)
'big))
#t)
(test "utf-32, errors 2"
(string~? "a\xfffd;b\xfffd;c\xfffd;d\xfffd;e"
(utf32->string
'#vu8(#x00 #x00 #x00 #x61
#x00 #x00 #xd9 #x00
#x00 #x00 #x00 #x62
#x00 #x00 #xdd #xab
#x00 #x00 #x00 #x63
#x00 #x11 #x00 #x00
#x00 #x00 #x00 #x64
#x01 #x00 #x00 #x65
#x00 #x00 #x00 #x65)
'big #t))
#t)
(test "utf-32, errors 3"
(string~? "a\xfffd;b\xfffd;c\xfffd;d\xfffd;e"
(utf32->string
'#vu8(#x00 #x00 #xfe #xff ; big-endian BOM
#x00 #x00 #x00 #x61
#x00 #x00 #xd9 #x00
#x00 #x00 #x00 #x62
#x00 #x00 #xdd #xab
#x00 #x00 #x00 #x63
#x00 #x11 #x00 #x00
#x00 #x00 #x00 #x64
#x01 #x00 #x00 #x65
#x00 #x00 #x00 #x65)
'big))
#t)
(test "utf-32, errors 4"
(string~? "\xfeff;a\xfffd;b\xfffd;c\xfffd;d\xfffd;e"
(utf32->string
'#vu8(#x00 #x00 #xfe #xff ; big-endian BOM
#x00 #x00 #x00 #x61
#x00 #x00 #xd9 #x00
#x00 #x00 #x00 #x62
#x00 #x00 #xdd #xab
#x00 #x00 #x00 #x63
#x00 #x11 #x00 #x00
#x00 #x00 #x00 #x64
#x01 #x00 #x00 #x65
#x00 #x00 #x00 #x65)
'big #t))
#t)
(test "utf-32, errors 5"
(string~? "a\xfffd;b\xfffd;c\xfffd;d\xfffd;e"
(utf32->string
'#vu8(#x61 #x00 #x00 #x00
#x00 #xd9 #x00 #x00
#x62 #x00 #x00 #x00
#xab #xdd #x00 #x00
#x63 #x00 #x00 #x00
#x00 #x00 #x11 #x00
#x64 #x00 #x00 #x00
#x65 #x00 #x00 #x01
#x65 #x00 #x00 #x00)
'little #t))
#t)
(test "utf-32, errors 6"
(string~? "a\xfffd;b\xfffd;c\xfffd;d\xfffd;e"
(utf32->string
'#vu8(#xff #xfe #x00 #x00 ; little-endian BOM
#x61 #x00 #x00 #x00
#x00 #xd9 #x00 #x00
#x62 #x00 #x00 #x00
#xab #xdd #x00 #x00
#x63 #x00 #x00 #x00
#x00 #x00 #x11 #x00
#x64 #x00 #x00 #x00
#x65 #x00 #x00 #x01
#x65 #x00 #x00 #x00)
'big))
#t)
(test "utf-32, errors 7"
(string~? "\xfeff;a\xfffd;b\xfffd;c\xfffd;d\xfffd;e"
(utf32->string
'#vu8(#xff #xfe #x00 #x00 ; little-endian BOM
#x61 #x00 #x00 #x00
#x00 #xd9 #x00 #x00
#x62 #x00 #x00 #x00
#xab #xdd #x00 #x00
#x63 #x00 #x00 #x00
#x00 #x00 #x11 #x00
#x64 #x00 #x00 #x00
#x65 #x00 #x00 #x01
#x65 #x00 #x00 #x00)
'little #t))
#t)
(let ((tostring (lambda (bv) (utf32->string bv 'big)))
(tostring-big (lambda (bv) (utf32->string bv 'big #t)))
(tostring-little (lambda (bv) (utf32->string bv 'little #t)))
(tobvec string->utf32)
(tobvec-big (lambda (s) (string->utf32 s 'big)))
(tobvec-little (lambda (s) (string->utf32 s 'little))))
(do ((i 0 (+ i 1)))
((= i *random-stress-tests*))
(test-roundtrip (random-bytevector4 *random-stress-test-max-size*)
tostring tobvec)
(test-roundtrip (random-bytevector4 *random-stress-test-max-size*)
tostring-big tobvec-big)
(test-roundtrip (random-bytevector4 *random-stress-test-max-size*)
tostring-little tobvec-little)))
)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; Exhaustive tests.
;
; Tests string <-> bytevector conversion on strings
; that contain every Unicode scalar value.
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(define (exhaustive-string-bytevector-tests)
; Tests throughout an inclusive range.
(define (test-char-range lo hi tostring tobytevector)
(let* ((n (+ 1 (- hi lo)))
(s (make-string n))
(replacement-character (integer->char #xfffd)))
(do ((i lo (+ i 1)))
((> i hi))
(let ((c (if (or (<= 0 i #xd7ff)
(<= #xe000 i #x10ffff))
(integer->char i)
replacement-character)))
(string-set! s (- i lo) c)))
(test "test of long string conversion"
(string=? (tostring (tobytevector s)) s) #t)))
(define (test-exhaustively name tostring tobytevector)
;(display "Testing ")
;(display name)
;(display " conversions...")
;(newline)
(test-char-range 0 #xffff tostring tobytevector)
(test-char-range #x10000 #x1ffff tostring tobytevector)
(test-char-range #x20000 #x2ffff tostring tobytevector)
(test-char-range #x30000 #x3ffff tostring tobytevector)
(test-char-range #x40000 #x4ffff tostring tobytevector)
(test-char-range #x50000 #x5ffff tostring tobytevector)
(test-char-range #x60000 #x6ffff tostring tobytevector)
(test-char-range #x70000 #x7ffff tostring tobytevector)
(test-char-range #x80000 #x8ffff tostring tobytevector)
(test-char-range #x90000 #x9ffff tostring tobytevector)
(test-char-range #xa0000 #xaffff tostring tobytevector)
(test-char-range #xb0000 #xbffff tostring tobytevector)
(test-char-range #xc0000 #xcffff tostring tobytevector)
(test-char-range #xd0000 #xdffff tostring tobytevector)
(test-char-range #xe0000 #xeffff tostring tobytevector)
(test-char-range #xf0000 #xfffff tostring tobytevector)
(test-char-range #x100000 #x10ffff tostring tobytevector))
; Feel free to replace this with your favorite timing macro.
(define (timeit x) x)
(timeit (test-exhaustively "UTF-8" utf8->string string->utf8))
; NOTE: An unfortunate misunderstanding led to a late deletion
; of single-argument utf16->string from the R6RS. To get the
; correct effect of single-argument utf16->string, you have to
; use two arguments, as below.
;
;(timeit (test-exhaustively "UTF-16" utf16->string string->utf16))
(timeit (test-exhaustively "UTF-16"
(lambda (bv) (utf16->string bv 'big))
string->utf16))
; NOTE: To get the correct effect of two-argument utf16->string,
; you have to use three arguments, as below.
(timeit (test-exhaustively "UTF-16BE"
(lambda (bv) (utf16->string bv 'big #t))
(lambda (s) (string->utf16 s 'big))))
(timeit (test-exhaustively "UTF-16LE"
(lambda (bv) (utf16->string bv 'little #t))
(lambda (s) (string->utf16 s 'little))))
; NOTE: An unfortunate misunderstanding led to a late deletion
; of single-argument utf32->string from the R6RS. To get the
; correct effect of single-argument utf32->string, you have to
; use two arguments, as below.
;
;(timeit (test-exhaustively "UTF-32" utf32->string string->utf32))
(timeit (test-exhaustively "UTF-32"
(lambda (bv) (utf32->string bv 'big))
string->utf32))
; NOTE: To get the correct effect of two-argument utf32->string,
; you have to use three arguments, as below.
(timeit (test-exhaustively "UTF-32BE"
(lambda (bv) (utf32->string bv 'big #t))
(lambda (s) (string->utf32 s 'big))))
(timeit (test-exhaustively "UTF-32LE"
(lambda (bv) (utf32->string bv 'little #t))
(lambda (s) (string->utf32 s 'little)))))
(define (main)
(let* ((count (read))
(input1 (read))
(input2 (read))
(output (read))
(s3 (number->string count))
(s2 (number->string input2))
(s1 (number->string input1))
(name "bv2string"))
(run-r6rs-benchmark
(string-append name ":" s1 ":" s2 ":" s3)
count
(lambda ()
(string-bytevector-tests (hide count count) (hide count input1))
(exhaustive-string-bytevector-tests)
(length failed-tests))
(lambda (result) (equal? result output)))))