From 0370fcb8bc49bd62b9a1bfa350901c34dcaa3f54 Mon Sep 17 00:00:00 2001 From: Martin Gasbichler Date: Mon, 14 Apr 2003 06:38:47 +0000 Subject: [PATCH] Added Oleg's binary-parsing --- s48/binary-parse/AUTHORS | 1 + s48/binary-parse/BLURB | 1 + s48/binary-parse/README | 18 ++ s48/binary-parse/binary-parse.scm | 462 ++++++++++++++++++++++++++++++ s48/binary-parse/interfaces.scm | 2 + s48/binary-parse/packages.scm | 6 + 6 files changed, 490 insertions(+) create mode 100644 s48/binary-parse/AUTHORS create mode 100644 s48/binary-parse/BLURB create mode 100644 s48/binary-parse/README create mode 100644 s48/binary-parse/binary-parse.scm create mode 100644 s48/binary-parse/interfaces.scm create mode 100644 s48/binary-parse/packages.scm diff --git a/s48/binary-parse/AUTHORS b/s48/binary-parse/AUTHORS new file mode 100644 index 0000000..eae3d05 --- /dev/null +++ b/s48/binary-parse/AUTHORS @@ -0,0 +1 @@ +Martin Gasbichler, Oleg Kiselyov \ No newline at end of file diff --git a/s48/binary-parse/BLURB b/s48/binary-parse/BLURB new file mode 100644 index 0000000..707b252 --- /dev/null +++ b/s48/binary-parse/BLURB @@ -0,0 +1 @@ +binary-parse: Reading of bits from a byte-stream. \ No newline at end of file diff --git a/s48/binary-parse/README b/s48/binary-parse/README new file mode 100644 index 0000000..61edb37 --- /dev/null +++ b/s48/binary-parse/README @@ -0,0 +1,18 @@ +(make-bit-reader byte-reader) -> bit-reader + +Given a BYTE-READER (a thunk), construct and return a function + + (bit-reader N) -> integer + +that reads N bits from a byte-stream represented by the BYTE-READER. + +The BYTE-READER is a function that takes no arguments and returns +the current byte as an exact integer [0-255]. The byte reader +should return #f on EOF. +The bit reader returns N bits as an exact unsigned integer, +0 -... (no limit). N must be a positive integer, otherwise the bit reader +returns #f. There is no upper limit on N -- other than the size of the +input stream itself and the amount of (virtual) memory an OS is willing +to give to your process. If you want to read 1M of _bits_, go ahead. + +It is assumed that the bit order is the most-significant bit first. diff --git a/s48/binary-parse/binary-parse.scm b/s48/binary-parse/binary-parse.scm new file mode 100644 index 0000000..740ed5c --- /dev/null +++ b/s48/binary-parse/binary-parse.scm @@ -0,0 +1,462 @@ +; Binary parsing + +;---------------------------------------- +; Apologia +; +; Binary parsing and unparsing are transformations between primitive or +; composite Scheme values and their external binary representations. +; +; Examples include reading and writing JPEG, TIFF, MP3, ELF file +; formats, communicating with DNS, Kerberos, LDAP, SLP internet +; services, participating in Sun RPC and CORBA/IIOP distributed systems, +; storing and retrieving (arrays of) floating-point numbers in a +; portable and efficient way. This project will propose a set of low- and +; intermediate- level procedures that make binary parsing possible. + +; Scheme is a good language to do research in text compression. Text +; compression involves a great deal of building and traversing +; dictionaries, trees and similar data structures, where Scheme +; excels. Performance doesn't matter in research, but the size of +; compressed files does (to figure out the bpc for the common +; benchmarks). Variable-bit i/o is a necessity. It is implemented +; in the present file. + +; ASN.1 corresponds to a higher-level parsing (LR parser +; vs. lexer). Information in LDAP responses and X.509 certificates is +; structural and recursive, and so lends itself to be processed in +; Scheme. Variable bit i/o is necessary, and so is a binary lexer for +; a LR parser. Parsing of ASN.1 is a highly profitable enterprise + +;---------------------------------------- +; The outline of the project +; +; Primitives and streams +; +; - read-byte +; - read-u8vector (cf. read-string) +; - with-input-from-u8vector, with-input-from-encoded-u8vector 'base64,... +; building binary i/o streams from a sequence of bytes. Streams over +; u8vector, u16vector, etc. provide a serial access to memory. See SRFI-4 +; +; - read-bit, read-bits via overlayed streams given read-byte +; implemented in the present file. +; +; - mmap-u8vector, munmap-u8vector +; +; Conversions +; - u8vector->integer u8vector endianness, +; u8vector->sinteger u8vector endianness +; These conversion procedures turn a sequence of bytes to an unsigned or +; signed integer, minding the byte order. The u8vector in question can +; have size 1,2,4,8, 3 etc. bytes. These two functions therefore can be +; used to read shorts, longs, extra longs, etc. numbers. +; - u8vector-reverse and other useful u8vector operations +; +; - modf, frexp, ldexp +; The above primitives can be emulated in R5RS, yet they are quite handy +; (for portable FP manipulation) and can be executed very efficiently by +; an FPU. +; +; Higher-level parsing and combinators +; These are combinators that can compose primitives above for more +; complex (possibly iterative) actions. +; +; - skip-bits, next-u8token,... +; - IIOP, RPC/XDR, RMI +; - binary lexer for existing LR/LL-parsers +; +; The composition of primitives and combinators will represent binary +; parsing language in a _full_ notation. This is similar to XPath +; expressions in full notation. Later we need to find out the +; most-frequently used patterns of the binary parsing language and +; design an abbreviated notation. The latter will need a special +; "interpreter". The abbreviated notation may turn out to look like +; Olin's regular expressions. + +; $Id: binary-parse.scm,v 1.1 2003/04/14 06:38:47 mainzelM Exp $ + + +;---------------------------------------- +; Test harness +; +; The following macro runs built-in test cases -- or does not run, +; depending on which of the two lines below you commented out +;(define-syntax run-test +; (syntax-rules () +; ((run-test body ...) +; (begin (newline) +; (display "-->Test") +; (newline) +; body ...)))) + +(define-syntax run-test + (syntax-rules () + ((run-test body ...) + (begin #f)))) + +;(define-macro (run-test . body) '(begin #f)) +;(defmacro run-test body `(begin (display "\n-->Test\n") ,@body)) + +;;======================================================================== +;; Configuration section +;; +; Performance is very important for binary parsing. We have to get all +; help from a particular Scheme system we can get. If a Scheme function +; can support the following primitives faster, we should take +; advantage of that fact. + + +;--- +; Configuration for Gambit. See below for other systems, as well as R5RS +; implementations + +;(define-macro (logior x y) `(##fixnum.logior ,x ,y)) +;(define-macro (logand x y) `(##fixnum.logand ,x ,y)) +;(define-macro (lsh-left x n) `(##fixnum.shl ,x ,n)) +;(define-macro (lsh-right x n) `(##fixnum.lshr ,x ,n)) +;(define-macro (lsh-left-one x) `(##fixnum.shl ,x 1)) +;(define-macro (lsh-right-one x) `(##fixnum.lshr ,x 1)) + +;(define-macro (-- x) `(##fixnum.- ,x 1)) +;(define-macro (++ x) `(##fixnum.+ ,x 1)) + +;(define-macro (bit-set? x mask) ; return x & mask != 0 +; `(##not (##fixnum.zero? (logand ,x ,mask))) +;) +; End of the Gambit-specific configuration section +;--- + +; --- +; Configuration for S48/scsh + +(define logior bitwise-ior) +(define logand bitwise-and) +(define (lsh-right x n) (arithmetic-shift x (- n))) +(define lsh-left arithmetic-shift) +(define (lsh-left-one x) (arithmetic-shift x 1)) +(define (lsh-right-one x) (arithmetic-shift x -1)) + +(define (bit-set? x mask) + (not (zero? (bitwise-and x mask)))) + +; -- and ++ are not valid R5RS names: +; ++ is not used in this file anyway +; -- is replaced by decrement-one: +(define (decrement-one n) + (- n 1)) + +(define-syntax time + (syntax-rules () + ((time expr) + expr))) + +; End of S48/scsh configuration +; --- + +; combine bytes in the MSB order. A byte may be #f +(define (combine-two b1 b2) ; The result is for sure a fixnum + (and b1 b2 (logior (lsh-left b1 8) b2))) + +(define (combine-three b1 b2 b3) ; The result is for sure a fixnum + (and b1 b2 b3 (logior (lsh-left (logior (lsh-left b1 8) b2) 8) b3))) + +; Here the result may be a BIGNUM +(define (combine-bytes . bytes) + (cond + ((null? bytes) 0) + ((not (car bytes)) #f) + (else + (let loop ((bytes (cdr bytes)) (result (car bytes))) + (cond + ((null? bytes) result) + ((not (car bytes)) #f) + (else (loop (cdr bytes) (+ (car bytes) (* 256 result))))))))) + +;--- +; R5RS implementations of the primitives +; This is the most portable -- and the slowest implementation + +; See also logical.scm from SLIB +; (define (logior x y) +; (cond ((= x y) x) +; ((zero? x) y) +; ((zero? y) x) +; (else +; (+ (* (logior (quotient x 2) (quotient y 2)) 2) +; (if (and (even? x) (even? y)) 0 1))))) +; (define (logand x y) +; (cond ((= x y) x) +; ((zero? x) 0) +; ((zero? y) 0) +; (else +; (+ (* (logand (quotient x 2) (quotient y 2)) 2) +; (if (or (even? x) (even? y)) 0 1))))) + +; (define (lsh-left x n) (* x (expt 2 n))) +; (define (lsh-right x n) (quotient x (expt 2 n))) +; (define (lsh-left-one x) (* x 2)) +; (define (lsh-right-one x) (quotient x 2)) + + +; (define (-- x) (- x 1)) +; (define (++ x) (+ x 1)) + + ;(define (bit-set? x mask) ; return x & mask != 0 +; (odd? (quotient x mask)) ; mask is an exact power of two + ;) + + +;======================================================================== +; Reading a byte + +; Read-byte is a fundamental primitive; it needs to be +; added to the standard. Most of the other functions are library +; procedures. The following is an approximation, which clearly doesn't +; hold if the port is a Unicode (especially UTF-8) character stream. + +; Return a byte as an exact integer [0,255], or the EOF object +(define (read-byte port) + (let ((c (read-char port))) + (if (eof-object? c) c (char->ascii c)))) + +; The same as above, but returns #f on EOF. +(define (read-byte-f port) + (let ((c (read-char port))) + (and (not (eof-object? c)) (char->ascii c)))) + + + +;======================================================================== +; Bit stream + +; -- Function: make-bit-reader BYTE-READER + +; Given a BYTE-READER (a thunk), construct and return a function +; bit-reader N +; +; that reads N bits from a byte-stream represented by the BYTE-READER. +; The BYTE-READER is a function that takes no arguments and returns +; the current byte as an exact integer [0-255]. The byte reader +; should return #f on EOF. +; The bit reader returns N bits as an exact unsigned integer, +; 0 -... (no limit). N must be a positive integer, otherwise the bit reader +; returns #f. There is no upper limit on N -- other than the size of the +; input stream itself and the amount of (virtual) memory an OS is willing +; to give to your process. If you want to read 1M of _bits_, go ahead. +; +; It is assumed that the bit order is the most-significant bit first. +; +; Note the bit reader keeps the following condition true at all times: +; (= current-inport-pos (ceiling (/ no-bits-read 8))) +; That is, no byte is read until the very moment we really need (some of) +; its bits. The bit reader does _not_ "byte read ahead". +; Therefore, it can be used to handle a concatenation of different +; bit/byte streams *STRICTLY* sequentially, _without_ 'backing up a char', +; 'unreading-char' etc. tricks. +; For example, make-bit-reader has been used to read GRIB files of +; meteorological data, which made of several bitstreams with headers and +; tags. +; Thus careful attention to byte-buffering and optimization are the +; features of this bit reader. +; +; Usage example: +; (define bit-reader (make-bit-reader (lambda () #b11000101))) +; (bit-reader 3) ==> 6 +; (bit-reader 4) ==> 2 +; The test driver below is another example. +; +; Notes on the algorithm. +; The function recognizes and handles the following special cases: +; - the buffer is empty and 8, 16, 24 bits are to be read +; - reading all bits which are currently in the byte-buffer +; (and then maybe more) +; - reading only one bit + +; Since the bit reader is going to be called many times, optimization is +; critical. We need all the help from the compiler/interpreter +; we can get. + + +(define (make-bit-reader byte-reader) + (let ((buffer 0) (mask 0) ; mask = 128 means that the buffer is full and + ; the msb bit is the current (yet unread) bit + (bits-in-buffer 0)) + + ; read the byte into the buffer and set up the counters. + ; return #f on eof + (define (set-buffer) + (set! buffer (byte-reader)) + (and buffer + (begin + (set! bits-in-buffer 8) + (set! mask 128) + #t))) + + ; Read fewer bits than there are in the buffer + (define (read-few-bits n) + (let ((value (logand buffer ; all bits in buffer + (decrement-one (lsh-left-one mask))))) + (set! bits-in-buffer (- bits-in-buffer n)) + (set! mask (lsh-right mask n)) + (lsh-right value bits-in-buffer))) ; remove extra bits + + ; read n bits given an empty buffer, and append them to value, n>=8 + (define (add-more-bits value n) + (let loop ((value value) (n n)) + (cond + ((zero? n) value) + ((< n 8) + (let ((rest (read-n-bits n))) + (and rest (+ (* value (lsh-left 1 n)) rest)))) + (else + (let ((b (byte-reader))) + (and b (loop (+ (* value 256) b) (- n 8)))))))) + + ; The main module + (define (read-n-bits n) + ; Check the most common cases first + (cond + ((not (positive? n)) #f) + ((zero? bits-in-buffer) ; the bit-buffer is empty + (case n + ((8) (byte-reader)) + ((16) + (let ((b (byte-reader))) + (combine-two b (byte-reader)))) + ((24) + (let* ((b1 (byte-reader)) (b2 (byte-reader))) + (combine-three b1 b2 (byte-reader)))) + (else + (cond + ((< n 8) + (and (set-buffer) (read-few-bits n))) + ((< n 16) + (let ((b (byte-reader))) + (and (set-buffer) + (logior (lsh-left b (- n 8)) + (read-few-bits (- n 8)))))) + (else + (let ((b (byte-reader))) + (and b (add-more-bits b (- n 8))))))))) + + ((= n 1) ; read one bit + (let ((value (if (bit-set? buffer mask) 1 0))) + (set! mask (lsh-right-one mask)) + (set! bits-in-buffer (decrement-one bits-in-buffer)) + value)) + + ((>= n bits-in-buffer) ; will empty the buffer + (let ((n-rem (- n bits-in-buffer)) + (value (logand buffer ; for mask=64, it'll be &63 + (decrement-one (lsh-left-one mask))))) + (set! bits-in-buffer 0) + (cond + ((zero? n-rem) value) + ((<= n-rem 16) + (let ((rest (read-n-bits n-rem))) + (and rest (logior (lsh-left value n-rem) rest)))) + (else (add-more-bits value n-rem))))) + (else (read-few-bits n)) + )) + + read-n-bits) + ) + + +; Validation tests + +(run-test + (define (read-bits numbers nbits) + (let* ((left-numbers numbers) + (bit-reader + (make-bit-reader + (lambda () + (and (pair? left-numbers) + (let ((byte (car left-numbers))) + (set! left-numbers (cdr left-numbers)) + byte)))))) + (let loop ((result '())) + (let ((num (bit-reader nbits))) + (if num (loop (cons num result)) (reverse result)))))) + (define (do-test numbers nbits expected) + (let ((result (read-bits numbers nbits))) + (for-each display + (list "Reading " numbers " by " nbits " bits" ;newline + "The result is: " result )) + (or (equal? result expected) + (error "the result differs from the expected: " expected)))) + + (do-test '(1 2 3 4 5 6 7) 8 '(1 2 3 4 5 6 7)) + (do-test '(193 5 131 4) 1 + '(1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 1 + 0 0 0 0 0 1 0 0)) + (do-test '(193 5 131 4 5) 2 + '(3 0 0 1 0 0 1 1 2 0 0 3 0 0 1 0 0 0 1 1)) + (do-test '(193 5 131 4) 3 + '(6 0 2 0 2 6 0 3 0 1)) + (do-test '(193 5 131 4 5 6 7) 4 + '(12 1 0 5 8 3 0 4 0 5 0 6 0 7)) + (do-test '(193 5 131 4 5 6 7) 5 + '(24 4 2 24 6 1 0 5 0 24 3)) + (do-test '(193 5 131 4 5 6 7 8 17 24) 8 + '(193 5 131 4 5 6 7 8 17 24)) + (do-test '(193 5 131 4 5 6 7 8 17 24) 9 + '(386 22 24 64 160 385 388 17)) + (do-test '(193 5 131 4 5 6 7 8 17) 16 + '(49413 33540 1286 1800)) + (do-test '(193 5 131 4 5 6 104) 17 + '(98827 3088 10291)) + (do-test '(193 5 131 4 5 6 104) 19 + '(395308 49409)) + (do-test '(193 5 131 4 5 6 104) 55 + '(27165365385724724)) + (do-test '(193 5 131 4 5 6 104) 56 + '(54330730771449448)) + ) + + +; Timing test +; This test relies on a Gambit special form 'time' to clock +; evaluation of an expression. +; R5RS does not provide any timing facilities. So the test below +; might not run on your particular system, and probably needs +; adjustment anyway. + +(run-test + (let ((fname "/tmp/a") (size 10240) + (pattern (ascii->char #x55))) + (define (read-by-bits n) + (for-each display + (list "Reading the file by " n " bits ")) + (call-with-input-file fname + (lambda (port) + (let ((bit-reader (make-bit-reader + (lambda () (read-byte-f port))))) + (time + (do ((c (bit-reader n) (bit-reader n))) ((not c)))))))) + + (for-each display + (list "Creating a file " fname " of size " size " filled with " + pattern ;"\n" + )) + (with-output-to-file fname + (lambda () (do ((i 0 (+ 1 i))) ((>= i size)) (write-char pattern)))) + + (newline) + (display "Reading the file by characters: the baseline ") + (call-with-input-file fname + (lambda (port) + (time + (do ((c (read-char port) (read-char port))) ((eof-object? c)))))) + + (newline) + (display "Reading the file by bytes: ") + (call-with-input-file fname + (lambda (port) + (time + (do ((c (read-byte-f port) (read-byte-f port))) ((not c)))))) + + (for-each read-by-bits + (list 1 2 3 4 5 6 7 8 9 10 11 15 16 17 23 24 25 30 32 65535 + (* 8 size))) +)) diff --git a/s48/binary-parse/interfaces.scm b/s48/binary-parse/interfaces.scm new file mode 100644 index 0000000..aa7b202 --- /dev/null +++ b/s48/binary-parse/interfaces.scm @@ -0,0 +1,2 @@ +(define-interface binary-parse-interface + (export)) \ No newline at end of file diff --git a/s48/binary-parse/packages.scm b/s48/binary-parse/packages.scm new file mode 100644 index 0000000..302fc9a --- /dev/null +++ b/s48/binary-parse/packages.scm @@ -0,0 +1,6 @@ +(define-structure binary-parse binary-parse-interface + (open scheme + signals + ascii + bitwise) + (files binary-parse)) \ No newline at end of file