upscheme/llt/bitvector.c

/*
  bit vector primitives

  todo:
  * reverse
  * nreverse
 (- rotate left/right)
  * shl_to
  * not
  - shr_row, shl_row

  These routines are the back end supporting bit matrices. Many operations
  on bit matrices are slow (such as accessing or setting a single element!)
  but certain operations are privileged and lend themselves to extremely
  efficient implementation due to the bit-vector nature of machine integers.
  These are:
  done:
    &  |  $  ~  copy  reverse  fill  sum  prod
  todo:
    shift  trans  rowswap
  would be nice:
    channel  interleave

  Important note:
  Out-of-place functions always assume dest and source have the same amount
  of space available.

  shr_to, shl_to, not_to, and reverse_to assume source and dest don't overlap
  and_to, or_to, and xor_to allow overlap.
*/

#include <stdlib.h>
#include <assert.h>
#include <string.h>

#include "dtypes.h"
#include "bitvector.h"

#ifdef WIN32
#include <malloc.h>
#define alloca _alloca
#endif

// greater than this # of words we use malloc instead of alloca
#define MALLOC_CUTOFF 2000

u_int32_t *bitvector_resize(u_int32_t *b, u_int64_t n, int initzero)
{
    u_int32_t *p;
    size_t sz = ((n+31)>>5) * 4;
    p = realloc(b, sz);
    if (p == NULL) return NULL;
    if (initzero) memset(p, 0, sz);
    return p;
}

u_int32_t *bitvector_new(u_int64_t n, int initzero)
{
    return bitvector_resize(NULL, n, initzero);
}

size_t bitvector_nwords(u_int64_t nbits)
{
    return ((nbits+31)>>5) * 4;
}

void bitvector_set(u_int32_t *b, u_int64_t n, u_int32_t c)
{
    if (c)
        b[n>>5] |= (1<<(n&31));
    else
        b[n>>5] &= ~(1<<(n&31));
}

u_int32_t bitvector_get(u_int32_t *b, u_int64_t n)
{
    return b[n>>5] & (1<<(n&31));
}

u_int32_t bitreverse(u_int32_t x)
{
    u_int32_t m;

#ifdef __INTEL_COMPILER
    x = _bswap(x);
#else
    x = (x >> 16)      | (x << 16);        m = 0xff00ff00;
    x = ((x & m) >> 8) | ((x & ~m) << 8);
#endif
    m = 0xf0f0f0f0;
    x = ((x & m) >> 4) | ((x & ~m) << 4);  m = 0xcccccccc;
    x = ((x & m) >> 2) | ((x & ~m) << 2);  m = 0xaaaaaaaa;
    x = ((x & m) >> 1) | ((x & ~m) << 1);

    return x;
}

// shift all bits in a long bit vector
// n is # of int32s to consider, s is shift distance
// lowest bit-index is bit 0 of word 0
// TODO: handle boundary case of shift distance >= data size?
void bitvector_shr(u_int32_t *b, size_t n, u_int32_t s)
{
    u_int32_t i;
    if (s == 0 || n == 0) return;
    i = (s>>5);
    if (i) {
        n -= i;
        memmove(b, &b[i], n*4);
        memset(&b[n], 0, i*4);
        s &= 31;
    }
    for(i=0; i < n-1; i++) {
        b[i] = (b[i]>>s) | (b[i+1]<<(32-s));
    }
    b[i]>>=s;
}

// out-of-place version, good for re-aligning a strided submatrix to
// linear representation when a copy is needed
// assumes that dest has the same amount of space as source, even if it
// wouldn't have been necessary to hold the shifted bits
void bitvector_shr_to(u_int32_t *dest, u_int32_t *b, size_t n, u_int32_t s)
{
    u_int32_t i, j;
    if (n == 0) return;
    if (s == 0) {
        memcpy(dest, b, n*4);
        return;
    }
    j = (s>>5);
    if (j) {
        n -= j;
        memset(&dest[n], 0, j*4);
        s &= 31;
        b = &b[j];
    }
    for(i=0; i < n-1; i++) {
        dest[i] = (b[i]>>s) | (b[i+1]<<(32-s));
    }
    dest[i] = b[i]>>s;
}

void bitvector_shl(u_int32_t *b, size_t n, u_int32_t s)
{
    u_int32_t i, scrap=0, temp;
    if (s == 0 || n == 0) return;
    i = (s>>5);
    if (i) {
        n -= i;
        memmove(&b[i], b, n*4);
        memset(b, 0, i*4);
        s &= 31;
        b = &b[i];
    }
    for(i=0; i < n; i++) {
        temp = (b[i]<<s) | scrap;
        scrap = b[i]>>(32-s);
        b[i] = temp;
    }
}

// if dest has more space than source, set scrap to true to keep the
// top bits that would otherwise be shifted out
void bitvector_shl_to(u_int32_t *dest, u_int32_t *b, size_t n, u_int32_t s,
                      bool_t scrap)
{
    u_int32_t i, j, sc=0;
    if (n == 0) return;
    if (s == 0) {
        memcpy(dest, b, n*4);
        return;
    }
    j = (s>>5);
    if (j) {
        n -= j;
        memset(dest, 0, j*4);
        s &= 31;
        dest = &dest[j];
    }
    for(i=0; i < n; i++) {
        dest[i] = (b[i]<<s) | sc;
        sc = b[i]>>(32-s);
    }
    if (scrap)
        dest[i] = sc;
}

// set nbits to c, starting at given bit offset
// assumes offs < 32
void bitvector_fill(u_int32_t *b, u_int32_t offs, u_int32_t c, u_int32_t nbits)
{
    index_t i;
    u_int32_t nw, tail;
    u_int32_t mask;

    if (nbits == 0) return;
    nw = (offs+nbits+31)>>5;

    if (nw == 1) {
        mask = (lomask(nbits)<<offs);
        if (c) b[0]|=mask; else b[0]&=(~mask);
        return;
    }

    mask = lomask(offs);
    if (c) b[0]|=(~mask); else b[0]&=mask;

    if (c) mask=ONES32; else mask = 0;
    for(i=1; i < nw-1; i++)
        b[i] = mask;

    tail = (offs+nbits)&31;
    if (tail==0) {
        b[i] = mask;
    }
    else {
        mask = lomask(tail);
        if (c) b[i]|=mask; else b[i]&=(~mask);
    }
}

void bitvector_not(u_int32_t *b, u_int32_t offs, u_int32_t nbits)
{
    index_t i;
    u_int32_t nw, tail;
    u_int32_t mask;

    if (nbits == 0) return;
    nw = (offs+nbits+31)>>5;

    if (nw == 1) {
        mask = (lomask(nbits)<<offs);
        b[0] ^= mask;
        return;
    }

    mask = ~lomask(offs);
    b[0]^=mask;

    for(i=1; i < nw-1; i++)
        b[i] = ~b[i];

    tail = (offs+nbits)&31;
    if (tail==0) {
        b[i] = ~b[i];
    }
    else {
        mask = lomask(tail);
        b[i]^=mask;
    }
}

// constant-space bit vector copy in a single pass, with arbitrary
// offsets and lengths. to get this right, there are 16 cases to handle!
#define BITVECTOR_COPY_OP(name, OP)                                          \
void bitvector_##name(u_int32_t *dest, u_int32_t doffs,                      \
                      u_int32_t *src, u_int32_t soffs, u_int32_t nbits)      \
{                                                                            \
    index_t i;                                                               \
    u_int32_t s, nw, tail, snw;                                              \
    u_int32_t mask, scrap;                                                   \
                                                                             \
    if (nbits == 0) return;                                                  \
    nw = (doffs+nbits+31)>>5;                                                \
                                                                             \
    if (soffs == doffs) {                                                    \
        if (nw == 1) {                                                       \
            mask = (lomask(nbits)<<doffs);                                   \
            dest[0] = (dest[0] & ~mask) | (OP(src[0]) & mask);               \
            return;                                                          \
        }                                                                    \
        mask = ~lomask(doffs);                                               \
        dest[0] = (dest[0] & ~mask) | (OP(src[0]) & mask);                   \
        for(i=1; i < nw-1; i++)                                              \
            dest[i] = OP(src[i]);                                            \
        tail = (doffs+nbits)&31;                                             \
        if (tail==0) { dest[i]=src[i]; } else {                              \
            mask = lomask(tail);                                             \
            dest[i] = (dest[i] & ~mask) | (OP(src[i]) & mask); }             \
        return;                                                              \
    }                                                                        \
    snw = (soffs+nbits+31)>>5;                                               \
    if (soffs < doffs) {                                                     \
        s = doffs-soffs;                                                     \
        if (nw == 1) {                                                       \
            mask = (lomask(nbits)<<doffs);                                   \
            dest[0] = (dest[0] & ~mask) | ((OP(src[0])<<s) & mask);          \
            return;                                                          \
        }                                                                    \
        mask = ~lomask(doffs);                                               \
        dest[0] = (dest[0] & ~mask) | ((OP(src[0])<<s) & mask);              \
        scrap = OP(src[0])>>(32-s);                                          \
        for(i=1; i < snw-1; i++) {                                           \
            dest[i] = (OP(src[i])<<s) | scrap;                               \
            scrap = OP(src[i])>>(32-s);                                      \
        }                                                                    \
        tail = (doffs+nbits)&31;                                             \
        if (tail==0) { mask=ONES32; } else { mask = lomask(tail); }          \
        if (snw == nw) {                                                     \
            dest[i] = (dest[i] & ~mask) | (((OP(src[i])<<s)|scrap) & mask);  \
        }                                                                    \
        else /* snw < nw */ {                                                \
            if (snw == 1) {                                                  \
                dest[i] = (dest[i] & ~mask) |                                \
                    (((OP(src[i])<<s) | scrap) & mask);                      \
            }                                                                \
            else {                                                           \
                dest[i] = (OP(src[i])<<s) | scrap;                           \
                scrap = OP(src[i])>>(32-s);                                  \
                i++;                                                         \
                dest[i] = (dest[i] & ~mask) | (scrap & mask);                \
            }                                                                \
        }                                                                    \
    }                                                                        \
    else {                                                                   \
        s = soffs-doffs;                                                     \
        if (snw == 1) {                                                      \
            mask = (lomask(nbits)<<doffs);                                   \
            dest[0] = (dest[0] & ~mask) | ((OP(src[0])>>s) & mask);          \
            return;                                                          \
        }                                                                    \
        if (nw == 1) {                                                       \
            mask = (lomask(nbits)<<doffs);                                   \
            dest[0] = (dest[0] & ~mask) |                                    \
                (((OP(src[0])>>s)|(OP(src[1])<<(32-s))) & mask);             \
            return;                                                          \
        }                                                                    \
        mask = ~lomask(doffs);                                               \
        dest[0] = (dest[0] & ~mask) |                                        \
            (((OP(src[0])>>s)|(OP(src[1])<<(32-s))) & mask);                 \
        for(i=1; i < nw-1; i++) {                                            \
            dest[i] = (OP(src[i])>>s) | (OP(src[i+1])<<(32-s));              \
        }                                                                    \
        tail = (doffs+nbits)&31;                                             \
        if (tail==0) { mask=ONES32; } else { mask = lomask(tail); }          \
        if (snw == nw) {                                                     \
            dest[i] = (dest[i] & ~mask) | ((OP(src[i])>>s) & mask);          \
        }                                                                    \
        else /* snw > nw */ {                                                \
            dest[i] = (dest[i] & ~mask) |                                    \
                (((OP(src[i])>>s)|(OP(src[i+1])<<(32-s))) & mask);           \
        }                                                                    \
    }                                                                        \
}

#define BV_COPY(a) (a)
#define BV_NOT(a) (~(a))
BITVECTOR_COPY_OP(copy, BV_COPY)
BITVECTOR_COPY_OP(not_to, BV_NOT)

// right-shift the bits in one logical "row" of a long 2d bit vector
/*
void bitvector_shr_row(u_int32_t *b, u_int32_t offs, size_t nbits, u_int32_t s)
{
}
*/

// copy from source to dest while reversing bit-order
// assumes dest offset == 0
// assumes source and dest don't overlap
// assumes offset < 32
void bitvector_reverse_to(u_int32_t *dest, u_int32_t *src, u_int32_t soffs,
                          u_int32_t nbits)
{
    index_t i;
    u_int32_t nw, tail;

    if (nbits == 0) return;

    nw = (soffs+nbits+31)>>5;
    // first, reverse the words while reversing bit order within each word
    for(i=0; i < nw/2; i++) {
        dest[i]      = bitreverse(src[nw-i-1]);
        dest[nw-i-1] = bitreverse(src[i]);
    }
    if (nw&0x1)
        dest[i] = bitreverse(src[i]);

    tail = (soffs+nbits)&31;
    if (tail)
        bitvector_shr(dest, nw, 32-tail);
}

void bitvector_reverse(u_int32_t *b, u_int32_t offs, u_int32_t nbits)
{
    index_t i;
    u_int32_t nw, tail;
    u_int32_t *temp;

    if (nbits == 0) return;

    nw = (offs+nbits+31)>>5;
    temp = (nw > MALLOC_CUTOFF) ? malloc(nw*4) : alloca(nw*4);
    for(i=0; i < nw/2; i++) {
        temp[i]      = bitreverse(b[nw-i-1]);
        temp[nw-i-1] = bitreverse(b[i]);
    }
    if (nw&0x1)
        temp[i] = bitreverse(b[i]);

    tail = (offs+nbits)&31;
    bitvector_copy(b, offs, temp, (32-tail)&31, nbits);
    if (nw > MALLOC_CUTOFF) free(temp);
}

u_int64_t bitvector_count(u_int32_t *b, u_int32_t offs, u_int64_t nbits)
{
    size_t i, nw;
    u_int32_t ntail;
    u_int64_t ans;

    if (nbits == 0) return 0;
    nw = ((u_int64_t)offs+nbits+31)>>5;

    if (nw == 1) {
        return count_bits(b[0] & (lomask(nbits)<<offs));
    }

    ans = count_bits(b[0]>>offs);  // first end cap

    for(i=1; i < nw-1; i++) {
        /* popcnt can be computed branch-free, so these special cases
           probably don't help much */
        /*
        v = b[i];
        if (v == 0)
            continue;
        if (v == ONES32)
            ans += 32;
        else
        */
        ans += count_bits(b[i]);
    }

    ntail = (offs+(u_int32_t)nbits)&31;
    ans += count_bits(b[i]&(ntail>0?lomask(ntail):ONES32));  // last end cap

    return ans;
}

u_int32_t bitvector_any0(u_int32_t *b, u_int32_t offs, u_int32_t nbits)
{
    index_t i;
    u_int32_t nw, tail;
    u_int32_t mask;

    if (nbits == 0) return 0;
    nw = (offs+nbits+31)>>5;

    if (nw == 1) {
        mask = (lomask(nbits)<<offs);
        if ((b[0] & mask) != mask) return 1;
        return 0;
    }

    mask = ~lomask(offs);
    if ((b[0] & mask) != mask) return 1;

    for(i=1; i < nw-1; i++) {
        if (b[i] != ONES32) return 1;
    }

    tail = (offs+nbits)&31;
    if (tail==0) {
        if (b[i] != ONES32) return 1;
    }
    else {
        mask = lomask(tail);
        if ((b[i] & mask) != mask) return 1;
    }
    return 0;
}

u_int32_t bitvector_any1(u_int32_t *b, u_int32_t offs, u_int32_t nbits)
{
    index_t i;
    u_int32_t nw, tail;
    u_int32_t mask;

    if (nbits == 0) return 0;
    nw = (offs+nbits+31)>>5;

    if (nw == 1) {
        mask = (lomask(nbits)<<offs);
        if ((b[0] & mask) != 0) return 1;
        return 0;
    }

    mask = ~lomask(offs);
    if ((b[0] & mask) != 0) return 1;

    for(i=1; i < nw-1; i++) {
        if (b[i] != 0) return 1;
    }

    tail = (offs+nbits)&31;
    if (tail==0) {
        if (b[i] != 0) return 1;
    }
    else {
        mask = lomask(tail);
        if ((b[i] & mask) != 0) return 1;
    }
    return 0;
}

static void adjust_offset_to(u_int32_t *dest, u_int32_t *src, u_int32_t nw,
                             u_int32_t soffs, u_int32_t newoffs)
{
    if (newoffs > soffs)
        bitvector_shl_to(dest, src, nw, newoffs-soffs, true);
    else
        bitvector_shr_to(dest, src, nw, soffs-newoffs);
}

#define BITVECTOR_BINARY_OP_TO(opname, OP)                                   \
void bitvector_##opname##_to(u_int32_t *dest, u_int32_t doffs,               \
                             u_int32_t *a, u_int32_t aoffs,                  \
                             u_int32_t *b, u_int32_t boffs, u_int32_t nbits) \
{                                                                            \
    u_int32_t nw = (doffs+nbits+31)>>5;                                      \
    u_int32_t *temp = nw>MALLOC_CUTOFF ? malloc((nw+1)*4) : alloca((nw+1)*4);\
    u_int32_t i, anw, bnw;                                                   \
    if (aoffs == boffs) {                                                    \
        anw = (aoffs+nbits+31)>>5;                                           \
    }                                                                        \
    else if (aoffs == doffs) {                                               \
        bnw = (boffs+nbits+31)>>5;                                           \
        adjust_offset_to(temp, b, bnw, boffs, aoffs);                        \
        b = temp; anw = nw;                                                  \
    }                                                                        \
    else {                                                                   \
        anw = (aoffs+nbits+31)>>5;                                           \
        bnw = (boffs+nbits+31)>>5;                                           \
        adjust_offset_to(temp, a, anw, aoffs, boffs);                        \
        a = temp; aoffs = boffs; anw = bnw;                                  \
    }                                                                        \
    for(i=0; i < anw; i++) temp[i] = OP(a[i], b[i]);                         \
    bitvector_copy(dest, doffs, temp, aoffs, nbits);                         \
    if (nw>MALLOC_CUTOFF) free(temp);                                        \
}

#define BV_AND(a,b) ((a)&(b))
#define BV_OR(a,b)  ((a)|(b))
#define BV_XOR(a,b) ((a)^(b))
BITVECTOR_BINARY_OP_TO(and, BV_AND)
BITVECTOR_BINARY_OP_TO(or,  BV_OR)
BITVECTOR_BINARY_OP_TO(xor, BV_XOR)