Faster byte-wise compress

Posted at 2013-12-16

Back in March I wrote about the following code to compress a word given a mask:

uint32_t exch (uint32_t x, uint32_t m, uint32_t k)
{
    uint32_t t = (x ^ (x >> k)) & m;

    return x ^ t ^ (t << k);
}

uint32_t compress (uint32_t x, uint32_t m)
{
    if (m) {
        x = exch (x, 0x0000ff, (m & 0x008000) >> 12);
        x = exch (x, 0xff0000, (m & 0x800000) >> 20);

        if (m & 0x008080)
            x >>= 8;
    }

    return x;
}

A couple of weeks later I found an ever so slightly better way to do the job, again exploiting the fact that we know 0 <= popcount(mask) <= 2, and that the only bits set could be the high bits:

uint32_t f(uint32_t x, uint32_t n)
{
    uint32_t t;

    n |= -n; /* Propagate bits to the left. */

    uint32_t have = x;
    uint32_t want = x >> 8;
    t = (have ^ want) & n;

    return x ^ t;
}

uint32_t
compress (uint32_t x, uint32_t m)
{
    if (m) {
        m >>= 7; /* Move high bits into low bits. */

        uint32_t n0 = m & -m; /* The low bit of m. */
        uint32_t n1 = m & ~n0; /* The high bit of m (might be 0). */

        x = f(x, n1);
        x = f(x, n0);
    }

    return x;
}

When compiling this code for ARMv5, the difference is just two instructions. The latter code avoids one or two bugs in gcc which lead to sub-optimal machine code being generated.

Tags