Faster byte-wise compress
Posted at 2013-12-16
Back in March I wrote about the following code to compress a word given a mask:
uint32_t exch (uint32_t x, uint32_t m, uint32_t k)
{
uint32_t t = (x ^ (x >> k)) & m;
return x ^ t ^ (t << k);
}
uint32_t compress (uint32_t x, uint32_t m)
{
if (m) {
x = exch (x, 0x0000ff, (m & 0x008000) >> 12);
x = exch (x, 0xff0000, (m & 0x800000) >> 20);
if (m & 0x008080)
x >>= 8;
}
return x;
}
A couple of weeks later I found an ever so slightly better way to do the job, again exploiting the fact that we know 0 <= popcount(mask) <= 2, and that the only bits set could be the high bits:
uint32_t f(uint32_t x, uint32_t n)
{
uint32_t t;
n |= -n; /* Propagate bits to the left. */
uint32_t have = x;
uint32_t want = x >> 8;
t = (have ^ want) & n;
return x ^ t;
}
uint32_t
compress (uint32_t x, uint32_t m)
{
if (m) {
m >>= 7; /* Move high bits into low bits. */
uint32_t n0 = m & -m; /* The low bit of m. */
uint32_t n1 = m & ~n0; /* The high bit of m (might be 0). */
x = f(x, n1);
x = f(x, n0);
}
return x;
}
When compiling this code for ARMv5, the difference is just two instructions. The latter code avoids one or two bugs in gcc which lead to sub-optimal machine code being generated.