Posted at 2013-12-16

Back in March I wrote about the following code to compress a word given a mask:

```
uint32_t exch (uint32_t x, uint32_t m, uint32_t k)
{
uint32_t t = (x ^ (x >> k)) & m;
return x ^ t ^ (t << k);
}
uint32_t compress (uint32_t x, uint32_t m)
{
if (m) {
x = exch (x, 0x0000ff, (m & 0x008000) >> 12);
x = exch (x, 0xff0000, (m & 0x800000) >> 20);
if (m & 0x008080)
x >>= 8;
}
return x;
}
```

A couple of weeks later I found an ever so slightly better way to do the job, again exploiting the fact that we know 0 <= popcount(mask) <= 2, and that the only bits set could be the high bits:

```
uint32_t f(uint32_t x, uint32_t n)
{
uint32_t t;
n |= -n; /* Propagate bits to the left. */
uint32_t have = x;
uint32_t want = x >> 8;
t = (have ^ want) & n;
return x ^ t;
}
uint32_t
compress (uint32_t x, uint32_t m)
{
if (m) {
m >>= 7; /* Move high bits into low bits. */
uint32_t n0 = m & -m; /* The low bit of m. */
uint32_t n1 = m & ~n0; /* The high bit of m (might be 0). */
x = f(x, n1);
x = f(x, n0);
}
return x;
}
```

When compiling this code for ARMv5, the difference is just two instructions. The latter code avoids one or two bugs in gcc which lead to sub-optimal machine code being generated.

Tags yenc

Posted at 2013-03-06

Some weeks ago I have been trying to figure out how to decode yEnc-encoded files one word at a time since decoding them byte-by-byte is both slow and lame ;)

Let's first look at how the naive implementation of a yEnc decoder looks like:

```
uint8_t input[...]; // Input bytes, NUL terminated.
uint8_t output[...]; // Output bytes
uint8_t escape = 0x3d;
int i = 0, j = 0;
while (input[i]) {
uint8_t c = input[i++];
// Escape character?
if (c == escape)
c = input[i++] - 64;
output[j++] = c - 42;
}
```

And this is what I've come up with. Note that this code only works if 0x3d itself is never escaped, ie there will be no pairs 3d3d in the input stream. Thus we will have at most two escaped bytes per word. Yes, this is cheating, but it seems to work well enough with the encoders that are in commong use today.

On the ARMv5 CPU powering my GuruPlug, this code is ~10% faster than the naive approach.

So here's the (optimized) code. The explanations in there aren't nearly as good as they should be, but they are all you'll get for now ;p

```
void decode_line()
{
uint8_t input[...];
uint8_t output[...];
size_t length = number of input bytes to process;
/* Align to a multiple of 4. */
size_t orig_length = length;
length = (length + 3) & ~3;
ssize_t faked = length - orig_length;
uint32_t state = 0;
uint32_t *wp = (uint32_t *) input;
int j = 0;
for (; length; length -= 4) {
uint32_t word = *wp++;
uint32_t decoded, ndecoded;
decoded = decode (&state, word, &ndecoded);
output[j++] = decoded;
decoded >>= 8;
output[j++] = decoded;
ndecoded--;
if (--ndecoded) {
decoded >>= 8;
output[j++] = decoded;
if (--ndecoded) {
decoded >>= 8;
output[j++] = decoded;
}
}
}
/* We cheated before when we aligned 'length'.
* Adjust for that now, so j will contain the
* actual number of output bytes written.
*/
j -= faked;
}
/**
* @param state Used to remember whether the first byte was meant to be escaped.
* @param word The word to decode.
* @param ndecoded Will store the number of decoded bytes (ranging from 2 to 4).
* @return The decoded word.
*/
uint32_t decode (uint32_t *state, uint32_t word, uint32_t *ndecoded)
{
uint32_t b64 = *state >> 25;
uint32_t add_magic = 0xd6d6d6d6 & ~b64; /* 256 - 42 = 214 = 0xd6 */
uint32_t mask, result;
/* Find the 0x3d bytes. */
mask = find_eq (word | b64);
/* From the mask, we can deduce how many bytes we will decode. */
*ndecoded = count00 (mask);
/* Perform the actual decoding.
* Since we need to apply the offset to the bytes _following_ the
* escape bytes, we're shifting the mask.
*
* We're using add instead of sub because the former needs
* fewer instructions in ARM machine code.
*/
result = add (word, add_magic & ~(mask << 7));
/* Check for MSB to see if we had 0x3d in the leftmost byte. */
*state = mask;
/* Kick out bytes that originated from 0x3d. */
return compress (result, mask);
}
/* Turns bytes that are 0x3d in x into 0x80 and all others into 0x00
* (known as the Mycroft test).
*/
static inline uint32_t
find_eq (uint32_t x)
{
x ^= 0x3d3d3d3d;
return (x - 0x01010101) & ~x & 0x80808080;
}
/* Multibyte add, from Hacker's Delight. */
uint32_t add (uint32_t x, uint32_t y)
{
uint32_t a, b;
a = (x & 0x7f7f7f7f) + (y & 0x7f7f7f7f);
b = (x ^ y) & ~0x7f7f7f7f;
return a ^ b;
}
/* Exchange bits m in x that are k bits apart
* (from Hacker's Delight).
*/
uint32_t exch (uint32_t x, uint32_t m, uint32_t k)
{
uint32_t t = (x ^ (x >> k)) & m;
return x ^ t ^ (t << k);
}
uint32_t compress (uint32_t x, uint32_t m)
{
if (m) {
x = exch (x, 0x0000ff, (m & 0x008000) >> 12);
x = exch (x, 0xff0000, (m & 0x800000) >> 20);
if (m & 0x008080)
x >>= 8;
}
return x;
}
/* Count number of zero bytes in x. */
uint32_t count00 (uint32_t x)
{
/* In x, zero bytes are the bytes that were !0x3d
* in the input word.
* We know there can be at most two 0x3d bytes,
* thus we will have 2-4 zero bytes in x.
*/
uint32_t count = 4;
if (x)
count--;
x = x & (x - 1);
if (x)
count--;
return count;
}
```

Tags yenc