I have implemented a very simple but robust implementation of the Golomb-Rice coding. To understand my motivation, see the minimalist data compressor that is based on this. At the moment, the implementation works well. However, I intend to make it more elegant, concise, and possibly faster. Any suggestions for improvement or constructive criticism are welcome.
Prerequisites
- The
typedef
nameuint32
designates an unsigned integer type with width exactly 32 bits. - The
uchar
name is an alias forunsigned char
. - Function
size_t minsize(size_t a, size_t b)
returns the minimum ofa
andb
. - Function
size_t ctzu32(uint32 n)
returns the number of trailing 0-bits inn
, starting at the least significant bit position. Ifn
is 0, the result is 32.
Code
The implementation uses the following data structure:
enum {
BIO_MODE_READ,
BIO_MODE_WRITE
};
struct bio {
int mode; /* reading or writing? */
uchar *ptr; /* pointer to memory */
uint32 b; /* bit buffer */
size_t c; /* bit counter */
};
Furthermore, I use several auxiliary functions (it is a bit long):
static void bio_reset_after_flush(struct bio *bio)
{
assert(bio != NULL);
bio->b = 0;
bio->c = 0;
}
static void bio_open(struct bio *bio, uchar *ptr, int mode)
{
assert(bio != NULL);
assert(ptr != NULL);
bio->mode = mode;
bio->ptr = ptr;
switch (mode) {
case BIO_MODE_READ:
bio->c = 32;
break;
case BIO_MODE_WRITE:
bio_reset_after_flush(bio);
break;
}
}
static void bio_flush_buffer(struct bio *bio)
{
assert(bio != NULL);
assert(bio->ptr != NULL);
assert(sizeof(uint32) * CHAR_BIT == 32);
*((uint32 *)bio->ptr) = bio->b;
bio->ptr += 4;
}
static void bio_reload_buffer(struct bio *bio)
{
assert(bio != NULL);
assert(bio->ptr != NULL);
bio->b = *(uint32 *)bio->ptr;
bio->ptr += 4;
}
static void bio_put_nonzero_bit(struct bio *bio)
{
assert(bio != NULL);
assert(bio->c < 32);
bio->b |= (uint32)1 << bio->c;
bio->c++;
if (bio->c == 32) {
bio_flush_buffer(bio);
bio_reset_after_flush(bio);
}
}
static void bio_write_bits(struct bio *bio, uint32 b, size_t n)
{
assert(n <= 32);
while (n > 0) {
size_t m;
assert(bio->c < 32);
m = minsize(32 - bio->c, n);
assert(32 >= bio->c + m);
bio->b |= (uint32)((b & (((uint32)1 << m) - 1)) << bio->c);
bio->c += m;
if (bio->c == 32) {
bio_flush_buffer(bio);
bio_reset_after_flush(bio);
}
b >>= m;
n -= m;
}
}
static void bio_write_zero_bits(struct bio *bio, size_t n)
{
assert(n <= 32);
while (n > 0) {
size_t m;
assert(bio->c < 32);
m = minsize(32 - bio->c, n);
assert(32 >= bio->c + m);
bio->c += m;
if (bio->c == 32) {
bio_flush_buffer(bio);
bio_reset_after_flush(bio);
}
n -= m;
}
}
static uint32 bio_read_bits(struct bio *bio, size_t n)
{
uint32 w;
size_t s;
/* reload? */
if (bio->c == 32) {
bio_reload_buffer(bio);
bio->c = 0;
}
/* get the avail. least-significant bits */
s = minsize(32 - bio->c, n);
w = bio->b & (((uint32)1 << s) - 1);
bio->b >>= s;
bio->c += s;
n -= s;
/* need more bits? reload & get the most-significant bits */
if (n > 0) {
assert(bio->c == 32);
bio_reload_buffer(bio);
bio->c = 0;
w |= (bio->b & (((uint32)1 << n) - 1)) << s;
bio->b >>= n;
bio->c += n;
}
return w;
}
static void bio_close(struct bio *bio)
{
assert(bio != NULL);
if (bio->mode == BIO_MODE_WRITE && bio->c > 0) {
bio_flush_buffer(bio);
}
}
static void bio_write_unary(struct bio *bio, uint32 N)
{
while (N > 32) {
bio_write_zero_bits(bio, 32);
N -= 32;
}
bio_write_zero_bits(bio, N);
bio_put_nonzero_bit(bio);
}
static uint32 bio_read_unary(struct bio *bio)
{
/* get zeros... */
uint32 total_zeros = 0;
assert(bio != NULL);
do {
size_t s;
/* reload? */
if (bio->c == 32) {
bio_reload_buffer(bio);
bio->c = 0;
}
/* get trailing zeros */
s = minsize(32 - bio->c, ctzu32(bio->b));
bio->b >>= s;
bio->c += s;
total_zeros += s;
} while (bio->c == 32);
/* ...and drop non-zero bit */
assert(bio->c < 32);
bio->b >>= 1;
bio->c++;
return total_zeros;
}
Finally, the main entry functions (that encode/decode non-negative integer N
using parameter 2^k
) are defined as follows:
void bio_write_gr(struct bio *bio, size_t k, uint32 N)
{
uint32 Q = N >> k;
bio_write_unary(bio, Q);
assert(k <= 32);
bio_write_bits(bio, N, k);
}
uint32 bio_read_gr(struct bio *bio, size_t k)
{
uint32 Q;
uint32 N;
Q = bio_read_unary(bio);
N = Q << k;
assert(k <= 32);
N |= bio_read_bits(bio, k);
return N;
}
1 Answer 1
Give the mode enum a name
While enum
s are not strongly typed in C, it is more elegant to pretend they are. Give the enum
you are declaring a type, and use it in struct bio
, like so:
enum bio_mode {
BIO_MODE_READ,
BIO_MODE_WRITE,
};
struct bio {
enum bio_mode mode;
...
};
Compilers can use this information, for example if you write a switch (mode) {...}
statement and you forget to handle all possible modes, the compiler will warn about this.
Also change functions that take int mode
as a parameter to enum bio_mode mode
.
Use standard types where possible
Use the standard fixed width integer types from <stdint.h>
instead of inventing your own names. So instead of uint32
, use uint32_t
, and instead of uchar
, use uint8_t
.
There is no need to assert()
that the size of uint32_t
is 32 bits.
Reorder struct bio
to be more compact
On most 64-bit architectures, the layout of struct bio
is suboptimal, because pointers and size_t
have a 64-bit alignment, while int
s have 32-bit alignment. I suggest the following:
struct bio {
enum bio_mode mode;
uint32_t b;
uint8_t *ptr;
size_t c;
};
Make ptr
uint32_t *
Since you are casting ptr
to uint32_t *
in many places, it makes more sense to store it directly as that type, and only cast it once in bio_open()
. I also recommend you take a void *
in bio_open()
, so there is no need for the caller to do any casting.
struct bio {
enum bio_mode mode;
uint32_t b;
uint32_t *ptr;
size_t c;
};
static void bio_open(struct bio *bio, void *ptr, int mode)
{
...
bio->ptr = ptr;
...
}
Remember to also change all occurrences of bio->ptr += 4
to bio->ptr++
.
Assert that ptr
is 32-bit aligned
Casting a pointer to an uint32_t *
is only valid if the pointer is 32-bit aligned. On some architectures, accessing memory through a pointer that is not properly aligned is not allowed. On those that do, it might be less efficient than having the pointer properly aligned. To assert this write:
assert(((uintptr_t)ptr & 3) == 0);
Another option would be to allow ptr
to be non-aligned in the call to bio_open()
, but then to initialize bio->b
such that it contains the first few bytes up to the first 32-bit aligned address, and of course set bio->c
accordingly.
Assert the right mode is set in bio_read_*()
and bio_write_*()
To avoid accidental reuse of a struct bio
, or mixing read and write calls on the same bio
, assert(bio->mode == BIO_MODE_READ)
in read functions, and so on.
Optimizing bio_write_bits()
There's a lot of things in bio_write_bits()
that can be optimized. First, there is a lot of unnecessary casting going. While it doesn't change the actual binary, it cleans up the source code to remove them, and makes it easier to see the actual equations. For example, you can just write:
bio->b |= (b & ((1 << m) - 1)) << bio->c;
In the above, you are masking the lower bits of b
before shifting it by bio->c
. However, this is completely unnecessary, as either those high bits were zero to begin with, or they will be shifted out anyway. So you can write:
bio->b |= b << bio->c;
More importantly, you have written this function as a loop, but you would only ever have at most two iterations of the loop: either all n
bits fit in bio->b
, or you have to flush once and put the rest of the bits in. You can rewrite the code as follows:
static void bio_write_bits(struct bio *bio, uint32_t b, size_t n)
{
assert(n <= 32);
assert((b >> n) == 0);
assert(bio->c < 32);
bio->b |= b << bio->c;
bio->c += n;
/* Exit early if we didn't fill bio->b yet */
if (bio->c < 32)
return;
bio_flush_buffer(bio);
/* Store the remaining bits */
bio->c -= 32;
bio->b = b >> (n - bio->c);
}
A similar optimization is possible for bio_write_zero_bits()
.
Reset ptr
in bio_close()
To catch potential use of a struct bio
after calling bio_close()
, set bio->ptr = NULL
in bio_close()
.
Validate your input
In bio_read_unary()
, you have a loop reading zero bits. What if the input is malformed and just contains zero bits? After consuming the whole input, bio_read_unary()
would continue reading past the end of the input.
First, you can get rid of the loop by just assuming you have to do at most two iterations, just like in bio_write_bits()
. Second, it would be good to have an extra field in struct bio
that is either the remaining size in the buffer, or the end pointer, and keep track of how much you have read and written. Don't use assert()
to check that you don't go past the end, but use an actual if
-statement, and return an error or at least call abort()
if the input is invalid.
-
1\$\begingroup\$ This is exactly what I needed. I have switched from C89 to C99, so now don't need to define my own type a can use the
uint32_t
. (This also greatly shortened the entire code.) Unfortunately, reordering thestruct bio
didn't improve the performance. Similarly, the masking the lower bits ofb
before shifting it bybio->c
seems to be necessary, otherwise the more significant bits ofb
screw up the most significant bits ofbio->b
(which are intended to be filled by future calls of the function). Anyhow, excellent answer! \$\endgroup\$DaBler– DaBler2020年02月16日 08:31:03 +00:00Commented Feb 16, 2020 at 8:31 -
\$\begingroup\$ The masking should not be necessary if the input to
bio_write_bits()
satisfies(b >> n) == 0
. Either all bits fit in the first step, or they don't in which the ones that did not fit are shifted out. In the second step,b
is shifted right so only those bits that didn't fit in the first step are left. Note that in the second step the result is directly assigned tobio->b
, we don't OR it. \$\endgroup\$G. Sliepen– G. Sliepen2020年02月16日 10:58:43 +00:00Commented Feb 16, 2020 at 10:58
assert()
should not be in production code and does not display the information needed when debugging \$\endgroup\$NDEBUG
is defined at the moment<assert.h>
was last included, the macroassert()
generates no code, and hence does nothing at all. And further: The error message includes the name of the file and function containing theassert()
call, the source code line number of the call, and the text of the argument. \$\endgroup\$