I have written a pretty simple hash table in C. It uses a prime modulus, linear probing, open addressing, and robin hood hashing. The program can also be found on GitHub.
For clarification, uin
is a typedef that uses uint32_t
or uint64_t
depending on whether the system is x86 or x86_64.
I'd now like to optimize the performance as much as possible, but I'm unsure of how to do so. I've considered using fastrange or fibonacci hashing instead of a prime modulus and consistent hashing to speed up the resizes. However, I'd like to streamline it beforehand. My apologies for the gotos, I know they are evil (but I kinda like them I'm sorry). I'd appreciate any feedback.
#ifndef FTABLE_FTABLE_H
#define FTABLE_FTABLE_H
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#define MAX_LOAD 0.5
/* Set uin as uint32_t or uint64_t depending on system */
#ifdef __x86
typedef uint32_t uin;
/* Table of prime number sizes, each approx double the prev, that fits
* into a uint32_t */
const uin tableSizes[] = {
5, 11, 23, 47, 97, 197, 397, 797, 1597,
3203, 6421, 12853, 25717, 51437, 102877,
205759, 411527, 823117, 1646237, 3292489,
6584983, 13169977, 26339969, 52679969,
105359939, 210719881, 421439783, 842879579,
1685759167, 3371518343 };
#elif __x86_64
typedef uint64_t uin;
/* Table of prime number sizes, each approx double the prev, that fits
* into a uint64_t */
const uin tableSizes[] = {
5, 11, 23, 47, 97, 197, 397, 797, 1597,
3203, 6421, 12853, 25717, 51437, 102877,
205759, 411527, 823117, 1646237, 3292489,
6584983, 13169977, 26339969, 52679969,
105359939, 210719881, 421439783, 842879579,
1685759167, 3371518343, 6743036717, 13486073473,
26972146961, 53944293929, 107888587883,
215777175787, 431554351609, 863108703229,
1726217406467, 3452434812973, 6904869625999,
13809739252051, 27619478504183, 55238957008387,
110477914016779, 220955828033581, 441911656067171,
883823312134381, 1767646624268779, 3535293248537579,
7070586497075177, 14141172994150357,
28282345988300791, 56564691976601587,
113129383953203213, 226258767906406483,
452517535812813007, 905035071625626043,
1810070143251252131, 3620140286502504283,
7240280573005008577, 14480561146010017169,
18446744073709551557};
#endif
/* Table of bitmasks to use */
const uin mask[] = {
0x7, 0xF,
0x1F, 0x3F, 0x7F, 0xFF,
0x1FF, 0x3FF, 0x7FF, 0xFFF,
0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF,
0x1FFFF, 0x3FFFF, 0x7FFFF, 0xFFFFF,
0x1FFFFF, 0x3FFFFF, 0x7FFFFF, 0xFFFFFF,
0x1FFFFFF, 0x3FFFFFF, 0x7FFFFFF, 0xFFFFFFF,
0x1FFFFFFF, 0x3FFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF,
0x1FFFFFFFF, 0x3FFFFFFFF, 0x7FFFFFFFF, 0xFFFFFFFFF,
0x1FFFFFFFFF, 0x3FFFFFFFFF, 0x7FFFFFFFFF, 0xFFFFFFFFFF,
0x1FFFFFFFFFF, 0x3FFFFFFFFFF, 0x7FFFFFFFFFF, 0xFFFFFFFFFFF,
0x1FFFFFFFFFFF, 0x3FFFFFFFFFFF, 0x7FFFFFFFFFFF, 0xFFFFFFFFFFFF,
0x1FFFFFFFFFFFF, 0x3FFFFFFFFFFFF, 0x7FFFFFFFFFFFF, 0xFFFFFFFFFFFFF,
0x1FFFFFFFFFFFFF, 0x3FFFFFFFFFFFFF, 0x7FFFFFFFFFFFFF, 0xFFFFFFFFFFFFFF,
0x1FFFFFFFFFFFFFF, 0x3FFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFF,
0x1FFFFFFFFFFFFFFF, 0x3FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF,
};
/* Linear probing max distance */
#define MAX_PROBES 10
/* Bucket States: Empty, Occupied, Tombstone */
#define EMPTY 0
#define OCCPD 1
#define TMBSTN 2
typedef struct sftbl_bckt ftbucket;
/* Hash table bucket: Key, value, distance from 'ideal' position,
* and data field indicating the bucket state */
struct sftbl_bckt {
uin key;
uin val;
uint8_t dist;
uint8_t data;
};
typedef struct sftbl ftable;
struct sftbl {
ftbucket* buckets;
uin size;
uin count;
uint8_t lvl;
};
ftable* alloc_ftable() {
ftable* out = malloc(sizeof(ftable));
memset(out, 0, sizeof(ftable));
return out;
}
ftable* insert(ftable* ft, uin key, uin val);
void free_table(ftable* ft);
ftable* resize(ftable* ft) {
ftable* nt = malloc(sizeof(ftable));
/* Increase the index in the prime table used for the size */
nt->lvl = ft->lvl + 1;
nt->size = tableSizes[nt->lvl];;
nt->count = 0;
nt->buckets = malloc(sizeof(ftbucket) * nt->size);
memset(nt->buckets, 0, sizeof(ftbucket) * nt->size);
/* Iterate through every valid entry and insert into new table */
for (uin i = 0; i < ft->size; i++) {
if (ft->buckets[i].data == OCCPD) {
nt = insert(nt, ft->buckets[i].key, ft->buckets[i].val);
}
}
/* Free old table and return new one */
free_table(ft);
return nt;
}
ftable* insert(ftable* ft, uin key, uin val) {
if (((float) ft->count + 1) / ((float) ft->size) > MAX_LOAD) {
ft = resize(ft);
}
binsert:;
/* Prime modulus */
uin index = key % ft->size;
uint8_t dist = 0;
while (1) {
/* If more than MAX_PROBES away from ideal location
* resize table and attempt to insert again (goto binsert) */
if (dist > MAX_PROBES) {
ft = resize(ft);
goto binsert;
}
// uin nind = (index + dist) % ft->size;
uin nind = (index + dist) & mask[ft->lvl];
/**
* Above line can be replaced with
* uin nind = (index + dist) & mask[ft->lvl];
* for worse memory usage but faster perf
**/
if (ft->buckets[nind].data == OCCPD) {
if (ft->buckets[nind].dist < dist) {
/* Robin hood hashing: If a 'richer' node is found,
* steal from it: swap */
uin tkey = ft->buckets[nind].key;
uin tval = ft->buckets[nind].val;
uint8_t tdist = ft->buckets[nind].dist;
ft->buckets[nind].key = key;
ft->buckets[nind].val = val;
ft->buckets[nind].dist = dist;
key = tkey;
val = tval;
dist = tdist;
}
}
if (ft->buckets[nind].data == EMPTY || ft->buckets[index + dist].data == TMBSTN) {
/* Occupy any empty or tombstone buckets */
ft->buckets[nind].data = OCCPD;
ft->buckets[nind].key = key;
ft->buckets[nind].val = val;
ft->buckets[nind].dist = dist;
ft->count++;
return ft;
}
dist++;
}
}
void delete(ftable* ft, uin key) {
uin index = key % ft->size;
uint8_t dist = 0;
while (1) {
if (dist > MAX_PROBES) {
/* Object not present in table. Return. */
return;
}
// uin nind = (index + dist) % ft->size;
uin nind = (index + dist) & mask[ft->lvl];
/**
* Above line can be replaced with
* uin nind = (index + dist) & mask[ft->lvl];
* for worse memory usage but faster perf
**/
if (ft->buckets[nind].data == OCCPD) {
if (ft->buckets[nind].key == key) {
/* Set bucket data to tombstone and
* clear key and value */
ft->buckets[nind].data = TMBSTN;
ft->buckets[nind].key = 0;
ft->buckets[nind].val = 0;
ft->count--;
return;
}
}
dist++;
}
}
uin get(ftable* ft, uin key) {
uin index = key % ft->size;
uint8_t dist = 0;
while (1) {
if (dist > MAX_PROBES) {
/* Object not present in table. Return. */
perror("Went over max probes!");
return -1;
}
// uin nind = (index + dist) % ft->size;
uin nind = (index + dist) & mask[ft->lvl];
/**
* Above line can be replaced with
* uin nind = (index + dist) & mask[ft->lvl];
* for worse memory usage but faster perf
**/
if (ft->buckets[nind].data == OCCPD) {
if (ft->buckets[nind].key == key) {
return ft->buckets[nind].val;
}
} else if (ft->buckets[nind].data == EMPTY) {
/* If empty, return early. Further elements
* would have been bridged by a tombstone or a
* occupied bucket. */
return -1;
}
dist++;
}
}
void free_table(ftable* ft) {
free(ft->buckets);
free(ft);
}
#endif
2 Answers 2
Here are some things that may help you improve your code.
Separate interface from implementation
It makes the code somewhat longer for a code review, but it's often very useful to separate the interface from the implementation. In C, this is usually done by putting the interface into separate .h
files and the corresponding implementation into .c
files. It helps users (or reviewers) of the code see and understand the interface and hides implementation details. The other important reason is that you might have multiple source files including the .h
file but only one instance of the corresponding .c
file. In other words, split your existing .h
file into a .h
file and a .c
file.
Make sure you have all required #include
s
The code uses perror
but doesn't #include <stdio.h>
. Also, carefully consider which #include
s are part of the interface (and belong in the .h
file) and which are part of the implementation per the above advice.
Don't print from a library
Because you're creating something like a library that might be called by many different kinds of programs, the code should not print anything or assume that there even is anything on which to print. For that reason, I would strongly advise removing the perror
line.
Provide complete code to reviewers
This is not so much a change to the code as a change in how you present it to other people. Without the full context of the code and an example of how to use it, it takes more effort for other people to understand your code. This affects not only code reviews, but also maintenance of the code in the future, by you or by others. One good way to address that is by the use of comments. Another good technique is to include test code showing how your code is intended to be used. Here's code I wrote to try out your functions:
#include "ftable.h"
#include <assert.h>
int main() {
ftable *hash = alloc_ftable();
for (unsigned i = 0; i < 100; ++i) {
hash = insert(hash, i, i*i);
}
for (unsigned i = 0; i < 100; ++i) {
assert(i*i == get(hash, i));
}
// delete odd keys
for (unsigned i = 1; i < 100; i += 2) {
delete(hash, i);
}
// verify that it's still correct
for (unsigned i = 0; i < 100; ++i) {
if (i & 1) {
assert((uin)-1 == get(hash, i));
} else {
assert(i*i == get(hash, i));
}
}
// resize hash table
hash = resize(hash);
// verify that it's still correct
for (unsigned i = 0; i < 100; ++i) {
if (i & 1) {
assert((uin)-1 == get(hash, i));
} else {
assert(i*i == get(hash, i));
}
}
free_table(hash);
}
Measure the performance before and after any changes
As with the test function above, you should write many different test functions for your hash and measure their performance. It's only by actually measuring before and after any change that you will be able to tell for certain whether you are improving or worsening the performance.
Consider using better naming
Although some of the names are quite brief, I didn't have much difficulty in understanding them, so I think the current names are adequate. However, although you as the programmer are interested in the hash table mechanism, from another programmer's point of view trying to use this code, it would probably be better to call it a map
or hashmap
or even associative_array
because that's essentially what the code is for, even if the details happen to feature a hashing algorithm internally. Also, it seems to me that resize
should probably not be used other than internally. For that reason, I'd suggest that it should be static
and solely within ftable.c
. Also data
should clearly be state
or bucket_state
.
Combine typedef
with struct
declaration
It's purely a stylistic preference, but if you're going to use typedef
s for your struct
s, you should know that it's common practice to combine them for brevity and clarity:
typedef struct sftbl {
ftbucket* buckets;
unsigned size;
unsigned count;
uint8_t lvl;
} ftable;
Use const
where practical
In the get
routine, the underlying structure is not modified and so that parameter should be declared const
to signal that fact:
uin get(const ftable* ft, uin key);
Check the return value of malloc
If the system is running out of memory, malloc
will return NULL
. Code must check the return value to make sure it is not NULL
before dereferencing the variable or the program will crash.
Consider unsigned
instead of a custom type
The code currently won't compile for an ARM processor since neither __x86
nor __x86_64
are defined for that processor type. That's not really a necessary restriction, so I'd recommend instead simply using unsigned
and making the typedef
like this:
#include <limits.h>
#if UINT_MAX == 4294967295u
// 32-bit version
#elif UINT_MAX == 18446744073709551615u
// 64-bit version
#else
#error "unsigned type does not appear to be 32- or 64-bit value."
#endif
Understand constant values
In C, when you write a value like 14480561146010017169
or 0x7FFFFFFFFFFFFFF
it is interpreted by the preprocessor as a signed value. If you want unsigned values, you must say so, so these constants should be written as 14480561146010017169u
or 0x7FFFFFFFFFFFFFFu
with the trailing u
signifying unsigned. Also, your mask
values should be sized appropriately as per the previous advice.
Goto is still considered dangerous
The goto
in this code makes a difficult-to-understand control flow even more difficult to understand. That is not a good idea. So first let's look at the dubious while(1)
loop. Does it really never exit? No, that's misleading. If we study the code, we see it exits when it's able to place the data in a bucket. So instead of while(1)
, I would write this:
unsigned nind = index & mask[ft->lvl];
for (dist = 0;
ft->buckets[nind].data != EMPTY && ft->buckets[index + dist].data != TMBSTN;
++dist)
{
// the loop
}
/* Write the data in this bucket */
ft->buckets[nind].data = OCCPD;
ft->buckets[nind].key = key;
ft->buckets[nind].val = val;
ft->buckets[nind].dist = dist;
ft->count++;
return ft;
Now we can eliminate the goto
by rewriting the clause within the loop:
if (dist > MAX_PROBES) {
ft = resize(ft);
index = key % ft->size;
nind = index & mask[ft->lvl];
dist = 0;
continue;
}
A similar transformation can be applied elsewhere as with get
:
unsigned get(const ftable* ft, unsigned key) {
unsigned index = key % ft->size;
unsigned retval = -1;
for (uint8_t dist = 0; dist <= MAX_PROBES; ++dist) {
unsigned nind = (index + dist) & mask[ft->lvl];
if (ft->buckets[nind].data == OCCPD && ft->buckets[nind].key == key) {
retval = ft->buckets[nind].val;
break;
} else if (ft->buckets[nind].data == EMPTY) {
break;
}
}
return retval;
}
Use library calls efficiently
Instead of these two lines:
nt->buckets = malloc(sizeof(ftbucket) * nt->size);
memset(nt->buckets, 0, sizeof(ftbucket) * nt->size);
I'd write this:
nt->buckets = calloc(nt->size, sizeof(ftbucket));
Avoid C++ keywords
There may come a time that you or someone else wants to incorporate this C code into a C++ project. Unfortunately, the delete
function sits atop the C++ reserved word delete
. Rename it to remove
to avoid such clashes.
-
1\$\begingroup\$ If I could up vote twice, I would. You got everything I wanted to write about. \$\endgroup\$2020年06月08日 14:06:24 +00:00Commented Jun 8, 2020 at 14:06
-
\$\begingroup\$ @pacmaninbw: you did good work too, guiding the OP toward creating a reviewable question. Kudos! \$\endgroup\$Edward– Edward2020年06月08日 14:10:06 +00:00Commented Jun 8, 2020 at 14:10
-
1\$\begingroup\$ Thank you both for your help! I appreciate the advice and will get to work on fixing up my code! \$\endgroup\$Lev Knoblock– Lev Knoblock2020年06月08日 17:01:23 +00:00Commented Jun 8, 2020 at 17:01
Use valid constants
14480561146010017169, 18446744073709551557
are typically outside the long long
range. Append a u
.
Simplify allocation sizing
Insptead of p = some_alloc(sizeof(matching pointer type) * n)
, use p = some_alloc(sizeof *p * n)
. It is easier to code right, review and maintain.
// nt->buckets = malloc(sizeof(ftbucket) * nt->size);
nt->buckets = malloc(sizeof *(nt->buckets) * nt->size);
Use size_t
for indexing
uin
is not the best type for array index, it may be too narrow or wide for array indexing and sizing. Use size_t
.
I'd reccomend unsigned long long
or uintmax_t
for the key type though.
Avoid FP math for an integer problem.
//if (((float) ft->count + 1) / ((float) ft->size) > MAX_LOAD) {
// ft = resize(ft);
//}
#define MAX_LOAD_N 1
#define MAX_LOAD_D 2
// if ((ft->count + 1) / ft->size > MAX_LOAD_N / MAX_LOAD_D) {
if ((ft->count+1) / MAX_LOAD_N > ft->size / MAX_LOAD_D) {
ft = resize(ft);
}
typedef
does not help. \$\endgroup\$