An attempt at a clean implementation of this important data structure in C.
It's hard to make such structures generic in C without losing performance, so this specialises on char*
keys and int
values, but uses some type aliases, such that only a few places need changing to change the key or value types.
The hash table dynamically grows by 2x when 80% of the slots are used to keep efficiency high.
As a demo, we parse the complete works of Shakespeare, call ht_inc()
for each word. Then build a flat "view" of the hashtable, sort and report the 10 most common words. On my machine the entire task takes just 50ms.
Feedback on quality of coding style please.
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef char* ht_key_t;
typedef int ht_value_t;
// key => value plus pointer to next item for hash collisions
typedef struct HashTableItem HashTableItem;
struct HashTableItem {
ht_key_t key;
ht_value_t value;
HashTableItem* next;
};
// Array of pointers to HashTableItems, plus counters
typedef struct HashTable {
HashTableItem** slots;
size_t size; // how many slots are there
size_t scount; // how many slots are used
size_t itemcount; // how many items are there
} HashTable;
// Creates a new HashTableItem
HashTableItem* ht_create_item(ht_key_t key, ht_value_t value) {
HashTableItem* item = malloc(sizeof *item);
if (!item) {
perror("malloc item");
exit(EXIT_FAILURE);
}
item->key = strdup(key); // take a copy
item->value = value;
item->next = NULL;
return item;
}
// Creates a new HashTable
HashTable* ht_create(size_t size) {
HashTable* table = malloc(sizeof *table);
if (!table) {
perror("malloc table");
exit(EXIT_FAILURE);
}
table->size = size;
table->scount = 0;
table->itemcount = 0;
table->slots = calloc(table->size, sizeof *table->slots); // NOLINT
if (!table->slots) {
perror("calloc items");
exit(EXIT_FAILURE);
}
return table;
}
// Frees an item depending on their types, if the key or value members
// need freeing that needs to happen here too
void ht_free_item(HashTableItem* item) {
free(item->key);
free(item);
}
// Frees the whole hashtable
void ht_free(HashTable* table) {
// free the HashTableItems in the linked lists
for (size_t i = 0; i < table->size; i++) {
HashTableItem* item = table->slots[i];
while (item) {
HashTableItem* next = item->next;
ht_free_item(item);
item = next;
}
}
// free the array of pointers to HashTableItems
free(table->slots);
free(table);
}
// hash function. crucial to efficient operation
// returns value in range [0, size)
// an appropriate hash function for short strings is FNV-1a
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function#FNV-1a_hash
size_t ht_hash(size_t size, const char* str) {
uint64_t hash = 0xcbf29ce484222325; // FNV_offset_basis
while (*str) hash = (hash ^ (uint8_t)*str++) * 0x100000001b3; // FNV_prime
return hash % size; // fit to table
}
void ht_grow(HashTable* table) {
table->scount++;
if (table->scount * 100 / table->size >= 80) {
size_t nsize = table->size * 2;
HashTableItem** nslots = calloc(nsize, sizeof *table->slots); // NOLINT
if (!nslots) {
perror("calloc nitems");
exit(EXIT_FAILURE);
}
size_t ncount = 0;
for (size_t i = 0; i < table->size; i++) {
HashTableItem* item = table->slots[i];
while (item) {
HashTableItem* next = item->next; // save next
HashTableItem** nslot = &nslots[ht_hash(nsize, item->key)];
if (!*nslot) ++ncount;
item->next = *nslot; // push into new list
*nslot = item;
item = next;
}
}
free(table->slots);
table->scount = ncount;
table->size = nsize;
table->slots = nslots;
}
}
// Inserts an item (or updates if exists)
void ht_insert(HashTable* table, ht_key_t key, ht_value_t value) {
HashTableItem** slot = &table->slots[ht_hash(table->size, key)];
HashTableItem* item = *slot;
bool direct_slot = true;
while (item) {
if (strcmp(item->key, key) == 0) {
// exists, update value, free old value if needed
item->value = value;
return;
}
slot = &item->next;
item = *slot;
direct_slot = false;
}
*slot = ht_create_item(key, value); // new entry
++table->itemcount;
if (direct_slot) ht_grow(table); // HashTable accounting
}
// increments the value for a key or inserts with value = 1
// specialised for ht_value_t=int and faster than search then update.
void ht_inc(HashTable* table, ht_key_t key) {
HashTableItem** slot = &table->slots[ht_hash(table->size, key)];
HashTableItem* item = *slot;
bool direct_slot = true;
while (item) {
if (strcmp(item->key, key) == 0) {
++item->value;
return;
}
slot = &item->next;
item = *slot;
direct_slot = false;
}
*slot = ht_create_item(key, 1); // not found, init with one
++table->itemcount;
if (direct_slot) ht_grow(table); // HashTable accounting
}
// Deletes an item from the table
void ht_delete(HashTable* table, ht_key_t key) {
HashTableItem** slot = &table->slots[ht_hash(table->size, key)];
HashTableItem* item = *slot;
bool direct_slot = true;
while (item) {
if (strcmp(item->key, key) == 0) {
*slot = item->next; // remove item from linked list
ht_free_item(item);
--table->itemcount;
if (direct_slot) --table->scount; // HashTable accounting
return;
}
slot = &item->next;
item = *slot;
direct_slot = false;
}
}
// Searches the key in the hashtable
// and returns NULL ptr if it doesn't exist
HashTableItem* ht_search(HashTable* table, ht_key_t key) {
HashTableItem* item = table->slots[ht_hash(table->size, key)];
while (item) {
if (strcmp(item->key, key) == 0) return item;
item = item->next;
}
return NULL;
}
// debug printing. customise printf format strings by key & value types
void ht_print(HashTable* table) {
printf("\n---- Hash Table ---\n");
for (size_t i = 0; i < table->size; i++) {
printf("@%zu: ", i);
HashTableItem* item = table->slots[i];
while (item) {
printf("%s => %d | ", item->key, item->value);
item = item->next;
}
printf("\n");
}
printf("-------------------\n");
}
// debug printing. customise printf format strings by key & value types
void ht_print_search(HashTable* table, ht_key_t key) {
HashTableItem* val;
if ((val = ht_search(table, key)) == NULL) {
printf("Key:%s does not exist\n", key);
return;
}
printf("Key:%s => %d\n", key, val->value);
}
static inline bool ht_is_alpha(char c) {
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
}
static inline char ht_tolower(char c) { return (c < 'a') ? c + 'a' - 'A' : c; }
int cmp_ht_items(const void* a, const void* b) {
int a_val = (*(HashTableItem**)a)->value;
int b_val = (*(HashTableItem**)b)->value;
if (a_val == b_val) return 0;
return a_val < b_val ? 1 : -1;
}
int main() {
// some tests
HashTable* ht = ht_create(4);
ht_insert(ht, "1", 10);
ht_insert(ht, "2", 20);
ht_insert(ht, "Hel3", 30);
ht_insert(ht, "Cau4", 40);
ht_print(ht);
ht_insert(ht, "Cau6", 60); // grow!
ht_print(ht);
ht_inc(ht, "11"); // increment
ht_inc(ht, "21"); // increment
ht_inc(ht, "31"); // increment
ht_inc(ht, "41"); // increment
ht_inc(ht, "51"); // increment
ht_inc(ht, "61"); // increment
ht_inc(ht, "71"); // increment
ht_inc(ht, "81"); // increment and grow again
ht_print(ht);
ht_print_search(ht, "1");
ht_print_search(ht, "2");
ht_print_search(ht, "3");
ht_print_search(ht, "Hel3");
ht_print_search(ht, "Cau4");
ht_insert(ht, "Cau6", 61); // update
ht_inc(ht, "1"); // increment
ht_inc(ht, "Hel3"); // increment
ht_inc(ht, "new"); // increment
ht_print(ht);
ht_delete(ht, "Hel3");
ht_delete(ht, "Cau6");
ht_delete(ht, "1");
ht_print(ht);
ht_free(ht);
// end of tests
// shakespeare demo
FILE* fp = fopen("shakespeare.txt", "re");
if (!fp) {
perror("fopen");
exit(EXIT_FAILURE);
}
ht = ht_create(32 * 1024);
#define BUFSIZE 1024
#define WORDSIZE 50
char buf[BUFSIZE];
char word[WORDSIZE];
char* word_ptr = word;
while (!ferror(fp) && !feof(fp)) {
size_t bytes_read = fread(buf, 1, BUFSIZE, fp);
char* buf_ptr = buf;
while ((buf_ptr < buf + bytes_read)) {
char c = *buf_ptr;
if (ht_is_alpha(c)) {
*(word_ptr++) = ht_tolower(c);
if (word_ptr - word == WORDSIZE - 1) { // -1 for NULL terminator
fputs("word too long. terminating\n", stderr);
exit(EXIT_FAILURE);
}
} else if (word_ptr == word) {
// ignore repeated terminators
} else {
*(word_ptr++) = '0円';
ht_inc(ht, word); // record
word_ptr = word; // restart new word
}
++buf_ptr; // next char from buf
}
}
fclose(fp);
// build a flat view of the hashtable items
HashTableItem** itemview = malloc(ht->itemcount * sizeof *itemview); // NOLINT
if (!itemview) {
perror("malloc itemview");
exit(EXIT_FAILURE);
}
HashTableItem** curritem = itemview;
for (size_t i = 0; i < ht->size; i++) {
HashTableItem* item = ht->slots[i];
while (item) {
*curritem++ = item;
item = item->next;
}
}
// now sort that view and print the top 10
qsort(itemview, ht->itemcount, sizeof *itemview, cmp_ht_items); // NOLINT
for (size_t i = 0; i < 10; i++)
printf("%s => %d\n", itemview[i]->key, itemview[i]->value);
free(itemview);
// ht_print(ht);
ht_free(ht);
}
1 Answer 1
Make more use of calloc()
I would use calloc()
almost everywhere. It guarantees the memory is zeroed, so you won't have surprises if you forgot to initialize a member variable. I wouldn't worry about performance much; the compiler should be able to optimize the zeroing away if it sees you are setting a member variable to something else. So, use it in ht_create_item()
and ht_create()
where you currently use malloc()
.
Consider avoiding a layer of indirection
Your hash table has an array slots
that contains pointers to items, however items themselves are quite small, they are just two pointers and an int
. Since you aim at a load factor of 0.4 to 0.8, it will use less memory to remove the indirection:
typedef struct HashTable {
HashTableItem* slots;
...
} HashTable;
This should also give a small performance gain.
Avoid code duplication
ht_inc()
and ht_delete()
look very similar. It might be possible to put the common parts into a separate function. I think you can do this by creating a function that finds the pointer to where the item should be/go, and return a pointer to that pointer. Basically, a ht_search()
that returns the slot instead of the item itself.
static HashTableItem** ht_search_slot(HashTable* table, ht_key_t key, bool *direct) {
HashTableItem** slot = &table->slots[ht_hash(table->size, key)];
*direct = true;
while (*slot) {
if (strcmp((*slot)->key, key) == 0) break;
*direct = false;
slot = &(*slot)->next;
}
return slot;
}
void ht_inc(HashTable* table, ht_key_t key) {
bool direct_slot;
HashTableItem** slot = ht_search_slot(table, key, &direct_slot);
HashTableItem* item = *slot;
if (item) {
++item->value;
return;
}
*slot = ht_create_item(key, 1); // not found, init with one
++table->itemcount;
if (direct_slot) ht_grow(table); // HashTable accounting
}
void ht_delete(HashTable* table, ht_key_t key) {
bool direct_slot;
HashTableItem** slot = ht_search_slot(table, key, &direct_slot);
HashTableItem* item = *slot;
if (item) {
*slot = item->next; // remove item from linked list
ht_free_item(item);
--table->itemcount;
if (direct_slot) --table->scount; // HashTable accounting
}
}
HashTableItem* ht_search(HashTable* table, ht_key_t key) {
bool direct_slot;
return *ht_search_slot(table, key, &direct_slot);
}
Of course, this approach will conflict with the above advice of avoiding indirection, although perhaps something similar is still possible.
Always call ht_grow()
when inserting elements
You are only calling ht_grow()
when an empty direct slot is used. However, if an item is inserted in an already used slot, you don't call it. That means that if you are unlucky, everything hashes to the same slot, and you will never grow and rebalance. I would play it safe and just unconditionally call ht_grow()
when inserting an element. Of course, ht_grow()
should use itemcount
instead of scount
to check whether it needs to grow the hash table.
Avoid division and modulo operations
Divisions (and thus also modulo operations) are quite expensive. I would avoid them by ensuring the hash table size is always a power of two. And if you ensure that the size is at least 16 elements, then you can replace the check in ht_grow()
by:
if (table->itemcount >= 13 * (table->size >> 4)) {
...
}
This will trigger a resize when the hash table is 81.25% full. You can even get rid of the multiplication by storing the binary logarithm of the size, but unless you plan to run this code on microcontrollers I would not bother with that. And in the hash function you can replace the modulo this way:
return hash & (size - 1);
-
\$\begingroup\$ Good stuff, thanks. I considered most of these. I kept the indirection to not tie myself to this key=>value structure, but it's certainly debatable. I considered calloc, meh.. The code duplication is very interesting and I was still wrestling with this. Each function is only 10 lines, but they are "somewhat complex" pointer juggling. So not not just duplication but also clarity might improve if I factor out "finding the slot incl linked list part". I thought about load factor by itemcount vs slotcount, more common? And I really like the power of 2 idea and remove modulo/divide. Thanks again. \$\endgroup\$Oliver Schönrock– Oliver Schönrock2021年01月18日 23:13:55 +00:00Commented Jan 18, 2021 at 23:13
-
\$\begingroup\$ Are you sure this is right?
table->itemcount >= 13 * (table->size >> 2)
I can't seem to make that work in my head. For me that would only trigger at >300% load factor? Or am I missing something? shouldn't it betable->itemcount * 10 >= (table->size << 3)
for an 80% cutoff? And no restriction for size >=16 (which breaks my tests). In fact if I do the swap to itemcount, then the threshhold should be more lke 160% but that's easy to adjust. \$\endgroup\$Oliver Schönrock– Oliver Schönrock2021年01月19日 00:54:09 +00:00Commented Jan 19, 2021 at 0:54 -
\$\begingroup\$ Interestingly, removing the
%
modulo inht_hash
gave a 20% gain immediately, whereas changing thatif
inht_grow
did nothing at all (similar number of calls). I guess on a big Intel CPU division is fast (but modulo is not), so unless I am going for an 8-bit pic18 the division is probably more readable? (the 8-bit pic can't handle uint64 anyway, nor malloc etc etc...) \$\endgroup\$Oliver Schönrock– Oliver Schönrock2021年01月19日 01:13:10 +00:00Commented Jan 19, 2021 at 1:13 -
\$\begingroup\$ It should've been
>> 4
, which is equal to/ 16
. In fact you can just write the latter, the compiler is able to optimize that because it can see it's a constant. Your option to do... * 10 >= ... << 3
should also work, but it might have issues with very large values oftable->size
. \$\endgroup\$G. Sliepen– G. Sliepen2021年01月19日 07:24:32 +00:00Commented Jan 19, 2021 at 7:24 -
1\$\begingroup\$ Thought you might like this commit: github.com/oschonrock/cproj/commit/… \$\endgroup\$Oliver Schönrock– Oliver Schönrock2021年01月20日 20:30:00 +00:00Commented Jan 20, 2021 at 20:30