Hashtable with separate chaining and dynamic resizing

Question 1

An attempt at a clean implementation of this important data structure in C.

It's hard to make such structures generic in C without losing performance, so this specialises on char* keys and int values, but uses some type aliases, such that only a few places need changing to change the key or value types.

The hash table dynamically grows by 2x when 80% of the slots are used to keep efficiency high.

As a demo, we parse the complete works of Shakespeare, call ht_inc() for each word. Then build a flat "view" of the hashtable, sort and report the 10 most common words. On my machine the entire task takes just 50ms.

Feedback on quality of coding style please.

#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef char* ht_key_t;
typedef int ht_value_t;
// key => value plus pointer to next item for hash collisions
typedef struct HashTableItem HashTableItem;
struct HashTableItem {
 ht_key_t key;
 ht_value_t value;
 HashTableItem* next;
};
// Array of pointers to HashTableItems, plus counters
typedef struct HashTable {
 HashTableItem** slots;
 size_t size; // how many slots are there
 size_t scount; // how many slots are used
 size_t itemcount; // how many items are there
} HashTable;
// Creates a new HashTableItem 
HashTableItem* ht_create_item(ht_key_t key, ht_value_t value) {
 HashTableItem* item = malloc(sizeof *item);
 if (!item) {
 perror("malloc item");
 exit(EXIT_FAILURE);
 }
 item->key = strdup(key); // take a copy
 item->value = value;
 item->next = NULL;
 return item;
}
// Creates a new HashTable
HashTable* ht_create(size_t size) {
 HashTable* table = malloc(sizeof *table);
 if (!table) {
 perror("malloc table");
 exit(EXIT_FAILURE);
 }
 table->size = size;
 table->scount = 0;
 table->itemcount = 0;
 table->slots = calloc(table->size, sizeof *table->slots); // NOLINT
 if (!table->slots) {
 perror("calloc items");
 exit(EXIT_FAILURE);
 }
 return table;
}
// Frees an item depending on their types, if the key or value members
// need freeing that needs to happen here too
void ht_free_item(HashTableItem* item) {
 free(item->key);
 free(item);
}
// Frees the whole hashtable
void ht_free(HashTable* table) {
 // free the HashTableItems in the linked lists
 for (size_t i = 0; i < table->size; i++) {
 HashTableItem* item = table->slots[i];
 while (item) {
 HashTableItem* next = item->next;
 ht_free_item(item);
 item = next;
 }
 }
 // free the array of pointers to HashTableItems
 free(table->slots);
 free(table);
}
// hash function. crucial to efficient operation
// returns value in range [0, size)
// an appropriate hash function for short strings is FNV-1a
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function#FNV-1a_hash
size_t ht_hash(size_t size, const char* str) {
 uint64_t hash = 0xcbf29ce484222325; // FNV_offset_basis
 while (*str) hash = (hash ^ (uint8_t)*str++) * 0x100000001b3; // FNV_prime
 return hash % size; // fit to table
}
void ht_grow(HashTable* table) {
 table->scount++;
 if (table->scount * 100 / table->size >= 80) {
 size_t nsize = table->size * 2;
 HashTableItem** nslots = calloc(nsize, sizeof *table->slots); // NOLINT
 if (!nslots) {
 perror("calloc nitems");
 exit(EXIT_FAILURE);
 }
 size_t ncount = 0;
 for (size_t i = 0; i < table->size; i++) {
 HashTableItem* item = table->slots[i];
 while (item) {
 HashTableItem* next = item->next; // save next
 HashTableItem** nslot = &nslots[ht_hash(nsize, item->key)];
 if (!*nslot) ++ncount;
 item->next = *nslot; // push into new list
 *nslot = item;
 item = next;
 }
 }
 free(table->slots);
 table->scount = ncount;
 table->size = nsize;
 table->slots = nslots;
 }
}
// Inserts an item (or updates if exists)
void ht_insert(HashTable* table, ht_key_t key, ht_value_t value) {
 HashTableItem** slot = &table->slots[ht_hash(table->size, key)];
 HashTableItem* item = *slot;
 bool direct_slot = true;
 while (item) {
 if (strcmp(item->key, key) == 0) {
 // exists, update value, free old value if needed
 item->value = value;
 return;
 }
 slot = &item->next;
 item = *slot;
 direct_slot = false;
 }
 *slot = ht_create_item(key, value); // new entry
 ++table->itemcount;
 if (direct_slot) ht_grow(table); // HashTable accounting
}
// increments the value for a key or inserts with value = 1
// specialised for ht_value_t=int and faster than search then update.
void ht_inc(HashTable* table, ht_key_t key) {
 HashTableItem** slot = &table->slots[ht_hash(table->size, key)];
 HashTableItem* item = *slot;
 bool direct_slot = true;
 while (item) {
 if (strcmp(item->key, key) == 0) {
 ++item->value;
 return;
 }
 slot = &item->next;
 item = *slot;
 direct_slot = false;
 }
 *slot = ht_create_item(key, 1); // not found, init with one
 ++table->itemcount;
 if (direct_slot) ht_grow(table); // HashTable accounting
}
// Deletes an item from the table
void ht_delete(HashTable* table, ht_key_t key) {
 HashTableItem** slot = &table->slots[ht_hash(table->size, key)];
 HashTableItem* item = *slot;
 bool direct_slot = true;
 while (item) {
 if (strcmp(item->key, key) == 0) {
 *slot = item->next; // remove item from linked list
 ht_free_item(item);
 --table->itemcount;
 if (direct_slot) --table->scount; // HashTable accounting
 return;
 }
 slot = &item->next;
 item = *slot;
 direct_slot = false;
 }
}
// Searches the key in the hashtable
// and returns NULL ptr if it doesn't exist
HashTableItem* ht_search(HashTable* table, ht_key_t key) {
 HashTableItem* item = table->slots[ht_hash(table->size, key)];
 while (item) {
 if (strcmp(item->key, key) == 0) return item;
 item = item->next;
 }
 return NULL;
}
// debug printing. customise printf format strings by key & value types
void ht_print(HashTable* table) {
 printf("\n---- Hash Table ---\n");
 for (size_t i = 0; i < table->size; i++) {
 printf("@%zu: ", i);
 HashTableItem* item = table->slots[i];
 while (item) {
 printf("%s => %d | ", item->key, item->value);
 item = item->next;
 }
 printf("\n");
 }
 printf("-------------------\n");
}
// debug printing. customise printf format strings by key & value types
void ht_print_search(HashTable* table, ht_key_t key) {
 HashTableItem* val;
 if ((val = ht_search(table, key)) == NULL) {
 printf("Key:%s does not exist\n", key);
 return;
 }
 printf("Key:%s => %d\n", key, val->value);
}
static inline bool ht_is_alpha(char c) {
 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
}
static inline char ht_tolower(char c) { return (c < 'a') ? c + 'a' - 'A' : c; }
int cmp_ht_items(const void* a, const void* b) {
 int a_val = (*(HashTableItem**)a)->value;
 int b_val = (*(HashTableItem**)b)->value;
 if (a_val == b_val) return 0;
 return a_val < b_val ? 1 : -1;
}
int main() {
 // some tests
 HashTable* ht = ht_create(4);
 ht_insert(ht, "1", 10);
 ht_insert(ht, "2", 20);
 ht_insert(ht, "Hel3", 30);
 ht_insert(ht, "Cau4", 40);
 ht_print(ht);
 ht_insert(ht, "Cau6", 60); // grow!
 ht_print(ht);
 ht_inc(ht, "11"); // increment
 ht_inc(ht, "21"); // increment
 ht_inc(ht, "31"); // increment
 ht_inc(ht, "41"); // increment
 ht_inc(ht, "51"); // increment
 ht_inc(ht, "61"); // increment
 ht_inc(ht, "71"); // increment
 ht_inc(ht, "81"); // increment and grow again
 ht_print(ht);
 ht_print_search(ht, "1");
 ht_print_search(ht, "2");
 ht_print_search(ht, "3");
 ht_print_search(ht, "Hel3");
 ht_print_search(ht, "Cau4");
 ht_insert(ht, "Cau6", 61); // update
 ht_inc(ht, "1"); // increment
 ht_inc(ht, "Hel3"); // increment
 ht_inc(ht, "new"); // increment
 ht_print(ht);
 ht_delete(ht, "Hel3");
 ht_delete(ht, "Cau6");
 ht_delete(ht, "1");
 ht_print(ht);
 ht_free(ht);
 // end of tests
 // shakespeare demo
 FILE* fp = fopen("shakespeare.txt", "re");
 if (!fp) {
 perror("fopen");
 exit(EXIT_FAILURE);
 }
 ht = ht_create(32 * 1024);
#define BUFSIZE 1024
#define WORDSIZE 50
 char buf[BUFSIZE];
 char word[WORDSIZE];
 char* word_ptr = word;
 while (!ferror(fp) && !feof(fp)) {
 size_t bytes_read = fread(buf, 1, BUFSIZE, fp);
 char* buf_ptr = buf;
 while ((buf_ptr < buf + bytes_read)) {
 char c = *buf_ptr;
 if (ht_is_alpha(c)) {
 *(word_ptr++) = ht_tolower(c);
 if (word_ptr - word == WORDSIZE - 1) { // -1 for NULL terminator
 fputs("word too long. terminating\n", stderr);
 exit(EXIT_FAILURE);
 }
 } else if (word_ptr == word) {
 // ignore repeated terminators
 } else {
 *(word_ptr++) = '0円';
 ht_inc(ht, word); // record
 word_ptr = word; // restart new word
 }
 ++buf_ptr; // next char from buf
 }
 }
 fclose(fp);
 // build a flat view of the hashtable items
 HashTableItem** itemview = malloc(ht->itemcount * sizeof *itemview); // NOLINT
 if (!itemview) {
 perror("malloc itemview");
 exit(EXIT_FAILURE);
 }
 HashTableItem** curritem = itemview;
 for (size_t i = 0; i < ht->size; i++) {
 HashTableItem* item = ht->slots[i];
 while (item) {
 *curritem++ = item;
 item = item->next;
 }
 }
 // now sort that view and print the top 10
 qsort(itemview, ht->itemcount, sizeof *itemview, cmp_ht_items); // NOLINT
 for (size_t i = 0; i < 10; i++)
 printf("%s => %d\n", itemview[i]->key, itemview[i]->value);
 free(itemview);
 // ht_print(ht);
 ht_free(ht);
}

Question 2

Make more use of `calloc()`

I would use calloc() almost everywhere. It guarantees the memory is zeroed, so you won't have surprises if you forgot to initialize a member variable. I wouldn't worry about performance much; the compiler should be able to optimize the zeroing away if it sees you are setting a member variable to something else. So, use it in ht_create_item() and ht_create() where you currently use malloc().

Consider avoiding a layer of indirection

Your hash table has an array slots that contains pointers to items, however items themselves are quite small, they are just two pointers and an int. Since you aim at a load factor of 0.4 to 0.8, it will use less memory to remove the indirection:

typedef struct HashTable {
 HashTableItem* slots;
 ...
} HashTable;

This should also give a small performance gain.

Avoid code duplication

ht_inc() and ht_delete() look very similar. It might be possible to put the common parts into a separate function. I think you can do this by creating a function that finds the pointer to where the item should be/go, and return a pointer to that pointer. Basically, a ht_search() that returns the slot instead of the item itself.

static HashTableItem** ht_search_slot(HashTable* table, ht_key_t key, bool *direct) {
 HashTableItem** slot = &table->slots[ht_hash(table->size, key)];
 *direct = true;
 while (*slot) {
 if (strcmp((*slot)->key, key) == 0) break;
 *direct = false;
 slot = &(*slot)->next;
 }
 return slot;
}
void ht_inc(HashTable* table, ht_key_t key) {
 bool direct_slot;
 HashTableItem** slot = ht_search_slot(table, key, &direct_slot);
 HashTableItem* item = *slot;
 if (item) {
 ++item->value;
 return;
 }
 *slot = ht_create_item(key, 1); // not found, init with one
 ++table->itemcount;
 if (direct_slot) ht_grow(table); // HashTable accounting
}
void ht_delete(HashTable* table, ht_key_t key) {
 bool direct_slot;
 HashTableItem** slot = ht_search_slot(table, key, &direct_slot);
 HashTableItem* item = *slot;
 if (item) {
 *slot = item->next; // remove item from linked list
 ht_free_item(item);
 --table->itemcount;
 if (direct_slot) --table->scount; // HashTable accounting
 }
}
HashTableItem* ht_search(HashTable* table, ht_key_t key) {
 bool direct_slot;
 return *ht_search_slot(table, key, &direct_slot);
}

Of course, this approach will conflict with the above advice of avoiding indirection, although perhaps something similar is still possible.

Always call `ht_grow()` when inserting elements

You are only calling ht_grow() when an empty direct slot is used. However, if an item is inserted in an already used slot, you don't call it. That means that if you are unlucky, everything hashes to the same slot, and you will never grow and rebalance. I would play it safe and just unconditionally call ht_grow() when inserting an element. Of course, ht_grow() should use itemcount instead of scount to check whether it needs to grow the hash table.

Avoid division and modulo operations

Divisions (and thus also modulo operations) are quite expensive. I would avoid them by ensuring the hash table size is always a power of two. And if you ensure that the size is at least 16 elements, then you can replace the check in ht_grow() by:

if (table->itemcount >= 13 * (table->size >> 4)) {
 ...
}

This will trigger a resize when the hash table is 81.25% full. You can even get rid of the multiplication by storing the binary logarithm of the size, but unless you plan to run this code on microcontrollers I would not bother with that. And in the hash function you can replace the modulo this way:

return hash & (size - 1);

Question 3

Good stuff, thanks. I considered most of these. I kept the indirection to not tie myself to this key=>value structure, but it's certainly debatable. I considered calloc, meh.. The code duplication is very interesting and I was still wrestling with this. Each function is only 10 lines, but they are "somewhat complex" pointer juggling. So not not just duplication but also clarity might improve if I factor out "finding the slot incl linked list part". I thought about load factor by itemcount vs slotcount, more common? And I really like the power of 2 idea and remove modulo/divide. Thanks again.

Question 4

Are you sure this is right? table->itemcount >= 13 * (table->size >> 2) I can't seem to make that work in my head. For me that would only trigger at >300% load factor? Or am I missing something? shouldn't it be table->itemcount * 10 >= (table->size << 3) for an 80% cutoff? And no restriction for size >=16 (which breaks my tests). In fact if I do the swap to itemcount, then the threshhold should be more lke 160% but that's easy to adjust.

Question 5

Interestingly, removing the % modulo in ht_hash gave a 20% gain immediately, whereas changing that if in ht_grow did nothing at all (similar number of calls). I guess on a big Intel CPU division is fast (but modulo is not), so unless I am going for an 8-bit pic18 the division is probably more readable? (the 8-bit pic can't handle uint64 anyway, nor malloc etc etc...)

Question 6

It should've been >> 4, which is equal to / 16. In fact you can just write the latter, the compiler is able to optimize that because it can see it's a constant. Your option to do ... * 10 >= ... << 3 should also work, but it might have issues with very large values of table->size.

Question 7

Thought you might like this commit: github.com/oschonrock/cproj/commit/…

G. Sliepen G. Sliepen 69k3 gold badges74 silver badges180 bronze badges · Accepted Answer · 2021-01-18 22:35:41Z

Make more use of `calloc()`

I would use calloc() almost everywhere. It guarantees the memory is zeroed, so you won't have surprises if you forgot to initialize a member variable. I wouldn't worry about performance much; the compiler should be able to optimize the zeroing away if it sees you are setting a member variable to something else. So, use it in ht_create_item() and ht_create() where you currently use malloc().

Consider avoiding a layer of indirection

Your hash table has an array slots that contains pointers to items, however items themselves are quite small, they are just two pointers and an int. Since you aim at a load factor of 0.4 to 0.8, it will use less memory to remove the indirection:

typedef struct HashTable {
 HashTableItem* slots;
 ...
} HashTable;

This should also give a small performance gain.

Avoid code duplication

ht_inc() and ht_delete() look very similar. It might be possible to put the common parts into a separate function. I think you can do this by creating a function that finds the pointer to where the item should be/go, and return a pointer to that pointer. Basically, a ht_search() that returns the slot instead of the item itself.

static HashTableItem** ht_search_slot(HashTable* table, ht_key_t key, bool *direct) {
 HashTableItem** slot = &table->slots[ht_hash(table->size, key)];
 *direct = true;
 while (*slot) {
 if (strcmp((*slot)->key, key) == 0) break;
 *direct = false;
 slot = &(*slot)->next;
 }
 return slot;
}
void ht_inc(HashTable* table, ht_key_t key) {
 bool direct_slot;
 HashTableItem** slot = ht_search_slot(table, key, &direct_slot);
 HashTableItem* item = *slot;
 if (item) {
 ++item->value;
 return;
 }
 *slot = ht_create_item(key, 1); // not found, init with one
 ++table->itemcount;
 if (direct_slot) ht_grow(table); // HashTable accounting
}
void ht_delete(HashTable* table, ht_key_t key) {
 bool direct_slot;
 HashTableItem** slot = ht_search_slot(table, key, &direct_slot);
 HashTableItem* item = *slot;
 if (item) {
 *slot = item->next; // remove item from linked list
 ht_free_item(item);
 --table->itemcount;
 if (direct_slot) --table->scount; // HashTable accounting
 }
}
HashTableItem* ht_search(HashTable* table, ht_key_t key) {
 bool direct_slot;
 return *ht_search_slot(table, key, &direct_slot);
}

Of course, this approach will conflict with the above advice of avoiding indirection, although perhaps something similar is still possible.

Always call `ht_grow()` when inserting elements

You are only calling ht_grow() when an empty direct slot is used. However, if an item is inserted in an already used slot, you don't call it. That means that if you are unlucky, everything hashes to the same slot, and you will never grow and rebalance. I would play it safe and just unconditionally call ht_grow() when inserting an element. Of course, ht_grow() should use itemcount instead of scount to check whether it needs to grow the hash table.

Avoid division and modulo operations

Divisions (and thus also modulo operations) are quite expensive. I would avoid them by ensuring the hash table size is always a power of two. And if you ensure that the size is at least 16 elements, then you can replace the check in ht_grow() by:

if (table->itemcount >= 13 * (table->size >> 4)) {
 ...
}

This will trigger a resize when the hash table is 81.25% full. You can even get rid of the multiplication by storing the binary logarithm of the size, but unless you plan to run this code on microcontrollers I would not bother with that. And in the hash function you can replace the modulo this way:

return hash & (size - 1);

Good stuff, thanks. I considered most of these. I kept the indirection to not tie myself to this key=>value structure, but it's certainly debatable. I considered calloc, meh.. The code duplication is very interesting and I was still wrestling with this. Each function is only 10 lines, but they are "somewhat complex" pointer juggling. So not not just duplication but also clarity might improve if I factor out "finding the slot incl linked list part". I thought about load factor by itemcount vs slotcount, more common? And I really like the power of 2 idea and remove modulo/divide. Thanks again.
Are you sure this is right? table->itemcount >= 13 * (table->size >> 2) I can't seem to make that work in my head. For me that would only trigger at >300% load factor? Or am I missing something? shouldn't it be table->itemcount * 10 >= (table->size << 3) for an 80% cutoff? And no restriction for size >=16 (which breaks my tests). In fact if I do the swap to itemcount, then the threshhold should be more lke 160% but that's easy to adjust.
Interestingly, removing the % modulo in ht_hash gave a 20% gain immediately, whereas changing that if in ht_grow did nothing at all (similar number of calls). I guess on a big Intel CPU division is fast (but modulo is not), so unless I am going for an 8-bit pic18 the division is probably more readable? (the 8-bit pic can't handle uint64 anyway, nor malloc etc etc...)
It should've been >> 4, which is equal to / 16. In fact you can just write the latter, the compiler is able to optimize that because it can see it's a constant. Your option to do ... * 10 >= ... << 3 should also work, but it might have issues with very large values of table->size.
Thought you might like this commit: github.com/oschonrock/cproj/commit/…

Stack Exchange Network

Hashtable with separate chaining and dynamic resizing

1 Answer 1

Make more use of `calloc()`

Consider avoiding a layer of indirection

Avoid code duplication

Always call `ht_grow()` when inserting elements

Avoid division and modulo operations

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Hashtable with separate chaining and dynamic resizing

1 Answer 1

Make more use of calloc()

Consider avoiding a layer of indirection

Avoid code duplication

Always call ht_grow() when inserting elements

Avoid division and modulo operations

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions

Make more use of `calloc()`

Always call `ht_grow()` when inserting elements