I've written a C implementation of a hashmap of strings including the basic operations and a strategy for handling hash collisons.
Additionally, I've included some tests, specifically for cases involving hash collisions which, I confess, gave me a bit of trouble initially.
Thank you in advance for your time :)
ps, if you prefer to view on github: https://github.com/JosephSBoyle/HashMap
hashmap.h
#include <stddef.h>
#include "hashmap.c"
/* Create a hashmap.*/
Node** hm_create(void);
/* Add an item to the hashmap */
void hm_add(Node** hmap, char key[], char value[]);
/* Remove an item from the hashmap */
void hm_del(Node** hmap, char key[]);
/** Get an item from the hashmap
* @returns the value corresponding to 'key' or the terminating character '0円'
* if no such item exists. */
char* hm_get(Node** hmap, char key[]);
hashmap.c
#define BUCKETS 1000000
#define PRIME_SEED_1 17
#define PRIME_SEED_2 97
#define KEY_CHARS 128
#define VALUE_CHARS 128
#define NODE_SIZE sizeof(Node)
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <string.h>
#include <assert.h>
/* A key-value node in a single-linked list. */
typedef struct Node Node;
struct Node {
Node* next;
char key[KEY_CHARS];
char value[VALUE_CHARS];
};
Node* create_sentinel_node(){
void* ptr = malloc(NODE_SIZE);
if (ptr == NULL){
exit(EXIT_FAILURE);
} else {
Node* node = (Node*)ptr; // safely typecast the pointer
node->next = NULL;
strcpy(node->key, "0円");
strcpy(node->value, "0円");
return node;
}
}
size_t modular_hash(char* string) {
size_t hash = 0;
for (char* character=string; *character != '0円'; character++) {
hash += ((int)*character + PRIME_SEED_1) * PRIME_SEED_2;
}
return hash;
}
Node** hm_create(void){
static Node* keys[BUCKETS] = {};
for (size_t i=0; i<BUCKETS; i++){
keys[i] = create_sentinel_node();
}
return keys;
}
void hm_add(Node** hmap, char key[], char value[]){
Node* node = hmap[modular_hash(key)];
if (strcmp(node->value, "0円") == 0){
// there's no item in this bucket.
strcpy(node->key, key);
strcpy(node->value, value);
return;
}
while(true){
if (strcmp(node->key, key) == 0){
// the key already exists as a node in the linked list
// so we can simply replace the value.
strcpy(node->value, value);
return;
}
if (node->next == NULL){
// no more nodes, append a new node to the linked list.
void* ptr = malloc(NODE_SIZE);
if (ptr == NULL){
exit(EXIT_FAILURE);
}
Node* new_node = (Node*)ptr; // safely typecast the pointer
new_node->next = NULL;
strcpy(node->value, value);
strcpy(new_node->key, key);
node->next = new_node;
return;
}
node = node->next;
}
}
void hm_del(Node** hmap, char key[]){
size_t khash = modular_hash(key);
Node* node = hmap[khash];
void* prev_node = NULL;
Node* head = node;
// find the node
while (strcmp(head->key, key) != 0){
if (head->next == NULL){
// the key doesn't exist in this hashmap.
return;
}
head = head->next;
prev_node = head;
}
// are we deleting the first node?
if (prev_node == NULL){
// are we deleting the only node?
if (head->next == NULL){
// reset the node to the sentinel node.
strcpy(node->value, "0円");
strcpy(node->key, "0円");
} else {
// we are deleting the first of n>1 nodes.
hmap[khash] = head->next;
}
} else {
// there must be a previous node.
Node* prev_node = (Node*)prev_node;
// is this the last node?
if (head->next == NULL){
prev_node->next = NULL;
} else {
// we are deleting neither the first nor the last node.
prev_node->next = head->next;
}
free(head);
}
}
char* hm_get(Node** hmap, char key[]){
Node* node = hmap[modular_hash(key)];
// skip list elements in the bucket (linked list) that don't match our key.
while (strcmp(node->key, key) != 0){
if (node->next==NULL){
return "0円";
}
node = node->next;
}
return node->value;
}
tests.c
#include "hashmap.h"
/** TESTS
* These tests test the collision handling of the hashmap.
* NOTE: To cause collisions, replace the return value of the hash function with a constant int, e.g `return 0;`
*/
void main(){
// test adding and getting an item
Node** hm0 = hm_create();
hm_add(hm0, "bob", "x");
char* a = hm_get(hm0, "bob");
assert(strcmp(a, "x") == 0);
// test deleting an item that doesn't exist
Node** hm1 = hm_create();
hm_del(hm1, "bob");
char* b = hm_get(hm1, "bob");
assert(strcmp(b, "0円") == 0);
// test getting a deleted item
Node** hm2 = hm_create();
hm_add(hm0, "bob", "the dog");
hm_del(hm2, "bob");
char* c = hm_get(hm2, "bob");
assert(strcmp(c, "0円") == 0);
// test overwriting a value
Node** hm3 = hm_create();
hm_add(hm3, "bob", "foo");
hm_add(hm3, "bob", "bar");
char* d = hm_get(hm3, "bob");
assert(strcmp(d, "bar") == 0);
// test deleting the last item
Node** hm4 = hm_create();
hm_add(hm4, "bob", "foo");
hm_add(hm4, "alice", "bar");
hm_add(hm4, "jane", "baz");
hm_del(hm4, "jane");
char* e = hm_get(hm4, "jane");
assert(strcmp(e, "0円") == 0);
// test deleting an item in the middle of the linked list.
Node** hm5 = hm_create();
hm_add(hm5, "bob", "abc");
hm_add(hm5, "alice", "123");
hm_add(hm5, "jane", "xyz");
hm_del(hm5, "alice");
char* f = hm_get(hm5, "alice");
assert(strcmp(f, "0円") == 0);
printf(
"############################\n"
"----- ALL TESTS PASSED -----\n"
"############################\n"
);
}
```
-
\$\begingroup\$ Note: I'm not sure my hash function is a great choice for quasi-evenly distributing items across the 'buckets'. Perhaps higher prime seed values could ameliorate that. \$\endgroup\$Joe Boyle– Joe Boyle2022年08月21日 18:27:41 +00:00Commented Aug 21, 2022 at 18:27
-
\$\begingroup\$ I will also note that my interface is missing a way to delete an entire hashmap. \$\endgroup\$Joe Boyle– Joe Boyle2022年08月21日 18:30:28 +00:00Commented Aug 21, 2022 at 18:30
1 Answer 1
hashmap.c
in hashmap.h
??
Very unusual to #include "hashmap.c"
in a header file. Header files are expected to not generate code, just define and declare things.
Best to remove #include "hashmap.c"
Strange string compare
Simplify
// if (strcmp(node->value, "0円") == 0){
if (node->value[0] == 0) {
// strcpy(node->key, "0円");
node->key[0] = 0;
Use const
When referenced data does not change, use const
.
// void hm_add(Node** hmap, char key[], char value[]){
void hm_add(Node** hmap, const char key[], const char value[]) {
Prevent buffer overflow
strcpy()
risks buffer overflow. Consider protection. Example:
// strcpy(node->value, value);
snprintf(node->value, sizeof node->value, "%s", value);
Of course if the string is too long, better code would detect that and complain. Even better, save a copy of the string with strdup()
.
Bug: hash
modular_hash()
does not limit its return value to [0...BUCKETS). I'd expect a mod:
size_t modular_hash(const char* string) {
size_t hash = 0;
for (char* character=string; *character != '0円'; character++) {
hash += ((int)*character + PRIME_SEED_1) * PRIME_SEED_2;
}
hash %= BUCKETS; // add
return hash;
}
Weak hash
Does not vary on order.
modular_hash("abc")
== modular_hash("bca")
== modular_hash("cab")
== ...
Tip: mod by prime
Using hash %= BUCKETS;
above, with BUCKETS
as a prime or not, makes little difference if the hash function was good. Yet with a modest or weak hash, using a prime for BUCKETS
improves the overall hash. I recommend changing BUCKETS
from 1000000 to a prime near that.
Pointless cast
Cast (int)*character
in modular_hash(char* string)
serves no purpose. If anything a cast to unsigned
makes some sense.
Good for only 1 map
hm_create(void)
uses static Node* keys[BUCKETS] = {};
limiting us to 1 hash map. Better to allocate to allow multiple instantiations of the hash map.
Unneeded include
#include <stddef.h>
not needed in hashmap.h
.
Unneeded cast
Casts does not improve safety - not needed to convert a void *
to some object pointer..
// Node* node = (Node*)ptr; // safely typecast the pointer
Node* node = ptr;
"0円"
??
Every place where "0円"
(2 null characters, one explicit, one implied) is used can be replaced with ""
.
Good test code
Return distinction
hm_get()
returns a pointer to ""
when none is found. Yet if the item in the table was ""
, we do not know if the function succeeded or failed. Instead, consider returning NULL
on failure.
Fixed hash size
A more advanced hash table would adjust the hash table size as needed.
Comments
IHO, a satisfying amount of comments.