Generic Macro Generated Hashset in C

Question 1

The Hashset uses open addressing, linear probing and Robin Hood hashing for handling collisions. It comes with insert and remove operations, iteration (back and forward using a single iterator), min and max keys (this is to interface with a treeset that I've made, but not very useful) and set operations like union, intersection, difference and symmetric_difference.

Concerns

This is the first time I've used Robin Hood hashing. It seems way too simple so I'm not sure if it is implemented correctly!
Is it better to have an array of entries (buckets) or an array of pointers to entries (initialized with NULL)?
Are there any improvements that can be done in this hashtable?

This data structure is generated using macros. They follow the same idea as two of my previous questions (this one and this one).

How to

You can use HASHSET_GENERATE to generate a hashset of any type you want. It has 4 parameters:

PFX - Functions prefix, or namespace;
SNAME - Structure name (typedef SNAME##_s SNAME;);
FMOD - Functions modifier (static or empty, edit: not sure if works with inline);
V - Your data type to be worked with.

Or you can generate each part individually using HASHSET_GENERATE_STRUCT, HASHSET_GENERATE_HEADER and HASHSET_GENERATE_SOURCE.

hashset.h

#ifndef CMC_HASHSET_H
#define CMC_HASHSET_H
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#ifndef CMC_HASH_TABLE_SETUP
#define CMC_HASH_TABLE_SETUP
typedef enum EntryState_e
{
 ES_DELETED = -1,
 ES_EMPTY = 0,
 ES_FILLED = 1
} EntryState;
static const size_t cmc_hashtable_primes[] = {53, 97, 193, 389, 769, 1543, 3079,
 6151, 12289, 24593, 49157, 98317,
 196613, 393241, 786433, 1572869,
 3145739, 6291469, 12582917,
 25165843, 50331653, 100663319,
 201326611, 402653189, 805306457,
 1610612741};
#endif /* CMC_HASH_TABLE_SETUP */
#define HASHSET_GENERATE(PFX, SNAME, FMOD, V) \
 HASHSET_GENERATE_STRUCT(PFX, SNAME, FMOD, V) \
 HASHSET_GENERATE_HEADER(PFX, SNAME, FMOD, V) \
 HASHSET_GENERATE_SOURCE(PFX, SNAME, FMOD, V)
/* PRIVATE *******************************************************************/
#define HASHSET_GENERATE_HEADER_PRIVATE(PFX, SNAME, FMOD, K, V) \
 HASHSET_GENERATE_HEADER(PFX, SNAME, FMOD, V)
#define HASHSET_GENERATE_SOURCE_PRIVATE(PFX, SNAME, FMOD, K, V) \
 HASHSET_GENERATE_STRUCT(PFX, SNAME, FMOD, V) \
 HASHSET_GENERATE_SOURCE(PFX, SNAME, FMOD, V)
/* PUBLIC ********************************************************************/
#define HASHSET_GENERATE_HEADER_PUBLIC(PFX, SNAME, FMOD, K, V) \
 HASHSET_GENERATE_STRUCT(PFX, SNAME, FMOD, V) \
 HASHSET_GENERATE_HEADER(PFX, SNAME, FMOD, V)
#define HASHSET_GENERATE_SOURCE_PUBLIC(PFX, SNAME, FMOD, K, V) \
 HASHSET_GENERATE_SOURCE(PFX, SNAME, FMOD, V)
/* STRUCT ********************************************************************/
#define HASHSET_GENERATE_STRUCT(PFX, SNAME, FMOD, V) \
 \
 struct SNAME##_s \
 { \
 struct SNAME##_entry_s *buffer; \
 size_t capacity; \
 size_t count; \
 double load; \
 int (*cmp)(V, V); \
 size_t (*hash)(V); \
 }; \
 \
 struct SNAME##_entry_s \
 { \
 V value; \
 size_t dist; \
 enum EntryState_e state; \
 }; \
 \
 struct SNAME##_iter_s \
 { \
 struct SNAME##_s *target; \
 size_t cursor; \
 size_t index; \
 size_t first; \
 size_t last; \
 bool start; \
 bool end; \
 }; \
 \
/* HEADER ********************************************************************/
#define HASHSET_GENERATE_HEADER(PFX, SNAME, FMOD, V) \
 \
 typedef struct SNAME##_s SNAME; \
 typedef struct SNAME##_entry_s SNAME##_entry; \
 typedef struct SNAME##_iter_s SNAME##_iter; \
 \
 FMOD SNAME *PFX##_new(size_t size, double load, int (*compare)(V, V), size_t (*hash)(V)); \
 FMOD void PFX##_clear(SNAME *_set_); \
 FMOD void PFX##_free(SNAME *_set_); \
 FMOD bool PFX##_insert(SNAME *_set_, V element); \
 FMOD bool PFX##_remove(SNAME *_set_, V element); \
 FMOD bool PFX##_insert_if(SNAME *_set_, V element, bool condition); \
 FMOD bool PFX##_remove_if(SNAME *_set_, V element, bool condition); \
 FMOD V PFX##_max(SNAME *_set_); \
 FMOD V PFX##_min(SNAME *_set_); \
 FMOD bool PFX##_empty(SNAME *_set_); \
 FMOD size_t PFX##_count(SNAME *_set_); \
 \
 FMOD SNAME *PFX##_union(SNAME *_set1_, SNAME *_set2_); \
 FMOD SNAME *PFX##_intersection(SNAME *_set1_, SNAME *_set2_); \
 FMOD SNAME *PFX##_difference(SNAME *_set1_, SNAME *_set2_); \
 FMOD SNAME *PFX##_symmetric_difference(SNAME *_set1_, SNAME *_set2_); \
 \
 FMOD void PFX##_iter_new(SNAME##_iter *iter, SNAME *target); \
 FMOD bool PFX##_iter_start(SNAME##_iter *iter); \
 FMOD bool PFX##_iter_end(SNAME##_iter *iter); \
 FMOD void PFX##_iter_tostart(SNAME##_iter *iter); \
 FMOD void PFX##_iter_toend(SNAME##_iter *iter); \
 FMOD bool PFX##_iter_next(SNAME##_iter *iter, V *value, size_t *index); \
 FMOD bool PFX##_iter_prev(SNAME##_iter *iter, V *value, size_t *index); \
 \
/* SOURCE ********************************************************************/
#define HASHSET_GENERATE_SOURCE(PFX, SNAME, FMOD, V) \
 \
 FMOD bool PFX##_grow(SNAME *_set_); \
 FMOD SNAME##_entry *PFX##_get_entry(SNAME *_set_, V element); \
 FMOD size_t PFX##_calculate_size(size_t required); \
 \
 FMOD SNAME *PFX##_new(size_t size, double load, int (*compare)(V, V), size_t (*hash)(V)) \
 { \
 if (size == 0 || load <= 0 || load >= 1) \
 return NULL; \
 \
 size_t real_size = PFX##_calculate_size(size); \
 \
 SNAME *_set_ = malloc(sizeof(SNAME)); \
 \
 if (!_set_) \
 return NULL; \
 \
 _set_->buffer = malloc(sizeof(SNAME##_entry) * real_size); \
 \
 if (!_set_->buffer) \
 { \
 free(_set_); \
 return NULL; \
 } \
 \
 memset(_set_->buffer, 0, sizeof(SNAME##_entry) * real_size); \
 \
 _set_->count = 0; \
 _set_->capacity = real_size; \
 _set_->load = load; \
 _set_->cmp = compare; \
 _set_->hash = hash; \
 \
 return _set_; \
 } \
 \
 FMOD void PFX##_clear(SNAME *_set_) \
 { \
 memset(_set_->buffer, 0, sizeof(SNAME##_entry) * _set_->capacity); \
 \
 _set_->count = 0; \
 } \
 \
 FMOD void PFX##_free(SNAME *_set_) \
 { \
 free(_set_->buffer); \
 free(_set_); \
 } \
 \
 FMOD bool PFX##_insert(SNAME *_set_, V element) \
 { \
 if ((double)_set_->capacity * _set_->load <= (double)_set_->count) \
 { \
 if (!PFX##_grow(_set_)) \
 return false; \
 } \
 \
 size_t hash = _set_->hash(element); \
 size_t original_pos = hash % _set_->capacity; \
 size_t pos = original_pos; \
 \
 SNAME##_entry *target = &(_set_->buffer[pos]); \
 \
 if (PFX##_get_entry(_set_, element) != NULL) \
 return false; \
 \
 if (target->state == ES_EMPTY || target->state == ES_DELETED) \
 { \
 target->value = element; \
 target->dist = pos - original_pos; \
 target->state = ES_FILLED; \
 } \
 else \
 { \
 while (true) \
 { \
 pos++; \
 target = &(_set_->buffer[pos % _set_->capacity]); \
 \
 if (target->state == ES_EMPTY || target->state == ES_DELETED) \
 { \
 target->value = element; \
 target->dist = pos - original_pos; \
 target->state = ES_FILLED; \
 \
 break; \
 } \
 else if (target->dist < pos - original_pos) \
 { \
 V tmp = target->value; \
 size_t tmp_dist = target->dist; \
 \
 target->value = element; \
 target->dist = pos - original_pos; \
 \
 element = tmp; \
 original_pos = pos - tmp_dist; \
 } \
 } \
 } \
 \
 _set_->count++; \
 \
 return true; \
 } \
 \
 FMOD bool PFX##_remove(SNAME *_set_, V element) \
 { \
 SNAME##_entry *result = PFX##_get_entry(_set_, element); \
 \
 if (result == NULL) \
 return false; \
 \
 result->value = 0; \
 result->dist = 0; \
 result->state = ES_DELETED; \
 \
 _set_->count--; \
 \
 return true; \
 } \
 \
 FMOD bool PFX##_insert_if(SNAME *_set_, V element, bool condition) \
 { \
 if (condition) \
 return PFX##_insert(_set_, element); \
 \
 return false; \
 } \
 \
 FMOD bool PFX##_remove_if(SNAME *_set_, V element, bool condition) \
 { \
 if (condition) \
 return PFX##_remove(_set_, element); \
 \
 return false; \
 } \
 \
 FMOD V PFX##_max(SNAME *_set_) \
 { \
 if (PFX##_empty(_set_)) \
 return 0; \
 \
 V result, max; \
 size_t index; \
 SNAME##_iter iter; \
 \
 for (PFX##_iter_new(&iter, _set_); !PFX##_iter_end(&iter);) \
 { \
 PFX##_iter_next(&iter, &result, &index); \
 \
 if (index == 0) \
 max = result; \
 else if (_set_->cmp(result, max) > 0) \
 max = result; \
 } \
 \
 return max; \
 } \
 \
 FMOD V PFX##_min(SNAME *_set_) \
 { \
 if (PFX##_empty(_set_)) \
 return 0; \
 \
 V result, min; \
 size_t index; \
 SNAME##_iter iter; \
 \
 for (PFX##_iter_new(&iter, _set_); !PFX##_iter_end(&iter);) \
 { \
 PFX##_iter_next(&iter, &result, &index); \
 \
 if (index == 0) \
 min = result; \
 else if (_set_->cmp(result, min) < 0) \
 min = result; \
 } \
 \
 return min; \
 } \
 \
 FMOD bool PFX##_empty(SNAME *_set_) \
 { \
 return _set_->count == 0; \
 } \
 \
 FMOD size_t PFX##_count(SNAME *_set_) \
 { \
 return _set_->count; \
 } \
 \
 FMOD SNAME *PFX##_union(SNAME *_set1_, SNAME *_set2_) \
 { \
 V value; \
 size_t index; \
 SNAME##_iter iter1, iter2; \
 \
 SNAME *_set_r_ = PFX##_new(_set1_->capacity, _set1_->load, _set1_->cmp, _set1_->hash); \
 \
 if (!_set_r_) \
 return false; \
 \
 PFX##_iter_new(&iter1, _set1_); \
 PFX##_iter_new(&iter2, _set2_); \
 \
 for (PFX##_iter_tostart(&iter1); !PFX##_iter_end(&iter1);) \
 { \
 PFX##_iter_next(&iter1, &value, &index); \
 PFX##_insert(_set_r_, value); \
 } \
 \
 for (PFX##_iter_tostart(&iter2); !PFX##_iter_end(&iter2);) \
 { \
 PFX##_iter_next(&iter2, &value, &index); \
 PFX##_insert(_set_r_, value); \
 } \
 \
 return _set_r_; \
 } \
 \
 FMOD SNAME *PFX##_intersection(SNAME *_set1_, SNAME *_set2_) \
 { \
 V value; \
 size_t index; \
 SNAME##_iter iter; \
 \
 SNAME *_set_r_ = PFX##_new(_set1_->capacity, _set1_->load, _set1_->cmp, _set1_->hash); \
 \
 if (!_set_r_) \
 return false; \
 \
 SNAME *_set_A_ = _set1_->count < _set2_->count ? _set1_ : _set2_; \
 SNAME *_set_B_ = _set_A_ == _set1_ ? _set2_ : _set1_; \
 \
 PFX##_iter_new(&iter, _set_A_); \
 \
 for (PFX##_iter_tostart(&iter); !PFX##_iter_end(&iter);) \
 { \
 PFX##_iter_next(&iter, &value, &index); \
 \
 if (PFX##_get_entry(_set_B_, value) != NULL) \
 PFX##_insert(_set_r_, value); \
 } \
 \
 return _set_r_; \
 } \
 \
 FMOD SNAME *PFX##_difference(SNAME *_set1_, SNAME *_set2_) \
 { \
 V value; \
 size_t index; \
 SNAME##_iter iter; \
 \
 SNAME *_set_r_ = PFX##_new(_set1_->capacity, _set1_->load, _set1_->cmp, _set1_->hash); \
 \
 if (!_set_r_) \
 return false; \
 \
 PFX##_iter_new(&iter, _set1_); \
 \
 for (PFX##_iter_tostart(&iter); !PFX##_iter_end(&iter);) \
 { \
 PFX##_iter_next(&iter, &value, &index); \
 \
 if (PFX##_get_entry(_set2_, value) == NULL) \
 PFX##_insert(_set_r_, value); \
 } \
 \
 return _set_r_; \
 } \
 \
 FMOD SNAME *PFX##_symmetric_difference(SNAME *_set1_, SNAME *_set2_) \
 { \
 V value; \
 size_t index; \
 SNAME##_iter iter1, iter2; \
 \
 SNAME *_set_r_ = PFX##_new(_set1_->capacity, _set1_->load, _set1_->cmp, _set1_->hash); \
 \
 if (!_set_r_) \
 return false; \
 \
 PFX##_iter_new(&iter1, _set1_); \
 PFX##_iter_new(&iter2, _set2_); \
 \
 for (PFX##_iter_tostart(&iter1); !PFX##_iter_end(&iter1);) \
 { \
 PFX##_iter_next(&iter1, &value, &index); \
 \
 if (PFX##_get_entry(_set2_, value) == NULL) \
 PFX##_insert(_set_r_, value); \
 } \
 \
 for (PFX##_iter_tostart(&iter2); !PFX##_iter_end(&iter2);) \
 { \
 PFX##_iter_next(&iter2, &value, &index); \
 \
 if (PFX##_get_entry(_set1_, value) == NULL) \
 PFX##_insert(_set_r_, value); \
 } \
 \
 return _set_r_; \
 } \
 \
 FMOD void PFX##_iter_new(SNAME##_iter *iter, SNAME *target) \
 { \
 iter->target = target; \
 iter->cursor = 0; \
 iter->index = 0; \
 iter->start = true; \
 iter->end = PFX##_empty(target); \
 \
 if (!PFX##_empty(target)) \
 { \
 for (size_t i = 0; i < target->capacity; i++) \
 { \
 if (target->buffer[i].state == ES_FILLED) \
 { \
 iter->first = i; \
 break; \
 } \
 } \
 \
 iter->cursor = iter->first; \
 \
 for (size_t i = target->capacity; i > 0; i--) \
 { \
 if (target->buffer[i - 1].state == ES_FILLED) \
 { \
 iter->last = i - 1; \
 break; \
 } \
 } \
 } \
 } \
 \
 FMOD bool PFX##_iter_start(SNAME##_iter *iter) \
 { \
 return PFX##_empty(iter->target) || iter->start; \
 } \
 \
 FMOD bool PFX##_iter_end(SNAME##_iter *iter) \
 { \
 return PFX##_empty(iter->target) || iter->end; \
 } \
 \
 FMOD void PFX##_iter_tostart(SNAME##_iter *iter) \
 { \
 iter->cursor = iter->first; \
 iter->index = 0; \
 iter->start = true; \
 iter->end = PFX##_empty(iter->target); \
 } \
 \
 FMOD void PFX##_iter_toend(SNAME##_iter *iter) \
 { \
 iter->cursor = iter->last; \
 iter->index = iter->target->count - 1; \
 iter->start = PFX##_empty(iter->target); \
 iter->end = true; \
 } \
 \
 FMOD bool PFX##_iter_next(SNAME##_iter *iter, V *value, size_t *index) \
 { \
 if (iter->end) \
 return false; \
 \
 SNAME##_entry *scan = &(iter->target->buffer[iter->cursor]); \
 \
 *value = scan->value; \
 *index = iter->index; \
 \
 if (iter->cursor == iter->last) \
 iter->end = true; \
 else \
 { \
 iter->index++; \
 \
 while (1) \
 { \
 iter->cursor++; \
 scan = &(iter->target->buffer[iter->cursor]); \
 \
 if (scan->state == ES_FILLED) \
 break; \
 } \
 } \
 \
 iter->start = PFX##_empty(iter->target); \
 \
 return true; \
 } \
 \
 FMOD bool PFX##_iter_prev(SNAME##_iter *iter, V *value, size_t *index) \
 { \
 if (iter->start) \
 return false; \
 \
 SNAME##_entry *scan = &(iter->target->buffer[iter->cursor]); \
 \
 *value = scan->value; \
 *index = iter->index; \
 \
 if (iter->cursor == iter->first) \
 iter->start = true; \
 else \
 { \
 iter->index--; \
 \
 while (1) \
 { \
 iter->cursor--; \
 scan = &(iter->target->buffer[iter->cursor]); \
 \
 if (scan->state == ES_FILLED) \
 break; \
 } \
 } \
 \
 iter->end = PFX##_empty(iter->target); \
 \
 return true; \
 } \
 \
 FMOD bool PFX##_grow(SNAME *_set_) \
 { \
 size_t new_size = PFX##_calculate_size((size_t)((double)_set_->capacity * 1.5)); \
 \
 SNAME *_new_set_ = PFX##_new(new_size, _set_->load, _set_->cmp, _set_->hash); \
 \
 if (!_new_set_) \
 return false; \
 \
 V value; \
 size_t index; \
 SNAME##_iter iter; \
 \
 for (PFX##_iter_new(&iter, _set_); !PFX##_iter_end(&iter);) \
 { \
 PFX##_iter_next(&iter, &value, &index); \
 PFX##_insert(_new_set_, value); \
 } \
 \
 if (_set_->count != _new_set_->count) \
 { \
 PFX##_free(_new_set_); \
 \
 return false; \
 } \
 \
 SNAME##_entry *tmp = _set_->buffer; \
 _set_->buffer = _new_set_->buffer; \
 _new_set_->buffer = tmp; \
 \
 _set_->capacity = _new_set_->capacity; \
 \
 PFX##_free(_new_set_); \
 \
 return true; \
 } \
 \
 FMOD SNAME##_entry *PFX##_get_entry(SNAME *_set_, V element) \
 { \
 size_t hash = _set_->hash(element); \
 size_t pos = hash % _set_->capacity; \
 \
 SNAME##_entry *target = &(_set_->buffer[pos % _set_->capacity]); \
 \
 while (target->state == ES_FILLED || target->state == ES_DELETED) \
 { \
 if (_set_->cmp(target->value, element) == 0) \
 return target; \
 \
 pos++; \
 target = &(_set_->buffer[pos % _set_->capacity]); \
 } \
 \
 return NULL; \
 } \
 \
 FMOD size_t PFX##_calculate_size(size_t required) \
 { \
 const size_t count = sizeof(cmc_hashtable_primes) / sizeof(cmc_hashtable_primes[0]); \
 \
 if (cmc_hashtable_primes[count - 1] < required) \
 return required; \
 \
 size_t i = 0; \
 while (cmc_hashtable_primes[i] < required) \
 i++; \
 \
 return cmc_hashtable_primes[i]; \
 }
#endif /* CMC_HASHSET_H */

EXAMPLE

#include <stdio.h>
#include <stdlib.h>
#include "hashset.h"
HASHSET_GENERATE(set, hash_set, /* static */, int)
void print_set(hash_set *s)
{
 size_t index;
 int result;
 hash_set_iter iter;
 set_iter_new(&iter, s);
 for (set_iter_tostart(&iter); !set_iter_end(&iter);)
 {
 set_iter_next(&iter, &result, &index);
 if (index == 0)
 printf("[ %2d, ", result);
 else if (index == s->count - 1)
 printf("%2d ]\n", result);
 else
 printf("%2d, ", result);
 }
}
int intcmp(int a, int b)
{
 return a - b;
}
size_t inthash(int t)
{
 size_t a = t;
 a += ~(a << 15);
 a ^= (a >> 10);
 a += (a << 3);
 a ^= (a >> 6);
 a += ~(a << 11);
 a ^= (a >> 16);
 return a;
}
int main(int argc, char const *argv[])
{
 hash_set *set1 = set_new(50, 0.9, intcmp, inthash);
 hash_set *set2 = set_new(50, 0.9, intcmp, inthash);
 for (int i = 1; i <= 20; i++)
 set_insert(set1, i);
 for (int i = 11; i <= 30; i++)
 set_insert(set2, i);
 hash_set *set3 = set_union(set1, set2);
 print_set(set1);
 printf("UNION\n");
 print_set(set2);
 printf("=\n");
 print_set(set3);
 printf("\n\n");
 hash_set *set4 = set_intersection(set1, set2);
 print_set(set1);
 printf("INTERSECTION\n");
 print_set(set2);
 printf("=\n");
 print_set(set4);
 printf("\n\n");
 hash_set *set5 = set_difference(set1, set2);
 print_set(set1);
 printf("DIFFERENCE\n");
 print_set(set2);
 printf("=\n");
 print_set(set5);
 printf("\n\n");
 hash_set *set6 = set_symmetric_difference(set1, set2);
 print_set(set1);
 printf("SYMMETRIC DIFFERENCE\n");
 print_set(set2);
 printf("=\n");
 print_set(set6);
 set_free(set1);
 set_free(set2);
 set_free(set3);
 set_free(set4);
 set_free(set5);
 set_free(set6);
 return 0;
}

Question 2

An array of entries has the advantage of cache coherence, but it's bigger in memory and only supports a fixed size of entry. From Wikipedia, "If the table is expected to have a high load factor, the records are large, or the data is variable-sized, chained hash tables often perform as well or better." So, it depends?

Question 3

Is it possible to have an associative array with this? It seems to me that that's not enough parameters.

Question 4

@NeilEdelman To make a Hashmap is very very very close to the Hashset. In fact I have it implemented here.

Question 5

I still don't see how this is enough parameters. In your example, how do you link intcmp and inthash into your header?

Question 6

Ahh, I see how it's done now. If you coded it in HASHSET_GENERATE instead of the constructor, all the same hashset types would be compatible. And really, how much do you really have two or more equiality relations? I mean, I guess you could, but I would think it's rare.

Question 7

... used Robin Hood hashing. It seems way too simple so I'm not sure if it is implemented correctly!

No comment.

Is it better to have an array of entries (buckets) or an array of pointers to entries (initialized with NULL)?

6.01 or half-dozen of the other. Depends on use case.

I prefer a dynamic array of entries for less (perceived) fragmentation.

Are there any improvements that can be done in this hashtable?

Lack of comments.

I'd expect something to help indicate usage and limitation of the various generated functions in the .h files. Perhaps even a commented terse example?

Put that How to in the .h

Name space impact.

hashset.h creates

CMC_HASHSET_...
CMC_HASH_...
EntryState...
ES_...
cmc_hashtable...
HASHSET_...

I'd expect a more uniform name space usage. Example

cmcht.h
cmcht_...
CMCHT_...

Assumed range

size_t cmc_hashtable_primes[] = {53, 97, 193, 389, ... 402653189, 805306457, 1610612741 assumes size_t is 32 bit. For wider portability, conditionally handle 16 to 64 bit.

Unclear why table lacked entries near 2,000,000,000 and 4,000,000,000.

Why double math?

(size_t)((double)_set_->capacity * 1.5)
vs.
_set_->capacity + _set_->capacity/2

If concerned about overflow, a prior test is useful.

Linear search vs binary

With dozens of values to search, instead of a linear search while (cmc_hashtable_primes[i] < required) i++;, perhaps a binary one?

Allocate to sizeof de-referenced pointer

Original code was hard to review for correctnesses.

_set_->buffer = malloc(sizeof(SNAME##_entry) * real_size); 
// vs
_set_->buffer = malloc(sizeof *_set_->buffer * real_size);

Good use of size_t/bool

Good use of prime numbers in hashing

size_t hash = _set_->hash(element); 
size_t original_pos = hash % _set_->capacity;

I'd re-order includes to insure "hashset.h" stands on its own

//#include <stdio.h>
//#include <stdlib.h>
#include "hashset.h"
#include <stdio.h>
#include <stdlib.h>

It appears the .h code is designed to handle multiple HASHSET_GENERATE(). If so, good to demo it.

A full range int compare for intcmp(int a, int b) without potential UB is return (a > b) - (a < b)

Bonus

OP had "only go to 32 bit integers is because I took them from here." so I extended to 64-bit.

0x 3 3
0x 7 7
0x D 13
0x 1D 29
0x 35 53
0x 61 97
0x C1 193
0x 185 389
0x 301 769
0x 607 1543
0x C07 3079
0x 1807 6151
0x 3001 12289
0x 6011 24593
0x C005 49157
0x 1800D 98317
0x 30005 196613
0x 60019 393241
0x C0001 786433
0x 180005 1572869
0x 30000B 3145739
0x 60000D 6291469
0x C00005 12582917
0x 1800013 25165843
0x 3000005 50331653
0x 6000017 100663319
0x C000013 201326611
0x 18000005 402653189
0x 30000059 805306457
0x 60000005 1610612741
0x C0000001 3221225473
0x 180000017 6442450967
0x 300000005 12884901893
0x 600000017 25769803799
0x C0000002F 51539607599
0x 1800000007 103079215111
0x 3000000001 206158430209
0x 6000000019 412316860441
0x C000000005 824633720837
0x 18000000011 1649267441681
0x 30000000059 3298534883417
0x 60000000001 6597069766657
0x C0000000025 13194139533349
0x 18000000002F 26388279066671
0x 300000000037 52776558133303
0x 60000000000D 105553116266509
0x C00000000037 211106232533047
0x 1800000000011 422212465066001
0x 3000000000059 844424930132057
0x 6000000000011 1688849860263953
0x C000000000019 3377699720527897
0x 18000000000053 6755399441055827
0x 3000000000001F 13510798882111519
0x 6000000000005F 27021597764223071
0x C0000000000005 54043195528445957
0x 180000000000025 108086391056891941
0x 300000000000023 216172782113783843
0x 600000000000005 432345564227567621
0x C00000000000031 864691128455135281
0x1800000000000011 1729382256910270481
0x3000000000000005 3458764513820540933
0x600000000000002F 6917529027641081903
0xC000000000000011 13835058055282163729
0xFFFFFFFFFFFFFFC5 18446744073709551557 extra

Question 8

Your implementation of intcmp performs two comparisons while only 1.5 of them are actually required.

Question 9

@bipll Both (a > b) - (a < b) and the "1.5 compare" approaches are O(1) complexity. Compilers readily emit efficient code from (a > b) - (a < b) so to assess either is a micro-optimization. My goal is first correct functionality over range which a - b does not have.

Question 10

Hi. First, thanks for the review! About the prime numbers and why they only go to 32 bit integers is because I took them from here. I still couldn't find prime numbers that go up to 64 bit values.

Question 11

@LeoVen Finding primes (each number is as far as possible from the nearest two powers of two) up to 2^64 looks like a fun programming task.... ;-)

chux chux 36.4k2 gold badges43 silver badges96 bronze badges · Accepted Answer · 2019-04-13 12:44:53Z

... used Robin Hood hashing. It seems way too simple so I'm not sure if it is implemented correctly!

No comment.

Is it better to have an array of entries (buckets) or an array of pointers to entries (initialized with NULL)?

6.01 or half-dozen of the other. Depends on use case.

I prefer a dynamic array of entries for less (perceived) fragmentation.

Are there any improvements that can be done in this hashtable?

Lack of comments.

I'd expect something to help indicate usage and limitation of the various generated functions in the .h files. Perhaps even a commented terse example?

Put that How to in the .h

Name space impact.

hashset.h creates

CMC_HASHSET_...
CMC_HASH_...
EntryState...
ES_...
cmc_hashtable...
HASHSET_...

I'd expect a more uniform name space usage. Example

cmcht.h
cmcht_...
CMCHT_...

Assumed range

size_t cmc_hashtable_primes[] = {53, 97, 193, 389, ... 402653189, 805306457, 1610612741 assumes size_t is 32 bit. For wider portability, conditionally handle 16 to 64 bit.

Unclear why table lacked entries near 2,000,000,000 and 4,000,000,000.

Why double math?

(size_t)((double)_set_->capacity * 1.5)
vs.
_set_->capacity + _set_->capacity/2

If concerned about overflow, a prior test is useful.

Linear search vs binary

With dozens of values to search, instead of a linear search while (cmc_hashtable_primes[i] < required) i++;, perhaps a binary one?

Allocate to sizeof de-referenced pointer

Original code was hard to review for correctnesses.

_set_->buffer = malloc(sizeof(SNAME##_entry) * real_size); 
// vs
_set_->buffer = malloc(sizeof *_set_->buffer * real_size);

Good use of size_t/bool

Good use of prime numbers in hashing

size_t hash = _set_->hash(element); 
size_t original_pos = hash % _set_->capacity;

I'd re-order includes to insure "hashset.h" stands on its own

//#include <stdio.h>
//#include <stdlib.h>
#include "hashset.h"
#include <stdio.h>
#include <stdlib.h>

It appears the .h code is designed to handle multiple HASHSET_GENERATE(). If so, good to demo it.

A full range int compare for intcmp(int a, int b) without potential UB is return (a > b) - (a < b)

Bonus

OP had "only go to 32 bit integers is because I took them from here." so I extended to 64-bit.

0x 3 3
0x 7 7
0x D 13
0x 1D 29
0x 35 53
0x 61 97
0x C1 193
0x 185 389
0x 301 769
0x 607 1543
0x C07 3079
0x 1807 6151
0x 3001 12289
0x 6011 24593
0x C005 49157
0x 1800D 98317
0x 30005 196613
0x 60019 393241
0x C0001 786433
0x 180005 1572869
0x 30000B 3145739
0x 60000D 6291469
0x C00005 12582917
0x 1800013 25165843
0x 3000005 50331653
0x 6000017 100663319
0x C000013 201326611
0x 18000005 402653189
0x 30000059 805306457
0x 60000005 1610612741
0x C0000001 3221225473
0x 180000017 6442450967
0x 300000005 12884901893
0x 600000017 25769803799
0x C0000002F 51539607599
0x 1800000007 103079215111
0x 3000000001 206158430209
0x 6000000019 412316860441
0x C000000005 824633720837
0x 18000000011 1649267441681
0x 30000000059 3298534883417
0x 60000000001 6597069766657
0x C0000000025 13194139533349
0x 18000000002F 26388279066671
0x 300000000037 52776558133303
0x 60000000000D 105553116266509
0x C00000000037 211106232533047
0x 1800000000011 422212465066001
0x 3000000000059 844424930132057
0x 6000000000011 1688849860263953
0x C000000000019 3377699720527897
0x 18000000000053 6755399441055827
0x 3000000000001F 13510798882111519
0x 6000000000005F 27021597764223071
0x C0000000000005 54043195528445957
0x 180000000000025 108086391056891941
0x 300000000000023 216172782113783843
0x 600000000000005 432345564227567621
0x C00000000000031 864691128455135281
0x1800000000000011 1729382256910270481
0x3000000000000005 3458764513820540933
0x600000000000002F 6917529027641081903
0xC000000000000011 13835058055282163729
0xFFFFFFFFFFFFFFC5 18446744073709551557 extra

Your implementation of intcmp performs two comparisons while only 1.5 of them are actually required.
@bipll Both (a > b) - (a < b) and the "1.5 compare" approaches are O(1) complexity. Compilers readily emit efficient code from (a > b) - (a < b) so to assess either is a micro-optimization. My goal is first correct functionality over range which a - b does not have.
Hi. First, thanks for the review! About the prime numbers and why they only go to 32 bit integers is because I took them from here. I still couldn't find prime numbers that go up to 64 bit values.
@LeoVen Finding primes (each number is as far as possible from the nearest two powers of two) up to 2^64 looks like a fun programming task.... ;-)

Stack Exchange Network

Generic Macro Generated Hashset in C

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Hot Network Questions

Generic Macro Generated Hashset in C

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Related

Hot Network Questions