A simple Java integer hash set

Question 1

The following data structure implements a hash table based set for int values:

com.github.coderodde.util.IntHashSet

package com.github.coderodde.util;
/**
 * This class implements a simple hash set for non-negative {@code int} values.
 * It is used in the {@link com.github.coderodde.util.LinkedList} in order to 
 * keep track of nodes that are being pointed to by fingers.
 * 
 * @author Rodion "rodde" Efremov
 * @version 1.6 (Aug 29, 2021)
 * @since 1.6 (Aug 29, 2021)
 */
class IntHashSet {
 private static final int INITIAL_CAPACITY = 8;
 private static final float MAXIMUM_LOAD_FACTOR = 0.75f;
 static final class IntHashTableCollisionChainNode {
 IntHashTableCollisionChainNode next;
 int integer;
 IntHashTableCollisionChainNode(
 int integer, 
 IntHashTableCollisionChainNode next) {
 this.integer = integer;
 this.next = next;
 }
 @Override
 public String toString() {
 return "Chain node, integer = " + integer;
 }
 }
 void add(int integer) {
 if (contains(integer)) {
 return;
 }
 size++;
 if (shouldExpand())
 expand();
 final int targetCollisionChainIndex = integer & mask;
 final IntHashTableCollisionChainNode newNode = 
 new IntHashTableCollisionChainNode(
 integer, 
 table[targetCollisionChainIndex]);
 newNode.next = table[targetCollisionChainIndex];
 table[targetCollisionChainIndex] = newNode;
 }
 boolean contains(int integer) {
 final int collisionChainIndex = integer & mask;
 IntHashTableCollisionChainNode node = table[collisionChainIndex];
 while (node != null) {
 if (node.integer == integer) {
 return true;
 }
 node = node.next;
 }
 return false;
 }
 void remove(int integer) {
 if (!contains(integer)) {
 return;
 }
 size--;
 if (shouldContract()) 
 contract();
 final int targetCollisionChainIndex = integer & mask;
 IntHashTableCollisionChainNode current = 
 table[targetCollisionChainIndex];
 IntHashTableCollisionChainNode previous = null;
 while (current != null) {
 IntHashTableCollisionChainNode next = current.next;
 if (current.integer == integer) {
 if (previous == null) {
 table[targetCollisionChainIndex] = next;
 } else {
 previous.next = next;
 }
 return;
 }
 previous = current;
 current = next;
 }
 }
 public void clear() {
 size = 0;
 table = new IntHashTableCollisionChainNode[INITIAL_CAPACITY];
 mask = table.length - 1;
 }
 private IntHashTableCollisionChainNode[] table = 
 new IntHashTableCollisionChainNode[INITIAL_CAPACITY];
 private int size = 0;
 private int mask = INITIAL_CAPACITY - 1;
 // Keep add(int) an amortized O(1)
 private boolean shouldExpand() {
 return size > table.length * MAXIMUM_LOAD_FACTOR;
 }
 // Keep remove(int) an amortized O(1)
 private boolean shouldContract() {
 if (table.length == INITIAL_CAPACITY) {
 return false;
 }
 return MAXIMUM_LOAD_FACTOR * size * 4 < table.length;
 }
 private void expand() {
 IntHashTableCollisionChainNode[] newTable = 
 new IntHashTableCollisionChainNode[table.length * 2];
 rehash(newTable);
 table = newTable;
 mask = table.length - 1;
 }
 private void contract() {
 IntHashTableCollisionChainNode[] newTable = 
 new IntHashTableCollisionChainNode[table.length / 4];
 rehash(newTable);
 table = newTable;
 mask = table.length - 1;
 }
 private void rehash(IntHashTableCollisionChainNode[] newTable) {
 for (IntHashTableCollisionChainNode node : table) {
 while (node != null) {
 final IntHashTableCollisionChainNode next = node.next;
 final int rehashedIndex = rehash(node.integer, newTable);
 node.next = newTable[rehashedIndex];
 newTable[rehashedIndex] = node;
 node = next;
 }
 }
 }
 private static int rehash(
 int integer, 
 IntHashTableCollisionChainNode[] newTable) {
 return integer & (newTable.length - 1);
 }
}

com.github.coderodde.util.IntHashSetTest

package com.github.coderodde.util;
import java.util.Random;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import org.junit.Before;
import org.junit.Test;
public class IntHashSetTest {
 private final IntHashSet set = new IntHashSet();
 @Before
 public void beforeTest() {
 set.clear();
 }
 @Test
 public void add() {
 for (int i = 0; i < 500; i++) {
 set.add(i);
 }
 for (int i = 0; i < 500; i++) {
 assertTrue(set.contains(i));
 }
 for (int i = 500; i < 1_000; i++) {
 assertFalse(set.contains(i));
 }
 for (int i = 450; i < 550; i++) {
 set.remove(i);
 }
 for (int i = 450; i < 1_000; i++) {
 assertFalse(set.contains(i));
 }
 for (int i = 0; i < 450; i++) {
 assertTrue(set.contains(i));
 }
 }
 @Test
 public void contains() {
 set.add(10);
 set.add(20);
 set.add(30);
 for (int i = 1; i < 40; i++) {
 if (i % 10 == 0) {
 assertTrue(set.contains(i));
 } else {
 assertFalse(set.contains(i));
 }
 }
 }
 @Test
 public void remove() {
 set.add(1);
 set.add(2);
 set.add(3);
 set.add(4);
 set.add(5);
 set.remove(2);
 set.remove(4);
 set.add(2);
 assertFalse(set.contains(4));
 assertTrue(set.contains(1));
 assertTrue(set.contains(2));
 assertTrue(set.contains(3));
 assertTrue(set.contains(5));
 }
 @Test
 public void clear() {
 for (int i = 0; i < 100; i++) {
 set.add(i);
 }
 for (int i = 0; i < 100; i++) {
 assertTrue(set.contains(i));
 }
 set.clear();
 for (int i = 0; i < 100; i++) {
 assertFalse(set.contains(i));
 }
 }
 @Test 
 public void bruteForceAdd() {
 long seed = System.currentTimeMillis();
 System.out.println(
 "--- IntHashSetTest.bruteForceAdd: seed = " + seed + " ---");
 Random random = new Random(seed);
 int[] data = new int[10_000];
 for (int i = 0; i < data.length; i++) {
 int datum = random.nextInt(5_000);
 data[i] = datum;
 set.add(datum);
 }
 for (int i = 0; i < data.length; i++) {
 assertTrue(set.contains(data[i]));
 }
 }
 @Test
 public void bruteForceRemove() {
 long seed = System.currentTimeMillis();
 System.out.println(
 "--- IntHashSetTest.bruteForceRemove: seed = " + seed + " ---");
 Random random = new Random(seed);
 int[] data = new int[10_000];
 for (int i = 0; i < data.length; i++) {
 int datum = random.nextInt(5_000);
 data[i] = datum;
 set.add(datum);
 }
 shuffle(data, random);
 for (int i = 0; i < data.length; i++) {
 int datum = data[i];
 if (set.contains(datum)) {
 set.remove(datum);
 } 
 assertFalse(set.contains(datum));
 }
 }
 private static void shuffle(int[] data, Random random) {
 for (int i = data.length - 1; i > 0; --i) {
 final int j = random.nextInt(i + 1);
 swap(data, i, j);
 }
 }
 private static void swap(int[] data, int i, int j) {
 int tmp = data[i];
 data[i] = data[j];
 data[j] = tmp;
 }
}

Critique request

Please, tell me anything that comes to mind! ^^

Question 2

Hiding fields between public methods and private methods is really weird to me. Does this follow some specific coding standard? I do get the point of public methods being the interface and thus the most important part, but the interface is documented in the JavaDocs and the source code should try to be as clear as possible instead of trying to disguise the fields (I did spend a while wondering where the mask variable came from as I read the code).

Question 3

Bug: array shrinks too much

The logic in shouldContract looks like it is intended to prevent the size from shrinking below INITIAL_CAPACITY, but it doesn't do that. For example, suppose the current table size is 16, then table.length == INITIAL_CAPACITY is false and MAXIMUM_LOAD_FACTOR * size * 4 < table.length can be true, so a resize is performed and the new size is 4. Then table.length == INITIAL_CAPACITY is still false, and MAXIMUM_LOAD_FACTOR * size * 4 < table.length can be true again if more items are removed, and the next size is 1. You can see where this is headed: the next size after this is going to be zero. That then causes the rehash to fail.

Code to reproduce:

for (int i = 0; i < 9; i++)
 set.add(i);
for (int i = 0; i < 9; i++)
 set.remove(i);

Access modifiers are inconsistent

clear is public but the rest of the "public interface" has the default access level, which might be OK but then why is clear public. I recommend picking one of them.

Non-negative?

The description says that it's a set for non-negative integers, but it seems to do fine with any integer.

Weak hash (identity hash)

Identity-hashing (ie using the value as the hash, only applying range reduction) for integers mostly works, but can easily cause values to some distribute unevenly across buckets. For example, if all values are even (or all odd) then only half the buckets are used. Or if all values are a multiple of 256, then only 1 in 256 buckets can be used. In many cases that will simply not happen because input is well-distributed, but a set of integers that are a multiple of a power of two is not a rarity. The danger could be mitigated (not removed entirely, but the sets of values that trigger bad behaviour with hashing are quite unusual sets) by applying some "diffusive" bijective transformation to the value before range-reducing it, for example (borrowing the "hash finalizer" from MurmurHash3):

static int hash(int x) {
 x ^= x >>> 16;
 x *= 0x85ebca6b;
 x ^= x >>> 13;
 x *= 0xc2b2ae35;
 x ^= x >>> 16;
 return x;
}
// elsewhere in the code ...
int targetCollisionChainIndex = hash(integer) & mask;

There is a trade-off there of course, that hash isn't free to evaluate.

Question 4

Just an unordered list of things that came to my mind:

You have two rehash() methods doing very different things, an int-returning one computing the index for a content integer, and the other void one doing the rehashing process.
You have (at least) two places where you compute the array index for a content integer, one being inside the add() method, the other being the int rehash() method.
You re-invented the wheel. Functionally, a HashSet<Integer> does the job and is well-tested and optimized. Probably, you did your own implementation for efficiency reasons, but I doubt your class outperforms HashSet<Integer> regarding execution time or memory footprint. But if you have performance data, I'd be curious to see them.
Your "hashing" by using a bit mask can lead to unnecessary collisions in some (quite plausible) use cases.
You use a singly-linked list in the buckets. In your test cases for the remove() method, I didn't find explicit ones for the edge cases: removing the first entry and removing the last entry from the list. The random tests might cover that as well with some probability, but "some probability" is a bad idea for testing.

Question 5

The JDK builtin HashSet<Integer> is actually a rather poorly optimized implementation. Not only does it waste time and memory due to primitive (un)boxing, but it's actually a HashMap in disguise and thus also wastes space to store pointers to the unused map values and time to update them. It's also quite prone to collisions, and while it does deal with them decently well (essentially falling back to a TreeMap), that in itself adds quite a bit of complexity and performance overhead. Something like HPPC IntHashSet would be a much better comparison.

Question 6

IntHashTableCollisionChainNode is already nested within IntHashSet, making the IntHashTable part of its name redundant. Plus the fact that it's part of a "chain" is kind of implied by how it's used, it's not really a property of the class itself.

Consider perhaps just IntHashSet.Node.
IntHashTableCollisionChainNode can be private

You didn't mention which Java version you're using, but if it's higher than 10, you should use var for type inference on local variables. It'll reduce the incidence rate of things like:

final IntHashTableCollisionChainNode newNode = 
 new IntHashTableCollisionChainNode(
 integer, 
 table[targetCollisionChainIndex]);

Which can just become:

final var newNode = new Node(integer, table[targetCollisionChainIndex]);

Marking local variables in Java is pretty tedious. IMO, it's a lot of noise without much return. Take a look at the keywords used to declare constants vs variables in various langauges:
```
 Language | Constant keyword | Mutable keyword
------------|------------------+-----------------
 Java | final | <blank>
 JavaScript | const | let
 Swift | let | var
 Rust | let | let mut
```
Notice the big difference: the other languages make it practically just as easy to declare a constant as it is to declare a variable. In Java, it's additive, and really tedious. Given that Java's final also has no bearing on the mutability on the referenced object (it only cares about the immutability of the reference itself), I think it's quite low-value, and not worth the spam to use it on all local variables.

Question 7

Nice one, but I have to disagree on final: if nothing else, it documents that a variable/reference is not changed in the current method.

Question 8

@coderodde I’m aware, but I just don’t think it’s that useful compared to its cost. Particularly when it doesn’t stop you from mutating the referenced object.

Question 9

Performance

If one considers speed (which seems to be a goal since no safety net is put in place such as HashSet has), I'd say that you're in good spot, but some improvements can be done.

Checking contains inside of add and removes makes the code loop twice through the nodes when an element is absent. Consider checking later in your add and remove method.
Consider caching heavily used numbers, such as table.length * MAXIMUM_LOAD_FACTOR (=nextExpandSize?).

Readability

I have a hard time reading the code. I've read several performance-related code, and they're much more legible.

Consider renaming IntHashTableCollisionChainNode to just Node. It expresses exactly the same, but is much shorter.
We're now close to Java 17. Just use var instead of the full type name.
Remove the final for local variables. There's absolutely zero added value to it in your code, especially since Java now has the concept of effectively final variables.
Consider merging expand and contract into resize with a parameter: the code is absolutely identical except for the new size.
OR consider merging shouldExpand and expand together (expandIfNeeded), as well as shouldContract and contract (contractIfNeeded).
There are magic numbers such as 4 and 2. Express them as constants such as EXPAND_FACTOR and CONTRACT_FACTOR.

Testing

The bruteForceAdd and bruteForceRemove tests are not reproducible because the seed is the current timestamp. If your implementation is buggy, you might have the same test that sometimes succeeds and sometimes fails. Tests are about reproducibility. So it's better to have a seed that is constant.
The bruteForceRemove is even more problematic because there is a conditional remove. I would write it like this:

for (int i = 0; i < 10000; i++) {
 data[i] = i;
 set.add(i);
}
shuffle(data);
for (var i : data) {
 assumeTrue(data.contains(i)); // not assert.
 data.remove(i);
 assertFalse(data.contains(i));
}

user555045 user555045 12.2k1 gold badge19 silver badges37 bronze badges · Accepted Answer · 2021-08-31 11:12:23Z

Bug: array shrinks too much

The logic in shouldContract looks like it is intended to prevent the size from shrinking below INITIAL_CAPACITY, but it doesn't do that. For example, suppose the current table size is 16, then table.length == INITIAL_CAPACITY is false and MAXIMUM_LOAD_FACTOR * size * 4 < table.length can be true, so a resize is performed and the new size is 4. Then table.length == INITIAL_CAPACITY is still false, and MAXIMUM_LOAD_FACTOR * size * 4 < table.length can be true again if more items are removed, and the next size is 1. You can see where this is headed: the next size after this is going to be zero. That then causes the rehash to fail.

Code to reproduce:

for (int i = 0; i < 9; i++)
 set.add(i);
for (int i = 0; i < 9; i++)
 set.remove(i);

Access modifiers are inconsistent

clear is public but the rest of the "public interface" has the default access level, which might be OK but then why is clear public. I recommend picking one of them.

Non-negative?

The description says that it's a set for non-negative integers, but it seems to do fine with any integer.

Weak hash (identity hash)

Identity-hashing (ie using the value as the hash, only applying range reduction) for integers mostly works, but can easily cause values to some distribute unevenly across buckets. For example, if all values are even (or all odd) then only half the buckets are used. Or if all values are a multiple of 256, then only 1 in 256 buckets can be used. In many cases that will simply not happen because input is well-distributed, but a set of integers that are a multiple of a power of two is not a rarity. The danger could be mitigated (not removed entirely, but the sets of values that trigger bad behaviour with hashing are quite unusual sets) by applying some "diffusive" bijective transformation to the value before range-reducing it, for example (borrowing the "hash finalizer" from MurmurHash3):

static int hash(int x) {
 x ^= x >>> 16;
 x *= 0x85ebca6b;
 x ^= x >>> 13;
 x *= 0xc2b2ae35;
 x ^= x >>> 16;
 return x;
}
// elsewhere in the code ...
int targetCollisionChainIndex = hash(integer) & mask;

There is a trade-off there of course, that hash isn't free to evaluate.

Stack Exchange Network

A simple Java integer hash set

Critique request

4 Answers 4

Bug: array shrinks too much

Access modifiers are inconsistent

Non-negative?

Weak hash (identity hash)

Performance

Readability

Testing

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

A simple Java integer hash set

Critique request

4 Answers 4

Bug: array shrinks too much

Access modifiers are inconsistent

Non-negative?

Weak hash (identity hash)

Performance

Readability

Testing

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions