(See the next version.)
The following data structure implements a hash table based set for int
values:
com.github.coderodde.util.IntHashSet
package com.github.coderodde.util;
/**
* This class implements a simple hash set for non-negative {@code int} values.
* It is used in the {@link com.github.coderodde.util.LinkedList} in order to
* keep track of nodes that are being pointed to by fingers.
*
* @author Rodion "rodde" Efremov
* @version 1.6 (Aug 29, 2021)
* @since 1.6 (Aug 29, 2021)
*/
class IntHashSet {
private static final int INITIAL_CAPACITY = 8;
private static final float MAXIMUM_LOAD_FACTOR = 0.75f;
static final class IntHashTableCollisionChainNode {
IntHashTableCollisionChainNode next;
int integer;
IntHashTableCollisionChainNode(
int integer,
IntHashTableCollisionChainNode next) {
this.integer = integer;
this.next = next;
}
@Override
public String toString() {
return "Chain node, integer = " + integer;
}
}
void add(int integer) {
if (contains(integer)) {
return;
}
size++;
if (shouldExpand())
expand();
final int targetCollisionChainIndex = integer & mask;
final IntHashTableCollisionChainNode newNode =
new IntHashTableCollisionChainNode(
integer,
table[targetCollisionChainIndex]);
newNode.next = table[targetCollisionChainIndex];
table[targetCollisionChainIndex] = newNode;
}
boolean contains(int integer) {
final int collisionChainIndex = integer & mask;
IntHashTableCollisionChainNode node = table[collisionChainIndex];
while (node != null) {
if (node.integer == integer) {
return true;
}
node = node.next;
}
return false;
}
void remove(int integer) {
if (!contains(integer)) {
return;
}
size--;
if (shouldContract())
contract();
final int targetCollisionChainIndex = integer & mask;
IntHashTableCollisionChainNode current =
table[targetCollisionChainIndex];
IntHashTableCollisionChainNode previous = null;
while (current != null) {
IntHashTableCollisionChainNode next = current.next;
if (current.integer == integer) {
if (previous == null) {
table[targetCollisionChainIndex] = next;
} else {
previous.next = next;
}
return;
}
previous = current;
current = next;
}
}
public void clear() {
size = 0;
table = new IntHashTableCollisionChainNode[INITIAL_CAPACITY];
mask = table.length - 1;
}
private IntHashTableCollisionChainNode[] table =
new IntHashTableCollisionChainNode[INITIAL_CAPACITY];
private int size = 0;
private int mask = INITIAL_CAPACITY - 1;
// Keep add(int) an amortized O(1)
private boolean shouldExpand() {
return size > table.length * MAXIMUM_LOAD_FACTOR;
}
// Keep remove(int) an amortized O(1)
private boolean shouldContract() {
if (table.length == INITIAL_CAPACITY) {
return false;
}
return MAXIMUM_LOAD_FACTOR * size * 4 < table.length;
}
private void expand() {
IntHashTableCollisionChainNode[] newTable =
new IntHashTableCollisionChainNode[table.length * 2];
rehash(newTable);
table = newTable;
mask = table.length - 1;
}
private void contract() {
IntHashTableCollisionChainNode[] newTable =
new IntHashTableCollisionChainNode[table.length / 4];
rehash(newTable);
table = newTable;
mask = table.length - 1;
}
private void rehash(IntHashTableCollisionChainNode[] newTable) {
for (IntHashTableCollisionChainNode node : table) {
while (node != null) {
final IntHashTableCollisionChainNode next = node.next;
final int rehashedIndex = rehash(node.integer, newTable);
node.next = newTable[rehashedIndex];
newTable[rehashedIndex] = node;
node = next;
}
}
}
private static int rehash(
int integer,
IntHashTableCollisionChainNode[] newTable) {
return integer & (newTable.length - 1);
}
}
com.github.coderodde.util.IntHashSetTest
package com.github.coderodde.util;
import java.util.Random;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import org.junit.Before;
import org.junit.Test;
public class IntHashSetTest {
private final IntHashSet set = new IntHashSet();
@Before
public void beforeTest() {
set.clear();
}
@Test
public void add() {
for (int i = 0; i < 500; i++) {
set.add(i);
}
for (int i = 0; i < 500; i++) {
assertTrue(set.contains(i));
}
for (int i = 500; i < 1_000; i++) {
assertFalse(set.contains(i));
}
for (int i = 450; i < 550; i++) {
set.remove(i);
}
for (int i = 450; i < 1_000; i++) {
assertFalse(set.contains(i));
}
for (int i = 0; i < 450; i++) {
assertTrue(set.contains(i));
}
}
@Test
public void contains() {
set.add(10);
set.add(20);
set.add(30);
for (int i = 1; i < 40; i++) {
if (i % 10 == 0) {
assertTrue(set.contains(i));
} else {
assertFalse(set.contains(i));
}
}
}
@Test
public void remove() {
set.add(1);
set.add(2);
set.add(3);
set.add(4);
set.add(5);
set.remove(2);
set.remove(4);
set.add(2);
assertFalse(set.contains(4));
assertTrue(set.contains(1));
assertTrue(set.contains(2));
assertTrue(set.contains(3));
assertTrue(set.contains(5));
}
@Test
public void clear() {
for (int i = 0; i < 100; i++) {
set.add(i);
}
for (int i = 0; i < 100; i++) {
assertTrue(set.contains(i));
}
set.clear();
for (int i = 0; i < 100; i++) {
assertFalse(set.contains(i));
}
}
@Test
public void bruteForceAdd() {
long seed = System.currentTimeMillis();
System.out.println(
"--- IntHashSetTest.bruteForceAdd: seed = " + seed + " ---");
Random random = new Random(seed);
int[] data = new int[10_000];
for (int i = 0; i < data.length; i++) {
int datum = random.nextInt(5_000);
data[i] = datum;
set.add(datum);
}
for (int i = 0; i < data.length; i++) {
assertTrue(set.contains(data[i]));
}
}
@Test
public void bruteForceRemove() {
long seed = System.currentTimeMillis();
System.out.println(
"--- IntHashSetTest.bruteForceRemove: seed = " + seed + " ---");
Random random = new Random(seed);
int[] data = new int[10_000];
for (int i = 0; i < data.length; i++) {
int datum = random.nextInt(5_000);
data[i] = datum;
set.add(datum);
}
shuffle(data, random);
for (int i = 0; i < data.length; i++) {
int datum = data[i];
if (set.contains(datum)) {
set.remove(datum);
}
assertFalse(set.contains(datum));
}
}
private static void shuffle(int[] data, Random random) {
for (int i = data.length - 1; i > 0; --i) {
final int j = random.nextInt(i + 1);
swap(data, i, j);
}
}
private static void swap(int[] data, int i, int j) {
int tmp = data[i];
data[i] = data[j];
data[j] = tmp;
}
}
Critique request
Please, tell me anything that comes to mind! ^^
-
2\$\begingroup\$ Hiding fields between public methods and private methods is really weird to me. Does this follow some specific coding standard? I do get the point of public methods being the interface and thus the most important part, but the interface is documented in the JavaDocs and the source code should try to be as clear as possible instead of trying to disguise the fields (I did spend a while wondering where the mask variable came from as I read the code). \$\endgroup\$TorbenPutkonen– TorbenPutkonen2021年08月31日 10:31:02 +00:00Commented Aug 31, 2021 at 10:31
4 Answers 4
Bug: array shrinks too much
The logic in shouldContract
looks like it is intended to prevent the size from shrinking below INITIAL_CAPACITY
, but it doesn't do that. For example, suppose the current table size is 16, then table.length == INITIAL_CAPACITY
is false and MAXIMUM_LOAD_FACTOR * size * 4 < table.length
can be true, so a resize is performed and the new size is 4. Then table.length == INITIAL_CAPACITY
is still false, and MAXIMUM_LOAD_FACTOR * size * 4 < table.length
can be true again if more items are removed, and the next size is 1. You can see where this is headed: the next size after this is going to be zero. That then causes the rehash to fail.
Code to reproduce:
for (int i = 0; i < 9; i++)
set.add(i);
for (int i = 0; i < 9; i++)
set.remove(i);
Access modifiers are inconsistent
clear
is public
but the rest of the "public interface" has the default access level, which might be OK but then why is clear
public. I recommend picking one of them.
Non-negative?
The description says that it's a set for non-negative integers, but it seems to do fine with any integer.
Weak hash (identity hash)
Identity-hashing (ie using the value as the hash, only applying range reduction) for integers mostly works, but can easily cause values to some distribute unevenly across buckets. For example, if all values are even (or all odd) then only half the buckets are used. Or if all values are a multiple of 256, then only 1 in 256 buckets can be used. In many cases that will simply not happen because input is well-distributed, but a set of integers that are a multiple of a power of two is not a rarity. The danger could be mitigated (not removed entirely, but the sets of values that trigger bad behaviour with hashing are quite unusual sets) by applying some "diffusive" bijective transformation to the value before range-reducing it, for example (borrowing the "hash finalizer" from MurmurHash3):
static int hash(int x) {
x ^= x >>> 16;
x *= 0x85ebca6b;
x ^= x >>> 13;
x *= 0xc2b2ae35;
x ^= x >>> 16;
return x;
}
// elsewhere in the code ...
int targetCollisionChainIndex = hash(integer) & mask;
There is a trade-off there of course, that hash isn't free to evaluate.
Just an unordered list of things that came to my mind:
You have two
rehash()
methods doing very different things, anint
-returning one computing the index for a content integer, and the othervoid
one doing the rehashing process.You have (at least) two places where you compute the array index for a content integer, one being inside the
add()
method, the other being theint rehash()
method.You re-invented the wheel. Functionally, a
HashSet<Integer>
does the job and is well-tested and optimized. Probably, you did your own implementation for efficiency reasons, but I doubt your class outperformsHashSet<Integer>
regarding execution time or memory footprint. But if you have performance data, I'd be curious to see them.Your "hashing" by using a bit mask can lead to unnecessary collisions in some (quite plausible) use cases.
You use a singly-linked list in the buckets. In your test cases for the
remove()
method, I didn't find explicit ones for the edge cases: removing the first entry and removing the last entry from the list. The random tests might cover that as well with some probability, but "some probability" is a bad idea for testing.
-
1\$\begingroup\$ The JDK builtin
HashSet<Integer>
is actually a rather poorly optimized implementation. Not only does it waste time and memory due to primitive (un)boxing, but it's actually aHashMap
in disguise and thus also wastes space to store pointers to the unused map values and time to update them. It's also quite prone to collisions, and while it does deal with them decently well (essentially falling back to aTreeMap
), that in itself adds quite a bit of complexity and performance overhead. Something like HPPCIntHashSet
would be a much better comparison. \$\endgroup\$Ilmari Karonen– Ilmari Karonen2021年08月31日 16:59:30 +00:00Commented Aug 31, 2021 at 16:59
IntHashTableCollisionChainNode
is already nested withinIntHashSet
, making theIntHashTable
part of its name redundant. Plus the fact that it's part of a "chain" is kind of implied by how it's used, it's not really a property of the class itself.Consider perhaps just
IntHashSet.Node
.IntHashTableCollisionChainNode
can beprivate
You didn't mention which Java version you're using, but if it's higher than 10, you should use
var
for type inference on local variables. It'll reduce the incidence rate of things like:final IntHashTableCollisionChainNode newNode = new IntHashTableCollisionChainNode( integer, table[targetCollisionChainIndex]);
Which can just become:
final var newNode = new Node(integer, table[targetCollisionChainIndex]);
Marking local variables in Java is pretty tedious. IMO, it's a lot of noise without much return. Take a look at the keywords used to declare constants vs variables in various langauges:
Language | Constant keyword | Mutable keyword ------------|------------------+----------------- Java | final | <blank> JavaScript | const | let Swift | let | var Rust | let | let mut
Notice the big difference: the other languages make it practically just as easy to declare a constant as it is to declare a variable. In Java, it's additive, and really tedious. Given that Java's
final
also has no bearing on the mutability on the referenced object (it only cares about the immutability of the reference itself), I think it's quite low-value, and not worth the spam to use it on all local variables.
-
\$\begingroup\$ Nice one, but I have to disagree on
final
: if nothing else, it documents that a variable/reference is not changed in the current method. \$\endgroup\$coderodde– coderodde2021年09月01日 11:39:34 +00:00Commented Sep 1, 2021 at 11:39 -
\$\begingroup\$ @coderodde I’m aware, but I just don’t think it’s that useful compared to its cost. Particularly when it doesn’t stop you from mutating the referenced object. \$\endgroup\$Alexander– Alexander2021年09月01日 12:15:56 +00:00Commented Sep 1, 2021 at 12:15
Performance
If one considers speed (which seems to be a goal since no safety net is put in place such as HashSet
has), I'd say that you're in good spot, but some improvements can be done.
- Checking
contains
inside ofadd
andremoves
makes the code loop twice through the nodes when an element is absent. Consider checking later in youradd
andremove
method. - Consider caching heavily used numbers, such as
table.length * MAXIMUM_LOAD_FACTOR
(=nextExpandSize
?).
Readability
I have a hard time reading the code. I've read several performance-related code, and they're much more legible.
- Consider renaming
IntHashTableCollisionChainNode
to justNode
. It expresses exactly the same, but is much shorter. - We're now close to Java 17. Just use
var
instead of the full type name. - Remove the
final
for local variables. There's absolutely zero added value to it in your code, especially since Java now has the concept of effectively final variables. - Consider merging
expand
andcontract
intoresize
with a parameter: the code is absolutely identical except for the new size. - OR consider merging
shouldExpand
andexpand
together (expandIfNeeded
), as well asshouldContract
andcontract
(contractIfNeeded
). - There are magic numbers such as
4
and2
. Express them as constants such asEXPAND_FACTOR
andCONTRACT_FACTOR
.
Testing
- The
bruteForceAdd
andbruteForceRemove
tests are not reproducible because the seed is the current timestamp. If your implementation is buggy, you might have the same test that sometimes succeeds and sometimes fails. Tests are about reproducibility. So it's better to have a seed that is constant. - The
bruteForceRemove
is even more problematic because there is a conditional remove. I would write it like this:
for (int i = 0; i < 10000; i++) {
data[i] = i;
set.add(i);
}
shuffle(data);
for (var i : data) {
assumeTrue(data.contains(i)); // not assert.
data.remove(i);
assertFalse(data.contains(i));
}
Explore related questions
See similar questions with these tags.