Can you review the following Python code?
import random
class Point3D:
SEED = 1
def __init__(self, x, y, z):
self.X = x
self.Y = y
self.Z = z
def __hash__(self):
hash_value = 0
# Number of hash functions to use
num_hashes = 5
# Random vectors for projection
vectors = [[0] * 3 for _ in range(num_hashes)]
random.seed(Point3D.SEED)
for i in range(num_hashes):
for j in range(3):
vectors[i][j] = random.uniform(-1, 1) # Random value between -1 and 1
# Compute hash value for each random vector
for i in range(num_hashes):
dot_product = self.X * vectors[i][0] + self.Y * vectors[i][1] + self.Z * vectors[i][2]
hash_value <<= 1 # Left shift hash value by 1 bit
if dot_product >= 0:
hash_value |= 1 # Set the last bit to 1
return hash_value
def __str__(self):
return f"({self.X},{self.Y},{self.Z})"
if __name__ == '__main__':
hash_set = set()
for i in range(3):
for j in range(3):
for k in range(3):
point = Point3D(i, j, k)
hash_value = hash(point)
# print(f"{count}--({i},{j},{k}) = {hash_value}")
hash_set.add(hash_value)
sorted_list = sorted(list(hash_set))
print(f"Cont : {len(sorted_list)}")
for item in sorted_list:
print(f"{item:.2f}")
```
1 Answer 1
If this code makes it into production,
random.seed(Point3D.SEED)
is eventually going to make someone very confused and then angry. Every hash call of every instance of your class modifies the global random state. The quick and bad fix is to reinstantiate and seed a new Random
on every call to __hash__
. The quick and less-bad fix is to set up your vectors once and store them as a static. This will also improve performance.
Without pulling in any other libraries or being tricky with C FFI, there are still some modifications you can make that should help with speed:
- Add
__slots__
- Cache your hash value (this assumes that the class does not mutate, which may or may not hold - consider using
NamedTuple
to guarantee this) - Loop like a native: unpack each vector row to its components with no indexing
Otherwise, some Python basics:
- Add typehints
- Use a set comprehension for your hash set test
- Don't cast
hash_set
to a list;sorted
works fine on any iterable - Convert your
Cont:
from a print to anassert
, and add more of these (I have only shown one) item
is anint
, so justprint
it; don't:.2f
format it- Move the code from the
__main__
guard - which is still global - into a function
Suggested
from random import Random
from typing import Optional
def _make_vectors(seed: int = 1, num_hashes: int = 5) -> tuple[tuple[int, int, int], ...]:
rand = Random(seed)
return tuple(
tuple(
rand.uniform(-1, 1)
for _ in range(3)
)
for _ in range(num_hashes)
)
class Point3D:
__slots__ = ('x', 'y', 'z', '_hash')
VECTORS = _make_vectors()
def __init__(self, x: int, y: int, z: int) -> None:
self.x = x
self.y = y
self.z = z
self._hash: Optional[int] = None
def __hash__(self) -> int:
if self._hash is None:
hash_value = 0
for vx, vy, vz in self.VECTORS:
dot_product = self.x*vx + self.y*vy + self.z*vz
hash_value <<= 1
if dot_product >= 0:
hash_value |= 1
self._hash = hash_value
return self._hash
def __str__(self) -> str:
return f"({self.x},{self.y},{self.z})"
def test() -> None:
hash_set = {
hash(Point3D(i, j, k))
for i in range(3)
for j in range(3)
for k in range(3)
}
sorted_list = sorted(hash_set)
assert len(sorted_list) == 9
for item in sorted_list:
print(item)
if __name__ == '__main__':
test()
The above partially hides a problem, though: your hash function has a lot of collisions, and you can't see those collisions in any detail. Instead,
hash_set = Counter(
hash(Point3D(i, j, k))
for i in range(3)
for j in range(3)
for k in range(3)
)
print(hash_set)
Counter({22: 5, 16: 4, 18: 4, 5: 3, 1: 3, 4: 3, 20: 3, 31: 1, 17: 1})
Even at 100,000 hash iterations instead of five, there are still seven collisions.