3
\$\begingroup\$

I'm attempting to implement a basic hash table in Python using linked lists. Any tips would be appreciated (including better hash functions). I intend this to handle collisions with separate chaining.

I include my gist for my linked list:

Question: Are there any things I handled incorrectly or could have implemented in a better way?

I also checked my function against the unit-test and make sure it all 11 tests and passes them.

#!python
from linkedlist import LinkedList
class HashTable(object):
 def __init__(self, init_size=8):
 """Initialize this hash table with the given initial size."""
 self.buckets = [LinkedList() for i in range(init_size)]
 self.size = 0 # Number of key-value entries
 def __str__(self):
 """Return a formatted string representation of this hash table."""
 items = ['{!r}: {!r}'.format(key, val) for key, val in self.items()]
 return '{' + ', '.join(items) + '}'
 def __repr__(self):
 """Return a string representation of this hash table."""
 return 'HashTable({!r})'.format(self.items())
 def _bucket_index(self, key):
 """Return the bucket index where the given key would be stored."""
 return hash(key) % len(self.buckets)
 def load_factor(self):
 """Return the load factor, the ratio of number of entries to buckets.
 Best and worst case running time: Theta(n) it always depends on the number
 of items that need to be counted, one by one"""
 return float(self.size) / (len(self.buckets))
 def keys(self):
 """Return a list of all keys in this hash table.
 Best and worst case running time: O(n^2)"""
 # Collect all keys in each of the buckets
 all_keys = []
 for bucket in self.buckets:
 for key, value in bucket.items():
 all_keys.append(key)
 return all_keys
 def values(self):
 """Return a list of all values in this hash table.
 Best and worst case running time: O(n^2)"""
 # Collect all values in each of the buckets
 all_values = []
 for bucket in self.buckets:
 for key, value in bucket.items():
 all_values.append(value)
 return all_values
 def items(self):
 """Return a list of all entries (key-value pairs) in this hash table.
 Best and worst case running time: O(n^2)"""
 # Collect all pairs of key-value entries in each of the buckets
 all_items = []
 for bucket in self.buckets:
 all_items.extend(bucket.items())
 return all_items
 def length(self):
 """Return the number of key-value entries by traversing its buckets.
 Best and worst case running time: O(1)"""
 return self.size
 def contains(self, key):
 """Return True if this hash table contains the given key, or False.
 Best case running time: O(1) near beginning of list of keys
 Worst case running time: O(n) near end of list of keys """
 # Find the bucket the given key belongs in
 index = self._bucket_index(key)
 bucket = self.buckets[index]
 entry = bucket.find(lambda key_value: key_value[0] == key)
 return entry is not None # True or False
 def get(self, key):
 """Return the value associated with the given key, or raise KeyError.
 Best case running time: O(1) if entry is near beginning of list in first bucket
 Worst case running time: O(n) in most cases it will be theta(n) and depend
 on where it is located in the list of items"""
 # Find the bucket the given key belongs in
 index = self._bucket_index(key)
 bucket = self.buckets[index]
 # # Find the entry with the given key in that bucket, if one exists
 entry = bucket.find(lambda key_value: key_value[0] == key)
 # Return the given key's associated value
 if entry is not None:
 assert isinstance(entry, tuple)
 assert len(entry) == 2
 return entry[1]
 else: 
 raise KeyError('Key not found: {}'.format(key))
 def set(self, key, value):
 """Insert or update the given key with its associated value.
 Best case running time: O(1) if it's near beginning of LL
 Worst case running time: O(n) in most cases, depends on position in LL"""
 # Find the bucket the given key belongs in
 index = self._bucket_index(key)
 bucket = self.buckets[index]
 # Find the entry with the given key in that bucket, if one exists
 # Check if an entry with the given key exists in that bucket
 entry = bucket.find(lambda key_value: key_value[0] == key) 
 if entry is not None: # Found
 # In this case, the given key's value is being updated
 # Remove the old key-value entry from the bucket first
 bucket.delete(entry) 
 else:
 # If not found, increase size by one to account for new item
 self.size += 1
 # Insert the new key-value entry into the bucket in either case
 bucket.append((key, value))
 # Check if load_factor exceeds threshold, if it does, resize hashtable
 if self.load_factor() > 0.75:
 self._resize()
 def delete(self, key):
 """Delete the given key and its associated value, or raise KeyError.
 Best case running time: O(1) if near beginning of LL
 Worst case running time: O(n) in most cases, depends on position in LL"""
 # Find the bucket the given key belongs in
 index = self._bucket_index(key)
 bucket = self.buckets[index]
 # Find the entry with the given key in that bucket, if one exists
 entry = bucket.find(lambda key_value: key_value[0] == key)
 if entry is not None: # Found
 # Remove the key-value entry from the bucket
 bucket.delete(entry)
 self.size -= 1
 else: # Not found
 raise KeyError('Key not found: {}'.format(key))
 def _resize(self, new_bucket_count=None):
 """Resize this hash table's buckets and rehash all key-value entries.
 Should be called automatically when load factor exceeds a threshold
 such as 0.75 after an insertion (when set is called with a new key).
 Best and worst case running time: ??? under what conditions? [TODO]
 Best and worst case space usage: ??? what uses this memory? [TODO]"""
 # If unspecified, choose new size dynamically based on current size
 if new_bucket_count is None:
 new_bucket_count = len(self.buckets) * 2 # Double the bucket
 # Option to reduce size if buckets are sparsely filled (low load factor)
 elif new_bucket_count is 0:
 new_bucket_count = len(self.buckets) // 2 # Half the bucket
 # Get a list to temporarily hold all current key-value entries
 old_items = self.items()
 # Create a list of new buckets to be used for rehashing
 self.buckets = [LinkedList() for _ in range(new_bucket_count)]
 # Reset size
 self.size = 0
 # Insert each key-value entry into the new list of buckets,
 for key, value in old_items:
 self.set(key, value)
def test_hash_table():
 ht = HashTable(4)
 print('HashTable: ' + str(ht))
 print('Setting entries:')
 ht.set('I', 1)
 print('set(I, 1): ' + str(ht))
 ht.set('V', 5)
 print('set(V, 5): ' + str(ht))
 print('size: ' + str(ht.size))
 print('length: ' + str(ht.length()))
 print('buckets: ' + str(len(ht.buckets)))
 print('load_factor: ' + str(ht.load_factor()))
 ht.set('X', 10)
 print('set(X, 10): ' + str(ht))
 ht.set('L', 50) # Should trigger resize
 print('set(L, 50): ' + str(ht))
 print('size: ' + str(ht.size))
 print('length: ' + str(ht.length()))
 print('buckets: ' + str(len(ht.buckets)))
 print('load_factor: ' + str(ht.load_factor()))
 print('Getting entries:')
 print('get(I): ' + str(ht.get('I')))
 print('get(V): ' + str(ht.get('V')))
 print('get(X): ' + str(ht.get('X')))
 print('get(L): ' + str(ht.get('L')))
 print('contains(X): ' + str(ht.contains('X')))
 print('contains(Z): ' + str(ht.contains('Z')))
 print('Deleting entries:')
 ht.delete('I')
 print('delete(I): ' + str(ht))
 ht.delete('V')
 print('delete(V): ' + str(ht))
 ht.delete('X')
 print('delete(X): ' + str(ht))
 ht.delete('L')
 print('delete(L): ' + str(ht))
 print('contains(X): ' + str(ht.contains('X')))
 print('size: ' + str(ht.size))
 print('length: ' + str(ht.length()))
 print('buckets: ' + str(len(ht.buckets)))
 print('load_factor: ' + str(ht.load_factor()))
if __name__ == '__main__':
 test_hash_table()
#!python
from hashtable import HashTable
import unittest
# Python 2 and 3 compatibility: unittest module renamed this assertion method
if not hasattr(unittest.TestCase, 'assertCountEqual'):
 unittest.TestCase.assertCountEqual = unittest.TestCase.assertItemsEqual
class HashTableTest(unittest.TestCase):
 def test_init(self):
 ht = HashTable(4)
 assert len(ht.buckets) == 4
 assert ht.length() == 0
 assert ht.size == 0
 def test_keys(self):
 ht = HashTable()
 assert ht.keys() == []
 ht.set('I', 1)
 assert ht.keys() == ['I']
 ht.set('V', 5)
 self.assertCountEqual(ht.keys(), ['I', 'V']) # Ignore item order
 ht.set('X', 10)
 self.assertCountEqual(ht.keys(), ['I', 'V', 'X']) # Ignore item order
 def test_values(self):
 ht = HashTable()
 assert ht.values() == []
 ht.set('I', 1)
 assert ht.values() == [1]
 ht.set('V', 5)
 self.assertCountEqual(ht.values(), [1, 5]) # Ignore item order
 ht.set('X', 10)
 self.assertCountEqual(ht.values(), [1, 5, 10]) # Ignore item order
 def test_items(self):
 ht = HashTable()
 assert ht.items() == []
 ht.set('I', 1)
 assert ht.items() == [('I', 1)]
 ht.set('V', 5)
 self.assertCountEqual(ht.items(), [('I', 1), ('V', 5)])
 ht.set('X', 10)
 self.assertCountEqual(ht.items(), [('I', 1), ('V', 5), ('X', 10)])
 def test_length(self):
 ht = HashTable()
 assert ht.length() == 0
 ht.set('I', 1)
 assert ht.length() == 1
 ht.set('V', 5)
 assert ht.length() == 2
 ht.set('X', 10)
 assert ht.length() == 3
 def test_size(self):
 ht = HashTable()
 assert ht.size == 0
 ht.set('I', 1)
 assert ht.size == 1
 ht.set('V', 5)
 assert ht.size == 2
 ht.set('X', 10)
 assert ht.size == 3
 def test_resize(self):
 ht = HashTable(2) # Set init_size to 2
 assert ht.size == 0
 assert len(ht.buckets) == 2
 assert ht.load_factor() == 0
 ht.set('I', 1)
 assert ht.size == 1
 assert len(ht.buckets) == 2
 assert ht.load_factor() == 0.5
 ht.set('V', 5) # Should trigger resize
 assert ht.size == 2
 assert len(ht.buckets) == 4
 assert ht.load_factor() == 0.5
 ht.set('X', 10)
 assert ht.size == 3
 assert len(ht.buckets) == 4
 assert ht.load_factor() == 0.75
 ht.set('L', 50) # Should trigger resize
 assert ht.size == 4
 assert len(ht.buckets) == 8
 assert ht.load_factor() == 0.5
 def test_contains(self):
 ht = HashTable()
 ht.set('I', 1)
 ht.set('V', 5)
 ht.set('X', 10)
 assert ht.contains('I') is True
 assert ht.contains('V') is True
 assert ht.contains('X') is True
 assert ht.contains('A') is False
 def test_set_and_get(self):
 ht = HashTable()
 ht.set('I', 1)
 ht.set('V', 5)
 ht.set('X', 10)
 assert ht.get('I') == 1
 assert ht.get('V') == 5
 assert ht.get('X') == 10
 assert ht.length() == 3
 assert ht.size == 3
 with self.assertRaises(KeyError):
 ht.get('A') # Key does not exist
 def test_set_twice_and_get(self):
 ht = HashTable()
 ht.set('I', 1)
 ht.set('V', 4)
 ht.set('X', 9)
 assert ht.length() == 3
 assert ht.size == 3
 ht.set('V', 5) # Update value
 ht.set('X', 10) # Update value
 assert ht.get('I') == 1
 assert ht.get('V') == 5
 assert ht.get('X') == 10
 assert ht.length() == 3 # Check length is not overcounting
 assert ht.size == 3 # Check size is not overcounting
 def test_delete(self):
 ht = HashTable()
 ht.set('I', 1)
 ht.set('V', 5)
 ht.set('X', 10)
 assert ht.length() == 3
 assert ht.size == 3
 ht.delete('I')
 ht.delete('X')
 assert ht.length() == 1
 assert ht.size == 1
 with self.assertRaises(KeyError):
 ht.delete('X') # Key no longer exists
 with self.assertRaises(KeyError):
 ht.delete('A') # Key does not exist
if __name__ == '__main__':
 unittest.main()
Daniel
4,6122 gold badges18 silver badges40 bronze badges
asked Dec 4, 2017 at 9:28
\$\endgroup\$

1 Answer 1

2
\$\begingroup\$

Few high-level ideas and minor nitpicks:

  • have you considered json.dumps() to use for the HashTable's string representation
  • the keys() and values() methods can make use of a list comprehension:

    def keys(self):
     """Return a list of all keys in this hash table.
     Best and worst case running time: O(n^2)"""
     # Collect all keys in each of the buckets
     return [key 
     for bucket in self.buckets
     for key, _ in bucket.items()]
    def values(self):
     """Return a list of all values in this hash table.
     Best and worst case running time: O(n^2)"""
     # Collect all values in each of the buckets
     return [value 
     for bucket in self.buckets
     for _, value in bucket.items()]
    

    Please check if you can use bucket.keys() and bucket.values() directly, instead of looping over .items().

  • actually, can be applied for items() as well:

    return [item
     for bucket in self.buckets
     for item in bucket.items()]
    
  • consider using Type Annotations. For instance, look how you've added this inline comment:

    return entry is not None # True or False
    

    With type hints, you could've removed the comment and add a return type as a hint:

    def contains(self, key) -> bool:
    
  • unused loop variable. Since i is unused in the self.buckets = [LinkedList() for i in range(init_size)] expression, use _ as a variable name:

    self.buckets = [LinkedList() for _ in range(init_size)]
    
  • format Documentation Strings properly. There are some recommendations like, add a newline into the beginning of a docstring if it consists of multiple lines, end it with a dot etc.

answered Dec 4, 2017 at 12:52
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.