Trie implementation in Python

Question 1

I have created a trie in Python and am looking for feedback. Specifically, I am looking for feedback on:

If my code is 'pythonic'
If my logic is proper when inserting and retrieving from the trie

I have implemented the following logic:

Insert a word into the trie
Check if a word exists in the trie
Given a prefix, get all possible words . Get all the words in the trie

My purpose for coding the trie is to improve my Python and algorithm knowledge.

from collections import deque
class Node:
 def __init__(self, character, parent):
 self.character = character
 if self.character is not None:
 self.character = self.character.lower()
 self.parent = parent
 self.children = dict()
 self.terminus = False
 def add(self, child_node):
 self.children[child_node.character] = child_node
class Trie:
 def __init__(self):
 self._root = Node(None, None)
 def insert(self, word):
 if word:
 current_node = self._root
 for i, character in enumerate(self._normalize_word(word)):
 if character in current_node.children:
 current_node = current_node.children[character]
 else:
 child_node = Node(character, current_node)
 current_node.add(child_node)
 current_node = child_node
 current_node.terminus = True
 def __contains__(self, item):
 current_node = self._root
 contained = True
 for symbol in self._normalize_word(item):
 if symbol in current_node.children:
 current_node = current_node.children[symbol]
 else:
 contained = False
 break
 return contained and current_node.terminus
 def _normalize_word(self, word):
 return word.strip().lower()
 def _get_all_words(self, prefix, node, word_list):
 if node.character:
 prefix.append(node.character)
 for child in node.children.values():
 self._get_all_words(prefix, child, word_list)
 if node.terminus:
 word_list.append("".join([i[0] for i in prefix]))
 if len(prefix) > 0:
 prefix.pop()
 def get_possible_words(self, prefix):
 current_node = self._root
 found_prefix = True
 word_list = []
 prefix_deque = deque()
 for symbol in prefix.strip().lower():
 if symbol in current_node.children:
 current_node = current_node.children[symbol]
 else:
 found_prefix = False
 break
 if found_prefix:
 self._get_all_words(prefix_deque, current_node, word_list)
 # This is a bit kludgy - add the prefix to the rest of the characters found,
 # but I take off the last character from the prefix because it was added
 # in the _get_all_words method call since it is the current_node being passed into it. 
 # Is there a better way to do this?
 word_list = list(map(lambda word: prefix[:len(prefix)-1] + word, word_list))
 return word_list
 def get_all_words(self):
 word_list = []
 self._get_all_words(deque(), self._root, word_list)
 return word_list

Here is my unit test:

import unittest
from algorithms import trie
class TestTrie(unittest.TestCase):
 @classmethod
 def setUpClass(cls):
 # root
 # / \
 # / \
 # a * b
 # / \ /
 # d n * a
 # / / \ / | \
 # d * d * y * g * t * y *
 # / \ \
 # e s * h *
 # /
 # l *
 # asterisk denotes a word
 cls._trie = trie.Trie()
 cls._trie.insert("a")
 cls._trie.insert("add")
 cls._trie.insert("an")
 cls._trie.insert("and")
 cls._trie.insert("any")
 cls._trie.insert("bagel")
 cls._trie.insert("bag")
 cls._trie.insert("bags")
 cls._trie.insert("bat")
 cls._trie.insert("bath")
 cls._trie.insert("bay")
 cls._trie_length = 11 # magic number, the number of words in the trie
 def test(self):
 assert len(self._trie.get_all_words()) == self._trie_length
 assert "a" in self._trie
 assert "add" in self._trie
 assert "an" in self._trie
 assert "and" in self._trie
 assert "any" in self._trie
 assert "bagel" in self._trie
 assert "bag" in self._trie
 assert "bags" in self._trie
 assert "bat" in self._trie
 assert "bath" in self._trie
 assert "bay" in self._trie
 def test_duplicate_entries(self):
 """Adding a word that already exists should not create a new word in the trie"""
 t = self._trie
 t.insert("bag")
 assert len(t.get_all_words()) == self._trie_length
 assert "bag" in t
 def test_mixed_case(self):
 """insert and retrieval are case insensitive"""
 t = trie.Trie()
 t.insert("APPLE")
 t.insert("oRANge")
 assert "apple" in t
 assert "orange" in t
 assert "APPLE" in t
 assert "ORANGE" in t
 assert "aPpLe" in t
 assert "oRangE" in t
 def test_hyphenated_words(self):
 t = trie.Trie()
 t.insert("e-mail")
 t.insert("above-said")
 t.insert("above-water")
 t.insert("above-written")
 t.insert("above")
 t.insert("abode")
 t.insert("exit")
 assert len(t.get_all_words()) == 7
 assert "abode" in t
 assert "above" in t
 assert "above-written" in t
 assert "above-water" in t
 assert "above-said" in t
 assert "e-mail" in t
 assert "exit" in t
 def test_empty_trie(self):
 t = trie.Trie()
 assert len(t.get_all_words()) == 0
 def test_first_symbol_is_a_word(self):
 t = trie.Trie()
 t.insert("a")
 t.insert("apple")
 assert "a" in t
 assert "apple" in t
 words = t.get_all_words()
 assert len(words) == 2
 assert "a" in words
 assert "apple" in words
 def test_get_possible_words(self):
 prefix = 'an'
 expected_words = ['an', 'and', 'any']
 actual_words = self._trie.get_possible_words(prefix)
 assert len(expected_words) == len(actual_words)
 for word in expected_words:
 assert word in actual_words
 prefix = 'ba'
 expected_words = ["bagel", "bag", "bags", "bat", "bath", "bay"]
 actual_words = self._trie.get_possible_words(prefix)
 assert len(expected_words) == len(actual_words)
 for word in expected_words:
 assert word in actual_words
 def test_get_possible_words_no_more_words(self):
 """test that given a prefix that is a terminus with no children in the trie, returns that one word"""
 prefix = 'any'
 actual_words = self._trie.get_possible_words(prefix)
 assert len(actual_words) == 1
 assert prefix in actual_words
 def test_get_possible_words_prefix_not_in_trie(self):
 prefix = 'z'
 actual_words = self._trie.get_possible_words(prefix)
 assert len(actual_words) == 0

Question 2

Starting in the Node class, there was a parent attribute that never gets referenced. Also I find if you are just assigning a variable based off one condition doing it in one line makes the code cleaner and easier to read.

class Node:
 def __init__(self, character):
 self.character = character if character is None else character.lower()
 self.children = dict()
 self.terminus = False
 def add(self, child_node):
 self.children[child_node.character] = child_node

In your Trie class specifically with the .insert and __contains__ methods:

def insert(self, word):
 if word: # checking for truth to continue
 current_node = self._root
 for i, character in enumerate(self._normalize_word(word)):
 if character in current_node.children:
 current_node = current_node.children[character]
 else:
 child_node = Node(character, current_node)
 current_node.add(child_node)
 current_node = child_node
 current_node.terminus = True

You can save a level of indentation by returning on false, essentially short-circuiting your method:

def insert(self, word):
 if not word:
 return
 current_node = self._root
 for i, character in enumerate(self._normalize_word(word)):
 if character in current_node.children:
 current_node = current_node.children[character]
 else:
 child_node = Node(character, current_node)
 current_node.add(child_node)
 current_node = child_node
 current_node.terminus = True

You also make a call to enumerate but never use it:

def insert(self, word):
 if not word:
 return
 current_node = self._root
 for character in self._normalize_word(word):
 if character in current_node.children:
 current_node = current_node.children[character]
 else:
 child_node = Node(character, current_node)
 current_node.add(child_node)
 current_node = child_node
 current_node.terminus = True

Seeing your logic in your method made me think you could also have used a recursive method, something along the lines of:

def insert(self, word):
 if not word: 
 return
 self.__insert(self._normalize_word(word), self._root).terminus = True
def __insert(self, word, node):
 if not word:
 return node
 elif {word[0]} & node.children.keys():
 return self.__insert(word[1:], node.children[word[0]])
 else:
 new_node = Node(word[0])
 node.add(new_node)
 return self.__insert(word[1:], new_node)

One other change I made which depending on the size of the tree will improve performance is the use of a logical & on the children.keys(). PEP 3106 brought about changes that allowed for set operations on dict-keys. This means you could just as well check if a dict has a value as a key by {value} & dict.keys() rather than iterating over every key.

In the __contains__ method where you set a truth flag to false then break and then test with a logical &:

def __contains__(self, item):
 current_node = self._root
 contained = True
 for symbol in self._normalize_word(item):
 if symbol in current_node.children:
 current_node = current_node.children[symbol]
 else:
 contained = False
 break
 return contained and current_node.terminus

If contained is set to False theres no need to check at the bottom of the method. Just get rid of contained and return False if your first condition isn't met. Similar to the last point in .insert you can save a couple of lines if you short-circuit, basically no need for else if you are returning on it:

def __contains__(self, item):
 current_node = self._root
 for symbol in self._normalize_word(item):
 if symbol not in current_node.children:
 return False
 current_node = current_node.children[symbol]
 return current_node.terminus

I saw this as another place where you could have done this recursively as well:

def __contains(self, word, node):
 if not word:
 return node
 return self.__contains(word[1:], node.children[word[0]]) if \
 {word[0]} & node.children.keys() else None
def __contains__(self, item):
 word = self._normalize_word(item)
 return self.__contains(word, self._root) is not None

In the __contains method you can return the Node of the last letter or None. This is useful where you can get a boolean value from your dunder-contains method to check for membership but also you can use the __contains method with the get_possible_words method later. This is similar to how I adjusted the __insert method to return the final Node after inserting to set .terminus.

def get_possible_words(self, word):
 word_node = self.__contains(word, self._root)
 return [] if word_node is None else \
 self.__get_possible_words(word, word_node)

Here you can get your final Node and that will be the starting point to retrieve all possible words. If its None just return an empty list else:

def __get_possible_words(self, word, word_node, word_list=None):
 word_list = [] if word_list is None else word_list
 if word_node.terminus:
 word_list.append(word)
 for letter in word_node.children:
 if not word_node.children[letter]:
 word_list.append(word + letter)
 else:
 self.__get_possible_words(word + letter,
 word_node.children[letter], word_list)

And putting it altogether:

class Node:
 def __init__(self, character):
 self.character = character if character is None else character.lower()
 self.children = dict()
 self.terminus = False
 def add(self, child_node):
 self.children[child_node.character] = child_node
class Trie:
 def __init__(self):
 self._root = Node(None) 
 def insert(self, word):
 if not word: 
 return
 self.__insert(self._normalize_word(word), self._root).terminus = True
 def __insert(self, word, node):
 if not word:
 return node
 elif {word[0]} & node.children.keys():
 return self.__insert(word[1:], node.children[word[0]])
 else:
 new_node = Node(word[0])
 node.add(new_node)
 return self.__insert(word[1:], new_node)
 def __contains(self, word, node):
 if not word:
 return node
 return self.__contains(word[1:], node.children[word[0]]) if \
 {word[0]} & node.children.keys() else None
 def __contains__(self, item):
 word = self._normalize_word(item)
 return self.__contains(word, self._root) is not None
 def _normalize_word(self, word):
 return word.strip().lower()
 def get_possible_words(self, word):
 word_node = self.__contains(word, self._root)
 return [] if word_node is None else \
 self.__get_possible_words(word, word_node)
 def __get_possible_words(self, word, word_node, word_list=None):
 word_list = [] if word_list is None else word_list
 if word_node.terminus:
 word_list.append(word)
 for letter in word_node.children:
 if not word_node.children[letter]:
 word_list.append(word + letter)
 else:
 self.__get_possible_words(word + letter,
 word_node.children[letter], word_list)
 return word_list
 def get_all_words(self):
 word_list = []
 for letter in self._root.children:
 word_list.extend(self.get_possible_words(letter))
 return word_list

Question 3

Thank you so much for the reply @tijko. I did not know about PEP 3106 so it is great to learn about it. The code looks much more condensed. Thank you!

tijko tijko 7821 gold badge5 silver badges13 bronze badges · Accepted Answer · 2016-11-24 00:34:07Z

Starting in the Node class, there was a parent attribute that never gets referenced. Also I find if you are just assigning a variable based off one condition doing it in one line makes the code cleaner and easier to read.

class Node:
 def __init__(self, character):
 self.character = character if character is None else character.lower()
 self.children = dict()
 self.terminus = False
 def add(self, child_node):
 self.children[child_node.character] = child_node

In your Trie class specifically with the .insert and __contains__ methods:

def insert(self, word):
 if word: # checking for truth to continue
 current_node = self._root
 for i, character in enumerate(self._normalize_word(word)):
 if character in current_node.children:
 current_node = current_node.children[character]
 else:
 child_node = Node(character, current_node)
 current_node.add(child_node)
 current_node = child_node
 current_node.terminus = True

You can save a level of indentation by returning on false, essentially short-circuiting your method:

def insert(self, word):
 if not word:
 return
 current_node = self._root
 for i, character in enumerate(self._normalize_word(word)):
 if character in current_node.children:
 current_node = current_node.children[character]
 else:
 child_node = Node(character, current_node)
 current_node.add(child_node)
 current_node = child_node
 current_node.terminus = True

You also make a call to enumerate but never use it:

def insert(self, word):
 if not word:
 return
 current_node = self._root
 for character in self._normalize_word(word):
 if character in current_node.children:
 current_node = current_node.children[character]
 else:
 child_node = Node(character, current_node)
 current_node.add(child_node)
 current_node = child_node
 current_node.terminus = True

Seeing your logic in your method made me think you could also have used a recursive method, something along the lines of:

def insert(self, word):
 if not word: 
 return
 self.__insert(self._normalize_word(word), self._root).terminus = True
def __insert(self, word, node):
 if not word:
 return node
 elif {word[0]} & node.children.keys():
 return self.__insert(word[1:], node.children[word[0]])
 else:
 new_node = Node(word[0])
 node.add(new_node)
 return self.__insert(word[1:], new_node)

One other change I made which depending on the size of the tree will improve performance is the use of a logical & on the children.keys(). PEP 3106 brought about changes that allowed for set operations on dict-keys. This means you could just as well check if a dict has a value as a key by {value} & dict.keys() rather than iterating over every key.

In the __contains__ method where you set a truth flag to false then break and then test with a logical &:

def __contains__(self, item):
 current_node = self._root
 contained = True
 for symbol in self._normalize_word(item):
 if symbol in current_node.children:
 current_node = current_node.children[symbol]
 else:
 contained = False
 break
 return contained and current_node.terminus

If contained is set to False theres no need to check at the bottom of the method. Just get rid of contained and return False if your first condition isn't met. Similar to the last point in .insert you can save a couple of lines if you short-circuit, basically no need for else if you are returning on it:

def __contains__(self, item):
 current_node = self._root
 for symbol in self._normalize_word(item):
 if symbol not in current_node.children:
 return False
 current_node = current_node.children[symbol]
 return current_node.terminus

I saw this as another place where you could have done this recursively as well:

def __contains(self, word, node):
 if not word:
 return node
 return self.__contains(word[1:], node.children[word[0]]) if \
 {word[0]} & node.children.keys() else None
def __contains__(self, item):
 word = self._normalize_word(item)
 return self.__contains(word, self._root) is not None

In the __contains method you can return the Node of the last letter or None. This is useful where you can get a boolean value from your dunder-contains method to check for membership but also you can use the __contains method with the get_possible_words method later. This is similar to how I adjusted the __insert method to return the final Node after inserting to set .terminus.

def get_possible_words(self, word):
 word_node = self.__contains(word, self._root)
 return [] if word_node is None else \
 self.__get_possible_words(word, word_node)

Here you can get your final Node and that will be the starting point to retrieve all possible words. If its None just return an empty list else:

def __get_possible_words(self, word, word_node, word_list=None):
 word_list = [] if word_list is None else word_list
 if word_node.terminus:
 word_list.append(word)
 for letter in word_node.children:
 if not word_node.children[letter]:
 word_list.append(word + letter)
 else:
 self.__get_possible_words(word + letter,
 word_node.children[letter], word_list)

And putting it altogether:

class Node:
 def __init__(self, character):
 self.character = character if character is None else character.lower()
 self.children = dict()
 self.terminus = False
 def add(self, child_node):
 self.children[child_node.character] = child_node
class Trie:
 def __init__(self):
 self._root = Node(None) 
 def insert(self, word):
 if not word: 
 return
 self.__insert(self._normalize_word(word), self._root).terminus = True
 def __insert(self, word, node):
 if not word:
 return node
 elif {word[0]} & node.children.keys():
 return self.__insert(word[1:], node.children[word[0]])
 else:
 new_node = Node(word[0])
 node.add(new_node)
 return self.__insert(word[1:], new_node)
 def __contains(self, word, node):
 if not word:
 return node
 return self.__contains(word[1:], node.children[word[0]]) if \
 {word[0]} & node.children.keys() else None
 def __contains__(self, item):
 word = self._normalize_word(item)
 return self.__contains(word, self._root) is not None
 def _normalize_word(self, word):
 return word.strip().lower()
 def get_possible_words(self, word):
 word_node = self.__contains(word, self._root)
 return [] if word_node is None else \
 self.__get_possible_words(word, word_node)
 def __get_possible_words(self, word, word_node, word_list=None):
 word_list = [] if word_list is None else word_list
 if word_node.terminus:
 word_list.append(word)
 for letter in word_node.children:
 if not word_node.children[letter]:
 word_list.append(word + letter)
 else:
 self.__get_possible_words(word + letter,
 word_node.children[letter], word_list)
 return word_list
 def get_all_words(self):
 word_list = []
 for letter in self._root.children:
 word_list.extend(self.get_possible_words(letter))
 return word_list

Thank you so much for the reply @tijko. I did not know about PEP 3106 so it is great to learn about it. The code looks much more condensed. Thank you!

Stack Exchange Network

Trie implementation in Python

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Trie implementation in Python

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions