Autocomplete implementation in Python

Question 1

I have an autocomplete implementation in Python, and it passed all of the unit tests that I have included at the bottom.

#!python
import random
import time
def get_words(filename):
 with open(filename, 'r') as f:
 words_list = f.read().split()
 return words_list
 print(words_list)
def binarysearch(list_, item, left=None, right=None):
 if left is None and right is None:
 left = 0
 right = len(list_) - 1
 middle = (right + left) // 2
 item_length = len(item)
 middle_item = list_[middle]
 if middle_item[:item_length].lower() == item.lower():
 return find_all_words(list_, item, middle)
 elif middle_item[:item_length].lower() < item.lower():
 return binarysearch(list_, item, middle + 1, right)
 elif middle_item[:item_length].lower() > item.lower():
 return binarysearch(list_, item, left, middle - 1)
 elif left > right:
 return None
def find_all_words(list_, item, middle):
 predicted_words = [list_[middle]]
 item_length = len(item)
 left_match = True
 right_match = False
 left_index = middle
 right_index = middle
 while left_match:
 left_index -= 1
 if left_index < 0:
 left_match = False
 else:
 word = list_[left_index]
 if word[:item_length] == item:
 predicted_words.append(word)
 else:
 left_match = False
 while right_match:
 right_index += 1
 if right_index >= len(list_):
 right_match = False
 else:
 word = list_[right_index]
 if word[:item_length] == item:
 predicted_words.append(word)
 else:
 right_match = False
 return predicted_words
def autocomplete(prefix=""):
 filename = "/usr/share/dict/words"
 words_list = get_words(filename)
 predicted_words = binarysearch(words_list, prefix)
 return predicted_words
def benchmark(all_prefixes):
 t1 = time.time()
 for prefix in all_prefixes:
 autocomplete(prefix)
 t2 = time.time()
 return t2 - t1
def main():
 all_words = get_words('/usr/share/dict/words')
 all_prefixes = set([word[:len(word)//2] for word in all_words])
 time = benchmark(all_prefixes)
 print('Took {} seconds to benchmark {} prefixes on {} words'.format(time, len(all_prefixes), len(all_words)))
 # import sys
 # prefix = sys.argv[1]
 # words = autocomplete(prefix)
 # print(words)
if __name__ == '__main__':
 main()
# autocomplete_test.py
#!python
from autocomplete import insert_word
from tries import Trie
import unittest
class TrieTest(unittest.TestCase):
 def test_init(self):
 trie = Trie()
 data, is_word = trie.root.data
 assert data == ""
 assert is_word is False
 assert trie.root.next == {}
 def test_insert_with_3_items(self):
 trie = Trie()
 assert trie.root.data == ('', False)
 assert trie.root.next == {}
 trie.insert("a")
 assert trie.root.next['a'] is not None
 assert len(trie.root.next) == 1
 node = trie.root.next['a']
 assert node.data == ('a', True)
 assert node.next == {}
 trie.insert("a")
 assert trie.root.next['a'] is not None
 assert len(trie.root.next) == 1
 node = trie.root.next['a']
 assert node.data == ('a', True)
 assert node.next == {}
 trie.insert("b")
 assert trie.root.next['b'] is not None
 assert len(trie.root.next) == 2
 node = trie.root.next['b']
 assert node.data == ('b', True)
 assert node.next == {}
 trie.insert("c")
 assert trie.root.next['c'] is not None
 assert len(trie.root.next) == 3
 node = trie.root.next['c']
 assert node.data == ('c', True)
 assert node.next == {}
 def test_inset_with_7_items(self):
 trie = Trie()
 words_list = ['a', 'app', 'apple', 'be', 'bee', 'beat', 'c']
 for word in words_list:
 trie.insert(word)
 assert trie.root.data == ('', False)
 assert len(trie.root.next) == 3
 assert trie.root.next['a'] is not None
 node = trie.root.next['a']
 assert node.data == ('a', True)
 assert len(node.next) == 1
 assert node.next['p'] is not None
 node = node.next['p']
 assert node.data == ('ap', False)
 assert len(node.next) == 1
 assert node.next['p'] is not None
 node = node.next['p']
 assert node.data == ('app', True)
 assert len(node.next) == 1
 assert node.next['l'] is not None
 node = node.next['l']
 assert node.data == ('appl', False)
 assert len(node.next) == 1
 assert node.next['e'] is not None
 node = node.next['e']
 assert node.data == ('apple', True)
 assert len(node.next) == 0
 assert trie.root.next['b'] is not None
 node = trie.root.next['b']
 assert node.data == ('b', False)
 assert len(node.next) == 1
 assert node.next['e'] is not None
 node = node.next['e']
 assert node.data == ('be', True)
 assert len(node.next) == 2
 assert node.next['e'] is not None
 node1 = node.next['e']
 assert node1.data == ('bee', True)
 assert len(node1.next) == 0
 assert node.next['a'] is not None
 node2 = node.next['a']
 assert node2.data == ('bea', False)
 assert len(node2.next) == 1
 assert node2.next['t'] is not None
 node = node2.next['t']
 assert node.data == ('beat', True)
 assert len(node.next) == 0
 assert trie.root.next['c'] is not None
 node = trie.root.next['c']
 assert node.data == ('c', True)
 assert len(node.next) == 0
class AutocompleteTest(unittest.TestCase):
 def test_insert_word():

Question 2

You don't seem to have used the random module, so that import can be deleted. In your first function get_words(), you have put a print(words_list) immediately after an unconditional return, which is unreachable. Unless this is for debugging, it is best to either put it before the return statement or delete it altogether.

binarysearch is composed of two words, and should be separated with an underscore for consistency. The conditional for the first if statement is a bit redundant, since whenever one index is None, the other must also be the same. If you do want to check both, you should probably use an or rather than and, since neither is allowed to be None.

The final elif of the function is again unreachable, this time because one of the previous comparisons (==, <, >) will always be true. This implementation means your binary search will never terminate correctly if the search term is not present in the dictionary — instead, check if the search should terminate at the beginning of the function.

You can also do the slicing and lowercasing of your variables immediately instead of repeating for each if statement.

def binary_search(list_, item, left=None, right=None):
 if left is None or right is None:
 left = 0
 right = len(list_) - 1
 elif left > right:
 return None
 item = item.lower()
 middle = (right + left) // 2
 sliced_middle_item = list_[middle][:len(item)].lower()
 if sliced_middle_item == item:
 return find_all_words(list_, item, middle)
 if sliced_middle_item < item:
 return binary_search(list_, item, middle + 1, right)
 if sliced_middle_item > item:
 return binary_search(list_, item, left, middle - 1)

Since this is a single branch problem and Python does not optimise tail calls, it may be more efficent to impement your binary search iteratively using a loop instead of recursively.

def binary_search(list_, item):
 left = 0
 right = len(list_) - 1
 item = item.lower()
 while True:
 if left > right:
 return None
 middle = (left + right) // 2
 sliced_middle_item = list_[middle][:len(item)]
 if sliced_middle_item < item:
 left = middle + 1
 elif sliced_middle_item > item:
 right = middle - 1
 elif sliced_middle_item == item:
 return find_all_words(list_, item, middle)

In find_all_words, you have initialized right_match to False, which means the second loop will never run, and you will not find any values to the right of the first match. Also, using a flag variable that controls the loop is really no different from using a break statement, except you won't need to put the second half of your loop in an else block.

Instead of building your list one at a time using .append(), it is probably more efficent to find the first match and last match and then using the indcies to slice the word list. As a bonus this will also maintain the lexical ordering of the list. This will also mean you can use a binary search to find the first and last matches instead of a linear search. It would be best to continue from the previous search rather than starting from scratch, taking left and right as parameters as well.

def binary_find_all(list_, item, left, middle, right):
 half_right = middle
 half_left = middle
 while left < half_right:
 middle = (left + half_right) // 2
 sliced_middle_item = list_[middle][:len(item)]
 if sliced_middle_item < item:
 left = middle + 1
 else:
 half_right = middle
 while half_left < right:
 middle = (half_left + right) // 2
 sliced_middle_item = list_[middle][:len(item)]
 if sliced_middle_item == item:
 half_left = middle + 1
 else:
 right = middle
 return list_[left:right]

It is really unnessary and inefficent to reopen the file and reset the word list for every invocation of autocomplete() — on my machine, 1000 invocations of the binary search takes around 60 ms for most search terms, while 1000 invocations of autocomplete() as it is written would take over 30 seconds: 500 times slower! Just delete it, or make it an alias of binary_search() if you have to.

For your unit tests, I think your tests for the above code has been cut off, either by the site or during cutting and pasting. I only see the tests for the Trie data type. I'm not sure why you're testing it, or why you haven't used it in your main code, but the tests themselves look fine.

Question 3

i don't think I miss any unit testings of my file. you can find my file here at github

Question 4

@NinjaG, I'm afraid that I am unable to find any public repository of gist containing what appears to be your autocomplete linked to the GitHub username in your profile (Jeffchiucp). Nor can I see any code in the class AutocompleteTest, except the line def test_insert_word(): with nothing in the function. I think one more thing I forgot to mention is that, depending on use case, it might be a good idea to add docstrings to your functions. Also, binary search might return [] instead of None. I'll edit it in only when there's something else significant to add though.

timuzhti timuzhti 2232 silver badges11 bronze badges · Accepted Answer · 2018-07-03 13:09:50Z

You don't seem to have used the random module, so that import can be deleted. In your first function get_words(), you have put a print(words_list) immediately after an unconditional return, which is unreachable. Unless this is for debugging, it is best to either put it before the return statement or delete it altogether.

binarysearch is composed of two words, and should be separated with an underscore for consistency. The conditional for the first if statement is a bit redundant, since whenever one index is None, the other must also be the same. If you do want to check both, you should probably use an or rather than and, since neither is allowed to be None.

The final elif of the function is again unreachable, this time because one of the previous comparisons (==, <, >) will always be true. This implementation means your binary search will never terminate correctly if the search term is not present in the dictionary — instead, check if the search should terminate at the beginning of the function.

You can also do the slicing and lowercasing of your variables immediately instead of repeating for each if statement.

def binary_search(list_, item, left=None, right=None):
 if left is None or right is None:
 left = 0
 right = len(list_) - 1
 elif left > right:
 return None
 item = item.lower()
 middle = (right + left) // 2
 sliced_middle_item = list_[middle][:len(item)].lower()
 if sliced_middle_item == item:
 return find_all_words(list_, item, middle)
 if sliced_middle_item < item:
 return binary_search(list_, item, middle + 1, right)
 if sliced_middle_item > item:
 return binary_search(list_, item, left, middle - 1)

Since this is a single branch problem and Python does not optimise tail calls, it may be more efficent to impement your binary search iteratively using a loop instead of recursively.

def binary_search(list_, item):
 left = 0
 right = len(list_) - 1
 item = item.lower()
 while True:
 if left > right:
 return None
 middle = (left + right) // 2
 sliced_middle_item = list_[middle][:len(item)]
 if sliced_middle_item < item:
 left = middle + 1
 elif sliced_middle_item > item:
 right = middle - 1
 elif sliced_middle_item == item:
 return find_all_words(list_, item, middle)

In find_all_words, you have initialized right_match to False, which means the second loop will never run, and you will not find any values to the right of the first match. Also, using a flag variable that controls the loop is really no different from using a break statement, except you won't need to put the second half of your loop in an else block.

Instead of building your list one at a time using .append(), it is probably more efficent to find the first match and last match and then using the indcies to slice the word list. As a bonus this will also maintain the lexical ordering of the list. This will also mean you can use a binary search to find the first and last matches instead of a linear search. It would be best to continue from the previous search rather than starting from scratch, taking left and right as parameters as well.

def binary_find_all(list_, item, left, middle, right):
 half_right = middle
 half_left = middle
 while left < half_right:
 middle = (left + half_right) // 2
 sliced_middle_item = list_[middle][:len(item)]
 if sliced_middle_item < item:
 left = middle + 1
 else:
 half_right = middle
 while half_left < right:
 middle = (half_left + right) // 2
 sliced_middle_item = list_[middle][:len(item)]
 if sliced_middle_item == item:
 half_left = middle + 1
 else:
 right = middle
 return list_[left:right]

It is really unnessary and inefficent to reopen the file and reset the word list for every invocation of autocomplete() — on my machine, 1000 invocations of the binary search takes around 60 ms for most search terms, while 1000 invocations of autocomplete() as it is written would take over 30 seconds: 500 times slower! Just delete it, or make it an alias of binary_search() if you have to.

For your unit tests, I think your tests for the above code has been cut off, either by the site or during cutting and pasting. I only see the tests for the Trie data type. I'm not sure why you're testing it, or why you haven't used it in your main code, but the tests themselves look fine.

i don't think I miss any unit testings of my file. you can find my file here at github
@NinjaG, I'm afraid that I am unable to find any public repository of gist containing what appears to be your autocomplete linked to the GitHub username in your profile (Jeffchiucp). Nor can I see any code in the class AutocompleteTest, except the line def test_insert_word(): with nothing in the function. I think one more thing I forgot to mention is that, depending on use case, it might be a good idea to add docstrings to your functions. Also, binary search might return [] instead of None. I'll edit it in only when there's something else significant to add though.

Stack Exchange Network

Autocomplete implementation in Python

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Autocomplete implementation in Python

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions