I have an autocomplete implementation in Python, and it passed all of the unit tests that I have included at the bottom.
#!python
import random
import time
def get_words(filename):
with open(filename, 'r') as f:
words_list = f.read().split()
return words_list
print(words_list)
def binarysearch(list_, item, left=None, right=None):
if left is None and right is None:
left = 0
right = len(list_) - 1
middle = (right + left) // 2
item_length = len(item)
middle_item = list_[middle]
if middle_item[:item_length].lower() == item.lower():
return find_all_words(list_, item, middle)
elif middle_item[:item_length].lower() < item.lower():
return binarysearch(list_, item, middle + 1, right)
elif middle_item[:item_length].lower() > item.lower():
return binarysearch(list_, item, left, middle - 1)
elif left > right:
return None
def find_all_words(list_, item, middle):
predicted_words = [list_[middle]]
item_length = len(item)
left_match = True
right_match = False
left_index = middle
right_index = middle
while left_match:
left_index -= 1
if left_index < 0:
left_match = False
else:
word = list_[left_index]
if word[:item_length] == item:
predicted_words.append(word)
else:
left_match = False
while right_match:
right_index += 1
if right_index >= len(list_):
right_match = False
else:
word = list_[right_index]
if word[:item_length] == item:
predicted_words.append(word)
else:
right_match = False
return predicted_words
def autocomplete(prefix=""):
filename = "/usr/share/dict/words"
words_list = get_words(filename)
predicted_words = binarysearch(words_list, prefix)
return predicted_words
def benchmark(all_prefixes):
t1 = time.time()
for prefix in all_prefixes:
autocomplete(prefix)
t2 = time.time()
return t2 - t1
def main():
all_words = get_words('/usr/share/dict/words')
all_prefixes = set([word[:len(word)//2] for word in all_words])
time = benchmark(all_prefixes)
print('Took {} seconds to benchmark {} prefixes on {} words'.format(time, len(all_prefixes), len(all_words)))
# import sys
# prefix = sys.argv[1]
# words = autocomplete(prefix)
# print(words)
if __name__ == '__main__':
main()
# autocomplete_test.py
#!python
from autocomplete import insert_word
from tries import Trie
import unittest
class TrieTest(unittest.TestCase):
def test_init(self):
trie = Trie()
data, is_word = trie.root.data
assert data == ""
assert is_word is False
assert trie.root.next == {}
def test_insert_with_3_items(self):
trie = Trie()
assert trie.root.data == ('', False)
assert trie.root.next == {}
trie.insert("a")
assert trie.root.next['a'] is not None
assert len(trie.root.next) == 1
node = trie.root.next['a']
assert node.data == ('a', True)
assert node.next == {}
trie.insert("a")
assert trie.root.next['a'] is not None
assert len(trie.root.next) == 1
node = trie.root.next['a']
assert node.data == ('a', True)
assert node.next == {}
trie.insert("b")
assert trie.root.next['b'] is not None
assert len(trie.root.next) == 2
node = trie.root.next['b']
assert node.data == ('b', True)
assert node.next == {}
trie.insert("c")
assert trie.root.next['c'] is not None
assert len(trie.root.next) == 3
node = trie.root.next['c']
assert node.data == ('c', True)
assert node.next == {}
def test_inset_with_7_items(self):
trie = Trie()
words_list = ['a', 'app', 'apple', 'be', 'bee', 'beat', 'c']
for word in words_list:
trie.insert(word)
assert trie.root.data == ('', False)
assert len(trie.root.next) == 3
assert trie.root.next['a'] is not None
node = trie.root.next['a']
assert node.data == ('a', True)
assert len(node.next) == 1
assert node.next['p'] is not None
node = node.next['p']
assert node.data == ('ap', False)
assert len(node.next) == 1
assert node.next['p'] is not None
node = node.next['p']
assert node.data == ('app', True)
assert len(node.next) == 1
assert node.next['l'] is not None
node = node.next['l']
assert node.data == ('appl', False)
assert len(node.next) == 1
assert node.next['e'] is not None
node = node.next['e']
assert node.data == ('apple', True)
assert len(node.next) == 0
assert trie.root.next['b'] is not None
node = trie.root.next['b']
assert node.data == ('b', False)
assert len(node.next) == 1
assert node.next['e'] is not None
node = node.next['e']
assert node.data == ('be', True)
assert len(node.next) == 2
assert node.next['e'] is not None
node1 = node.next['e']
assert node1.data == ('bee', True)
assert len(node1.next) == 0
assert node.next['a'] is not None
node2 = node.next['a']
assert node2.data == ('bea', False)
assert len(node2.next) == 1
assert node2.next['t'] is not None
node = node2.next['t']
assert node.data == ('beat', True)
assert len(node.next) == 0
assert trie.root.next['c'] is not None
node = trie.root.next['c']
assert node.data == ('c', True)
assert len(node.next) == 0
class AutocompleteTest(unittest.TestCase):
def test_insert_word():
1 Answer 1
You don't seem to have used the random
module, so that import can be deleted. In your first function get_words()
, you have put a print(words_list)
immediately after an unconditional return, which is unreachable. Unless this is for debugging, it is best to either put it before the return statement or delete it altogether.
binarysearch
is composed of two words, and should be separated with an underscore for consistency. The conditional for the first if
statement is a bit redundant, since whenever one index is None
, the other must also be the same. If you do want to check both, you should probably use an or
rather than and
, since neither is allowed to be None
.
The final elif
of the function is again unreachable, this time because one of the previous comparisons (==
, <
, >
) will always be true. This implementation means your binary search will never terminate correctly if the search term is not present in the dictionary — instead, check if the search should terminate at the beginning of the function.
You can also do the slicing and lowercasing of your variables immediately instead of repeating for each if
statement.
def binary_search(list_, item, left=None, right=None):
if left is None or right is None:
left = 0
right = len(list_) - 1
elif left > right:
return None
item = item.lower()
middle = (right + left) // 2
sliced_middle_item = list_[middle][:len(item)].lower()
if sliced_middle_item == item:
return find_all_words(list_, item, middle)
if sliced_middle_item < item:
return binary_search(list_, item, middle + 1, right)
if sliced_middle_item > item:
return binary_search(list_, item, left, middle - 1)
Since this is a single branch problem and Python does not optimise tail calls, it may be more efficent to impement your binary search iteratively using a loop instead of recursively.
def binary_search(list_, item):
left = 0
right = len(list_) - 1
item = item.lower()
while True:
if left > right:
return None
middle = (left + right) // 2
sliced_middle_item = list_[middle][:len(item)]
if sliced_middle_item < item:
left = middle + 1
elif sliced_middle_item > item:
right = middle - 1
elif sliced_middle_item == item:
return find_all_words(list_, item, middle)
In find_all_words
, you have initialized right_match
to False
, which means the second loop will never run, and you will not find any values to the right of the first match. Also, using a flag variable that controls the loop is really no different from using a break
statement, except you won't need to put the second half of your loop in an else
block.
Instead of building your list one at a time using .append()
, it is probably more efficent to find the first match and last match and then using the indcies to slice the word list. As a bonus this will also maintain the lexical ordering of the list. This will also mean you can use a binary search to find the first and last matches instead of a linear search. It would be best to continue from the previous search rather than starting from scratch, taking left
and right
as parameters as well.
def binary_find_all(list_, item, left, middle, right):
half_right = middle
half_left = middle
while left < half_right:
middle = (left + half_right) // 2
sliced_middle_item = list_[middle][:len(item)]
if sliced_middle_item < item:
left = middle + 1
else:
half_right = middle
while half_left < right:
middle = (half_left + right) // 2
sliced_middle_item = list_[middle][:len(item)]
if sliced_middle_item == item:
half_left = middle + 1
else:
right = middle
return list_[left:right]
It is really unnessary and inefficent to reopen the file and reset the word list for every invocation of autocomplete()
— on my machine, 1000 invocations of the binary search takes around 60 ms for most search terms, while 1000 invocations of autocomplete()
as it is written would take over 30 seconds: 500 times slower! Just delete it, or make it an alias of binary_search()
if you have to.
For your unit tests, I think your tests for the above code has been cut off, either by the site or during cutting and pasting. I only see the tests for the Trie data type. I'm not sure why you're testing it, or why you haven't used it in your main code, but the tests themselves look fine.
-
\$\begingroup\$ i don't think I miss any unit testings of my file. you can find my file here at github \$\endgroup\$NinjaG– NinjaG2018年07月04日 03:19:43 +00:00Commented Jul 4, 2018 at 3:19
-
\$\begingroup\$ @NinjaG, I'm afraid that I am unable to find any public repository of gist containing what appears to be your autocomplete linked to the GitHub username in your profile (Jeffchiucp). Nor can I see any code in the class AutocompleteTest, except the line
def test_insert_word():
with nothing in the function. I think one more thing I forgot to mention is that, depending on use case, it might be a good idea to add docstrings to your functions. Also, binary search might return[]
instead ofNone
. I'll edit it in only when there's something else significant to add though. \$\endgroup\$timuzhti– timuzhti2018年07月04日 06:59:26 +00:00Commented Jul 4, 2018 at 6:59
Explore related questions
See similar questions with these tags.