As an introduction to Python classes and an exercise in recursion / tree-traversal, I wrote a small script that forms all English words using only the letters in an input string. This is useful for things like creating words from musical-note letters or typing "words" on upside-down calculators. I used this word list as my raw data.
I'm particularly interested in adherence to best practices.
Notes:
tqdm
was included simply to display to the user that the script was not hanging while reading the word list file into memory.- Sets were used instead of lists for
get_child_letters
and forallowed_letters
becausex in set
is faster thanx in list
, andin
-tests is the only use-case of both sets. (After the word list is processed, the output is nearly instantaneous on my machine regardless of the list/set implementation, but I felt I should do this anyway.)
These are things I feel could be improved, but I am not sure how:
- Importing
re
just to do a singlere.fullmatch
for input validation seems clumsy. - I feel that my docstrings are awkward; I feel that I may be overexplaining obvious things and underexplaining non-obvious things.
- Are my
__str__
and__repr__
implementations appropriate? - Is
child.letter or ""
okay? I know that it's dependent on which comes first (i.e."" or child.letter
will beNone
ifchild.letter
isNone
), so it felt a little weird, but in searching for a "coalesce" operator in Python, StackOverflow recommended the above construction.
#! /usr/bin/env python
'''Given a string of letters, return all English words formable from those letters only.'''
import re, sys, tqdm
from tqdm import tqdm
class TreeNode:
'''A node of an up-to-26-ary tree that stores English words, letter-by-letter.
Each node contains a single letter (or None, indicating end-of-word), along with
a list of its children. These children represent the next letter of the word.
A TreeNode can have a None child alongside non-None children; this indicates that
there exists a word that ends at the current position in the word, as well as
words that continue beyond the current position in the word. For example, at the
position in the tree representing "ant", a None child is present indicating that
"ant" is a complete valid English word, but an "e" child is also present, which
leads onward to "anteater", "antechamber", "antelope", and others.
A new TreeNode intended to be the root of a tree should be initialized with "".
None is not appropriate in this case (reserved for end-of-word), and the "" will
disappear when child letters are concatenated to it later.
'''
def __init__(self, letter):
self.letter = letter
self.children = []
def __str__(self):
return "letter: " + str(self.letter) + ", children: " + ", ".join([child.letter for child in self.children])
def __repr__(self):
return self.letter
def add_child(self, letter):
'''Append a child to the Tree's children with the provided letter and return the new child.'''
self.children.append(TreeNode(letter))
return self.get_child_of_letter(letter)
def get_child_letters(self):
'''Return a set consisting of the letters of all this TreeNode's children.'''
return set([child.letter for child in self.children])
def get_child_of_letter(self, letter):
'''Return the child of this TreeNode whose letter matches the one provided.'''
for child in self.children:
if child.letter == letter:
return child
raise KeyError("No child was found with letter '" + letter + "'!")
def add_word_to_tree(word, tree_node):
'''Add a word letter-by-letter to the tree, starting at the root and traveling down.
For each successive letter ("X") in the word, if there is a child of the current node
whose letter is X, add the remainder of the word (everything after X) to that child.
Otherwise, create a new child of the current node for X, and add the remainder to it.
When the word has been entirely "consumed", add a None child and return, signifying
the end of the word.
'''
if not word: # Analogous to "if len(word) == 0"
tree_node.add_child(None)
return
elif word[0] in tree_node.get_child_letters():
add_word_to_tree(word[1:], tree_node.get_child_of_letter(word[0]))
else:
add_word_to_tree(word[1:], tree_node.add_child(word[0]))
def parse_word_list(word_list_file):
'''Open the text file of legitimate English words and add each one to the TreeNode.'''
with open(word_list_file, "r") as infile:
words = infile.read().splitlines()
head = TreeNode("")
for word in tqdm(words, desc = "Reading word list..."):
add_word_to_tree(word, head)
return head
def create_result_words(tree, allowed_letters, letter_stack, results):
'''Traverse the tree to return the words formable from the allowed letters.
When a child's letter is None, this indicates the end of the word. The letter stack
at that time can be completed as a word and appended to the results. Otherwise, we
look through the children's letters, descending into those with acceptable letters.
'''
if tree.letter is None:
return results + [letter_stack]
for child in tree.children:
if child.letter in allowed_letters:
# Using 'child.letter or ""' coalesces None to "" to handle end-of-word instances.
results = create_result_words(child, allowed_letters, letter_stack + (child.letter or ""), results)
return results
def output_result_words(result_words):
'''Write all result words to a text file, and print some quick stats.'''
longest = [""] # Initialized to [""] so it will get immediately overwritten.
with open("word_constructor.txt", "w") as outfile:
for word in result_words:
if len(word) > len(longest[0]):
longest = [word]
elif len(word) == len(longest[0]):
longest = longest + [word]
outfile.write(word + "\n")
print(str(len(result_words)) + " words can be made from the letters provided.")
print("Longest word(s): " + ", ".join(longest))
def main():
'''Validate command-line options and execute top-level functionality.'''
if len(sys.argv) != 2 or re.fullmatch("[a-z]+", sys.argv[1].lower()) is None:
print("This requires as input a string of allowed letters ONLY.")
print("Don't separate the letters by spaces, commas, or anything else.")
sys.exit(1)
allowed_letters = set(sys.argv[1].lower())
tree = parse_word_list("words_alpha.txt")
result_words = create_result_words(tree, allowed_letters | {None}, "", [])
output_result_words(result_words)
main()
1 Answer 1
Overview
The code layout is good, and you used meaningful names for classes, functions and variables.
Lint check
pylint identified a few issues. There are some long lines that can be shortened.
You could split up the multiple imports on one line onto one line per import.
This also revealed that you import tqdm
twice.
Consider using a set comprehension for the following line:
return set([child.letter for child in self.children])
Documentation
When I run the code, it is not clear what it expects for input or what the output is, especially when I don't do what the code expects.
You could add more relevant usage information to the header comment, such as the required input file, the required input string and the output file.
Command line
argparse is a standard alternative to
parsing the command line yourself. It can generate a help message in a consistent way
as well as providing a built-in -h
option.
DRY
You can assign the result of sys.argv[1].lower()
to a variable instead
of calling it multiple times.
Here is new code with the suggestions above:
'''
Given a string of letters, return all English words formable from those letters only.
Reads file named "words_alpha.txt from the current directory"
Creates file named "word_constructor.txt" in the current directory
Requires a string of letter as input on the command line, like "grade"
'''
import re
import sys
import argparse
from tqdm import tqdm
class TreeNode:
'''A node of an up-to-26-ary tree that stores English words, letter-by-letter.
Each node contains a single letter (or None, indicating end-of-word), along with
a list of its children. These children represent the next letter of the word.
A TreeNode can have a None child alongside non-None children; this indicates that
there exists a word that ends at the current position in the word, as well as
words that continue beyond the current position in the word. For example, at the
position in the tree representing "ant", a None child is present indicating that
"ant" is a complete valid English word, but an "e" child is also present, which
leads onward to "anteater", "antechamber", "antelope", and others.
A new TreeNode intended to be the root of a tree should be initialized with "".
None is not appropriate in this case (reserved for end-of-word), and the "" will
disappear when child letters are concatenated to it later.
'''
def __init__(self, letter):
self.letter = letter
self.children = []
def __str__(self):
text = "letter: " + str(self.letter) + ", children: " + ", "
return text.join([child.letter for child in self.children])
def __repr__(self):
return self.letter
def add_child(self, letter):
'''Append a child to the Tree's children with the provided letter and return the new child.'''
self.children.append(TreeNode(letter))
return self.get_child_of_letter(letter)
def get_child_letters(self):
'''Return a set consisting of the letters of all this TreeNode's children.'''
return {child.letter for child in self.children}
def get_child_of_letter(self, letter):
'''Return the child of this TreeNode whose letter matches the one provided.'''
for child in self.children:
if child.letter == letter:
return child
raise KeyError("No child was found with letter '" + letter + "'!")
def add_word_to_tree(word, tree_node):
'''Add a word letter-by-letter to the tree, starting at the root and traveling down.
For each successive letter ("X") in the word, if there is a child of the current node
whose letter is X, add the remainder of the word (everything after X) to that child.
Otherwise, create a new child of the current node for X, and add the remainder to it.
When the word has been entirely "consumed", add a None child and return, signifying
the end of the word.
'''
if not word: # Analogous to "if len(word) == 0"
tree_node.add_child(None)
return
elif word[0] in tree_node.get_child_letters():
add_word_to_tree(word[1:], tree_node.get_child_of_letter(word[0]))
else:
add_word_to_tree(word[1:], tree_node.add_child(word[0]))
def parse_word_list(word_list_file):
'''Open the text file of legitimate English words and add each one to the TreeNode.'''
with open(word_list_file, "r") as infile:
words = infile.read().splitlines()
head = TreeNode("")
for word in tqdm(words, desc = "Reading word list..."):
add_word_to_tree(word, head)
return head
def create_result_words(tree, allowed_letters, letter_stack, results):
'''Traverse the tree to return the words formable from the allowed letters.
When a child's letter is None, this indicates the end of the word. The letter stack
at that time can be completed as a word and appended to the results. Otherwise, we
look through the children's letters, descending into those with acceptable letters.
'''
if tree.letter is None:
return results + [letter_stack]
for child in tree.children:
if child.letter in allowed_letters:
# Using 'child.letter or ""' coalesces None to "" to handle end-of-word instances.
results = create_result_words(
child, allowed_letters, letter_stack + (child.letter or ""), results
)
return results
def output_result_words(result_words):
'''Write all result words to a text file, and print some quick stats.'''
longest = [""] # Initialized to [""] so it will get immediately overwritten.
with open("word_constructor.txt", "w") as outfile:
for word in result_words:
if len(word) > len(longest[0]):
longest = [word]
elif len(word) == len(longest[0]):
longest = longest + [word]
outfile.write(word + "\n")
print(str(len(result_words)) + " words can be made from the letters provided.")
print("Longest word(s): " + ", ".join(longest))
def main():
'''Validate command-line options and execute top-level functionality.'''
msg = 'This requires as input a string of letters: a-z'
parser = argparse.ArgumentParser()
parser.add_argument('string', help = msg)
args = parser.parse_args()
string = args.string.lower()
if re.fullmatch("[a-z]+", string) is None:
print(f'Error: {msg}')
sys.exit(1)
allowed_letters = set(string)
tree = parse_word_list("words_alpha.txt")
result_words = create_result_words(tree, allowed_letters | {None}, "", [])
output_result_words(result_words)
main()
Explore related questions
See similar questions with these tags.
for line in infie:
, thenline
will be the same line as ininfile.read().splitlines()
. With the added benefit that you will read file in a buffered way (not that it is particularly important in your case, but in general, that's the way to go). There are also standard implementations of prefix tree in Python, but I imagine that you did this as an exercise, so it's not important. \$\endgroup\$