I have this code which looks for query words in a given text. I preprocess the give text and store it in an inverted-index (dictionary with word as key and position in text as value). Searching for a query words then becomes contant time operation.
The below code also prints the surrounding text which contains the maximum number of queries.
Please help me understand the shortcomings (coding style, naming conventions, performance issues) of the code.
# -*- coding: cp1252 -*-
from collections import defaultdict
class c_Highlight():
"""Highlight Class to find and print relevant snippet of a text"""
Grammar_Suffix = ['ing' ,'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']
Punctuation = [',', '!', '.', '?', '"', '\n']
def __init__(self):
self.InvertedIndex = defaultdict( list )
def m_Read_File( self, f_name ):
"""
Function to Read a file, if the text is to be read from file
Args: FileName
Returns: List of words
"""
list_words = []
try:
f = open(f_name)
for line in f:
line = line.strip()
list_words.extend( line.split())
f.close()
except IOError as (errno, strerror):
print "File I/O Error({0}): {1}".format(errno, strerror)
exit(0)
return list_words
def __m_Stem( self, word ):
"""
Function to remove Suffix from the end of a word.
Input: Word
Returns: Cropped Word
"""
word = word.lower()
for suffix in c_Highlight.Grammar_Suffix:
if word.endswith(suffix):
word = word[:-len(suffix)]
break
for punctuation in c_Highlight.Punctuation:
while word.endswith(punctuation):
word = word[:-len(punctuation)]
return word #can be empty list
def m_Create_InvertedIndex( self, words ):
"""
Function to parse the input words and store them in a InvertedIndex
The Key is the word itself and Value is the position of the word in the text
Input: List of Words
Return: None
"""
idx = 0
for word in words:
word = self.__m_Stem(word)
self.InvertedIndex[word].append(idx)
idx = idx+1
def m_Search_Query( self, search_query, length ):
"""
Function to search for query words in the InvertedIndex
Input: List of query words, and the number of words in the text
Return: Integer List of length same as the number of words. Each index indicates
if the word is present in the query or not. So if List[i]==0, the word is not present
and if List[i]==2, then the 2nd word in the query is present at location i in the input text.
"""
words_present = []
idx = 1
for x in range(length):
words_present.append(0)
for word in search_query:
word = self.__m_Stem(word)
if word in self.InvertedIndex:
for word_index in self.InvertedIndex[word]:
words_present[word_index] = idx
idx = idx + 1
return words_present
def m_Find_Snippet( self, words_present, num_words, MaxWindow ):
"""
Function to find a snippet of input text with has the most number of query
words present.
Input: Integer List, length, and number of words to be present in the snippet
Return: begin, end position of the window and the count of query words present
in the snippet
"""
begin = 0
count = 0
end = 0
max_count = -1
Snippet_End = 0
Snippet_Begin = 0
num_words = len(words_present)
try:
while begin < num_words:
if words_present[begin]!=0:
count = 0
end = begin
end_done = min((MaxWindow + begin), num_words)
while end < end_done:
if words_present[end]!=0:
count+=1
end+=1
if end == num_words:
end-=1
if count > max_count:
max_count = count
Snippet_Begin = begin
Snippet_End = end
begin+=1
while Snippet_End > 0:
if words_present[Snippet_End]!=0:
break
Snippet_End-=1
except IndexError:
print "Tying to access out of bound index in method m_Find_Snippet"
return Snippet_Begin, Snippet_End, max_count
def m_Sentence_End( self, word ):
for suffix in ['.', '!', '?']:
if word.endswith(suffix):
return 1
return 0
def m_Print_Snippet( self, Snippet_Begin, Snippet_End, raw_data, words_present ):
"""
Function to generate the output text.
Input: begin and end position of snippet
Returns: Beautifies the snippet and returns the new start and end position of the snippet
"""
start = Snippet_Begin
new_start = Snippet_Begin
end = Snippet_End
num_sentence = 0
flag = 0
num_words = len(raw_data)
try:
start_done = min( 20, Snippet_Begin )
while (Snippet_Begin - start) <= start_done:
if self.m_Sentence_End( raw_data[start] ):
num_sentence += 1
new_start = start+1
if num_sentence == 2:
flag = 1
break
start -= 1
if flag == 1:
Snippet_Begin = start + 1
elif flag == 0:
if start_done == Snippet_Begin:
Snippet_Begin = start + 1
else:
Snippet_Begin = new_start
num_sentence = 0
new_end = Snippet_End
flag = 0
end_done = min( 20, num_words - Snippet_End )
while (end - Snippet_End) < end_done:
if self.m_Sentence_End( raw_data[end] ):
num_sentence+=1
new_end = end
if num_sentence == 2:
flag = 1
break
end += 1
if flag == 1:
Snippet_End = end
elif flag == 0:
if end_done == (num_words - Snippet_End):
Snippet_End = num_words - 1
else:
Snippet_End = new_end
except IndexError:
print "Tying to access out of bound index in method m_Print_Snippet"
output_text = []
query_start = 0
try:
for x in range( Snippet_Begin, Snippet_End+1 ):
if words_present[x]!=0:
if query_start == 0:
output_text.append('[[HIGHLIGHT]]'+raw_data[x])
query_start = 1
else:
output_text.append(raw_data[x])
else:
if query_start == 1:
w = output_text.pop()
output_text.append(w+'[[ENDHIGHLIGHT]]')
output_text.append(raw_data[x])
query_start = 0
else:
output_text.append(raw_data[x])
if words_present[x]!=0:
w = output_text.pop()
output_text.append(w+'[[ENDHIGHLIGHT]]')
except IndexError:
print "Trying to access out of bound index in method m_Print_Snippet"
return ' '.join(output_text)
def m_Highlight_doc( doc, query ):
"""
Args:
doc-string that is a document to be c_Highlighted
query-string that contains the search query
Return:
Text Snippet
"""
WINDOW_LENGTH = 35
text = doc.split()
query = query.split()
num_words = len(text)
Search = c_Highlight()
Search.m_Create_InvertedIndex(text)
words_present = Search.m_Search_Query(query, num_words)
begin, end, count = Search.m_Find_Snippet( words_present, num_words, WINDOW_LENGTH )
if count <= 0:
return "Query Not Found"
else:
return Search.m_Print_Snippet( begin, end, text, words_present )
1 Answer 1
# -*- coding: cp1252 -*-
from collections import defaultdict
class c_Highlight():
Prefixed like c_ and m_ are frowned upon as useless.
"""Highlight Class to find and print relevant snippet of a text"""
Grammar_Suffix = ['ing' ,'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']
Punctuation = [',', '!', '.', '?', '"', '\n']
The official python style guide recommends ALL CAPS for constants like this
def __init__(self):
self.InvertedIndex = defaultdict( list )
The official python style guide recommends all_lower_case_with_underscores for variable names
def m_Read_File( self, f_name ):
"""
Function to Read a file, if the text is to be read from file
Args: FileName
Returns: List of words
"""
list_words = []
try:
f = open(f_name)
for line in f:
line = line.strip()
list_words.extend( line.split())
f.close()
This can be better if you use with
with open(f_name) as f:
for line in f:
line = line.strip()
list_words.extend(line.split())
This will close the file even an exception is thrown.
except IOError as (errno, strerror):
print "File I/O Error({0}): {1}".format(errno, strerror)
exit(0)
exit(0) means the program succeeded in this case it didn't.
return list_words
Since this function doesn't manipulate any object attributes, it might be better as a separate utility function.
def __m_Stem( self, word ):
"""
Function to remove Suffix from the end of a word.
Input: Word
Returns: Cropped Word
"""
word = word.lower()
for suffix in c_Highlight.Grammar_Suffix:
if word.endswith(suffix):
word = word[:-len(suffix)]
break
I recommend use return word[:-len(suffix)] instead.
for punctuation in c_Highlight.Punctuation:
while word.endswith(punctuation):
word = word[:-len(punctuation)]
You don't break here like you did before
return word #can be empty list
Since your words appear to be strings, this comment is just wrong.
def m_Create_InvertedIndex( self, words ):
"""
Function to parse the input words and store them in a InvertedIndex
The Key is the word itself and Value is the position of the word in the text
Input: List of Words
Return: None
"""
idx = 0
for word in words:
word = self.__m_Stem(word)
self.InvertedIndex[word].append(idx)
idx = idx+1
Use enumerate
for idx, word in enumerate(words):
word = self.__m_Stem(word)
self.InvertedIndex[word].append(index)
See? much cleaner.
def m_Search_Query( self, search_query, length ):
"""
Function to search for query words in the InvertedIndex
Input: List of query words, and the number of words in the text
Return: Integer List of length same as the number of words. Each index indicates
if the word is present in the query or not. So if List[i]==0, the word is not present
and if List[i]==2, then the 2nd word in the query is present at location i in the input text.
"""
Requiring the user pass in the length of the input text seems strange. The class itself knows the length of the input text and the user probably doesn't.
words_present = []
idx = 1
for x in range(length):
words_present.append(0)
Use: word_present = [0] * length
to create words_present, it'll be faster and clearer.
for word in search_query:
word = self.__m_Stem(word)
if word in self.InvertedIndex:
for word_index in self.InvertedIndex[word]:
words_present[word_index] = idx
idx = idx + 1
You only increment idx
when a match is found. This seems strange as I'd expect the idx to correspond the indexes in search_query. Also returning a list like this seems strange as its pretty much as hard to find values in this list as it is to find the words in the first place.
return words_present
def m_Find_Snippet( self, words_present, num_words, MaxWindow ):
"""
Function to find a snippet of input text with has the most number of query
words present.
Input: Integer List, length, and number of words to be present in the snippet
Return: begin, end position of the window and the count of query words present
in the snippet
"""
begin = 0
count = 0
end = 0
max_count = -1
Snippet_End = 0
Snippet_Begin = 0
Python style guide recommaeds this_style for local variables.
num_words = len(words_present)
try:
while begin < num_words:
if words_present[begin]!=0:
count = 0
end = begin
end_done = min((MaxWindow + begin), num_words)
while end < end_done:
if words_present[end]!=0:
count+=1
end+=1
You are really counting from begin
to end_done
use a for loop.
if end == num_words:
end-=1
You should be able to get rid of this ugliness when you use a for loop above
if count > max_count:
max_count = count
Snippet_Begin = begin
Snippet_End = end
I suggest restructuring this code to use a generator. The generator would generate the snippets begin/ends and the calling code could use max to pick the best one.
begin+=1
Use a for loop for begin. If you find yourself wanting to use a while loop, slap yourself and use a for loop.
while Snippet_End > 0:
if words_present[Snippet_End]!=0:
break
Snippet_End-=1
Again, use a for loop. Whenever you are counting, you should use a for loop. Also, do this when you first construct the snippet, not now.
except IndexError:
print "Tying to access out of bound index in method m_Find_Snippet"
Why are catching an index error? An IndexError is usually indicative of a bug in your code. All you accomplish by putting this here is making your code harder to debug by not showing the stack trace and by continuing on so you don't notice the bug.
return Snippet_Begin, Snippet_End, max_count
def m_Sentence_End( self, word ):
for suffix in ['.', '!', '?']:
if word.endswith(suffix):
return 1
return 0
Use True
and False
not 1 and 0.
def m_Print_Snippet( self, Snippet_Begin, Snippet_End, raw_data, words_present ):
"""
Function to generate the output text.
Input: begin and end position of snippet
Returns: Beautifies the snippet and returns the new start and end position of the snippet
"""
You explain Snippet_Begin and Snippet end which were obvious. But you don't bother to explain what raw_data means.
start = Snippet_Begin
new_start = Snippet_Begin
end = Snippet_End
num_sentence = 0
flag = 0
num_words = len(raw_data)
try:
start_done = min( 20, Snippet_Begin )
while (Snippet_Begin - start) <= start_done:
if self.m_Sentence_End( raw_data[start] ):
num_sentence += 1
new_start = start+1
if num_sentence == 2:
flag = 1
break
start -= 1
If you must count, you must use a for loop! Also don't use 1 for True
. I'm also very suspicous of boolean flags. I think they are delayed gotos.
if flag == 1:
Use if flag:
Snippet_Begin = start + 1
elif flag == 0:
If the flag can only True
or False
just use else
if start_done == Snippet_Begin:
Snippet_Begin = start + 1
else:
Snippet_Begin = new_start
num_sentence = 0
new_end = Snippet_End
flag = 0
end_done = min( 20, num_words - Snippet_End )
while (end - Snippet_End) < end_done:
if self.m_Sentence_End( raw_data[end] ):
num_sentence+=1
new_end = end
if num_sentence == 2:
flag = 1
break
end += 1
if flag == 1:
Snippet_End = end
elif flag == 0:
if end_done == (num_words - Snippet_End):
Snippet_End = num_words - 1
else:
Snippet_End = new_end
except IndexError:
print "Tying to access out of bound index in method m_Print_Snippet"
Again, don't do this. Don't do this. DON'T DO THIS!
output_text = []
query_start = 0
try:
for x in range( Snippet_Begin, Snippet_End+1 ):
if words_present[x]!=0:
if query_start == 0:
output_text.append('[[HIGHLIGHT]]'+raw_data[x])
Its probably faster to use StringIO rather then appending into a list
query_start = 1
else:
output_text.append(raw_data[x])
else:
if query_start == 1:
w = output_text.pop()
output_text.append(w+'[[ENDHIGHLIGHT]]')
output_text.append(raw_data[x])
query_start = 0
else:
output_text.append(raw_data[x])
if words_present[x]!=0:
w = output_text.pop()
output_text.append(w+'[[ENDHIGHLIGHT]]')
except IndexError:
print "Trying to access out of bound index in method m_Print_Snippet"
...
return ' '.join(output_text)
def m_Highlight_doc( doc, query ):
"""
Args:
doc-string that is a document to be c_Highlighted
query-string that contains the search query
Return:
Text Snippet
"""
WINDOW_LENGTH = 35
text = doc.split()
query = query.split()
num_words = len(text)
Search = c_Highlight()
Search.m_Create_InvertedIndex(text)
words_present = Search.m_Search_Query(query, num_words)
begin, end, count = Search.m_Find_Snippet( words_present, num_words, WINDOW_LENGTH )
if count <= 0:
return "Query Not Found"
else:
return Search.m_Print_Snippet( begin, end, text, words_present )
m_Print_Snippet is a fairly complicated function. It can be simplified. Here my simplified version. (Its not tested probably broken, but may give you clues as to how you can simplify your code)
def rewind_to_sentance_start(self, words, current_start):
minimum_start = max(0, current_start - 20)
num_sentence = 0
new_start = minimum_start
for possible_start in range(current_start, minimum_start, -1):
word = words[possible_start]
if self.m_Sentence_End(word):
num_sentence += 1
current_start = possible_start
if num_sentence == 2:
return possible_start + 1
return new_start
def forward_to_sentance_end(self, words, current_end):
maximum_end = min(len(words)-1, current_end + 20)
num_sentence = 0
new_end = maximum_end
for possible_end in range(current_end, minimum_end, 1):
word = words[possible_end]
if self.m_Sentence_End(word):
num_sentence += 1
current_end = possible_end
if num_sentence == 2:
return possible_end + 1
return new_end
def m_Print_Snippet( self, Snippet_Begin, Snippet_End, raw_data, words_present ):
"""
Function to generate the output text.
Input: begin and end position of snippet
Returns: Beautifies the snippet and returns the new start and end position of the snippet
"""
Snippet_Begin = rewind_to_sentance_start(raw_data, Snippet_Begin)
Snippet_End = forward_to_sentance_end(raw_data, Snippet_End)
output = StringIO.StringIO()
query_start = 0
snippet_words = raw_data[Snippet_Begin:Snippet_End]
words_present = map(bool, words_present[Snippet_Begin:Snippet_End])
# map(bool makes sure that they are really bools in there
for highlighted, data in itertools.groupby( zip(words_present, snippet_words), operator.itemgetter(0) ):
if highlighted:
output.write('[[HIGHLIGHT]]')
output.write(' '.join(word for present, word in data))
if highlighted:
output.write('[[ENDHIGHLIGHT]]')
return output.getvalue()
Notes:
- Any portion of the logic of a function thats at all complex should be broken out into its own function.
- Python itertools has many useful tools for moving the logic out of your function into it making your code clearer