3
\$\begingroup\$

I have this code which looks for query words in a given text. I preprocess the give text and store it in an inverted-index (dictionary with word as key and position in text as value). Searching for a query words then becomes contant time operation.

The below code also prints the surrounding text which contains the maximum number of queries.

Please help me understand the shortcomings (coding style, naming conventions, performance issues) of the code.

# -*- coding: cp1252 -*-
from collections import defaultdict
class c_Highlight():
"""Highlight Class to find and print relevant snippet of a text"""
Grammar_Suffix = ['ing' ,'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']
Punctuation = [',', '!', '.', '?', '"', '\n']
def __init__(self):
 self.InvertedIndex = defaultdict( list )
def m_Read_File( self, f_name ):
 """
 Function to Read a file, if the text is to be read from file
 Args: FileName
 Returns: List of words
 """
 list_words = []
 try:
 f = open(f_name)
 for line in f:
 line = line.strip()
 list_words.extend( line.split())
 f.close()
 except IOError as (errno, strerror):
 print "File I/O Error({0}): {1}".format(errno, strerror)
 exit(0)
 return list_words
def __m_Stem( self, word ):
 """
 Function to remove Suffix from the end of a word.
 Input: Word
 Returns: Cropped Word
 """
 word = word.lower()
 for suffix in c_Highlight.Grammar_Suffix: 
 if word.endswith(suffix):
 word = word[:-len(suffix)]
 break
 for punctuation in c_Highlight.Punctuation:
 while word.endswith(punctuation):
 word = word[:-len(punctuation)]
 return word #can be empty list
def m_Create_InvertedIndex( self, words ):
 """
 Function to parse the input words and store them in a InvertedIndex
 The Key is the word itself and Value is the position of the word in the text
 Input: List of Words
 Return: None
 """
 idx = 0
 for word in words:
 word = self.__m_Stem(word)
 self.InvertedIndex[word].append(idx)
 idx = idx+1
def m_Search_Query( self, search_query, length ):
 """
 Function to search for query words in the InvertedIndex
 Input: List of query words, and the number of words in the text
 Return: Integer List of length same as the number of words. Each index indicates
 if the word is present in the query or not. So if List[i]==0, the word is not present
 and if List[i]==2, then the 2nd word in the query is present at location i in the input text. 
 """
 words_present = []
 idx = 1
 for x in range(length):
 words_present.append(0)
 for word in search_query:
 word = self.__m_Stem(word)
 if word in self.InvertedIndex:
 for word_index in self.InvertedIndex[word]:
 words_present[word_index] = idx
 idx = idx + 1
 return words_present
def m_Find_Snippet( self, words_present, num_words, MaxWindow ):
 """
 Function to find a snippet of input text with has the most number of query
 words present.
 Input: Integer List, length, and number of words to be present in the snippet
 Return: begin, end position of the window and the count of query words present
 in the snippet
 """
 begin = 0
 count = 0
 end = 0
 max_count = -1
 Snippet_End = 0
 Snippet_Begin = 0
 num_words = len(words_present)
 try:
 while begin < num_words:
 if words_present[begin]!=0:
 count = 0
 end = begin
 end_done = min((MaxWindow + begin), num_words)
 while end < end_done:
 if words_present[end]!=0:
 count+=1
 end+=1
 if end == num_words:
 end-=1
 if count > max_count:
 max_count = count
 Snippet_Begin = begin
 Snippet_End = end
 begin+=1
 while Snippet_End > 0:
 if words_present[Snippet_End]!=0:
 break
 Snippet_End-=1 
 except IndexError:
 print "Tying to access out of bound index in method m_Find_Snippet"
 return Snippet_Begin, Snippet_End, max_count
def m_Sentence_End( self, word ):
 for suffix in ['.', '!', '?']:
 if word.endswith(suffix):
 return 1
 return 0
def m_Print_Snippet( self, Snippet_Begin, Snippet_End, raw_data, words_present ):
 """
 Function to generate the output text. 
 Input: begin and end position of snippet
 Returns: Beautifies the snippet and returns the new start and end position of the snippet 
 """
 start = Snippet_Begin
 new_start = Snippet_Begin
 end = Snippet_End
 num_sentence = 0
 flag = 0
 num_words = len(raw_data)
 try:
 start_done = min( 20, Snippet_Begin )
 while (Snippet_Begin - start) <= start_done:
 if self.m_Sentence_End( raw_data[start] ):
 num_sentence += 1
 new_start = start+1
 if num_sentence == 2:
 flag = 1
 break
 start -= 1
 if flag == 1:
 Snippet_Begin = start + 1
 elif flag == 0:
 if start_done == Snippet_Begin:
 Snippet_Begin = start + 1
 else:
 Snippet_Begin = new_start
 num_sentence = 0
 new_end = Snippet_End
 flag = 0
 end_done = min( 20, num_words - Snippet_End )
 while (end - Snippet_End) < end_done:
 if self.m_Sentence_End( raw_data[end] ):
 num_sentence+=1
 new_end = end
 if num_sentence == 2:
 flag = 1
 break
 end += 1
 if flag == 1:
 Snippet_End = end
 elif flag == 0:
 if end_done == (num_words - Snippet_End):
 Snippet_End = num_words - 1
 else:
 Snippet_End = new_end
 except IndexError:
 print "Tying to access out of bound index in method m_Print_Snippet"
 output_text = []
 query_start = 0
 try:
 for x in range( Snippet_Begin, Snippet_End+1 ):
 if words_present[x]!=0:
 if query_start == 0:
 output_text.append('[[HIGHLIGHT]]'+raw_data[x])
 query_start = 1
 else:
 output_text.append(raw_data[x])
 else:
 if query_start == 1:
 w = output_text.pop()
 output_text.append(w+'[[ENDHIGHLIGHT]]')
 output_text.append(raw_data[x])
 query_start = 0
 else:
 output_text.append(raw_data[x])
 if words_present[x]!=0:
 w = output_text.pop()
 output_text.append(w+'[[ENDHIGHLIGHT]]')
 except IndexError:
 print "Trying to access out of bound index in method m_Print_Snippet"
 return ' '.join(output_text)
def m_Highlight_doc( doc, query ):
"""
Args:
 doc-string that is a document to be c_Highlighted
 query-string that contains the search query
Return:
 Text Snippet
"""
WINDOW_LENGTH = 35
text = doc.split()
query = query.split()
num_words = len(text)
Search = c_Highlight()
Search.m_Create_InvertedIndex(text)
words_present = Search.m_Search_Query(query, num_words)
begin, end, count = Search.m_Find_Snippet( words_present, num_words, WINDOW_LENGTH )
if count <= 0:
 return "Query Not Found"
else:
 return Search.m_Print_Snippet( begin, end, text, words_present )
asked May 28, 2011 at 0:18
\$\endgroup\$

1 Answer 1

4
\$\begingroup\$
# -*- coding: cp1252 -*-
from collections import defaultdict
class c_Highlight():

Prefixed like c_ and m_ are frowned upon as useless.

"""Highlight Class to find and print relevant snippet of a text"""
Grammar_Suffix = ['ing' ,'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']
Punctuation = [',', '!', '.', '?', '"', '\n']

The official python style guide recommends ALL CAPS for constants like this

def __init__(self):
 self.InvertedIndex = defaultdict( list )

The official python style guide recommends all_lower_case_with_underscores for variable names

def m_Read_File( self, f_name ):
 """
 Function to Read a file, if the text is to be read from file
 Args: FileName
 Returns: List of words
 """
 list_words = []
 try:
 f = open(f_name)
 for line in f:
 line = line.strip()
 list_words.extend( line.split())
 f.close()

This can be better if you use with

with open(f_name) as f:
 for line in f:
 line = line.strip()
 list_words.extend(line.split())

This will close the file even an exception is thrown.

 except IOError as (errno, strerror):
 print "File I/O Error({0}): {1}".format(errno, strerror)
 exit(0)

exit(0) means the program succeeded in this case it didn't.

 return list_words

Since this function doesn't manipulate any object attributes, it might be better as a separate utility function.

def __m_Stem( self, word ):
 """
 Function to remove Suffix from the end of a word.
 Input: Word
 Returns: Cropped Word
 """
 word = word.lower()
 for suffix in c_Highlight.Grammar_Suffix: 
 if word.endswith(suffix):
 word = word[:-len(suffix)]
 break

I recommend use return word[:-len(suffix)] instead.

 for punctuation in c_Highlight.Punctuation:
 while word.endswith(punctuation):
 word = word[:-len(punctuation)]

You don't break here like you did before

 return word #can be empty list

Since your words appear to be strings, this comment is just wrong.

def m_Create_InvertedIndex( self, words ):
 """
 Function to parse the input words and store them in a InvertedIndex
 The Key is the word itself and Value is the position of the word in the text
 Input: List of Words
 Return: None
 """
 idx = 0
 for word in words:
 word = self.__m_Stem(word)
 self.InvertedIndex[word].append(idx)
 idx = idx+1

Use enumerate

for idx, word in enumerate(words):
 word = self.__m_Stem(word)
 self.InvertedIndex[word].append(index)

See? much cleaner.

def m_Search_Query( self, search_query, length ):
 """
 Function to search for query words in the InvertedIndex
 Input: List of query words, and the number of words in the text
 Return: Integer List of length same as the number of words. Each index indicates
 if the word is present in the query or not. So if List[i]==0, the word is not present
 and if List[i]==2, then the 2nd word in the query is present at location i in the input text. 
 """

Requiring the user pass in the length of the input text seems strange. The class itself knows the length of the input text and the user probably doesn't.

 words_present = []
 idx = 1
 for x in range(length):
 words_present.append(0)

Use: word_present = [0] * length to create words_present, it'll be faster and clearer.

 for word in search_query:
 word = self.__m_Stem(word)
 if word in self.InvertedIndex:
 for word_index in self.InvertedIndex[word]:
 words_present[word_index] = idx
 idx = idx + 1

You only increment idx when a match is found. This seems strange as I'd expect the idx to correspond the indexes in search_query. Also returning a list like this seems strange as its pretty much as hard to find values in this list as it is to find the words in the first place.

 return words_present
def m_Find_Snippet( self, words_present, num_words, MaxWindow ):
 """
 Function to find a snippet of input text with has the most number of query
 words present.
 Input: Integer List, length, and number of words to be present in the snippet
 Return: begin, end position of the window and the count of query words present
 in the snippet
 """
 begin = 0
 count = 0
 end = 0
 max_count = -1
 Snippet_End = 0
 Snippet_Begin = 0

Python style guide recommaeds this_style for local variables.

 num_words = len(words_present)
 try:
 while begin < num_words:
 if words_present[begin]!=0:
 count = 0
 end = begin
 end_done = min((MaxWindow + begin), num_words)
 while end < end_done:
 if words_present[end]!=0:
 count+=1
 end+=1

You are really counting from begin to end_done use a for loop.

 if end == num_words:
 end-=1

You should be able to get rid of this ugliness when you use a for loop above

 if count > max_count:
 max_count = count
 Snippet_Begin = begin
 Snippet_End = end

I suggest restructuring this code to use a generator. The generator would generate the snippets begin/ends and the calling code could use max to pick the best one.

 begin+=1

Use a for loop for begin. If you find yourself wanting to use a while loop, slap yourself and use a for loop.

 while Snippet_End > 0:
 if words_present[Snippet_End]!=0:
 break
 Snippet_End-=1 

Again, use a for loop. Whenever you are counting, you should use a for loop. Also, do this when you first construct the snippet, not now.

 except IndexError:
 print "Tying to access out of bound index in method m_Find_Snippet"

Why are catching an index error? An IndexError is usually indicative of a bug in your code. All you accomplish by putting this here is making your code harder to debug by not showing the stack trace and by continuing on so you don't notice the bug.

 return Snippet_Begin, Snippet_End, max_count
def m_Sentence_End( self, word ):
 for suffix in ['.', '!', '?']:
 if word.endswith(suffix):
 return 1
 return 0

Use True and False not 1 and 0.

def m_Print_Snippet( self, Snippet_Begin, Snippet_End, raw_data, words_present ):
 """
 Function to generate the output text. 
 Input: begin and end position of snippet
 Returns: Beautifies the snippet and returns the new start and end position of the snippet 
 """

You explain Snippet_Begin and Snippet end which were obvious. But you don't bother to explain what raw_data means.

 start = Snippet_Begin
 new_start = Snippet_Begin
 end = Snippet_End
 num_sentence = 0
 flag = 0
 num_words = len(raw_data)
 try:
 start_done = min( 20, Snippet_Begin )
 while (Snippet_Begin - start) <= start_done:
 if self.m_Sentence_End( raw_data[start] ):
 num_sentence += 1
 new_start = start+1
 if num_sentence == 2:
 flag = 1
 break
 start -= 1

If you must count, you must use a for loop! Also don't use 1 for True. I'm also very suspicous of boolean flags. I think they are delayed gotos.

 if flag == 1:

Use if flag:

 Snippet_Begin = start + 1
 elif flag == 0:

If the flag can only True or False just use else

 if start_done == Snippet_Begin:
 Snippet_Begin = start + 1
 else:
 Snippet_Begin = new_start
 num_sentence = 0
 new_end = Snippet_End
 flag = 0
 end_done = min( 20, num_words - Snippet_End )
 while (end - Snippet_End) < end_done:
 if self.m_Sentence_End( raw_data[end] ):
 num_sentence+=1
 new_end = end
 if num_sentence == 2:
 flag = 1
 break
 end += 1
 if flag == 1:
 Snippet_End = end
 elif flag == 0:
 if end_done == (num_words - Snippet_End):
 Snippet_End = num_words - 1
 else:
 Snippet_End = new_end
 except IndexError:
 print "Tying to access out of bound index in method m_Print_Snippet"

Again, don't do this. Don't do this. DON'T DO THIS!

 output_text = []
 query_start = 0
 try:
 for x in range( Snippet_Begin, Snippet_End+1 ):
 if words_present[x]!=0:
 if query_start == 0:
 output_text.append('[[HIGHLIGHT]]'+raw_data[x])

Its probably faster to use StringIO rather then appending into a list

 query_start = 1
 else:
 output_text.append(raw_data[x])
 else:
 if query_start == 1:
 w = output_text.pop()
 output_text.append(w+'[[ENDHIGHLIGHT]]')
 output_text.append(raw_data[x])
 query_start = 0
 else:
 output_text.append(raw_data[x])
 if words_present[x]!=0:
 w = output_text.pop()
 output_text.append(w+'[[ENDHIGHLIGHT]]')
 except IndexError:
 print "Trying to access out of bound index in method m_Print_Snippet"

...

 return ' '.join(output_text)
def m_Highlight_doc( doc, query ):
"""
Args:
 doc-string that is a document to be c_Highlighted
 query-string that contains the search query
Return:
 Text Snippet
"""
WINDOW_LENGTH = 35
text = doc.split()
query = query.split()
num_words = len(text)
Search = c_Highlight()
Search.m_Create_InvertedIndex(text)
words_present = Search.m_Search_Query(query, num_words)
begin, end, count = Search.m_Find_Snippet( words_present, num_words, WINDOW_LENGTH )
if count <= 0:
 return "Query Not Found"
else:
 return Search.m_Print_Snippet( begin, end, text, words_present )

m_Print_Snippet is a fairly complicated function. It can be simplified. Here my simplified version. (Its not tested probably broken, but may give you clues as to how you can simplify your code)

def rewind_to_sentance_start(self, words, current_start):
 minimum_start = max(0, current_start - 20)
 num_sentence = 0
 new_start = minimum_start
 for possible_start in range(current_start, minimum_start, -1):
 word = words[possible_start]
 if self.m_Sentence_End(word):
 num_sentence += 1
 current_start = possible_start
 if num_sentence == 2:
 return possible_start + 1
 return new_start
def forward_to_sentance_end(self, words, current_end):
 maximum_end = min(len(words)-1, current_end + 20)
 num_sentence = 0
 new_end = maximum_end
 for possible_end in range(current_end, minimum_end, 1):
 word = words[possible_end]
 if self.m_Sentence_End(word):
 num_sentence += 1
 current_end = possible_end
 if num_sentence == 2:
 return possible_end + 1
 return new_end
def m_Print_Snippet( self, Snippet_Begin, Snippet_End, raw_data, words_present ):
 """
 Function to generate the output text. 
 Input: begin and end position of snippet
 Returns: Beautifies the snippet and returns the new start and end position of the snippet 
 """
 Snippet_Begin = rewind_to_sentance_start(raw_data, Snippet_Begin)
 Snippet_End = forward_to_sentance_end(raw_data, Snippet_End)
 output = StringIO.StringIO()
 query_start = 0
 snippet_words = raw_data[Snippet_Begin:Snippet_End]
 words_present = map(bool, words_present[Snippet_Begin:Snippet_End])
 # map(bool makes sure that they are really bools in there
 for highlighted, data in itertools.groupby( zip(words_present, snippet_words), operator.itemgetter(0) ):
 if highlighted:
 output.write('[[HIGHLIGHT]]')
 output.write(' '.join(word for present, word in data))
 if highlighted:
 output.write('[[ENDHIGHLIGHT]]')
 return output.getvalue()

Notes:

  1. Any portion of the logic of a function thats at all complex should be broken out into its own function.
  2. Python itertools has many useful tools for moving the logic out of your function into it making your code clearer
answered May 28, 2011 at 1:29
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.