Return to Answer

Made more Pythonic, incorporating suggestions from @codesparkle

edited Apr 28, 2014 at 6:44

145.5k
22
190
478

from collections import namedtuple
from itertools import product
import re
WordPos = namedtuple('WordPos', ['start', 'end'])
def find_all_words(text, words):
 """
 For each word in the list, find all positions in which they
 appear in the text. Results are returned as a dict, with
 the words as keys. Each value is a list, with a WordPos
 object to mark each occurrence of the word in the text.
 """
 WordPos = namedtuple('WordPos', ['start', 'end'])
  def positions(text, word):
 word_re = re.compile(r'\b' + re.escape(word) + r'\b', re.IGNORECASE)
 return [WordPos(match.start(), match.end()) for match in
 reword_re.finditer(word_re, text, re.IGNORECASE)]
 return dict(({ word,: positions(text, word)) for word in words) }
def cluster(found_words):
 """
 Given a dict resulting from find_all_words(), pick the
 occurrence of each word that minimizes the length of the
 substring of text that contains them all.
 The result is a list of WordPos objects,object onethat perrepresents searchthe word.span of
 the cluster. If any of the words does not appear in the text, then this function returns None.
 """
 def spanbounds(combo):
 start = min(word.start for word in combo)
 end = max(word.end for word in combo)
 return end - WordPos(start
 shortest_span, best_combo = None, Noneend)
 positions = found_words.values()
 combo_bounds = [bounds(combo) for combo in product(*positions)]
 if combo_bounds:
 length# =Find span(the shortest combo)
 ifreturn shortest_spanmin(combo_bounds, iskey=lambda Nonespan: orspan.end length- <span.start)
 shortest_span else:
 # At least one shortest_span,search best_comboterm =was length,not combo
found
 return best_comboNone
def excerpt(text, combocombo_bounds):
 """
 Take the substring of text corresponding to the combination
 of word occurrences represented bygiven combospan.
 """
 if not combocombo_bounds:
 return None
 start =return min(wordtext[combo_bounds.start for word in combo)
 end =: max(wordcombo_bounds.end for word in combo)
 return text[start:end]
test_text = """The George Washington Bridge in New York City..."""
test_terms = 'Landmark City Bridge'.split()
print(excerpt(test_text, cluster(find_all_words(test_text, test_terms))))

from collections import namedtuple
from itertools import product
import re
def find_all_words(text, words):
 """
 For each word in the list, find all positions in which they
 appear in the text. Results are returned as a dict, with
 the words as keys. Each value is a list, with a WordPos
 object to mark each occurrence of the word in the text.
 """
 WordPos = namedtuple('WordPos', ['start', 'end'])
  def positions(text, word):
 word_re = r'\b' + re.escape(word) + r'\b'
 return [WordPos(match.start(), match.end()) for match in
 re.finditer(word_re, text, re.IGNORECASE)]
 return dict((word, positions(text, word)) for word in words)
def cluster(found_words):
 """
 Given a dict resulting from find_all_words(), pick the
 occurrence of each word that minimizes the length of the
 substring of text that contains them all.
 The result is a list of WordPos objects, one per search word.
 If any of the words does not appear in the text, then this function returns None.
 """
 def span(combo):
 start = min(word.start for word in combo)
 end = max(word.end for word in combo)
 return end - start
 shortest_span, best_combo = None, None
 positions = found_words.values()
 for combo in product(*positions):
 length = span(combo)
 if shortest_span is None or length < shortest_span:
 shortest_span, best_combo = length, combo

 return best_combo
def excerpt(text, combo):
 """
 Take the substring of text corresponding to the combination
 of word occurrences represented by combo.
 """
 if not combo:
 return None
 start = min(word.start for word in combo)
 end = max(word.end for word in combo)
 return text[start:end]
test_text = """The George Washington Bridge in New York City..."""
test_terms = 'Landmark City Bridge'.split()
print(excerpt(test_text, cluster(find_all_words(test_text, test_terms))))

from collections import namedtuple
from itertools import product
import re
WordPos = namedtuple('WordPos', ['start', 'end'])
def find_all_words(text, words):
 """
 For each word in the list, find all positions in which they
 appear in the text. Results are returned as a dict, with
 the words as keys. Each value is a list, with a WordPos
 object to mark each occurrence of the word in the text.
 """
 def positions(text, word):
 word_re = re.compile(r'\b' + re.escape(word) + r'\b', re.IGNORECASE)
 return [WordPos(match.start(), match.end()) for match in
 word_re.finditer(text)]
 return { word: positions(text, word) for word in words }
def cluster(found_words):
 """
 Given a dict resulting from find_all_words(), pick the
 occurrence of each word that minimizes the length of the
 substring of text that contains them all.
 The result is a WordPos object that represents the span of
 the cluster. If any of the words does not appear in the text, then this function returns None.
 """
 def bounds(combo):
 start = min(word.start for word in combo)
 end = max(word.end for word in combo)
 return WordPos(start, end)
 positions = found_words.values()
 combo_bounds = [bounds(combo) for combo in product(*positions)]
 if combo_bounds:
 # Find the shortest combo
 return min(combo_bounds, key=lambda span: span.end - span.start)
  else:
 # At least one search term was not found
 return None
def excerpt(text, combo_bounds):
 """
 Take the substring of text corresponding to the given span.
 """
 if not combo_bounds:
 return None
 return text[combo_bounds.start : combo_bounds.end]
test_text = """The George Washington Bridge in New York City..."""
test_terms = 'Landmark City Bridge'.split()
print(excerpt(test_text, cluster(find_all_words(test_text, test_terms))))

Modified handling when a search word is not found

Source Link

edited Apr 28, 2014 at 4:14

200_success

edited Apr 28, 2014 at 4:14

200_success

145.5k
22
190
478

from collections import namedtuple
from itertools import product
import re
def find_all_words(text, words):
 """
 For each word in the list, find all positions in which they
 appear in the text. Results are returned as a dict, with
 the words as keys. Each value is a list, with a WordPos
 object to mark each occurrence of the word in the text.
 """
 WordPos = namedtuple('WordPos', ['start', 'end'])
 def positions(text, word):
 word_re = r'\b' + re.escape(word) + r'\b'
 return [WordPos(match.start(), match.end()) for match in
 re.finditer(word_re, text, re.IGNORECASE)]
 return dict((word, positions(text, word)) for word in words)
def cluster(found_words):
 """
 Given a dict resulting from find_all_words(), pick the
 occurrence of each word that minimizes the length of the
 substring of text that contains them all.
 The result is a list of WordPos objects, one per search word.
 If any of the words does not appear in the text, then this
 function returns None.
 """
 def span(combo):
 start = min(word.start for word in combo)
 end = max(word.end for word in combo)
 return end - start
 if not found_words:
 return []
 shortest_span, best_combo = None, None
 positions = found_words.values()
 for combo in product(*positions):
 length = span(combo)
 if shortest_span is None or length < shortest_span:
 shortest_span, best_combo = length, combo
 return best_combo
def excerpt(text, combo):
 """
 Take the substring of text corresponding to the combination
 of word occurrences represented by combo.
 """
 if not combo:
 return None
 start = min(word.start for word in combo)
 end = max(word.end for word in combo)
 return text[start:end]
test_text = """The George Washington Bridge in New York City..."""
test_terms = 'Landmark City Bridge'.split()
print(excerpt(test_text, cluster(find_all_words(test_text, test_terms))))

from collections import namedtuple
from itertools import product
import re
def find_all_words(text, words):
 """
 For each word in the list, find all positions in which they
 appear in the text. Results are returned as a dict, with
 the words as keys. Each value is a list, with a WordPos
 object to mark each occurrence of the word in the text.
 """
 WordPos = namedtuple('WordPos', ['start', 'end'])
 def positions(text, word):
 word_re = r'\b' + re.escape(word) + r'\b'
 return [WordPos(match.start(), match.end()) for match in
 re.finditer(word_re, text, re.IGNORECASE)]
 return dict((word, positions(text, word)) for word in words)
def cluster(found_words):
 """
 Given a dict resulting from find_all_words(), pick the
 occurrence of each word that minimizes the length of the
 substring of text that contains them all.
 The result is a list of WordPos objects, one per search word.
 """
 def span(combo):
 start = min(word.start for word in combo)
 end = max(word.end for word in combo)
 return end - start
 if not found_words:
 return []
 shortest_span, best_combo = None, None
 positions = found_words.values()
 for combo in product(*positions):
 length = span(combo)
 if shortest_span is None or length < shortest_span:
 shortest_span, best_combo = length, combo
 return best_combo
def excerpt(text, combo):
 """
 Take the substring of text corresponding to the combination
 of word occurrences represented by combo.
 """
 if not combo:
 return None
 start = min(word.start for word in combo)
 end = max(word.end for word in combo)
 return text[start:end]
test_text = """The George Washington Bridge in New York City..."""
test_terms = 'Landmark City Bridge'.split()
print(excerpt(test_text, cluster(find_all_words(test_text, test_terms))))

from collections import namedtuple
from itertools import product
import re
def find_all_words(text, words):
 """
 For each word in the list, find all positions in which they
 appear in the text. Results are returned as a dict, with
 the words as keys. Each value is a list, with a WordPos
 object to mark each occurrence of the word in the text.
 """
 WordPos = namedtuple('WordPos', ['start', 'end'])
 def positions(text, word):
 word_re = r'\b' + re.escape(word) + r'\b'
 return [WordPos(match.start(), match.end()) for match in
 re.finditer(word_re, text, re.IGNORECASE)]
 return dict((word, positions(text, word)) for word in words)
def cluster(found_words):
 """
 Given a dict resulting from find_all_words(), pick the
 occurrence of each word that minimizes the length of the
 substring of text that contains them all.
 The result is a list of WordPos objects, one per search word.
 If any of the words does not appear in the text, then this
 function returns None.
 """
 def span(combo):
 start = min(word.start for word in combo)
 end = max(word.end for word in combo)
 return end - start
 shortest_span, best_combo = None, None
 positions = found_words.values()
 for combo in product(*positions):
 length = span(combo)
 if shortest_span is None or length < shortest_span:
 shortest_span, best_combo = length, combo
 return best_combo
def excerpt(text, combo):
 """
 Take the substring of text corresponding to the combination
 of word occurrences represented by combo.
 """
 if not combo:
 return None
 start = min(word.start for word in combo)
 end = max(word.end for word in combo)
 return text[start:end]
test_text = """The George Washington Bridge in New York City..."""
test_terms = 'Landmark City Bridge'.split()
print(excerpt(test_text, cluster(find_all_words(test_text, test_terms))))

Source Link

answered Apr 28, 2014 at 4:09

200_success

answered Apr 28, 2014 at 4:09

200_success

145.5k
22
190
478

I would agree with the recruiter: this code does not make a good first impression.

Red flags

Any one of these issues could be grounds for disqualification within the first few seconds of reading.

Too much code. If someone gives you a programming puzzle, chances are that they expect a short, elegant solution. Nobody wants to read a long, rambling answer, so they would not ask a question that requires a lot of code to solve.

The bulk of your code is in one long function. I couldn't point to a chunk of code within that function and say what that chunk's purpose is. The comments that you left are worse-than-useless junk: they are all disabled print statements for debugging.

You have 65 lines, excluding comments and blank lines. My proposed solution below has about half that.
Global variables. It is widely agreed that global variables are to be used only as a last resort. Here, there's absolutely no justification for them. Not only do they make it hard to reason about the code by making side-effects non-localized, they indicate that your function's interface is sloppy: it's unclear what the function's inputs and outputs are.
Too many variables. Within FindShortest(), we have:
1. CurrentTerm
2. MatchIndexes (global?)
3. LeftBorder (global)
4. RightBorder (global)
5. FinalLeftBorder (global)
6. FinalRightBorder (global)
7. SearchTerms
8. OptimalRightBorderFound
9. OldLeftBorder
10. OldRightBorder
The human mind can keep track of about seven things at once. Ideally, you should only have about three variables per function.
Non-idiomatic Python. You used regular expressions, which is good. Other than that, your answer is written not much differently from a C solution. You need to demonstrate your ability to think at a more abstract level.
Non-standard naming. You should call your function find_shortest, and its parameter current_term, and similarly for all variables. UpperCase identifiers look like class names. See PEP 8, which is the standard style guide for Python.

The naming of your functions is particularly important, since its effects extend beyond your own code. You've just indicated that you will burden your future colleagues with non-standard naming.

Yellow flags

These are also serious issues. They are just not as obvious as the red flags at first glance.

Lots of free-floating preparatory code. There's a lot of code between opening the file and calling FindShortest(). Why does it do, and why is it not packaged in functions too?
Careless concatenation. RegexVariable = r"\W" + Term + r"\W" trusts that Term contains no characters that have special meaning within regular expressions. From that one line of careless concatenation, I would infer that you would probably write code that is vulnerable to SQL injection, cross-site scripting, arbitrary command execution, header-splitting attacks, etc.
Irresponsible recursion. When a function calls itself, that is recursion. However, recursion should be used responsibly: there should be invariants, well-defined function inputs and outputs, a base case, and a recursive case. But you don't have any of those elements, so effectively you have a weird goto.

Proposed solution

For comparison, here's what I came up with. It's optimized more for simplicity than performance — it's usually beneficial to convey that goal to your interviewer.

Note that it accomplishes the task by chaining three functions, each with well defined inputs and outputs.

from collections import namedtuple
from itertools import product
import re
def find_all_words(text, words):
 """
 For each word in the list, find all positions in which they
 appear in the text. Results are returned as a dict, with
 the words as keys. Each value is a list, with a WordPos
 object to mark each occurrence of the word in the text.
 """
 WordPos = namedtuple('WordPos', ['start', 'end'])
 def positions(text, word):
 word_re = r'\b' + re.escape(word) + r'\b'
 return [WordPos(match.start(), match.end()) for match in
 re.finditer(word_re, text, re.IGNORECASE)]
 return dict((word, positions(text, word)) for word in words)
def cluster(found_words):
 """
 Given a dict resulting from find_all_words(), pick the
 occurrence of each word that minimizes the length of the
 substring of text that contains them all.
 The result is a list of WordPos objects, one per search word.
 """
 def span(combo):
 start = min(word.start for word in combo)
 end = max(word.end for word in combo)
 return end - start
 if not found_words:
 return []
 shortest_span, best_combo = None, None
 positions = found_words.values()
 for combo in product(*positions):
 length = span(combo)
 if shortest_span is None or length < shortest_span:
 shortest_span, best_combo = length, combo
 return best_combo
def excerpt(text, combo):
 """
 Take the substring of text corresponding to the combination
 of word occurrences represented by combo.
 """
 if not combo:
 return None
 start = min(word.start for word in combo)
 end = max(word.end for word in combo)
 return text[start:end]
test_text = """The George Washington Bridge in New York City..."""
test_terms = 'Landmark City Bridge'.split()
print(excerpt(test_text, cluster(find_all_words(test_text, test_terms))))

lang-py