#!/usr/bin/env python # coding: utf-8 # # Palindromic Panama Sentences # # I [previously](http://norvig.com/palindrome.html) wrote a program to find long palindromic sentences. Starting with the sentence: # # A man, a plan, a canal, Panama # # the program added words to the middle, one comma-separated phrase at a time, backtracking when a choice didn't work out. The program is able to find palindromes of over 15,000 words that look like: # # A man, a plan, a caddy, Ore, Lee, ..., Steele, Roydd, a canal, Panama. # # In this notebook I revisit the problem, but this time progressing one *letter* at a time rather than one phrase. That should allow me, at each choice point, to pick the letter that is most likely to lead to a complete phrase, # perhaps leading to longer palindromes. # # # Phrase Dictionary # # First, I will build a dictionary of phrases. I start with a file, [npdict.txt](npdict.txt), which lists noun phrases —proper nouns like `'Panama'`, plural nouns like `'dogs'`, and two-word noun phrases like `'a plan'`. Capitalization, whitespace and punctuation don't count in palindromes, just letters, so I use the function `letters` to extract just the letters of each phrase and create a dict of entries like `{'donaldeknuth': 'Donald E. Knuth'}`. In addition, to decide what letter to choose next, I will want to know if a given sequence of letters forms a *prefix* or *suffix* of other words. For example, if I have formed the partial phrase `'am'`, then adding `'a'` gives us `'ama'`, which is a prefix of over 1000 phrases in the dictionary, but adding `'d'` gives us `'amd'`, which is not a prefix of any phrase, and thus can be rejected. # # # In[1]: get_ipython().system(' [ -e npdict.txt ] || curl -O http://norvig.com/npdict.txt') # In[1]: from collections import Counter, deque import re class PhraseDict(dict): """A dictionary of {letters: phrase}, such as {'donaldeknuth': 'Donald E. Knuth'}, with: .prefixes: Counter of {'pre': n} where n is the number of keys that start with 'pre' .suffixes: Counter of {'xes': n} where n is the number of keys that end with 'xes'""" def __init__(self, phrases): for phrase in phrases: phrase = phrase.strip() self[letters(phrase)] = phrase self.prefixes = Counter(x for p in self for x in prefixes(p)) self.suffixes = Counter(x for p in self for x in suffixes(p)) def prefixes(phrase): return [phrase[:i] for i in range(1, len(phrase) + 1)] def suffixes(phrase): return [phrase[-i:] for i in range(1, len(phrase) + 1)] def letters(phrase, sub=re.compile(r'[\W]+').sub): "Remove all the non-letters from phrase; return lowercase version." return sub('', phrase).lower() def test1(): assert prefixes('hello') == ['h', 'he', 'hel', 'hell', 'hello'] assert suffixes('hello') == ['o', 'lo', 'llo', 'ello', 'hello'] assert letters('a man') == 'aman' assert letters('an elk') == 'anelk' assert letters('Mr. T') == 'mrt' assert letters('Donald E. Knuth') == 'donaldeknuth' assert len(DICT) == 125512 assert 'panama' in DICT assert 'aman' in DICT assert 'threemen' not in DICT assert DICT['acanal'] == 'a canal' return 'ok' get_ipython().run_line_magic('time', "DICT = PhraseDict(open('npdict.txt'))") test1() # # Search for Panama Palindromes # # Now the `Panama` class will search for a palindrome. I'll show the code first, then explain it. # In[2]: import time class Panama: """Panama represents a palindrome, or a state in searching for one. It has .left and .right to hold the phrases that are chosen, and .L and .R to hold the current partial phrases in the middle (still working on these). Also, a .set of all complete phrases, and the .dict of allowable phrases to choose from.""" def __init__(self, left=['aman', 'aplan'], L='aca', R='', right=['acanal', 'panama'], dict=DICT): assert cat(left + [L]) == cat([R] + right)[::-1] self.left = list(left) # list of complete phrases on left self.L = L # an incomplete phrase on left self.R = R # an incomplete phrase on right self.right = deque(right) # deque of complete phrases on right self.dict = dict # a {letters: actual_phrase} mapping self.set = set(left + right) # a set of all complete phrases in palindrome self.best = [] # list of phrases in longest palindrome found self.Nshown = 0 # the number of phrases shown in the previous printout self.i = 0 # the number of steps taken in the search self.check() def __str__(self): return self.original_phrases(self.best) def original_phrases(self, phrases): return ', '.join(self.dict[phrase] for phrase in phrases) def search(self, steps=10**5): """Depth-first search for palindromes. From the current state, find all applicable actions. Do the first one, and put on the stack reminders to undo it and try the others, but first search deeper from the result of the first action.""" self.t0 = time.time() stack = [self.applicable_actions()] for self.i in range(steps): if not stack: return command = stack.pop() if isinstance(command, UndoCommand): self.undo(command) elif command: act = command.pop() self.do(act) self.check() stack.extend([command, UndoCommand(act), self.applicable_actions()]) def do(self, act): "Modify the current state by adding a letter, or finishing a phrase." if act == ',': # finish phrase on left self.set.add(self.L) self.left.append(self.L) self.L = '' elif act == ';': # finish phrase on right self.set.add(self.R) self.right.appendleft(self.R) self.R = '' else: # add a letter self.L = self.L + act self.R = act + self.R def undo(self, act): "Modify the current state by undoing an action that was previously done." if act == ',': # unfinish phrase on left assert self.L == '' self.L = self.left.pop() self.set.remove(self.L) elif act == ';': # unfinish phrase on right assert self.R == '' self.R = self.right.popleft() self.set.remove(self.R) else: # remove a letter self.L = self.L[:-1] self.R = self.R[1:] def check(self): "Check to see if current state is a palindrome, and if so, record it and maybe print." if not self.is_palindrome(): return N = len(self.left) + len(self.right) if N> len(self.best): self.best = self.left + list(self.right) if N - self.Nshown> 1000 or N> 16000: self.Nshown = N print(self.report()) def report(self): N = len(self.best) nwords = N + sum(self.dict[p].count(' ') for p in self.best) nletters = sum(len(p) for p in self.best) return ('Pal: {:6,d} phrases, {:6,d} words, {:6,d} letters ({:,d} steps, {} seconds)' .format(N, nwords, nletters, self.i+1, round(time.time() - self.t0))) def applicable_actions(self): L, R, D = self.L, self.R, self.dict actions = [] def score(A): return D.prefixes[L+A] * D.suffixes[A+R] if self.is_allowed(L): actions.append(',') if self.is_allowed(R): actions.append(';') for A in sorted(alphabet, key=score): if score(A)> 0: actions.append(A) return actions def is_allowed(self, phrase): return phrase in self.dict and phrase not in self.set def is_palindrome(self): "Is this a palindrome? (Does any extra .L or .R match the other side?)" return ((self.L == '' and self.left[-1].endswith(self.R)) or (self.R == '' and self.right[0].startswith(self.L))) alphabet = 'abcdefghijklmnopqrstuvwxyz' cat = ''.join UndoCommand = str DoCommand = list def test2(): p1 = Panama() assert p1.is_palindrome() assert str(p1) == 'a man, a plan, a canal, Panama' p2 = Panama(['aman','aplan'], 'acadd','dd', ['acanal', 'panama']) assert not p2.is_palindrome() p3 = Panama(['maya'], '', '', ['ayam']) assert p3.is_palindrome() assert str(p3) == 'Maya, a yam' return 'ok' test2() # The main point of the `Panama` class is to allow the `.search()` method to search for a long palindrome. The [overall strategy](http://norvig.com/pal-alg.html#v3) is explained elsewhere. The search # adds a lett1er at a time to both left and right side, backtracking when necessary. The state of the search is represented by the four fields `.left, .L, .R, .right`, for example: # # A man, a plan, a cam, a yak, a yam, a canal, Panama # # gets encoded into the four fields like this: # # .left: ['aman', 'aplan', 'acam', 'ayak'] # .L: '' # .R: 'k' # .right: ['ayam', 'acanal', 'panama'] # # We always maintain the invariant: # # cat(left + [L]) == cat([R] + right)[::-1] # # The `search` method would be more straightforward if we could write it is a recursive function. But Python does not perform well when recursing to 100 million levels deep. So instead I manually manage a *stack* of *commands* that tell the search what to do and to undo. A command can be a `DoCommand`, which consists of a list of letters describing all the possible actions we could take at this point. Actions include letters that could be added (only the letters that make a legitimate prefix of a known phrase in `.L` and a legitimate suffix of a known phrase in `.R`). Actions can also include the character `','`, which is used to dignal that `.L` is a complete word and should be moved ontot he `.left` list, or `';'` to signal the same for `.R/.right`. A command can also be a `UndoCommand`, which consists of a single character; one that was previously chosen to be done. So the list of characters in a `DoCommand` constitutes a choice point; we first choose one, and continue deeper from there, but we put on the command stack an `UndoCommand` to reverse the effects of the action, and put back the remaining characters to try instead, if the first character doesn't work out. Note that we pop characters off the end of a `DoCommand`, so the last character is the first one tried. # # Let's see how it works: # In[3]: p = Panama() get_ipython().run_line_magic('time', 'p.search(200000)') # In[5]: p.report() # That seemed to go ok; let's try a longer search (100 million steps): # In[6]: p = Panama() get_ipython().run_line_magic('time', 'p.search(10**8)') # In[7]: str(p) # Success! We found a palindrome that is longer than any found before. We can do about 50,000 steps per second. I'll stop here, but there are many things to try; feel free to explore on your own: # # - I order the actions by the product of the number of prefixes and number of suffixes formed. In other words, choose first the character that forms the most possible completions. But maybe, at least in the early goings, I should choose more difficult combinations of letters, to use up the "hard" phrases and save the "easy" phrases for later. # - I'm not sure when to end a word and when to try to continue to a longer word; experimentation here would be useful. # - The program is deterministic, and thus always finds the same plaindrome. that's boring; can some randomness be introduced? # - Can we make more interesting phrases? Include determiners other than "a" and "an"; include adjectives, etc. # - The counts of prefixes and suffixes include all the phrases in the dictionary. But we are not allowed to repeat a phrase, so can we modify the code to subtract one from the counts every time a phrase is used (and add onr back in when we backtrack over the phrase)? # # Perhaps you can find other areas to explore. # # # Indefinite Articles # # What if every noun phrase (except "Panama") has to start with an indefinite article, as in "an X" or "a Y"? Let's see: # In[21]: get_ipython().system(' [ -e anpdict-short.txt ] || curl -O http://norvig.com/anpdict-short.txt') # In[22]: DICT2 = PhraseDict(open('anpdict-short.txt')) # In[23]: p = Panama(dict=DICT2) get_ipython().run_line_magic('time', 'p.search(10**8)') p.report() # In[24]: str(p)