Rolling Hash algorithm

Question 1

I would like any ideas or suggestions on how to enhance the code for rolling hash algorithm and make if faster. Even if it is a change in one line. I tried to add a list, because I thought it would be more efficient, but it didn't work properly. So any ideas on how to enhance it. I would appreciate the assist.

import time
class RollingHash:
def __init__(self, text, sizeWord):
 self.text = text
 self.hash = 0
 self.sizeWord = sizeWord
 for i in range(0, sizeWord):
 # ord maps the character to a number
 # subtract out the ASCII value of "a" to start the indexing at zero
 self.hash += (ord(self.text[i]) - ord("a") + 1) * (26 ** (sizeWord - i - 1)) # change the first / with *
 # start index of current window
 self.window_start = 0
 # end of index window
 self.window_end = sizeWord
def move_window(self):
 if self.window_end <= len(self.text) - 1:
 # remove left letter from hash value
 self.hash -= (ord(self.text[self.window_start]) - ord("a") + 1) * 26 ** (self.sizeWord - 1)
 self.hash *= 26
 self.hash += ord(self.text[self.window_end]) - ord("a") + 1
 self.window_start += 1
 self.window_end += 1
def window_text(self):
 return self.text[self.window_start:self.window_end]
def rabin_karp(word, text):
if word == "" or text == "":
 return None
if len(word) > len(text):
 return None
rolling_hash = RollingHash(text, len(word))
word_hash = RollingHash(word, len(word))
# word_hash.move_window()
for i in range(len(text) - len(word) + 1):
 if rolling_hash.hash == word_hash.hash:
 if rolling_hash.window_text() == word:
 print(i)
 rolling_hash.move_window()
return 'Pattern length: ' , len(word)
text = input("Text: ")
word = input("Pattern: ")
t1 = time.clock()
results = rabin_karp(word, text)
print(results)
t2 = time.clock()
print('Run Time: %0.5f ms' % ((t2 - t1) * 1000.0))

Question 2

Welcome to Code Review! What Python version are you using?

Question 3

(3.x, guessing from print().)

Question 4

I am using version 3.6.2.

Question 5

See also: Karp-Rabin String Matching, Rabin Karp algorithm improvement for string contains A to Z only, and Text search in Python for an idea how to avoid adverse effects with longer words.

Question 6

You can do as a decent compiler would: precompute some expressions, and reduce the "strength" of others.
For worse rather than better, most opportunities for this are in the constructor.
Part of the performance of on object depends on its interface - I've included an alternative function that finds the (position of the) next match, if any.
Benchmarking is difficult, micro or not: beware "printing" and use help.

import timeit
class RollingHash:
 '''A rolling hash for a window of constant length into a text,
 both specified at construction.
 '''
 adjust = ord("a") - 1
 alphabet = 26
 def __init__(self, text, size_word):
 '''Set up a rolling hash for a window of size_word into text.'''
 self.text = text
 if len(text) < size_word:
 self.hash = None
 return
 rk = 0
 for c in text[:size_word]:
 rk = rk * self.alphabet + ord(c) - self.adjust
 self.hash = rk
 self.pos = -1
 self.window_start = 0
 self.window_end = size_word
 self.multiplier = RollingHash.alphabet ** (size_word - 1)
 def move_window(self):
 '''Advance window by one position.'''
 if self.window_end < len(self.text):
 # remove left letter from hash value
 self.hash = \
 (self.hash - (ord(self.text[self.window_start])
 - RollingHash.adjust) * self.multiplier) \
 * RollingHash.alphabet \
 + ord(self.text[self.window_end]) - RollingHash.adjust
 self.window_start += 1
 self.window_end += 1
 def window_text(self):
 '''Return current window text.'''
 return self.text[self.window_start:self.window_end]
 def match(self, other):
 '''Return position of next match, or none.'''
 roll = self.hash
 text = self.text
 # "local copies" may help or hinder readability and performance
 start = self.window_start
 end = self.window_end
 limit = len(self.text)
 result = None
 while end < limit:
 if self.pos < other.hash == roll \
 and other.text == text[start:end] \
 and self.pos < start:
 result = self.pos = start
 break;
 roll = (roll - (ord(text[start])
 - RollingHash.adjust) * self.multiplier) \
 * RollingHash.alphabet \
 + ord(text[end]) - RollingHash.adjust
 start += 1
 end += 1
 self.window_start = start
 self.window_end = end
 return result
verbose = True
def rabin_karp(word, text):
 '''Print indexes of matches for word in text.'''
 if word == "" or text == "":
 return None
 size_word = len(word)
 if size_word > len(text):
 return None
 rolling_hash = RollingHash(text, size_word)
 word_hash = RollingHash(word, size_word)
 for i in range(len(text) - size_word + 1):
 if rolling_hash.hash == word_hash.hash:
 if rolling_hash.window_text() == word:
 if verbose:
 print(i)
 else:
 print(rolling_hash.window_text(), '<>', word, "at", i)
 rolling_hash.move_window()
 return 'Pattern length: ', size_word
def karp_rabin(word, text):
 '''Print indexes of matches for word in text.'''
 size_word = len(word)
 if not 0 < size_word <= len(text):
 return None
 rolling_hash = RollingHash(text, size_word)
 word_hash = RollingHash(word, size_word)
 while True:
 position = rolling_hash.match(word_hash)
 if position is None:
 return 'Pattern length: ', size_word
 if verbose:
 print(position)
if __name__ == '__main__':
 text = input("Text: ")
 word = input("Pattern: ")
 print(rabin_karp(word, text))
 print(karp_rabin(word, text))
 verbose = False
 glob = globals()
 # have a look at timeit.Timer.repeat() and autorange(), too
 print(timeit.timeit('results = rabin_karp(word, text)',
 globals=glob, number=9999))
 print(timeit.timeit('results = karp_rabin(word, text)',
 globals=glob, number=9999))

Question 7

Thank you so much! , i compared the two codes run time, and it's faster

Question 8

Things. take. time.

greybeard greybeard 7,3713 gold badges21 silver badges55 bronze badges · Accepted Answer · 2017-10-26 15:40:47Z

You can do as a decent compiler would: precompute some expressions, and reduce the "strength" of others.
For worse rather than better, most opportunities for this are in the constructor.
Part of the performance of on object depends on its interface - I've included an alternative function that finds the (position of the) next match, if any.
Benchmarking is difficult, micro or not: beware "printing" and use help.

import timeit
class RollingHash:
 '''A rolling hash for a window of constant length into a text,
 both specified at construction.
 '''
 adjust = ord("a") - 1
 alphabet = 26
 def __init__(self, text, size_word):
 '''Set up a rolling hash for a window of size_word into text.'''
 self.text = text
 if len(text) < size_word:
 self.hash = None
 return
 rk = 0
 for c in text[:size_word]:
 rk = rk * self.alphabet + ord(c) - self.adjust
 self.hash = rk
 self.pos = -1
 self.window_start = 0
 self.window_end = size_word
 self.multiplier = RollingHash.alphabet ** (size_word - 1)
 def move_window(self):
 '''Advance window by one position.'''
 if self.window_end < len(self.text):
 # remove left letter from hash value
 self.hash = \
 (self.hash - (ord(self.text[self.window_start])
 - RollingHash.adjust) * self.multiplier) \
 * RollingHash.alphabet \
 + ord(self.text[self.window_end]) - RollingHash.adjust
 self.window_start += 1
 self.window_end += 1
 def window_text(self):
 '''Return current window text.'''
 return self.text[self.window_start:self.window_end]
 def match(self, other):
 '''Return position of next match, or none.'''
 roll = self.hash
 text = self.text
 # "local copies" may help or hinder readability and performance
 start = self.window_start
 end = self.window_end
 limit = len(self.text)
 result = None
 while end < limit:
 if self.pos < other.hash == roll \
 and other.text == text[start:end] \
 and self.pos < start:
 result = self.pos = start
 break;
 roll = (roll - (ord(text[start])
 - RollingHash.adjust) * self.multiplier) \
 * RollingHash.alphabet \
 + ord(text[end]) - RollingHash.adjust
 start += 1
 end += 1
 self.window_start = start
 self.window_end = end
 return result
verbose = True
def rabin_karp(word, text):
 '''Print indexes of matches for word in text.'''
 if word == "" or text == "":
 return None
 size_word = len(word)
 if size_word > len(text):
 return None
 rolling_hash = RollingHash(text, size_word)
 word_hash = RollingHash(word, size_word)
 for i in range(len(text) - size_word + 1):
 if rolling_hash.hash == word_hash.hash:
 if rolling_hash.window_text() == word:
 if verbose:
 print(i)
 else:
 print(rolling_hash.window_text(), '<>', word, "at", i)
 rolling_hash.move_window()
 return 'Pattern length: ', size_word
def karp_rabin(word, text):
 '''Print indexes of matches for word in text.'''
 size_word = len(word)
 if not 0 < size_word <= len(text):
 return None
 rolling_hash = RollingHash(text, size_word)
 word_hash = RollingHash(word, size_word)
 while True:
 position = rolling_hash.match(word_hash)
 if position is None:
 return 'Pattern length: ', size_word
 if verbose:
 print(position)
if __name__ == '__main__':
 text = input("Text: ")
 word = input("Pattern: ")
 print(rabin_karp(word, text))
 print(karp_rabin(word, text))
 verbose = False
 glob = globals()
 # have a look at timeit.Timer.repeat() and autorange(), too
 print(timeit.timeit('results = rabin_karp(word, text)',
 globals=glob, number=9999))
 print(timeit.timeit('results = karp_rabin(word, text)',
 globals=glob, number=9999))

Thank you so much! , i compared the two codes run time, and it's faster

Stack Exchange Network

Rolling Hash algorithm

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Hot Network Questions

Rolling Hash algorithm

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Related

Hot Network Questions