Return to Revisions

2 of 3

added 31 characters in body

edited May 17, 2017 at 9:07

301_Moved_Permanently

edited May 17, 2017 at 9:07

301_Moved_Permanently

29.4k
3
49
98

Speeding up a Python code with lots of regex substitution

I authored a piece of code that was merged into the nltk codebase. It is full of regex substitutions:

import re
from six import text_type
from nltk.tokenize.api import TokenizerI
class ToktokTokenizer(TokenizerI):
 """
 This is a Python port of the tok-tok.pl from
 https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl
 
 >>> toktok = ToktokTokenizer()
 >>> text = u'Is 9.5 or 525,600 my favorite number?'
 >>> print (toktok.tokenize(text, return_str=True))
 Is 9.5 or 525,600 my favorite number ?
 >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
 >>> print (toktok.tokenize(text, return_str=True))
 The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
 >>> text = u'\xa1This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
 >>> expected = u'\xa1 This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
 >>> assert toktok.tokenize(text, return_str=True) == expected
 >>> toktok.tokenize(text) == [u'\xa1', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf']
 True
 """
 # Replace non-breaking spaces with normal spaces.
 NON_BREAKING = re.compile(u"\u00A0"), " "
 
 # Pad some funky punctuation.
 FUNKY_PUNCT_1 = re.compile(u'([،;؛¿!"\])}»›"؟¡%٪°±©®।॥...])'), r" 1円 "
 # Pad more funky punctuation.
 FUNKY_PUNCT_2 = re.compile(u'([({\["‘„‚«‹「『])'), r" 1円 "
 # Pad En dash and em dash
 EN_EM_DASHES = re.compile(u'([–—])'), r" 1円 "
 
 # Replace problematic character with numeric character reference.
 AMPERCENT = re.compile('& '), '&amp; '
 TAB = re.compile('\t'), ' &#9; '
 PIPE = re.compile('\|'), ' &#124; '
 
 # Pad numbers with commas to keep them from further tokenization. 
 COMMA_IN_NUM = re.compile(r'(?<!,)([,،])(?![,\d])'), r' 1円 '
 
 # Just pad problematic (often neurotic) hyphen/single quote, etc.
 PROB_SINGLE_QUOTES = re.compile(r"(['’`])"), r' 1円 '
 # Group ` ` stupid quotes ' ' into a single token.
 STUPID_QUOTES_1 = re.compile(r" ` ` "), r" `` "
 STUPID_QUOTES_2 = re.compile(r" ' ' "), r" '' "
 
 # Don't tokenize period unless it ends the line and that it isn't 
 # preceded by another period, e.g. 
 # "something ..." -> "something ..." 
 # "something." -> "something ." 
 FINAL_PERIOD_1 = re.compile(r"(?<!\.)\.$"), r" ."
 # Don't tokenize period unless it ends the line eg. 
 # " ... stuff." -> "... stuff ."
 FINAL_PERIOD_2 = re.compile(r"""(?<!\.)\.\s*(["'’»›"]) *$"""), r" . 1円"
 # Treat continuous commas as fake German,Czech, etc.: „
 MULTI_COMMAS = re.compile(r'(,{2,})'), r' 1円 '
 # Treat continuous dashes as fake en-dash, etc.
 MULTI_DASHES = re.compile(r'(-{2,})'), r' 1円 '
 # Treat multiple periods as a thing (eg. ellipsis)
 MULTI_DOTS = re.compile(r'(\.{2,})'), r' 1円 '
 # This is the \p{Open_Punctuation} from Perl's perluniprops
 # see http://perldoc.perl.org/perluniprops.html
 OPEN_PUNCT = text_type(u'([{\u0f3a\u0f3c\u169b\u201a\u201e\u2045\u207d'
 u'\u208d\u2329\u2768\u276a\u276c\u276e\u2770\u2772'
 u'\u2774\u27c5\u27e6\u27e8\u27ea\u27ec\u27ee\u2983'
 u'\u2985\u2987\u2989\u298b\u298d\u298f\u2991\u2993'
 u'\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26'
 u'\u2e28\u3008\u300a\u300c\u300e\u3010\u3014\u3016'
 u'\u3018\u301a\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39'
 u'\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe47\ufe59\ufe5b'
 u'\ufe5d\uff08\uff3b\uff5b\uff5f\uff62')
 # This is the \p{Close_Punctuation} from Perl's perluniprops
 CLOSE_PUNCT = text_type(u')]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a'
 u'\u2769\u276b\u276d\u276f\u2771\u2773\u2775\u27c6'
 u'\u27e7\u27e9\u27eb\u27ed\u27ef\u2984\u2986\u2988'
 u'\u298a\u298c\u298e\u2990\u2992\u2994\u2996\u2998'
 u'\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29\u3009'
 u'\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b'
 u'\u301e\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c'
 u'\ufe3e\ufe40\ufe42\ufe44\ufe48\ufe5a\ufe5c\ufe5e'
 u'\uff09\uff3d\uff5d\uff60\uff63')
 # This is the \p{Close_Punctuation} from Perl's perluniprops
 CURRENCY_SYM = text_type(u'$\xa2\xa3\xa4\xa5\u058f\u060b\u09f2\u09f3\u09fb'
 u'\u0af1\u0bf9\u0e3f\u17db\u20a0\u20a1\u20a2\u20a3'
 u'\u20a4\u20a5\u20a6\u20a7\u20a8\u20a9\u20aa\u20ab'
 u'\u20ac\u20ad\u20ae\u20af\u20b0\u20b1\u20b2\u20b3'
 u'\u20b4\u20b5\u20b6\u20b7\u20b8\u20b9\u20ba\ua838'
 u'\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6')
 
 # Pad spaces after opening punctuations.
 OPEN_PUNCT_RE = re.compile(u'([{}])'.format(OPEN_PUNCT)), r'1円 '
 # Pad spaces before closing punctuations.
 CLOSE_PUNCT_RE = re.compile(u'([{}])'.format(CLOSE_PUNCT)), r'1円 '
 # Pad spaces after currency symbols.
 CURRENCY_SYM_RE = re.compile(u'([{}])'.format(CURRENCY_SYM)), r'1円 '
 
 # Use for tokenizing URL-unfriendly characters: [:/?#]
 URL_FOE_1 = re.compile(r':(?!//)'), r' : ' # in perl s{:(?!//)}{ : }g;
 URL_FOE_2 = re.compile(r'\?(?!\S)'), r' ? ' # in perl s{\?(?!\S)}{ ? }g;
 # in perl: m{://} or m{\S+\.\S+/\S+} or s{/}{ / }g;
 URL_FOE_3 = re.compile(r'(:\/\/)[\S+\.\S+\/\S+][\/]'), ' / '
 URL_FOE_4 = re.compile(r' /'), r' / ' # s{ /}{ / }g;
 
 # Left/Right strip, i.e. remove heading/trailing spaces.
 # These strip regexes should NOT be used,
 # instead use str.lstrip(), str.rstrip() or str.strip() 
 # (They are kept for reference purposes to the original toktok.pl code) 
 LSTRIP = re.compile(r'^ +'), ''
 RSTRIP = re.compile(r'\s+$'),'\n' 
 # Merge multiple spaces.
 ONE_SPACE = re.compile(r' {2,}'), ' '
 
 TOKTOK_REGEXES = [NON_BREAKING, FUNKY_PUNCT_1, 
 URL_FOE_1, URL_FOE_2, URL_FOE_3, URL_FOE_4,
 AMPERCENT, TAB, PIPE,
 OPEN_PUNCT_RE, CLOSE_PUNCT_RE, 
 MULTI_COMMAS, COMMA_IN_NUM, FINAL_PERIOD_2,
 PROB_SINGLE_QUOTES, STUPID_QUOTES_1, STUPID_QUOTES_2,
 CURRENCY_SYM_RE, EN_EM_DASHES, MULTI_DASHES, MULTI_DOTS,
 FINAL_PERIOD_1, FINAL_PERIOD_2, ONE_SPACE]
 
 def tokenize(self, text, return_str=False):
 text = text_type(text) # Converts input string into unicode.
 for regexp, subsitution in self.TOKTOK_REGEXES:
 text = regexp.sub(subsitution, text)
 # Finally, strips heading and trailing spaces
 # and converts output string into unicode.
 text = text_type(text.strip()) 
 return text if return_str else text.split()

Is there a way to make the subtituition faster? E.g.

Combine the chain of regexes into one super regex.
Combine some of the regexes
Coding it in Cython (but Cython regexes are slow, no?)
Running the regex substitution in Julia and wrapping Julia code in Python

The use case for the tokenize() function usually takes a single input but if the same function is called 1,000,000,000 times, it's rather slow and the GIL is going to lock up the core and process each sentence at a time.

The aim of the question is to ask for ways to speed up a Python code that's made up of regex substitution, esp. when running the tokenize() function for 1,000,000,000+ times.

If Cython/Julia or any faster language + wrapper is suggested, it would be good if you give an one regex example of how the regex is written in Cython/Julia/Others and the suggestion on how the wrapper would look like.

Thanks in advance!

asked May 17, 2017 at 5:31

alvas

lang-py