I am working on a text normalizer. It works just fine with small text files but takes a very long time with large text files such as 5 MB or more.
Is there anything to change in the code to make it run faster on large text files? My guess would be something in the __preprocess(tmp)
and __prenormalise(text)
?
# -*- coding: utf-8 -*-
import re
import sys
import json
import os
import codecs
import copy
from num2words import num2words
from text_unidecode import unidecode
import argparse
class TextNormaliser:
def __init__(self, debug=False):
"""
Args:
debug (bool, optional): Debug mode
"""
self.debug = debug
self.abbreviations = {}
self.acronyms = {}
self.currencies = {}
self.months = [
'january', 'february', 'march', 'april', 'may', 'june', 'july',
'august', 'september', 'october', 'november', 'december']
self.number_scale = [
'thousand', 'thousands', 'million', 'millions',
'billion', 'billions', 'trillion', 'trillions']
path = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(path, 'resources', 'abbreviations.json')) as jf:
self.abbreviations = json.load(jf)
with open(os.path.join(path, 'resources', 'acronyms.json')) as jf:
self.acronyms = json.load(jf)
with open(os.path.join(path, 'resources', 'currencies.json')) as jf:
self.currencies = json.load(jf)
with open(os.path.join(path, 'resources', 'domains.json')) as jf:
self.domains = json.load(jf)
def normalise(self, text):
"""Normalise text.
The function covers numbers, email addresses, ascii characters, etc.
Args:
text (str): Input string
Returns:
textn (srt): Normalised text
tokens ([tuples]): List of tuples to track back normalisation
Examples:
>>> textn, tokens = tn.normalise("My email is, [email protected].")
tokens: (Original, Normalised, Display)
my email is a at b dot com
[('My', ['my'], 'My'), ('email', ['email'], 'email'),
('is,', ['is'], 'is'),
('[email protected].', ['a', 'at', 'b', 'dot', 'com'], '[email protected]')]
"""
return self.__normalise(text)
def normalise_file(self, path):
"""Normalise text from a file.
The function covers numbers, email addresses, ascii characters, etc.
Args:
path (str): Path to a file
Returns:
textn (srt): Normalised text, or None if file does not exists
tokens ([tuples]): List of tuples to track back normalisation,
or None if file doesnot exists
Raises:
Exception: If file cannot be read
Examples:
>>> textn = tn.normalise_file('./trans.txt')
"""
try:
if os.path.isfile(path):
with codecs.open(path, encoding='utf-8') as f:
return self.__normalise(f.readline())
else:
return None, None
except Exception as e:
raise Exception('ERR Normalise_file: {}'.format(e))
def __normalise(self, text):
text = self.__prenormalise(text)
tmp = []
for idx, t in enumerate(text.split()):
tmp.append((t, idx))
original = copy.deepcopy(tmp)
# Preprocessing
tokens = self.__preprocess(tmp)
# Convert to result format
ret_text, ret_tokens = self.__generate_results(original, tokens)
return ret_text, ret_tokens
def __prenormalise(self, text):
text = text.replace('\n', '').replace('\r', '')
text = re.sub(r'\b\?\b', ' ', text)
text = re.sub(r'\b\!\b', ' ', text)
text = re.sub(r'\b\"\b', ' ', text)
text = re.sub(r'\b\--\b', ' ', text)
chars = list(text)
for i, c in enumerate(chars):
if i < 1 or i > len(chars)-1:
continue
if c == ',':
if not(chars[i-1].isnumeric() and
chars[i-1].isnumeric()):
chars[i] = ', '
text = ''.join(chars)
return text
def __preprocess(self, tokens):
# Remove spaces and some special encoding
for idx, t in enumerate(tokens):
i = t[1]
t = t[0]
t = t.replace('&', '&')
hints = ['[Music]', '[Laughter]', '[Applause]']
for hint in hints:
t = t.replace(hint, '')
del tokens[idx]
tokens.insert(idx, (t.strip(), i))
# Remove last dot
if len(tokens):
if tokens[-1][0].endswith('.'):
i = tokens[-1][1]
t = tokens[-1][0]
del tokens[-1]
tokens.append((t[:-1], i))
return tokens
def __rstrip(self, token):
for i in range(5):
if len(token):
if token[-1] in [',', '.', ';', '!', '?', ':', '"']:
token = token[:-1]
else:
break
return token
def __lstrip(self, token):
for i in range(5):
if len(token):
if token[0] in [',', '.', ';', '!', '?', ':', '"', '\'']:
token = token[1:]
else:
break
return token
def __generate_results(self, original, normalised):
words = []
for t in normalised:
if len(t[0]):
words.append(t[0])
text = ' '.join(words)
tokens = []
if len(original):
for t in original:
idx = t[1]
words = []
for t2 in normalised:
if idx == t2[1]:
words.append(t2[0])
display_text = self.__rstrip(t[0])
display_text = self.__lstrip(display_text)
tokens.append((t[0], words, display_text))
else:
tokens.append(('', '', ''))
return text, tokens
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--textfile', type=str, required=True, help='input directory or file')
args = parser.parse_args()
tn = TextNormaliser(False)
with open(args.textfile) as fd:
lines = fd.readlines()
for line in lines:
line = line.strip()
normalised, tokens = tn.normalise(line)
print(normalised)
```
2 Answers 2
I'd suggest to do some profiling, or simply using timeit
for measuring which part of code takes the long time, and then focus on that:
from timeit import default_timer as timer
start = timer()
text = self.__prenormalise(text) # for example
end = timer()
print('__prenormalise took', end - start) # time in seconds
I'm suspicious about the __prenormalise
method. You're doing there several simple replacements, which can be merged into a single one instead:
text = re.sub(r"\b(\?|\!|\"|--|\n)\b", " ", text)
even better you should use re.compile to compile the pattern once anywhere outside the function, so it is complied only once. If you would put this into a function, it would get compiled every time the function is executed:
# replace all diacritics in a single go
RE_DIACRITICS = re.compile(r"\b(\?|\!|\"|--|\n)\b")
then in the function you can use that compiled regex:
# use inside your methods like this
text = RE_DIACRITICS.sub(" ", text)
however the slowest part is probably the loop, where you iterate through the whole text one character at a time:
for i, c in enumerate(chars):
if i < 1 or i > len(chars)-1:
continue
if c == ',':
if not(chars[i-1].isnumeric() and
chars[i-1].isnumeric()):
chars[i] = ', '
text = ''.join(chars)
The condition if i < 1 or i > len(chars)-1
is executed every time but it matters only during the first and last iteration. So you can throw it away and iterate only through a slice starting at the 2nd character and ending at the last-but-one:
for i, c in enumerate(chars[1:-1]):
However that is still slow. The thing what you want to do is to replace a comma between two non-numbers with the same thing except putting a space there, right? That could be done with a straightforward regex substitution instead of going one character after another manually. So the whole loop for i, c in enumerate(chars):
can be replaced with this regex:
# replace comma between non-numbers with comma + space
text = re.sub("(?<!\d)(,)(?!\d)", "\g<1> ", text)
This regex is using negative lookahead and negative lookbehind, which you can find in the re module documenation. It looks for a comma which is not following nor followed by a number, and then it replaces the comma with first matched group (which is the comma itself) plus space. For working with regexes I recommend using regex101.com, which can visualise results in real time. Here's the regex from above https://regex101.com/r/tccMoA/1
-
\$\begingroup\$ thanks for you answer i did everything you said but im having trouble understanding where to put the re.compile and the re.finditer. Is there a way to contact you privately? Much appreciated. \$\endgroup\$mehio hatab– mehio hatab2020年09月11日 13:48:02 +00:00Commented Sep 11, 2020 at 13:48
-
1\$\begingroup\$ I updated my answer further \$\endgroup\$yedpodtrzitko– yedpodtrzitko2020年09月11日 14:42:37 +00:00Commented Sep 11, 2020 at 14:42
-
\$\begingroup\$ @ yedpodtrzitko I tried the timeit you told me about its very usefull and i discovered that the __generate_results and the __preprocess are also taking alot of time im not sure why. I hope you can help with that? The changes you did to the __prenormalize were excellent now its much faster! \$\endgroup\$mehio hatab– mehio hatab2020年09月11日 17:16:35 +00:00Commented Sep 11, 2020 at 17:16
__preprocess()
token
is a list of (test, index) tuples. So, it looks like idx
and i
will always be the same value. enumerate
is not needed, just use i
.
# Remove spaces and some special encoding
for t, i in tokens:
t = t.replace('&', '&')
hints = ['[Music]', '[Laughter]', '[Applause]']
for hint in hints:
t = t.replace(hint, '')
tokens
is a list. Don't delete and then insert a new value, just replace the value. Also, i
and t
aren't used when removing the last dot.
tokens[idx] = (t.strip(), i)
if len(tokens) and tokens[-1][0].endswith('.'):
tokens[-1] = (t[:-1], i)
return tokens
__rstrip()
and __lstrip()
The Python str type has methods for __rstrip()
and __lstrip()
. Triple quotes can be used to enclose a string containing both kinds of single quotes.
def __rstrip(self, token):
return token.rstrip(''',.;!?:'"''')
def __lstrip(self, token):
return token.lstrip(''',.;!?:'"''')
__generate_results()
The for t in normalised:
loop can be a generator expression:
text = ' '.join(w for w, _ in normalised if w)
It looks like the for t in original
loop could be replaced by itertools.groupby()
and grouping by the index in normalized
.
The way you are using __rstrip()
and __lstrip()
, just use the strip()
method; it strips from both ends of the string.
from itertools import groupby
tokens = []
for idx, group in groupby(normalized, key=lambda t:t[1]):
words = [w for w, _ in group]
display_text = original[idx][0].strip(''',.;!?:'"''')
tokens.append((original[idx], words, display_text))
main code
An open file is already an iterable, so you don't need to read it all in and then iterate over it.
with open(args.textfile) as fd:
for line in fd:
...
Explore related questions
See similar questions with these tags.
timeit
. It will usually save you a lot of effort, because you'll focus more quickly on the trouble spots. Of course, sometimes once you find the trouble, you'll realize there's no way to fix it without a bigger rewrite of your strategy and code -- but that's only a potential concern, after you have some hard evidence. \$\endgroup\$