Python function to find specific regex in the text of an XML document

Question 1

I'm writing a code that, starting from an XML file:

stores the index of child elements of a tag and the child elements as key, values in a dictionary (function get_xml_by_tag_names);
deletes keys whose values contain a certain string (the specific text size) and puts these keys and the corresponding values into a second dictionary (def search_delete_append);
joins, for each dictionary, the dict values and extracts their text(def main);
replaces certain values with "" (def main);
counts the occurrences of specific regex I specify (def find_regex).

The main function is problematic, as I need help cleaning it up, the regex are too many and I want to create a function for each regex inside the main function. Would it be a good option?

import re
from xml.dom import minidom
from xml.etree import ElementTree as ET
def get_xml_by_tag_names(xml_path, tag_name_1, tag_name_2):
 data = {}
 xml_tree = minidom.parse(xml_path)
 item_group_nodes = xml_tree.getElementsByTagName(tag_name_1)
 for idx, item_group_node in enumerate(item_group_nodes):
 cl_compile_nodes = item_group_node.getElementsByTagName(tag_name_2)
 for _ in cl_compile_nodes:
 data[idx]=[item_group_node.toxml()]
 return data
def find_regex(regex, text):
 lista = []
 for x in text:
 matches_prima = re.findall(regex, x)
 lunghezza = len(matches_prima)
 lista.append(lunghezza)
 print("The number of {} matches is ".format(regex), sum(lista))
def find_regex_fasi(regex, text):
 matches_fasi = re.findall(regex, text)
 print("Numero di corpo minore è", len(matches_fasi))
def search_delete_append(dizionario, dizionariofasi):
 deletekeys = []
 insertvalues = []
 for k in dizionario:
 for v in dizionario[k]:
 if "7.489" in v:
 deletekeys.append(k)
 dizionariofasi[k] = v
 for item in deletekeys:
 del dizionario[item]
def main():
 dict_fasi = {}
 data = get_xml_by_tag_names('output2.xml', 'new_line', 'text')
 search_delete_append(data, dict_fasi)
 testo = []
 for value in data.values():
 myxml = ' '.join(value)
 tree = ET.fromstring(myxml)
 tmpstring = ' '.join(text.text for text in tree.findall('text'))
 for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"):
 tmpstring = tmpstring.replace(to_remove, "")
 testo.append(tmpstring)
 #testo = ''.join(testo)
 print(testo)
 find_fase_12T_leo = re.compile(r"\]\s*AN\s*1\s*([\w\s]+)da\s*cui\s*2\s*([\w\s]+)da\s*cui\s*T")
 #find_prima = re.compile(r"\]\s*prima(?!\S)")
 find_fase_base_2 = re.compile(r"\]\s([\w\s]+)\s[→]\sT") # ] parole → T
 find_fase_base_3 = re.compile(r"\]\s*([\w\s]+)\s*da\scui\sT") # ] parole da cui T
 find_fase_12 = re.compile(r"\]\s1\s([\w\s]+)\s2\s([\w\s]+[^T])") # ] 1 parole 2 parole (esclude T)
 find_fase_prima_12 = re.compile(r"\]\s+prima\s+1\s+([\w\s]+)\s+2([\w\s]+[^T])") # ] prima 1 parole 2 parole (esclude T)
 find_fase_prima_123 = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3([\w\s]+)")
 find_fase_prima_123T = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3\sT") #prima 1 parole 2 parole 3t
 find_fase_prima_1freccia2 = re.compile(r"\]\s+prima\s1\s([\w\s]+)\s[→]\s2([\w\s]+[^T])") #] prima 1 parola → 2 parola
 FIND_FASE12T = re.compile(r"\]\s1\s([\w\s]+)\s2\sT")
 FIND_FASE123T_OPZ2 = re.compile(r"\]\s*prima\s*1([\w\s]+)\s*2([\w\s][^3|^3T]+) ")
 FIND_FASE123T = re.compile(r"\]\s*1([\w\s]+)\s*2([\w\s]+)\s3\sT")
 FIND_FASE_123FRECCIAT = re.compile(r"\]\s1\s([\w\s]+)\s2([\w\s]+)\s→\sT")
 FIND_FASE_1FRECCIA23T = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s(T|3\sT)")
 FIND_FASE_FRECCIA1F2FT = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s→\s(T|3\sT)")
 FIND_FASE_PRIMA_123FRECCIAT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*→\s*T")
 FIND_FASE_PRIMA_1FRECCIA23T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*(T|3\sT)")
 FIND_FASE_PRIMA_FRECCIA1F2FT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*→\s*(T|3\sT)")
 FIND_FASE_PRIMA_1FRECCIA2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)")
 FIND_FASE_PRIMA_12345T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT")
 FIND_FASE_PRIMA_12345T_OPZ2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s][^5|^5\sT]+)")
 FIND_FASE_12345T = re.compile(r"\]\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT")
 #find_da = re.compile(r"\]\s*da(?!\S)")
 #find_da_cui = re.compile(r"\]\s*([\w\s]+)\s*da\scui")
 #find_sps = re.compile(r"\]\s*([\w\s]+)\s*sps")
 #find_su = re.compile(r"\]\s*([\w\s]+)\s*su")
 #find_as = re.compile(r"\]\s*([\w\s]+)\s*as")
 #find_ins = re.compile(r"\]\s*([\w\s]+)\s*ins")
 #find_segue = re.compile(r"\]\s*([\w\s]+)\s*segue")
 find_regex(FIND_FASE12T, testo)
 find_regex(find_fase_12T_leo, testo)
 #find_regex(find_prima, testo)
 find_regex(find_fase_base_2, testo)
 find_regex(find_fase_base_3, testo)
 find_regex(find_fase_12, testo)
 find_regex(find_fase_prima_12, testo)
 find_regex(find_fase_prima_123, testo)
 find_regex(find_fase_prima_123T, testo)
 find_regex(find_fase_prima_1freccia2, testo)
 #find_regex(find_da, testo)
 #find_regex(find_da_cui, testo)
 #find_regex(find_sps, testo)
 #find_regex(find_su, testo)
 #find_regex(find_as, testo)
 #find_regex(find_ins, testo)
 #find_regex(find_segue, testo)
 #################
 testo_fasi = []
 values = [x for x in dict_fasi.values()]
 myxml_fasi = ' '.join(values)
 find_CM = re.compile(r"10\.238")
 find_regex_fasi(find_CM, myxml_fasi) #quanti CM ci sono?
 #print(myxml_fasi)
 for x in dict_fasi.values():
 xxx= ''.join(x)
 tree2 = ET.fromstring(xxx)
 tmpstring2 = ' '.join(text.text for text in tree2.findall('text'))
 for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"):
 tmpstring2 = tmpstring2.replace(to_remove, "")
 testo_fasi.append(tmpstring2)
 #testo_fasi = ''.join(testo_fasi)
 print(testo_fasi)
 find_regex(FIND_FASE12T, testo_fasi)
 find_regex(FIND_FASE123T_OPZ2, testo_fasi)
 find_regex(FIND_FASE123T, testo_fasi)
 find_regex(FIND_FASE_1FRECCIA23T, testo_fasi)
 find_regex(FIND_FASE_123FRECCIAT, testo_fasi)
 find_regex(FIND_FASE_FRECCIA1F2FT, testo_fasi)
 find_regex(FIND_FASE_PRIMA_1FRECCIA23T, testo_fasi)
 find_regex(FIND_FASE_PRIMA_123FRECCIAT, testo_fasi)
 find_regex(FIND_FASE_PRIMA_FRECCIA1F2FT, testo_fasi)
 find_regex(FIND_FASE_PRIMA_1FRECCIA2, testo_fasi)
 find_regex(FIND_FASE_PRIMA_12345T, testo_fasi)
 find_regex(FIND_FASE_PRIMA_12345T_OPZ2, testo_fasi)
 find_regex(FIND_FASE_12345T, testo_fasi)
 find_regex(find_fase_12T_leo, testo_fasi)
 #find_regex(find_prima, testo_fasi)
 find_regex(find_fase_base_2, testo_fasi)
 find_regex(find_fase_base_3, testo_fasi)
 find_regex(find_fase_12, testo_fasi)
 find_regex(find_fase_prima_12, testo_fasi)
 find_regex(find_fase_prima_123, testo_fasi)
 find_regex(find_fase_prima_123T, testo_fasi)
 find_regex(find_fase_prima_1freccia2, testo_fasi)
 #find_regex(find_da, testo_fasi)
 #find_regex(find_da_cui, testo_fasi)
 #find_regex(find_sps, testo_fasi)
 #find_regex(find_su, testo_fasi)
 #find_regex(find_as, testo_fasi)
 #find_regex(find_ins, testo_fasi)
 #find_regex(find_segue, testo_fasi)
if __name__ == "__main__":
 main()

I know it's half in Italian right now, but I need to keep it for now for my clarity.

Question 2

Hey. Could you reduce this to a minimal working example? Perhaps include a valid minimal input and the corresponding output too?

Question 3

Should I provide it even if everything is working?

Question 4

We should only use as many variables as needed

For example,

values = [x for x in dict_fasi.values()]
myxml_fasi = ' '.join(values)

could be

myxml_fasi = ' '.join(dict_fasi.values())

We can reduce the number of strings created

for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"):
 tmpstring2 = tmpstring2.replace(to_remove, "")

could be

tmpstring2 = ''.join(c for c in tmpstring2
 if c not in set("|=?-<>’(!.:,;"))

The first creates a new string with each iteration. N.b. after deleting < and >, there won't be any <> in the text.

Separating input/output from processing functions

I'd try to limit interaction with the world to as few functions as possible. For example I would not expect a function named find_regex_fasi to print anything to the console (or elsewhere). I'd make it return its results and do the printing inside main.

find_fase_12T_leo = re.compile(r"\]\s*AN\s*1\s*([\w\s]+)da\s*cui\s*2\s*([\w\s]+)da\s*cui\s*T")
#find_prima = re.compile(r"\]\s*prima(?!\S)")
find_fase_base_2 = re.compile(r"\]\s([\w\s]+)\s[→]\sT") # ] parole → T
find_fase_base_3 = re.compile(r"\]\s*([\w\s]+)\s*da\scui\sT") # ] parole da cui T
find_fase_12 = re.compile(r"\]\s1\s([\w\s]+)\s2\s([\w\s]+[^T])") # ] 1 parole 2 parole (esclude T)
find_fase_prima_12 = re.compile(r"\]\s+prima\s+1\s+([\w\s]+)\s+2([\w\s]+[^T])") # ] prima 1 parole 2 parole (esclude T)
find_fase_prima_123 = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3([\w\s]+)")
find_fase_prima_123T = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3\sT") #prima 1 parole 2 parole 3t
find_fase_prima_1freccia2 = re.compile(r"\]\s+prima\s1\s([\w\s]+)\s[→]\s2([\w\s]+[^T])") #] prima 1 parola → 2 parola
FIND_FASE12T = re.compile(r"\]\s1\s([\w\s]+)\s2\sT")
FIND_FASE123T_OPZ2 = re.compile(r"\]\s*prima\s*1([\w\s]+)\s*2([\w\s][^3|^3T]+) ")
FIND_FASE123T = re.compile(r"\]\s*1([\w\s]+)\s*2([\w\s]+)\s3\sT")
FIND_FASE_123FRECCIAT = re.compile(r"\]\s1\s([\w\s]+)\s2([\w\s]+)\s→\sT")
FIND_FASE_1FRECCIA23T = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s(T|3\sT)")
FIND_FASE_FRECCIA1F2FT = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s→\s(T|3\sT)")
FIND_FASE_PRIMA_123FRECCIAT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*→\s*T")
FIND_FASE_PRIMA_1FRECCIA23T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*(T|3\sT)")
FIND_FASE_PRIMA_FRECCIA1F2FT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*→\s*(T|3\sT)")
FIND_FASE_PRIMA_1FRECCIA2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)")
FIND_FASE_PRIMA_12345T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT")
FIND_FASE_PRIMA_12345T_OPZ2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s][^5|^5\sT]+)")
FIND_FASE_12345T = re.compile(r"\]\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT")
#find_da = re.compile(r"\]\s*da(?!\S)")
#find_da_cui = re.compile(r"\]\s*([\w\s]+)\s*da\scui")
#find_sps = re.compile(r"\]\s*([\w\s]+)\s*sps")
#find_su = re.compile(r"\]\s*([\w\s]+)\s*su")
#find_as = re.compile(r"\]\s*([\w\s]+)\s*as")
#find_ins = re.compile(r"\]\s*([\w\s]+)\s*ins")
#find_segue = re.compile(r"\]\s*([\w\s]+)\s*segue")
find_regex(FIND_FASE12T, testo)
find_regex(find_fase_12T_leo, testo)
#find_regex(find_prima, testo)
find_regex(find_fase_base_2, testo)
find_regex(find_fase_base_3, testo)
find_regex(find_fase_12, testo)
find_regex(find_fase_prima_12, testo)
find_regex(find_fase_prima_123, testo)
find_regex(find_fase_prima_123T, testo)
find_regex(find_fase_prima_1freccia2, testo)
#find_regex(find_da, testo)
#find_regex(find_da_cui, testo)
#find_regex(find_sps, testo)
#find_regex(find_su, testo)
#find_regex(find_as, testo)
#find_regex(find_ins, testo)
#find_regex(find_segue, testo)

can become something like

find_phase_regexes = {
 k: re.compile(v) for k, v in {
 "12T_leo": r"\]\s*AN\s*1\s*([\w\s]+)da\s*cui\s*2\s*([\w\s]+)da\s*cui\s*T",
 "prima": r"\]\s*prima(?!\S)",
 "base_2": r"\]\s([\w\s]+)\s[→]\sT", # ] parole → T
 "base_3": r"\]\s*([\w\s]+)\s*da\scui\sT", # ] parole da cui T
 "12": r"\]\s1\s([\w\s]+)\s2\s([\w\s]+[^T])", # ] 1 parole 2 parole (esclude T)
 "prima_12": r"\]\s+prima\s+1\s+([\w\s]+)\s+2([\w\s]+[^T])", # ] prima 1 parole 2 parole (esclude T)
 "prima_123": r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3([\w\s]+)",
 "prima_123T": r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3\sT", #prima 1 parole 2 parole 3t
 "prima_1freccia2": r"\]\s+prima\s1\s([\w\s]+)\s[→]\s2([\w\s]+[^T])", #] prima 1 parola → 2 parola
 "12T": r"\]\s1\s([\w\s]+)\s2\sT",
 "123T_OPZ2": r"\]\s*prima\s*1([\w\s]+)\s*2([\w\s][^3|^3T]+) ",
 "123T": r"\]\s*1([\w\s]+)\s*2([\w\s]+)\s3\sT",
 "123FRECCIAT": r"\]\s1\s([\w\s]+)\s2([\w\s]+)\s→\sT",
 "1FRECCIA23T": r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s(T|3\sT)",
 "FRECCIA1F2FT": r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s→\s(T|3\sT)",
 "PRIMA_123FRECCIAT": r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*→\s*T",
 "PRIMA_1FRECCIA23T": r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*(T|3\sT)",
 "PRIMA_FRECCIA1F2FT": r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*→\s*(T|3\sT)",
 "PRIMA_1FRECCIA2": r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)",
 "PRIMA_12345T": r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT",
 "PRIMA_12345T_OPZ2": r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s][^5|^5\sT]+)",
 "12345T": r"\]\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT",
 }.items()
}
for k, v in find_phase_regexes.items():
 find_regex(v, testo)

Question 5

str.maketrans and str.translate

str.maketrans() is much faster than:

 for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"):
 tmpstring = tmpstring.replace(to_remove, "")

Instead:

# create the table once at the beginning of main or globally
table = str.maketrans({c:None for c in "<>.,;-!:’?=|()"})
# then do this instead of the for-loop
tmpstring = tmpstring.translate(table)

Eman Yalpsid Eman YalpsidEman Yalpsid 1,56911 silver badges16 bronze badges · Answer 1 · 2020-05-03 16:18:18Z

We should only use as many variables as needed

For example,

values = [x for x in dict_fasi.values()]
myxml_fasi = ' '.join(values)

could be

myxml_fasi = ' '.join(dict_fasi.values())

We can reduce the number of strings created

for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"):
 tmpstring2 = tmpstring2.replace(to_remove, "")

could be

tmpstring2 = ''.join(c for c in tmpstring2
 if c not in set("|=?-<>’(!.:,;"))

The first creates a new string with each iteration. N.b. after deleting < and >, there won't be any <> in the text.

Separating input/output from processing functions

I'd try to limit interaction with the world to as few functions as possible. For example I would not expect a function named find_regex_fasi to print anything to the console (or elsewhere). I'd make it return its results and do the printing inside main.

find_fase_12T_leo = re.compile(r"\]\s*AN\s*1\s*([\w\s]+)da\s*cui\s*2\s*([\w\s]+)da\s*cui\s*T")
#find_prima = re.compile(r"\]\s*prima(?!\S)")
find_fase_base_2 = re.compile(r"\]\s([\w\s]+)\s[→]\sT") # ] parole → T
find_fase_base_3 = re.compile(r"\]\s*([\w\s]+)\s*da\scui\sT") # ] parole da cui T
find_fase_12 = re.compile(r"\]\s1\s([\w\s]+)\s2\s([\w\s]+[^T])") # ] 1 parole 2 parole (esclude T)
find_fase_prima_12 = re.compile(r"\]\s+prima\s+1\s+([\w\s]+)\s+2([\w\s]+[^T])") # ] prima 1 parole 2 parole (esclude T)
find_fase_prima_123 = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3([\w\s]+)")
find_fase_prima_123T = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3\sT") #prima 1 parole 2 parole 3t
find_fase_prima_1freccia2 = re.compile(r"\]\s+prima\s1\s([\w\s]+)\s[→]\s2([\w\s]+[^T])") #] prima 1 parola → 2 parola
FIND_FASE12T = re.compile(r"\]\s1\s([\w\s]+)\s2\sT")
FIND_FASE123T_OPZ2 = re.compile(r"\]\s*prima\s*1([\w\s]+)\s*2([\w\s][^3|^3T]+) ")
FIND_FASE123T = re.compile(r"\]\s*1([\w\s]+)\s*2([\w\s]+)\s3\sT")
FIND_FASE_123FRECCIAT = re.compile(r"\]\s1\s([\w\s]+)\s2([\w\s]+)\s→\sT")
FIND_FASE_1FRECCIA23T = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s(T|3\sT)")
FIND_FASE_FRECCIA1F2FT = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s→\s(T|3\sT)")
FIND_FASE_PRIMA_123FRECCIAT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*→\s*T")
FIND_FASE_PRIMA_1FRECCIA23T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*(T|3\sT)")
FIND_FASE_PRIMA_FRECCIA1F2FT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*→\s*(T|3\sT)")
FIND_FASE_PRIMA_1FRECCIA2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)")
FIND_FASE_PRIMA_12345T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT")
FIND_FASE_PRIMA_12345T_OPZ2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s][^5|^5\sT]+)")
FIND_FASE_12345T = re.compile(r"\]\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT")
#find_da = re.compile(r"\]\s*da(?!\S)")
#find_da_cui = re.compile(r"\]\s*([\w\s]+)\s*da\scui")
#find_sps = re.compile(r"\]\s*([\w\s]+)\s*sps")
#find_su = re.compile(r"\]\s*([\w\s]+)\s*su")
#find_as = re.compile(r"\]\s*([\w\s]+)\s*as")
#find_ins = re.compile(r"\]\s*([\w\s]+)\s*ins")
#find_segue = re.compile(r"\]\s*([\w\s]+)\s*segue")
find_regex(FIND_FASE12T, testo)
find_regex(find_fase_12T_leo, testo)
#find_regex(find_prima, testo)
find_regex(find_fase_base_2, testo)
find_regex(find_fase_base_3, testo)
find_regex(find_fase_12, testo)
find_regex(find_fase_prima_12, testo)
find_regex(find_fase_prima_123, testo)
find_regex(find_fase_prima_123T, testo)
find_regex(find_fase_prima_1freccia2, testo)
#find_regex(find_da, testo)
#find_regex(find_da_cui, testo)
#find_regex(find_sps, testo)
#find_regex(find_su, testo)
#find_regex(find_as, testo)
#find_regex(find_ins, testo)
#find_regex(find_segue, testo)

can become something like

find_phase_regexes = {
 k: re.compile(v) for k, v in {
 "12T_leo": r"\]\s*AN\s*1\s*([\w\s]+)da\s*cui\s*2\s*([\w\s]+)da\s*cui\s*T",
 "prima": r"\]\s*prima(?!\S)",
 "base_2": r"\]\s([\w\s]+)\s[→]\sT", # ] parole → T
 "base_3": r"\]\s*([\w\s]+)\s*da\scui\sT", # ] parole da cui T
 "12": r"\]\s1\s([\w\s]+)\s2\s([\w\s]+[^T])", # ] 1 parole 2 parole (esclude T)
 "prima_12": r"\]\s+prima\s+1\s+([\w\s]+)\s+2([\w\s]+[^T])", # ] prima 1 parole 2 parole (esclude T)
 "prima_123": r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3([\w\s]+)",
 "prima_123T": r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3\sT", #prima 1 parole 2 parole 3t
 "prima_1freccia2": r"\]\s+prima\s1\s([\w\s]+)\s[→]\s2([\w\s]+[^T])", #] prima 1 parola → 2 parola
 "12T": r"\]\s1\s([\w\s]+)\s2\sT",
 "123T_OPZ2": r"\]\s*prima\s*1([\w\s]+)\s*2([\w\s][^3|^3T]+) ",
 "123T": r"\]\s*1([\w\s]+)\s*2([\w\s]+)\s3\sT",
 "123FRECCIAT": r"\]\s1\s([\w\s]+)\s2([\w\s]+)\s→\sT",
 "1FRECCIA23T": r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s(T|3\sT)",
 "FRECCIA1F2FT": r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s→\s(T|3\sT)",
 "PRIMA_123FRECCIAT": r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*→\s*T",
 "PRIMA_1FRECCIA23T": r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*(T|3\sT)",
 "PRIMA_FRECCIA1F2FT": r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*→\s*(T|3\sT)",
 "PRIMA_1FRECCIA2": r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)",
 "PRIMA_12345T": r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT",
 "PRIMA_12345T_OPZ2": r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s][^5|^5\sT]+)",
 "12345T": r"\]\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT",
 }.items()
}
for k, v in find_phase_regexes.items():
 find_regex(v, testo)

RootTwo RootTwoRootTwo 10.6k1 gold badge14 silver badges30 bronze badges · Answer 2 · 2020-05-05 07:05:24Z

str.maketrans and str.translate

str.maketrans() is much faster than:

 for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"):
 tmpstring = tmpstring.replace(to_remove, "")

Instead:

# create the table once at the beginning of main or globally
table = str.maketrans({c:None for c in "<>.,;-!:’?=|()"})
# then do this instead of the for-loop
tmpstring = tmpstring.translate(table)

Stack Exchange Network

Python function to find specific regex in the text of an XML document

2 Answers 2

str.maketrans and str.translate

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Python function to find specific regex in the text of an XML document

2 Answers 2

str.maketrans and str.translate

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions