I'm writing a code that, starting from an XML file:
- stores the index of child elements of a tag and the child elements as
key, values in a dictionary (function
get_xml_by_tag_names
); - deletes keys whose values contain a certain string (the specific text
size) and puts these keys and the corresponding values into a second
dictionary (def
search_delete_append
); - joins, for each dictionary, the dict values and extracts their
text(def
main
); - replaces certain values with "" (def
main
); - counts the occurrences of specific regex I specify (def
find_regex
).
The main
function is problematic, as I need help cleaning it up, the regex are too many and I want to create a function for each regex inside the main function. Would it be a good option?
import re
from xml.dom import minidom
from xml.etree import ElementTree as ET
def get_xml_by_tag_names(xml_path, tag_name_1, tag_name_2):
data = {}
xml_tree = minidom.parse(xml_path)
item_group_nodes = xml_tree.getElementsByTagName(tag_name_1)
for idx, item_group_node in enumerate(item_group_nodes):
cl_compile_nodes = item_group_node.getElementsByTagName(tag_name_2)
for _ in cl_compile_nodes:
data[idx]=[item_group_node.toxml()]
return data
def find_regex(regex, text):
lista = []
for x in text:
matches_prima = re.findall(regex, x)
lunghezza = len(matches_prima)
lista.append(lunghezza)
print("The number of {} matches is ".format(regex), sum(lista))
def find_regex_fasi(regex, text):
matches_fasi = re.findall(regex, text)
print("Numero di corpo minore è", len(matches_fasi))
def search_delete_append(dizionario, dizionariofasi):
deletekeys = []
insertvalues = []
for k in dizionario:
for v in dizionario[k]:
if "7.489" in v:
deletekeys.append(k)
dizionariofasi[k] = v
for item in deletekeys:
del dizionario[item]
def main():
dict_fasi = {}
data = get_xml_by_tag_names('output2.xml', 'new_line', 'text')
search_delete_append(data, dict_fasi)
testo = []
for value in data.values():
myxml = ' '.join(value)
tree = ET.fromstring(myxml)
tmpstring = ' '.join(text.text for text in tree.findall('text'))
for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"):
tmpstring = tmpstring.replace(to_remove, "")
testo.append(tmpstring)
#testo = ''.join(testo)
print(testo)
find_fase_12T_leo = re.compile(r"\]\s*AN\s*1\s*([\w\s]+)da\s*cui\s*2\s*([\w\s]+)da\s*cui\s*T")
#find_prima = re.compile(r"\]\s*prima(?!\S)")
find_fase_base_2 = re.compile(r"\]\s([\w\s]+)\s[→]\sT") # ] parole → T
find_fase_base_3 = re.compile(r"\]\s*([\w\s]+)\s*da\scui\sT") # ] parole da cui T
find_fase_12 = re.compile(r"\]\s1\s([\w\s]+)\s2\s([\w\s]+[^T])") # ] 1 parole 2 parole (esclude T)
find_fase_prima_12 = re.compile(r"\]\s+prima\s+1\s+([\w\s]+)\s+2([\w\s]+[^T])") # ] prima 1 parole 2 parole (esclude T)
find_fase_prima_123 = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3([\w\s]+)")
find_fase_prima_123T = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3\sT") #prima 1 parole 2 parole 3t
find_fase_prima_1freccia2 = re.compile(r"\]\s+prima\s1\s([\w\s]+)\s[→]\s2([\w\s]+[^T])") #] prima 1 parola → 2 parola
FIND_FASE12T = re.compile(r"\]\s1\s([\w\s]+)\s2\sT")
FIND_FASE123T_OPZ2 = re.compile(r"\]\s*prima\s*1([\w\s]+)\s*2([\w\s][^3|^3T]+) ")
FIND_FASE123T = re.compile(r"\]\s*1([\w\s]+)\s*2([\w\s]+)\s3\sT")
FIND_FASE_123FRECCIAT = re.compile(r"\]\s1\s([\w\s]+)\s2([\w\s]+)\s→\sT")
FIND_FASE_1FRECCIA23T = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s(T|3\sT)")
FIND_FASE_FRECCIA1F2FT = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s→\s(T|3\sT)")
FIND_FASE_PRIMA_123FRECCIAT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*→\s*T")
FIND_FASE_PRIMA_1FRECCIA23T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*(T|3\sT)")
FIND_FASE_PRIMA_FRECCIA1F2FT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*→\s*(T|3\sT)")
FIND_FASE_PRIMA_1FRECCIA2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)")
FIND_FASE_PRIMA_12345T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT")
FIND_FASE_PRIMA_12345T_OPZ2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s][^5|^5\sT]+)")
FIND_FASE_12345T = re.compile(r"\]\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT")
#find_da = re.compile(r"\]\s*da(?!\S)")
#find_da_cui = re.compile(r"\]\s*([\w\s]+)\s*da\scui")
#find_sps = re.compile(r"\]\s*([\w\s]+)\s*sps")
#find_su = re.compile(r"\]\s*([\w\s]+)\s*su")
#find_as = re.compile(r"\]\s*([\w\s]+)\s*as")
#find_ins = re.compile(r"\]\s*([\w\s]+)\s*ins")
#find_segue = re.compile(r"\]\s*([\w\s]+)\s*segue")
find_regex(FIND_FASE12T, testo)
find_regex(find_fase_12T_leo, testo)
#find_regex(find_prima, testo)
find_regex(find_fase_base_2, testo)
find_regex(find_fase_base_3, testo)
find_regex(find_fase_12, testo)
find_regex(find_fase_prima_12, testo)
find_regex(find_fase_prima_123, testo)
find_regex(find_fase_prima_123T, testo)
find_regex(find_fase_prima_1freccia2, testo)
#find_regex(find_da, testo)
#find_regex(find_da_cui, testo)
#find_regex(find_sps, testo)
#find_regex(find_su, testo)
#find_regex(find_as, testo)
#find_regex(find_ins, testo)
#find_regex(find_segue, testo)
#################
testo_fasi = []
values = [x for x in dict_fasi.values()]
myxml_fasi = ' '.join(values)
find_CM = re.compile(r"10\.238")
find_regex_fasi(find_CM, myxml_fasi) #quanti CM ci sono?
#print(myxml_fasi)
for x in dict_fasi.values():
xxx= ''.join(x)
tree2 = ET.fromstring(xxx)
tmpstring2 = ' '.join(text.text for text in tree2.findall('text'))
for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"):
tmpstring2 = tmpstring2.replace(to_remove, "")
testo_fasi.append(tmpstring2)
#testo_fasi = ''.join(testo_fasi)
print(testo_fasi)
find_regex(FIND_FASE12T, testo_fasi)
find_regex(FIND_FASE123T_OPZ2, testo_fasi)
find_regex(FIND_FASE123T, testo_fasi)
find_regex(FIND_FASE_1FRECCIA23T, testo_fasi)
find_regex(FIND_FASE_123FRECCIAT, testo_fasi)
find_regex(FIND_FASE_FRECCIA1F2FT, testo_fasi)
find_regex(FIND_FASE_PRIMA_1FRECCIA23T, testo_fasi)
find_regex(FIND_FASE_PRIMA_123FRECCIAT, testo_fasi)
find_regex(FIND_FASE_PRIMA_FRECCIA1F2FT, testo_fasi)
find_regex(FIND_FASE_PRIMA_1FRECCIA2, testo_fasi)
find_regex(FIND_FASE_PRIMA_12345T, testo_fasi)
find_regex(FIND_FASE_PRIMA_12345T_OPZ2, testo_fasi)
find_regex(FIND_FASE_12345T, testo_fasi)
find_regex(find_fase_12T_leo, testo_fasi)
#find_regex(find_prima, testo_fasi)
find_regex(find_fase_base_2, testo_fasi)
find_regex(find_fase_base_3, testo_fasi)
find_regex(find_fase_12, testo_fasi)
find_regex(find_fase_prima_12, testo_fasi)
find_regex(find_fase_prima_123, testo_fasi)
find_regex(find_fase_prima_123T, testo_fasi)
find_regex(find_fase_prima_1freccia2, testo_fasi)
#find_regex(find_da, testo_fasi)
#find_regex(find_da_cui, testo_fasi)
#find_regex(find_sps, testo_fasi)
#find_regex(find_su, testo_fasi)
#find_regex(find_as, testo_fasi)
#find_regex(find_ins, testo_fasi)
#find_regex(find_segue, testo_fasi)
if __name__ == "__main__":
main()
I know it's half in Italian right now, but I need to keep it for now for my clarity.
-
\$\begingroup\$ Hey. Could you reduce this to a minimal working example? Perhaps include a valid minimal input and the corresponding output too? \$\endgroup\$Eman Yalpsid– Eman Yalpsid2020年05月03日 14:31:51 +00:00Commented May 3, 2020 at 14:31
-
\$\begingroup\$ Should I provide it even if everything is working? \$\endgroup\$Anna– Anna2020年05月03日 14:34:05 +00:00Commented May 3, 2020 at 14:34
2 Answers 2
We should only use as many variables as needed
For example,
values = [x for x in dict_fasi.values()]
myxml_fasi = ' '.join(values)
could be
myxml_fasi = ' '.join(dict_fasi.values())
We can reduce the number of strings created
for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"):
tmpstring2 = tmpstring2.replace(to_remove, "")
could be
tmpstring2 = ''.join(c for c in tmpstring2
if c not in set("|=?-<>’(!.:,;"))
The first creates a new string with each iteration.
N.b. after deleting <
and >
, there won't be any <>
in the text.
Separating input/output from processing functions
I'd try to limit interaction with the world to as few functions as possible. For example I would not expect a function named find_regex_fasi
to print anything to the console (or elsewhere). I'd make it return its results and do the printing inside main
.
find_fase_12T_leo = re.compile(r"\]\s*AN\s*1\s*([\w\s]+)da\s*cui\s*2\s*([\w\s]+)da\s*cui\s*T")
#find_prima = re.compile(r"\]\s*prima(?!\S)")
find_fase_base_2 = re.compile(r"\]\s([\w\s]+)\s[→]\sT") # ] parole → T
find_fase_base_3 = re.compile(r"\]\s*([\w\s]+)\s*da\scui\sT") # ] parole da cui T
find_fase_12 = re.compile(r"\]\s1\s([\w\s]+)\s2\s([\w\s]+[^T])") # ] 1 parole 2 parole (esclude T)
find_fase_prima_12 = re.compile(r"\]\s+prima\s+1\s+([\w\s]+)\s+2([\w\s]+[^T])") # ] prima 1 parole 2 parole (esclude T)
find_fase_prima_123 = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3([\w\s]+)")
find_fase_prima_123T = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3\sT") #prima 1 parole 2 parole 3t
find_fase_prima_1freccia2 = re.compile(r"\]\s+prima\s1\s([\w\s]+)\s[→]\s2([\w\s]+[^T])") #] prima 1 parola → 2 parola
FIND_FASE12T = re.compile(r"\]\s1\s([\w\s]+)\s2\sT")
FIND_FASE123T_OPZ2 = re.compile(r"\]\s*prima\s*1([\w\s]+)\s*2([\w\s][^3|^3T]+) ")
FIND_FASE123T = re.compile(r"\]\s*1([\w\s]+)\s*2([\w\s]+)\s3\sT")
FIND_FASE_123FRECCIAT = re.compile(r"\]\s1\s([\w\s]+)\s2([\w\s]+)\s→\sT")
FIND_FASE_1FRECCIA23T = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s(T|3\sT)")
FIND_FASE_FRECCIA1F2FT = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s→\s(T|3\sT)")
FIND_FASE_PRIMA_123FRECCIAT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*→\s*T")
FIND_FASE_PRIMA_1FRECCIA23T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*(T|3\sT)")
FIND_FASE_PRIMA_FRECCIA1F2FT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*→\s*(T|3\sT)")
FIND_FASE_PRIMA_1FRECCIA2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)")
FIND_FASE_PRIMA_12345T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT")
FIND_FASE_PRIMA_12345T_OPZ2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s][^5|^5\sT]+)")
FIND_FASE_12345T = re.compile(r"\]\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT")
#find_da = re.compile(r"\]\s*da(?!\S)")
#find_da_cui = re.compile(r"\]\s*([\w\s]+)\s*da\scui")
#find_sps = re.compile(r"\]\s*([\w\s]+)\s*sps")
#find_su = re.compile(r"\]\s*([\w\s]+)\s*su")
#find_as = re.compile(r"\]\s*([\w\s]+)\s*as")
#find_ins = re.compile(r"\]\s*([\w\s]+)\s*ins")
#find_segue = re.compile(r"\]\s*([\w\s]+)\s*segue")
find_regex(FIND_FASE12T, testo)
find_regex(find_fase_12T_leo, testo)
#find_regex(find_prima, testo)
find_regex(find_fase_base_2, testo)
find_regex(find_fase_base_3, testo)
find_regex(find_fase_12, testo)
find_regex(find_fase_prima_12, testo)
find_regex(find_fase_prima_123, testo)
find_regex(find_fase_prima_123T, testo)
find_regex(find_fase_prima_1freccia2, testo)
#find_regex(find_da, testo)
#find_regex(find_da_cui, testo)
#find_regex(find_sps, testo)
#find_regex(find_su, testo)
#find_regex(find_as, testo)
#find_regex(find_ins, testo)
#find_regex(find_segue, testo)
can become something like
find_phase_regexes = {
k: re.compile(v) for k, v in {
"12T_leo": r"\]\s*AN\s*1\s*([\w\s]+)da\s*cui\s*2\s*([\w\s]+)da\s*cui\s*T",
"prima": r"\]\s*prima(?!\S)",
"base_2": r"\]\s([\w\s]+)\s[→]\sT", # ] parole → T
"base_3": r"\]\s*([\w\s]+)\s*da\scui\sT", # ] parole da cui T
"12": r"\]\s1\s([\w\s]+)\s2\s([\w\s]+[^T])", # ] 1 parole 2 parole (esclude T)
"prima_12": r"\]\s+prima\s+1\s+([\w\s]+)\s+2([\w\s]+[^T])", # ] prima 1 parole 2 parole (esclude T)
"prima_123": r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3([\w\s]+)",
"prima_123T": r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3\sT", #prima 1 parole 2 parole 3t
"prima_1freccia2": r"\]\s+prima\s1\s([\w\s]+)\s[→]\s2([\w\s]+[^T])", #] prima 1 parola → 2 parola
"12T": r"\]\s1\s([\w\s]+)\s2\sT",
"123T_OPZ2": r"\]\s*prima\s*1([\w\s]+)\s*2([\w\s][^3|^3T]+) ",
"123T": r"\]\s*1([\w\s]+)\s*2([\w\s]+)\s3\sT",
"123FRECCIAT": r"\]\s1\s([\w\s]+)\s2([\w\s]+)\s→\sT",
"1FRECCIA23T": r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s(T|3\sT)",
"FRECCIA1F2FT": r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s→\s(T|3\sT)",
"PRIMA_123FRECCIAT": r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*→\s*T",
"PRIMA_1FRECCIA23T": r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*(T|3\sT)",
"PRIMA_FRECCIA1F2FT": r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*→\s*(T|3\sT)",
"PRIMA_1FRECCIA2": r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)",
"PRIMA_12345T": r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT",
"PRIMA_12345T_OPZ2": r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s][^5|^5\sT]+)",
"12345T": r"\]\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT",
}.items()
}
for k, v in find_phase_regexes.items():
find_regex(v, testo)
str.maketrans and str.translate
str.maketrans()
is much faster than:
for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"):
tmpstring = tmpstring.replace(to_remove, "")
Instead:
# create the table once at the beginning of main or globally
table = str.maketrans({c:None for c in "<>.,;-!:’?=|()"})
# then do this instead of the for-loop
tmpstring = tmpstring.translate(table)