1
\$\begingroup\$

I'm writing a code that, starting from an XML file:

  • stores the index of child elements of a tag and the child elements as key, values in a dictionary (function get_xml_by_tag_names);
  • deletes keys whose values contain a certain string (the specific text size) and puts these keys and the corresponding values into a second dictionary (def search_delete_append);
  • joins, for each dictionary, the dict values and extracts their text(def main);
  • replaces certain values with "" (def main);
  • counts the occurrences of specific regex I specify (def find_regex).

The main function is problematic, as I need help cleaning it up, the regex are too many and I want to create a function for each regex inside the main function. Would it be a good option?

import re
from xml.dom import minidom
from xml.etree import ElementTree as ET
def get_xml_by_tag_names(xml_path, tag_name_1, tag_name_2):
 data = {}
 xml_tree = minidom.parse(xml_path)
 item_group_nodes = xml_tree.getElementsByTagName(tag_name_1)
 for idx, item_group_node in enumerate(item_group_nodes):
 cl_compile_nodes = item_group_node.getElementsByTagName(tag_name_2)
 for _ in cl_compile_nodes:
 data[idx]=[item_group_node.toxml()]
 return data
def find_regex(regex, text):
 lista = []
 for x in text:
 matches_prima = re.findall(regex, x)
 lunghezza = len(matches_prima)
 lista.append(lunghezza)
 print("The number of {} matches is ".format(regex), sum(lista))
def find_regex_fasi(regex, text):
 matches_fasi = re.findall(regex, text)
 print("Numero di corpo minore è", len(matches_fasi))
def search_delete_append(dizionario, dizionariofasi):
 deletekeys = []
 insertvalues = []
 for k in dizionario:
 for v in dizionario[k]:
 if "7.489" in v:
 deletekeys.append(k)
 dizionariofasi[k] = v
 for item in deletekeys:
 del dizionario[item]
def main():
 dict_fasi = {}
 data = get_xml_by_tag_names('output2.xml', 'new_line', 'text')
 search_delete_append(data, dict_fasi)
 testo = []
 for value in data.values():
 myxml = ' '.join(value)
 tree = ET.fromstring(myxml)
 tmpstring = ' '.join(text.text for text in tree.findall('text'))
 for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"):
 tmpstring = tmpstring.replace(to_remove, "")
 testo.append(tmpstring)
 #testo = ''.join(testo)
 print(testo)
 find_fase_12T_leo = re.compile(r"\]\s*AN\s*1\s*([\w\s]+)da\s*cui\s*2\s*([\w\s]+)da\s*cui\s*T")
 #find_prima = re.compile(r"\]\s*prima(?!\S)")
 find_fase_base_2 = re.compile(r"\]\s([\w\s]+)\s[→]\sT") # ] parole → T
 find_fase_base_3 = re.compile(r"\]\s*([\w\s]+)\s*da\scui\sT") # ] parole da cui T
 find_fase_12 = re.compile(r"\]\s1\s([\w\s]+)\s2\s([\w\s]+[^T])") # ] 1 parole 2 parole (esclude T)
 find_fase_prima_12 = re.compile(r"\]\s+prima\s+1\s+([\w\s]+)\s+2([\w\s]+[^T])") # ] prima 1 parole 2 parole (esclude T)
 find_fase_prima_123 = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3([\w\s]+)")
 find_fase_prima_123T = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3\sT") #prima 1 parole 2 parole 3t
 find_fase_prima_1freccia2 = re.compile(r"\]\s+prima\s1\s([\w\s]+)\s[→]\s2([\w\s]+[^T])") #] prima 1 parola → 2 parola
 FIND_FASE12T = re.compile(r"\]\s1\s([\w\s]+)\s2\sT")
 FIND_FASE123T_OPZ2 = re.compile(r"\]\s*prima\s*1([\w\s]+)\s*2([\w\s][^3|^3T]+) ")
 FIND_FASE123T = re.compile(r"\]\s*1([\w\s]+)\s*2([\w\s]+)\s3\sT")
 FIND_FASE_123FRECCIAT = re.compile(r"\]\s1\s([\w\s]+)\s2([\w\s]+)\s→\sT")
 FIND_FASE_1FRECCIA23T = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s(T|3\sT)")
 FIND_FASE_FRECCIA1F2FT = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s→\s(T|3\sT)")
 FIND_FASE_PRIMA_123FRECCIAT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*→\s*T")
 FIND_FASE_PRIMA_1FRECCIA23T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*(T|3\sT)")
 FIND_FASE_PRIMA_FRECCIA1F2FT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*→\s*(T|3\sT)")
 FIND_FASE_PRIMA_1FRECCIA2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)")
 FIND_FASE_PRIMA_12345T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT")
 FIND_FASE_PRIMA_12345T_OPZ2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s][^5|^5\sT]+)")
 FIND_FASE_12345T = re.compile(r"\]\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT")
 #find_da = re.compile(r"\]\s*da(?!\S)")
 #find_da_cui = re.compile(r"\]\s*([\w\s]+)\s*da\scui")
 #find_sps = re.compile(r"\]\s*([\w\s]+)\s*sps")
 #find_su = re.compile(r"\]\s*([\w\s]+)\s*su")
 #find_as = re.compile(r"\]\s*([\w\s]+)\s*as")
 #find_ins = re.compile(r"\]\s*([\w\s]+)\s*ins")
 #find_segue = re.compile(r"\]\s*([\w\s]+)\s*segue")
 find_regex(FIND_FASE12T, testo)
 find_regex(find_fase_12T_leo, testo)
 #find_regex(find_prima, testo)
 find_regex(find_fase_base_2, testo)
 find_regex(find_fase_base_3, testo)
 find_regex(find_fase_12, testo)
 find_regex(find_fase_prima_12, testo)
 find_regex(find_fase_prima_123, testo)
 find_regex(find_fase_prima_123T, testo)
 find_regex(find_fase_prima_1freccia2, testo)
 #find_regex(find_da, testo)
 #find_regex(find_da_cui, testo)
 #find_regex(find_sps, testo)
 #find_regex(find_su, testo)
 #find_regex(find_as, testo)
 #find_regex(find_ins, testo)
 #find_regex(find_segue, testo)
 #################
 testo_fasi = []
 values = [x for x in dict_fasi.values()]
 myxml_fasi = ' '.join(values)
 find_CM = re.compile(r"10\.238")
 find_regex_fasi(find_CM, myxml_fasi) #quanti CM ci sono?
 #print(myxml_fasi)
 for x in dict_fasi.values():
 xxx= ''.join(x)
 tree2 = ET.fromstring(xxx)
 tmpstring2 = ' '.join(text.text for text in tree2.findall('text'))
 for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"):
 tmpstring2 = tmpstring2.replace(to_remove, "")
 testo_fasi.append(tmpstring2)
 #testo_fasi = ''.join(testo_fasi)
 print(testo_fasi)
 find_regex(FIND_FASE12T, testo_fasi)
 find_regex(FIND_FASE123T_OPZ2, testo_fasi)
 find_regex(FIND_FASE123T, testo_fasi)
 find_regex(FIND_FASE_1FRECCIA23T, testo_fasi)
 find_regex(FIND_FASE_123FRECCIAT, testo_fasi)
 find_regex(FIND_FASE_FRECCIA1F2FT, testo_fasi)
 find_regex(FIND_FASE_PRIMA_1FRECCIA23T, testo_fasi)
 find_regex(FIND_FASE_PRIMA_123FRECCIAT, testo_fasi)
 find_regex(FIND_FASE_PRIMA_FRECCIA1F2FT, testo_fasi)
 find_regex(FIND_FASE_PRIMA_1FRECCIA2, testo_fasi)
 find_regex(FIND_FASE_PRIMA_12345T, testo_fasi)
 find_regex(FIND_FASE_PRIMA_12345T_OPZ2, testo_fasi)
 find_regex(FIND_FASE_12345T, testo_fasi)
 find_regex(find_fase_12T_leo, testo_fasi)
 #find_regex(find_prima, testo_fasi)
 find_regex(find_fase_base_2, testo_fasi)
 find_regex(find_fase_base_3, testo_fasi)
 find_regex(find_fase_12, testo_fasi)
 find_regex(find_fase_prima_12, testo_fasi)
 find_regex(find_fase_prima_123, testo_fasi)
 find_regex(find_fase_prima_123T, testo_fasi)
 find_regex(find_fase_prima_1freccia2, testo_fasi)
 #find_regex(find_da, testo_fasi)
 #find_regex(find_da_cui, testo_fasi)
 #find_regex(find_sps, testo_fasi)
 #find_regex(find_su, testo_fasi)
 #find_regex(find_as, testo_fasi)
 #find_regex(find_ins, testo_fasi)
 #find_regex(find_segue, testo_fasi)
if __name__ == "__main__":
 main()

I know it's half in Italian right now, but I need to keep it for now for my clarity.

Jamal
35.2k13 gold badges134 silver badges238 bronze badges
asked May 3, 2020 at 13:47
\$\endgroup\$
2
  • \$\begingroup\$ Hey. Could you reduce this to a minimal working example? Perhaps include a valid minimal input and the corresponding output too? \$\endgroup\$ Commented May 3, 2020 at 14:31
  • \$\begingroup\$ Should I provide it even if everything is working? \$\endgroup\$ Commented May 3, 2020 at 14:34

2 Answers 2

3
\$\begingroup\$

We should only use as many variables as needed

For example,

values = [x for x in dict_fasi.values()]
myxml_fasi = ' '.join(values)

could be

myxml_fasi = ' '.join(dict_fasi.values())

We can reduce the number of strings created

for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"):
 tmpstring2 = tmpstring2.replace(to_remove, "")

could be

tmpstring2 = ''.join(c for c in tmpstring2
 if c not in set("|=?-<>’(!.:,;"))

The first creates a new string with each iteration. N.b. after deleting < and >, there won't be any <> in the text.


Separating input/output from processing functions

I'd try to limit interaction with the world to as few functions as possible. For example I would not expect a function named find_regex_fasi to print anything to the console (or elsewhere). I'd make it return its results and do the printing inside main.


find_fase_12T_leo = re.compile(r"\]\s*AN\s*1\s*([\w\s]+)da\s*cui\s*2\s*([\w\s]+)da\s*cui\s*T")
#find_prima = re.compile(r"\]\s*prima(?!\S)")
find_fase_base_2 = re.compile(r"\]\s([\w\s]+)\s[→]\sT") # ] parole → T
find_fase_base_3 = re.compile(r"\]\s*([\w\s]+)\s*da\scui\sT") # ] parole da cui T
find_fase_12 = re.compile(r"\]\s1\s([\w\s]+)\s2\s([\w\s]+[^T])") # ] 1 parole 2 parole (esclude T)
find_fase_prima_12 = re.compile(r"\]\s+prima\s+1\s+([\w\s]+)\s+2([\w\s]+[^T])") # ] prima 1 parole 2 parole (esclude T)
find_fase_prima_123 = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3([\w\s]+)")
find_fase_prima_123T = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3\sT") #prima 1 parole 2 parole 3t
find_fase_prima_1freccia2 = re.compile(r"\]\s+prima\s1\s([\w\s]+)\s[→]\s2([\w\s]+[^T])") #] prima 1 parola → 2 parola
FIND_FASE12T = re.compile(r"\]\s1\s([\w\s]+)\s2\sT")
FIND_FASE123T_OPZ2 = re.compile(r"\]\s*prima\s*1([\w\s]+)\s*2([\w\s][^3|^3T]+) ")
FIND_FASE123T = re.compile(r"\]\s*1([\w\s]+)\s*2([\w\s]+)\s3\sT")
FIND_FASE_123FRECCIAT = re.compile(r"\]\s1\s([\w\s]+)\s2([\w\s]+)\s→\sT")
FIND_FASE_1FRECCIA23T = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s(T|3\sT)")
FIND_FASE_FRECCIA1F2FT = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s→\s(T|3\sT)")
FIND_FASE_PRIMA_123FRECCIAT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*→\s*T")
FIND_FASE_PRIMA_1FRECCIA23T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*(T|3\sT)")
FIND_FASE_PRIMA_FRECCIA1F2FT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*→\s*(T|3\sT)")
FIND_FASE_PRIMA_1FRECCIA2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)")
FIND_FASE_PRIMA_12345T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT")
FIND_FASE_PRIMA_12345T_OPZ2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s][^5|^5\sT]+)")
FIND_FASE_12345T = re.compile(r"\]\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT")
#find_da = re.compile(r"\]\s*da(?!\S)")
#find_da_cui = re.compile(r"\]\s*([\w\s]+)\s*da\scui")
#find_sps = re.compile(r"\]\s*([\w\s]+)\s*sps")
#find_su = re.compile(r"\]\s*([\w\s]+)\s*su")
#find_as = re.compile(r"\]\s*([\w\s]+)\s*as")
#find_ins = re.compile(r"\]\s*([\w\s]+)\s*ins")
#find_segue = re.compile(r"\]\s*([\w\s]+)\s*segue")
find_regex(FIND_FASE12T, testo)
find_regex(find_fase_12T_leo, testo)
#find_regex(find_prima, testo)
find_regex(find_fase_base_2, testo)
find_regex(find_fase_base_3, testo)
find_regex(find_fase_12, testo)
find_regex(find_fase_prima_12, testo)
find_regex(find_fase_prima_123, testo)
find_regex(find_fase_prima_123T, testo)
find_regex(find_fase_prima_1freccia2, testo)
#find_regex(find_da, testo)
#find_regex(find_da_cui, testo)
#find_regex(find_sps, testo)
#find_regex(find_su, testo)
#find_regex(find_as, testo)
#find_regex(find_ins, testo)
#find_regex(find_segue, testo)

can become something like

find_phase_regexes = {
 k: re.compile(v) for k, v in {
 "12T_leo": r"\]\s*AN\s*1\s*([\w\s]+)da\s*cui\s*2\s*([\w\s]+)da\s*cui\s*T",
 "prima": r"\]\s*prima(?!\S)",
 "base_2": r"\]\s([\w\s]+)\s[→]\sT", # ] parole → T
 "base_3": r"\]\s*([\w\s]+)\s*da\scui\sT", # ] parole da cui T
 "12": r"\]\s1\s([\w\s]+)\s2\s([\w\s]+[^T])", # ] 1 parole 2 parole (esclude T)
 "prima_12": r"\]\s+prima\s+1\s+([\w\s]+)\s+2([\w\s]+[^T])", # ] prima 1 parole 2 parole (esclude T)
 "prima_123": r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3([\w\s]+)",
 "prima_123T": r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3\sT", #prima 1 parole 2 parole 3t
 "prima_1freccia2": r"\]\s+prima\s1\s([\w\s]+)\s[→]\s2([\w\s]+[^T])", #] prima 1 parola → 2 parola
 "12T": r"\]\s1\s([\w\s]+)\s2\sT",
 "123T_OPZ2": r"\]\s*prima\s*1([\w\s]+)\s*2([\w\s][^3|^3T]+) ",
 "123T": r"\]\s*1([\w\s]+)\s*2([\w\s]+)\s3\sT",
 "123FRECCIAT": r"\]\s1\s([\w\s]+)\s2([\w\s]+)\s→\sT",
 "1FRECCIA23T": r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s(T|3\sT)",
 "FRECCIA1F2FT": r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s→\s(T|3\sT)",
 "PRIMA_123FRECCIAT": r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*→\s*T",
 "PRIMA_1FRECCIA23T": r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*(T|3\sT)",
 "PRIMA_FRECCIA1F2FT": r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*→\s*(T|3\sT)",
 "PRIMA_1FRECCIA2": r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)",
 "PRIMA_12345T": r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT",
 "PRIMA_12345T_OPZ2": r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s][^5|^5\sT]+)",
 "12345T": r"\]\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT",
 }.items()
}
for k, v in find_phase_regexes.items():
 find_regex(v, testo)
answered May 3, 2020 at 16:18
\$\endgroup\$
2
\$\begingroup\$

str.maketrans and str.translate

str.maketrans() is much faster than:

 for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"):
 tmpstring = tmpstring.replace(to_remove, "")

Instead:

# create the table once at the beginning of main or globally
table = str.maketrans({c:None for c in "<>.,;-!:’?=|()"})
# then do this instead of the for-loop
tmpstring = tmpstring.translate(table)
answered May 5, 2020 at 7:05
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.