This code inputs a weighted edgelist, positive and negative sentiment seed words, and a target word. The program computes the sum of the weights along shortest paths from the seed words to the target and from the target to the seed words, as it generates 9 output values.
The program is very slow. Running large edgelist files takes days, rather than minutes or seconds.
How can this program be sped up?
from tkinter import Tk, X, Y, TOP, BOTTOM, LEFT, RIGHT, BOTH, END
from tkinter import filedialog, messagebox
from tkinter.ttk import Frame, Button, Entry, Label, Progressbar
import os, glob, time
import pandas as pd
root = Tk()
root.geometry("600x400+300+300")
def read_edge_list(filename):
edges = {}
words = set()
with open(filename) as fp:
lines = fp.readlines()
for line in lines:
token = line.split()
if len(token) != 3:
continue
word1 = token[0]
word2 = token[1]
freq = token[2]
words = words | {word1, word2}
if not word1 in edges.keys():
edges[word1] = {}
if not word2 in edges[word1]:
edges[word1][word2] = {}
edges[word1][word2] = freq
return edges, words
def read_sentiment(filename):
with open(filename, encoding='utf-8-sig') as fp:
lines = fp.readlines()
words = {line.strip() for line in lines}
return words
def read_target_word():
word = input("Please input target word: ")
return word
def run_shortest_path_algorithm(edges, positive, negative, target):
positivedict = {}
negativedict = {}
for source in positive:
dist1 = dijkstra(edges, source, target)
dist2 = dijkstra(edges, target, source)
if dist1 and dist2:
positivedict[source] = dist1 + dist2
for source in negative:
dist1 = dijkstra(edges, source, target)
dist2 = dijkstra(edges, target, source)
if dist1 and dist2:
negativedict[source] = dist1 + dist2
return positivedict, negativedict
def calculate_statistics_summary(positivedict, negativedict,
positivewords, negativewords):
numpositive = len(positivedict)
numnegative = len(negativedict)
actualnumpositive = len(positivewords)
actualnumnegative = len(negativewords)
sumpositive = sum(positivedict.values())
sumnegative = sum(negativedict.values())
if actualnumpositive == 0:
s1 = 0
else:
s1 = sumpositive / actualnumpositive
if actualnumnegative == 0:
s2 = 0
else:
s2 = sumnegative / actualnumnegative
if numnegative == 0:
s3 = 0
else:
s3 = s1 * numpositive / numnegative
if s2 == 0:
s4 = 0
else:
s4 = s3 / s2
if numpositive == 0:
s5 = 0
else:
s5 = sumpositive / numpositive
if numnegative == 0:
s6 = 0
else:
s6 = sumnegative / numnegative
if numnegative == 0:
s7 = 0
else:
s7 = s5 * numpositive / numnegative
if s6 == 0:
s8 = 0
else:
s8 = s7 / s6
s9 = s3 - s2
return [s1, s2, s3, s4, s5, s6, s7, s8, s9]
def write_output_file():
pass
def dijkstra(graph, start, end):
shortest_paths = {start: (None, 0)}
current_node = start
visited = set()
while current_node != end:
visited.add(current_node)
if current_node not in graph:
destinations = []
else:
destinations = graph[current_node].keys()
weight_to_current_node = shortest_paths[current_node][1]
for next_node in destinations:
weight = int(graph[current_node][next_node]) +
weight_to_current_node
if next_node not in shortest_paths:
shortest_paths[next_node] = (current_node, weight)
else:
current_shortest_weight = shortest_paths[next_node][1]
if current_shortest_weight > weight:
shortest_paths[next_node] = (current_node, weight)
next_destinations = {node: shortest_paths[node] for node in
shortest_paths if node not in visited}
if not next_destinations:
return None
current_node = min(next_destinations, key=lambda k:
next_destinations[k][1])
#path = []
#while current_node is not None:
#path.append(current_node)
#next_node = shortest_paths[current_node][0]
#current_node = next_node
#path = path[::-1]
#return path
return shortest_paths[end][1]
class SentimentWindow(Frame):
def __init__(self):
super().__init__()
self.initUI()
self.initPositiveDir = None
self.initNegativeDir = None
self.initSaveDir = None
self.summary = pd.DataFrame(columns=['S1', 'S2', 'S3', 'S4',
'S5', 'S6', 'S7', 'S8', 'S9'])
def initUI(self):
self.master.title("Sentiment")
self.pack(fill=BOTH, expand=True, padx=15, pady=15)
frmEdges = Frame(self)
frmEdges.pack(fill=X, expand=True)
lblEdges = Label(frmEdges, text="Select the directory of edge
list.")
lblEdges.pack(expand=True, fill=X, side=TOP, pady=2)
frmEdgesPath = Frame(frmEdges)
frmEdgesPath.pack(expand=True, fill=X, side=BOTTOM, pady=2)
self.entEdgesPath = Entry(frmEdgesPath, width=60)
self.entEdgesPath.pack(expand=True, fill=X, side=LEFT)
btnEdgesPath = Button(frmEdgesPath, width=20, text="Load
Edges", command=self.loadEdges)
btnEdgesPath.pack(expand=True, side=RIGHT)
frmPositive = Frame(self)
frmPositive.pack(fill=X, expand=True)
lblPositive = Label(frmPositive, text="Select the positive
file.")
lblPositive.pack(expand=True, fill=X, side=TOP, pady=2)
frmPositivePath = Frame(frmPositive)
frmPositivePath.pack(expand=True, fill=X, side=BOTTOM, pady=2)
self.entPositivePath = Entry(frmPositivePath, width=60)
self.entPositivePath.pack(expand=True, fill=X, side=LEFT)
btnPositivePath = Button(frmPositivePath, width=20, text="Load
Positive", command=self.loadPositive)
btnPositivePath.pack(expand=True, side=RIGHT)
frmNegative = Frame(self)
frmNegative.pack(fill=X, expand=True)
lblNegative = Label(frmNegative, text="Select the negative
file.")
lblNegative.pack(expand=True, fill=X, side=TOP, pady=2)
frmNegativePath = Frame(frmNegative)
frmNegativePath.pack(expand=True, fill=X, side=BOTTOM, pady=2)
self.entNegativePath = Entry(frmNegativePath, width=60)
self.entNegativePath.pack(expand=True, fill=X, side=LEFT)
btnNegativePath = Button(frmNegativePath, width=20, text="Load
Negative", command=self.loadNegative)
btnNegativePath.pack(expand=True, side=RIGHT)
frmTarget = Frame(self)
frmTarget.pack(fill=X, expand=True)
lblTarget = Label(frmTarget, text="Input the target word.")
lblTarget.pack(expand=True, fill=X, side=TOP, pady=2)
self.entTarget = Entry(frmTarget)
self.entTarget.pack(fill=X, expand=True, pady=2)
frmRun = Frame(self)
frmRun.pack(fill=X, expand=True, pady=20)
self.proRun = Progressbar(frmRun, value=0)
self.proRun.pack(fill=X, expand=True, side=LEFT)
btnRun = Button(frmRun, text = "Run", width=20,
command=self.run)
btnRun.pack(side=RIGHT, padx=20)
def loadEdges(self):
edgesFolderName = filedialog.askdirectory()
if edgesFolderName:
self.entEdgesPath.delete(0, END)
self.entEdgesPath.insert(0, edgesFolderName)
def loadPositive(self):
if self.initPositiveDir is None:
self.initPositiveDir = "/"
positiveFileName =
filedialog.askopenfilename(initialdir=self.initPositiveDir,
title="Open Positive File", filetypes=(("Text file", "*.txt"),))
if positiveFileName:
self.initPositiveDir = positiveFileName
self.entPositivePath.delete(0, END)
self.entPositivePath.insert(0, positiveFileName)
def loadNegative(self):
if self.initNegativeDir is None:
self.initNegativeDir = "/"
negativeFileName =
filedialog.askopenfilename(initialdir=self.initNegativeDir,
title="Open Positive File", filetypes=(("Text file", "*.txt"),))
if negativeFileName:
self.initNegativeDir = negativeFileName
self.entNegativePath.delete(0, END)
self.entNegativePath.insert(0, negativeFileName)
def run(self):
edgesFolderName = self.entEdgesPath.get()
if not os.path.isdir(edgesFolderName):
messagebox.showerror("Invalid Path", "The directory of
edge list is invalid.")
return
positiveFileName = self.entPositivePath.get()
if not os.path.isfile(positiveFileName):
messagebox.showerror("Invalid Path", "The positive
filename is invalid.")
return
negativeFileName = self.entNegativePath.get()
if not os.path.isfile(negativeFileName):
messagebox.showerror("Invalid Path", "The negative
filename is invalid.")
return
targetWord = self.entTarget.get()
if targetWord is None or len(targetWord) <= 0:
messagebox.showerror("No Target", "Please input the target
word.")
os.chdir(edgesFolderName)
edgefiles = glob.glob("*.pr")
if len(edgefiles) <= 0:
messagebox.showerror("No Edge File", "Cannot find the edge
files.")
positivewords = read_sentiment(positiveFileName)
negativewords = read_sentiment(negativeFileName)
self.summary.drop(self.summary.index, inplace=True)
self.proRun["value"] = 0.0
self.proRun.update()
root.config(cursor="wait")
root.update()
time.sleep(0.300)
for index, edgefile in enumerate(edgefiles):
edges, words = read_edge_list(edgefile)
if targetWord not in words:
messagebox.showerror("Invalid Target", "Target does
not exist in " + edgefile)
else:
possiblepositive = positivewords & words
possiblenegative = negativewords & words
positivedict, negativedict = \
run_shortest_path_algorithm(edges,
possiblepositive, possiblenegative, targetWord)
statistics_summary =
calculate_statistics_summary(positivedict, negativedict,
positivewords, negativewords)
self.summary.loc[edgefile] = statistics_summary
self.proRun["value"] = 100 * (index + 1) / len(edgefiles)
self.proRun.update()
root.config(cursor="")
if self.summary.shape[0] > 0:
self.summary.loc['mean'] = self.summary.mean()
self.summary.loc['std'] = self.summary.std()
if self.initSaveDir is None:
self.initSaveDir = "/"
outputFile =
filedialog.asksaveasfilename(initialdir=self.initSaveDir, title="Save
Summary File", filetypes=(("Text file", "*.txt"),))
self.initSaveDir = outputFile
if outputFile:
with open(outputFile, 'w') as outfp:
self.summary.to_string(outfp)
app = SentimentWindow()
root.mainloop()
Some data files for this program:
-
Set the target word to
bp
.
Here's the code in a file.
-
\$\begingroup\$ How are these files supposed to be opened? \$\endgroup\$T145– T1452019年11月10日 20:56:15 +00:00Commented Nov 10, 2019 at 20:56
-
1\$\begingroup\$ For all the comments in the code, I can't tell whether it tries to implement a standard algorithm for shortest path or something home-grown. (I can take a guess given the function names that it uses one as a partial solution.) \$\endgroup\$greybeard– greybeard2019年11月10日 23:48:37 +00:00Commented Nov 10, 2019 at 23:48
-
\$\begingroup\$ It uses Dijkstra's algorithm. Wondering if using igraph's implementation would be better. \$\endgroup\$James Danowski– James Danowski2019年11月11日 03:39:51 +00:00Commented Nov 11, 2019 at 3:39
-
\$\begingroup\$ It appears to me that your Dijkstra's algorithm implementation is not using a priority queue? \$\endgroup\$Andrew Au– Andrew Au2019年11月11日 06:47:53 +00:00Commented Nov 11, 2019 at 6:47
-
\$\begingroup\$ How would a priority queue work? \$\endgroup\$James Danowski– James Danowski2019年11月12日 19:13:52 +00:00Commented Nov 12, 2019 at 19:13
1 Answer 1
Just going to comment on style
Method Naming
Method names should be in snake_case
. You got it right with the class names, as they should be PascalCase
, but method names are in snake_case
.
Ternary Operations
Your huge block of if statements in calculate_statistics_summary
can be reduced to one line each, utilizing the ternary operator:
s1 = 0 if actualnumpositive == 0 else sumpositive / actualnumpositive
s2 = 0 if actualnumnegative == 0 else sumnegative / actualnumnegative
s3 = 0 if numnegative == 0 else s1 * numpositive / numnegative
s4 = 0 if s2 == 0 else s3 / s2
s5 = 0 if numpositive == 0 else sumpositive / numpositive
s6 = 0 if numnegative == 0 else sumnegative / numnegative
s7 = 0 if numnegative == 0 else s5 * numpositive / numnegative
s8 = 0 if s6 == 0 else s7 / s6
s9 = s3 - s2
Type Hints
You should use type hints to make it clear what types of parameters are acceptable to methods, and what type are returned, if any. For example:
def calculate_statistics_summary(positivedict, negativedict, positivewords, negativewords):
can be this (added spacing to make it readable)
def calculate_statistics_summary(
positivedict: dict,
negativedict: dict,
positivewords: set,
negativewords: set
) -> List[float]:
Now it's much clearer what types are passed to the method, and that a list containing floats (from my interpretation) is being returned.