python program for computing shortest paths

Question 1

This code inputs a weighted edgelist, positive and negative sentiment seed words, and a target word. The program computes the sum of the weights along shortest paths from the seed words to the target and from the target to the seed words, as it generates 9 output values.

The program is very slow. Running large edgelist files takes days, rather than minutes or seconds.

How can this program be sped up?

from tkinter import Tk, X, Y, TOP, BOTTOM, LEFT, RIGHT, BOTH, END
from tkinter import filedialog, messagebox
from tkinter.ttk import Frame, Button, Entry, Label, Progressbar
import os, glob, time
import pandas as pd
root = Tk()
root.geometry("600x400+300+300")
def read_edge_list(filename):
 edges = {}
 words = set()
 with open(filename) as fp:
 lines = fp.readlines()
 
 for line in lines:
 token = line.split()
 
 if len(token) != 3:
 continue
 
 word1 = token[0]
 word2 = token[1]
 freq = token[2]
 
 words = words | {word1, word2}
 
 if not word1 in edges.keys():
 edges[word1] = {}
 
 if not word2 in edges[word1]:
 edges[word1][word2] = {}
 
 edges[word1][word2] = freq
 
 return edges, words
def read_sentiment(filename):
 with open(filename, encoding='utf-8-sig') as fp:
 lines = fp.readlines()
 
 words = {line.strip() for line in lines}
 
 return words
def read_target_word():
 word = input("Please input target word: ")
 return word
def run_shortest_path_algorithm(edges, positive, negative, target):
 positivedict = {}
 negativedict = {}
 for source in positive:
 dist1 = dijkstra(edges, source, target)
 dist2 = dijkstra(edges, target, source)
 if dist1 and dist2:
 positivedict[source] = dist1 + dist2
 
 for source in negative:
 dist1 = dijkstra(edges, source, target)
 dist2 = dijkstra(edges, target, source)
 if dist1 and dist2:
 negativedict[source] = dist1 + dist2
 
 return positivedict, negativedict
def calculate_statistics_summary(positivedict, negativedict, 
positivewords, negativewords):
 numpositive = len(positivedict)
 numnegative = len(negativedict)
 actualnumpositive = len(positivewords)
 actualnumnegative = len(negativewords)
 sumpositive = sum(positivedict.values())
 sumnegative = sum(negativedict.values())
 if actualnumpositive == 0:
 s1 = 0
 else:
 s1 = sumpositive / actualnumpositive
 
 if actualnumnegative == 0:
 s2 = 0
 else:
 s2 = sumnegative / actualnumnegative
 
 if numnegative == 0:
 s3 = 0
 else:
 s3 = s1 * numpositive / numnegative
 
 if s2 == 0:
 s4 = 0
 else:
 s4 = s3 / s2
 
 if numpositive == 0:
 s5 = 0
 else:
 s5 = sumpositive / numpositive
 
 if numnegative == 0:
 s6 = 0
 else:
 s6 = sumnegative / numnegative
 
 if numnegative == 0:
 s7 = 0
 else:
 s7 = s5 * numpositive / numnegative
 
 if s6 == 0:
 s8 = 0
 else:
 s8 = s7 / s6
 
 s9 = s3 - s2
 return [s1, s2, s3, s4, s5, s6, s7, s8, s9]
def write_output_file():
 pass
def dijkstra(graph, start, end):
 shortest_paths = {start: (None, 0)}
 current_node = start
 visited = set()
 while current_node != end:
 visited.add(current_node)
 
 if current_node not in graph:
 destinations = []
 else:
 destinations = graph[current_node].keys()
 
 weight_to_current_node = shortest_paths[current_node][1]
 
 for next_node in destinations:
 weight = int(graph[current_node][next_node]) + 
weight_to_current_node
 if next_node not in shortest_paths:
 shortest_paths[next_node] = (current_node, weight)
 else:
 current_shortest_weight = shortest_paths[next_node][1]
 if current_shortest_weight > weight:
 shortest_paths[next_node] = (current_node, weight)
 
 next_destinations = {node: shortest_paths[node] for node in 
shortest_paths if node not in visited}
 if not next_destinations:
 return None
 
 current_node = min(next_destinations, key=lambda k: 
next_destinations[k][1])
 
 #path = []
 #while current_node is not None:
 #path.append(current_node)
 #next_node = shortest_paths[current_node][0]
 #current_node = next_node
 
 #path = path[::-1]
 #return path
 return shortest_paths[end][1]
class SentimentWindow(Frame):
 def __init__(self):
 super().__init__()
 
 self.initUI()
 
 self.initPositiveDir = None
 self.initNegativeDir = None
 self.initSaveDir = None
 
 self.summary = pd.DataFrame(columns=['S1', 'S2', 'S3', 'S4', 
'S5', 'S6', 'S7', 'S8', 'S9'])
 
 def initUI(self):
 self.master.title("Sentiment")
 self.pack(fill=BOTH, expand=True, padx=15, pady=15)
 
 frmEdges = Frame(self)
 frmEdges.pack(fill=X, expand=True)
 
 lblEdges = Label(frmEdges, text="Select the directory of edge 
list.")
 lblEdges.pack(expand=True, fill=X, side=TOP, pady=2)
 
 frmEdgesPath = Frame(frmEdges)
 frmEdgesPath.pack(expand=True, fill=X, side=BOTTOM, pady=2)
 
 self.entEdgesPath = Entry(frmEdgesPath, width=60)
 self.entEdgesPath.pack(expand=True, fill=X, side=LEFT)
 
 btnEdgesPath = Button(frmEdgesPath, width=20, text="Load 
Edges", command=self.loadEdges)
 btnEdgesPath.pack(expand=True, side=RIGHT)
 
 frmPositive = Frame(self)
 frmPositive.pack(fill=X, expand=True)
 
 lblPositive = Label(frmPositive, text="Select the positive 
file.")
 lblPositive.pack(expand=True, fill=X, side=TOP, pady=2)
 
 frmPositivePath = Frame(frmPositive)
 frmPositivePath.pack(expand=True, fill=X, side=BOTTOM, pady=2)
 
 self.entPositivePath = Entry(frmPositivePath, width=60)
 self.entPositivePath.pack(expand=True, fill=X, side=LEFT)
 
 btnPositivePath = Button(frmPositivePath, width=20, text="Load 
Positive", command=self.loadPositive)
 btnPositivePath.pack(expand=True, side=RIGHT)
 
 frmNegative = Frame(self)
 frmNegative.pack(fill=X, expand=True)
 
 lblNegative = Label(frmNegative, text="Select the negative 
file.")
 lblNegative.pack(expand=True, fill=X, side=TOP, pady=2)
 
 frmNegativePath = Frame(frmNegative)
 frmNegativePath.pack(expand=True, fill=X, side=BOTTOM, pady=2)
 
 self.entNegativePath = Entry(frmNegativePath, width=60)
 self.entNegativePath.pack(expand=True, fill=X, side=LEFT)
 
 btnNegativePath = Button(frmNegativePath, width=20, text="Load 
Negative", command=self.loadNegative)
 btnNegativePath.pack(expand=True, side=RIGHT)
 
 frmTarget = Frame(self)
 frmTarget.pack(fill=X, expand=True)
 
 lblTarget = Label(frmTarget, text="Input the target word.")
 lblTarget.pack(expand=True, fill=X, side=TOP, pady=2)
 
 self.entTarget = Entry(frmTarget)
 self.entTarget.pack(fill=X, expand=True, pady=2)
 
 frmRun = Frame(self)
 frmRun.pack(fill=X, expand=True, pady=20)
 
 self.proRun = Progressbar(frmRun, value=0)
 self.proRun.pack(fill=X, expand=True, side=LEFT)
 
 btnRun = Button(frmRun, text = "Run", width=20, 
command=self.run)
 btnRun.pack(side=RIGHT, padx=20)
 
 
 def loadEdges(self):
 edgesFolderName = filedialog.askdirectory()
 
 if edgesFolderName:
 self.entEdgesPath.delete(0, END)
 self.entEdgesPath.insert(0, edgesFolderName)
 
 def loadPositive(self):
 if self.initPositiveDir is None:
 self.initPositiveDir = "/"
 
 positiveFileName = 
filedialog.askopenfilename(initialdir=self.initPositiveDir, 
title="Open Positive File", filetypes=(("Text file", "*.txt"),))
 
 if positiveFileName:
 self.initPositiveDir = positiveFileName
 self.entPositivePath.delete(0, END)
 self.entPositivePath.insert(0, positiveFileName)
 def loadNegative(self):
 if self.initNegativeDir is None:
 self.initNegativeDir = "/"
 
 negativeFileName = 
filedialog.askopenfilename(initialdir=self.initNegativeDir, 
title="Open Positive File", filetypes=(("Text file", "*.txt"),))
 
 if negativeFileName:
 self.initNegativeDir = negativeFileName
 self.entNegativePath.delete(0, END)
 self.entNegativePath.insert(0, negativeFileName)
 def run(self):
 edgesFolderName = self.entEdgesPath.get()
 if not os.path.isdir(edgesFolderName):
 messagebox.showerror("Invalid Path", "The directory of 
edge list is invalid.")
 return
 
 positiveFileName = self.entPositivePath.get()
 if not os.path.isfile(positiveFileName):
 messagebox.showerror("Invalid Path", "The positive 
filename is invalid.")
 return
 
 negativeFileName = self.entNegativePath.get()
 if not os.path.isfile(negativeFileName):
 messagebox.showerror("Invalid Path", "The negative 
filename is invalid.")
 return
 
 targetWord = self.entTarget.get()
 if targetWord is None or len(targetWord) <= 0:
 messagebox.showerror("No Target", "Please input the target 
word.")
 
 os.chdir(edgesFolderName)
 edgefiles = glob.glob("*.pr")
 
 if len(edgefiles) <= 0:
 messagebox.showerror("No Edge File", "Cannot find the edge 
files.")
 positivewords = read_sentiment(positiveFileName)
 
 negativewords = read_sentiment(negativeFileName)
 
 self.summary.drop(self.summary.index, inplace=True)
 self.proRun["value"] = 0.0
 self.proRun.update()
 root.config(cursor="wait")
 root.update()
 time.sleep(0.300)
 
 for index, edgefile in enumerate(edgefiles):
 edges, words = read_edge_list(edgefile)
 
 if targetWord not in words:
 messagebox.showerror("Invalid Target", "Target does 
not exist in " + edgefile)
 else:
 possiblepositive = positivewords & words
 possiblenegative = negativewords & words
 
 positivedict, negativedict = \
 run_shortest_path_algorithm(edges, 
possiblepositive, possiblenegative, targetWord)
 
 statistics_summary = 
calculate_statistics_summary(positivedict, negativedict, 
positivewords, negativewords)
 self.summary.loc[edgefile] = statistics_summary
 self.proRun["value"] = 100 * (index + 1) / len(edgefiles)
 self.proRun.update()
 
 root.config(cursor="") 
 
 if self.summary.shape[0] > 0:
 self.summary.loc['mean'] = self.summary.mean()
 self.summary.loc['std'] = self.summary.std()
 
 if self.initSaveDir is None:
 self.initSaveDir = "/"
 
 outputFile = 
filedialog.asksaveasfilename(initialdir=self.initSaveDir, title="Save 
Summary File", filetypes=(("Text file", "*.txt"),))
 self.initSaveDir = outputFile
 
 if outputFile:
 with open(outputFile, 'w') as outfp:
 self.summary.to_string(outfp)
 
app = SentimentWindow()
root.mainloop()

Some data files for this program:

small edgelists
a big file that will run for days
the negative seed file
the positive seed file

Set the target word to bp.

Here's the code in a file.

Question 2

How are these files supposed to be opened?

Question 3

For all the comments in the code, I can't tell whether it tries to implement a standard algorithm for shortest path or something home-grown. (I can take a guess given the function names that it uses one as a partial solution.)

Question 4

It uses Dijkstra's algorithm. Wondering if using igraph's implementation would be better.

Question 5

It appears to me that your Dijkstra's algorithm implementation is not using a priority queue?

Question 6

How would a priority queue work?

Question 7

Just going to comment on style

Method Naming

Method names should be in snake_case. You got it right with the class names, as they should be PascalCase, but method names are in snake_case.

Ternary Operations

Your huge block of if statements in calculate_statistics_summary can be reduced to one line each, utilizing the ternary operator:

s1 = 0 if actualnumpositive == 0 else sumpositive / actualnumpositive
s2 = 0 if actualnumnegative == 0 else sumnegative / actualnumnegative
s3 = 0 if numnegative == 0 else s1 * numpositive / numnegative
s4 = 0 if s2 == 0 else s3 / s2
s5 = 0 if numpositive == 0 else sumpositive / numpositive
s6 = 0 if numnegative == 0 else sumnegative / numnegative
s7 = 0 if numnegative == 0 else s5 * numpositive / numnegative
s8 = 0 if s6 == 0 else s7 / s6
s9 = s3 - s2

Type Hints

You should use type hints to make it clear what types of parameters are acceptable to methods, and what type are returned, if any. For example:

def calculate_statistics_summary(positivedict, negativedict, positivewords, negativewords):

can be this (added spacing to make it readable)

def calculate_statistics_summary(
 positivedict: dict,
 negativedict: dict,
 positivewords: set,
 negativewords: set
) -> List[float]:

Now it's much clearer what types are passed to the method, and that a list containing floats (from my interpretation) is being returned.

Ben A 10.8k5 gold badges40 silver badges103 bronze badges · Answer 1 · 2019-11-10 20:33:08Z

Just going to comment on style

Method Naming

Method names should be in snake_case. You got it right with the class names, as they should be PascalCase, but method names are in snake_case.

Ternary Operations

Your huge block of if statements in calculate_statistics_summary can be reduced to one line each, utilizing the ternary operator:

s1 = 0 if actualnumpositive == 0 else sumpositive / actualnumpositive
s2 = 0 if actualnumnegative == 0 else sumnegative / actualnumnegative
s3 = 0 if numnegative == 0 else s1 * numpositive / numnegative
s4 = 0 if s2 == 0 else s3 / s2
s5 = 0 if numpositive == 0 else sumpositive / numpositive
s6 = 0 if numnegative == 0 else sumnegative / numnegative
s7 = 0 if numnegative == 0 else s5 * numpositive / numnegative
s8 = 0 if s6 == 0 else s7 / s6
s9 = s3 - s2

Type Hints

You should use type hints to make it clear what types of parameters are acceptable to methods, and what type are returned, if any. For example:

def calculate_statistics_summary(positivedict, negativedict, positivewords, negativewords):

can be this (added spacing to make it readable)

def calculate_statistics_summary(
 positivedict: dict,
 negativedict: dict,
 positivewords: set,
 negativewords: set
) -> List[float]:

Now it's much clearer what types are passed to the method, and that a list containing floats (from my interpretation) is being returned.

Stack Exchange Network

python program for computing shortest paths

1 Answer 1

Method Naming

Ternary Operations

Type Hints

You must log in to answer this question.

Hot Network Questions

python program for computing shortest paths

1 Answer 1

Method Naming

Ternary Operations

Type Hints

You must log in to answer this question.

Related

Hot Network Questions