Skip to main content
Code Review

Return to Question

added 1373 characters in body
Source Link
Jamal
  • 35.2k
  • 13
  • 134
  • 238
#!/usr/bin/env/python3
from multiprocessing__future__ import Pooldivision
from collectionsmultiprocessing import CounterPool
from mathscipy.stats import log2entropy
#import fromcodecs
import numbaos
import scipy
import jitscipy.sparse
# --- data & parameters--- #
frequenciesAdjectives = codecs.open('/home/christian/results/gender/frequenciesAdjectivesGivenNouns_UK'frequencies.txt', 'r', 'utf-8')
results = codecs.open('/home/christian/results/gender/JensenShannonDivergences_ukWaC'results.txt', 'w', 'utf-8')
temporaryFilesPath = "/temp/"
cores = 16
# --- functions --- #
# calculates Jason-Shannon Divergence from tuple of two nouns and their associated adjective probabilities in dictionariescalculate pjensen andshannon qdivergence
def JSD(nounTuplep, q):
 noun1,p noun2,= p, q = nounTupleq
 jsdm = 0.05 * (p+q)
 mjsd = 0.5 * (entropy(p, m) + entropy(q, m))
 forreturn keyjsd
# incalculate m:
JSD for every i-th noun with every other m_keynoun =and 0.5write *to m[key]
temp file, where i is the number of ifcores keyused, inand preturn filepath
def getJSDs(n):
 # open temporary file
 filename p_key= "temp"+str(n).zfill(2)
 fullpath = p[key]temporaryFilesPath+filename
 temp = codecs.open(fullpath, "w", "utf-8")
 jsd# +=shortcut 0.5write *function
 p_key * log2(p_key/m_key) tempwrite = temp.write

 # calculate JSD for ifeach keynoun inpair q:and write to temporary file
 for index, noun1 in enumerate(nouns[n::cores]):
 q_key index = q[key]index*cores+n
 first = sparseMatrix.getrow(index).toarray()[0]
 jsd += 0.5 * q_key * log2tempwrite(q_key/m_key"here")
 return noun1 for index2, noun2, jsd
defin jobGeneratorenumerate(tuplesnouns[index:]):
 for index2 += index, (noun, adjectives) in enumerate second = sparseMatrix.getrow(tuplesindex2):.toarray()[0]
 for noun2, adjectives2 in tuples[index:]:
divergence = JSD(first, second)
 yield noun, tempwrite(u"{noun1}\t{noun2,}\t{divergence}\n".format(**locals()))
 adjectives, adjectives2 temp.close()
 return fullpath

# --- processing --- #
# ignoreset header
frequenciesAdjectives.readline()up parameter variables for sparse matrix
nouns = []
#adjectiveIDs make= list{} of tuples of nouns and# dictionariescolumn containingid theirof precedingeach adjective in matrix
frequencies = [] # non-zero entries in matrix
nounAdjectivespositions = [] # column id of frequency in corresponding position in "frequencies"
forindices = [0] # frequencies[indices[i]:indices[i+]] = non-zero entries of line ini of matrix
# ignore file header
frequenciesAdjectives:.readline()

# incrementally get sparse adjectivesmatrix =parameters Counter(turn frequencies into probabilites)
for line in frequenciesAdjectives:
 line = line.strip().lower().split("\t")
 noun = line[0]
 nouns.append(noun)
  adjectiveList = [pair.split(" ") for pair in line[2:]]
 frequencySum total = sum(int[int(frequency) for _,frequency in adjectiveListadjectiveList])
 for pair in adjectiveList:
 adjective, frequency in= adjectiveList:pair
 probability = int(frequency)/frequencySumtotal
 adjectives[adjective]position = adjectiveIDs.setdefault(adjective, len(adjectiveIDs))
  frequencies.append(probability)
 nounAdjectives positions.append(position)
 indices.append(noun,len(frequencies))
# adjectivesturn lists into arrays
frequencies = scipy.array(frequencies)
positions = scipy.array(positions)
indices = scipy.array(indices)

# makecreate generatorsparse ofmatrix from parameter arrays and delete arrays
sparseMatrix = scipy.sparse.csr_matrix(noun(frequencies, noun2positions, adjectivesindices), adjectives2shape=(len(nouns)-tuples, len(adjectiveIDs)))
jobsdel frequencies, positions, indices, adjectiveIDs
# calculate JSDs in parallel and get list of temporary files
pool = jobGeneratorPool(nounAdjectives)
tempFiles = pool.map(getJSDs, range(cores))
pool.close()
pool.join()
# shortcut results.write and write header
resultswrite = results.write
resultswrite(u"noun1\tnoun2\tjensenShannonDivergence"u"noun1\tnoun2\tjensenShannonDivergence\n")
# calculatecombine JSDstemporary infiles parallelinto andresults writefile toand filedelete
poolfor =path Pool(cores)in tempFiles:
for noun1, noun2, jsd intempfile pool= codecs.imap_unorderedopen(JSDpath, jobs"r", chunksize=500000"utf-8")
 for line in tempfile:
 resultswrite(u"\n{noun1}\t{noun2}\t{jsd}"line)
 tempfile.format_map(localsclose()))
pool os.closeremove(path)
#!/usr/bin/env/python3
from multiprocessing import Pool
from collections import Counter
from math import log2
# from numba import jit
# --- data & parameters--- #
frequenciesAdjectives = open('/home/christian/results/gender/frequenciesAdjectivesGivenNouns_UK.txt', 'r')
results = open('/home/christian/results/gender/JensenShannonDivergences_ukWaC.txt', 'w')
cores = 16
# --- functions --- #
# calculates Jason-Shannon Divergence from tuple of two nouns and their associated adjective probabilities in dictionaries p and q
def JSD(nounTuple):
 noun1, noun2, p, q = nounTuple
 jsd = 0.0
 m = p + q
 for key in m:
 m_key = 0.5 * m[key]
 if key in p:
 p_key = p[key]
 jsd += 0.5 * p_key * log2(p_key/m_key)
 if key in q:
 q_key = q[key]
 jsd += 0.5 * q_key * log2(q_key/m_key)
 return noun1, noun2, jsd
def jobGenerator(tuples):
 for index, (noun, adjectives) in enumerate(tuples):
 for noun2, adjectives2 in tuples[index:]:
 yield noun, noun2, adjectives, adjectives2
# --- processing --- #
# ignore header
frequenciesAdjectives.readline()
# make list of tuples of nouns and dictionaries containing their preceding adjective frequencies
nounAdjectives = []
for line in frequenciesAdjectives:
 adjectives = Counter()
 line = line.strip().lower().split("\t")
 noun = line[0]
 adjectiveList = [pair.split(" ") for pair in line[2:]]
 frequencySum = sum(int(frequency) for _,frequency in adjectiveList)
 for adjective, frequency in adjectiveList:
 probability = int(frequency)/frequencySum
 adjectives[adjective] = probability
 nounAdjectives.append((noun, adjectives))
# make generator of (noun, noun2, adjectives, adjectives2)-tuples
jobs = jobGenerator(nounAdjectives)
# shortcut results.write and write header
resultswrite = results.write
resultswrite(u"noun1\tnoun2\tjensenShannonDivergence")
# calculate JSDs in parallel and write to file
pool = Pool(cores)
for noun1, noun2, jsd in pool.imap_unordered(JSD, jobs, chunksize=500000):
 resultswrite(u"\n{noun1}\t{noun2}\t{jsd}".format_map(locals()))
pool.close()
from __future__ import division
from multiprocessing import Pool
from scipy.stats import entropy
import codecs
import os
import scipy
import scipy.sparse
# --- data & parameters--- #
frequenciesAdjectives = codecs.open('frequencies.txt', 'r', 'utf-8')
results = codecs.open('results.txt', 'w', 'utf-8')
temporaryFilesPath = "/temp/"
cores = 16
# --- functions --- #
# calculate jensen shannon divergence
def JSD(p, q):
 p = p q = q
 m = 0.5 * (p+q)
 jsd = 0.5 * (entropy(p, m) + entropy(q, m))
 return jsd
# calculate JSD for every i-th noun with every other noun and write to temp file, where i is the number of cores used, and return filepath
def getJSDs(n):
 # open temporary file
 filename = "temp"+str(n).zfill(2)
 fullpath = temporaryFilesPath+filename
 temp = codecs.open(fullpath, "w", "utf-8")
 # shortcut write function
  tempwrite = temp.write

 # calculate JSD for each noun pair and write to temporary file
 for index, noun1 in enumerate(nouns[n::cores]):
 index = index*cores+n
 first = sparseMatrix.getrow(index).toarray()[0]
 tempwrite("here")
  for index2, noun2 in enumerate(nouns[index:]):
 index2 += index  second = sparseMatrix.getrow(index2).toarray()[0]
 divergence = JSD(first, second)
 tempwrite(u"{noun1}\t{noun2}\t{divergence}\n".format(**locals()))
  temp.close()
 return fullpath

# --- processing --- #
# set up parameter variables for sparse matrix
nouns = []
adjectiveIDs = {} # column id of each adjective in matrix
frequencies = [] # non-zero entries in matrix
positions = [] # column id of frequency in corresponding position in "frequencies"
indices = [0] # frequencies[indices[i]:indices[i+]] = non-zero entries of line i of matrix
# ignore file header
frequenciesAdjectives.readline()

# incrementally get sparse matrix parameters (turn frequencies into probabilites)
for line in frequenciesAdjectives:
 line = line.strip().lower().split("\t")
 noun = line[0]
 nouns.append(noun)
  adjectiveList = [pair.split(" ") for pair in line[2:]]
  total = sum([int(frequency) for _,frequency in adjectiveList])
 for pair in adjectiveList:
 adjective, frequency = pair
 probability = int(frequency)/total
 position = adjectiveIDs.setdefault(adjective, len(adjectiveIDs))
  frequencies.append(probability)
  positions.append(position)
 indices.append(len(frequencies))
# turn lists into arrays
frequencies = scipy.array(frequencies)
positions = scipy.array(positions)
indices = scipy.array(indices)

# create sparse matrix from parameter arrays and delete arrays
sparseMatrix = scipy.sparse.csr_matrix((frequencies, positions, indices), shape=(len(nouns), len(adjectiveIDs)))
del frequencies, positions, indices, adjectiveIDs
# calculate JSDs in parallel and get list of temporary files
pool = Pool()
tempFiles = pool.map(getJSDs, range(cores))
pool.close()
pool.join()
# shortcut results.write and write header
resultswrite = results.write
resultswrite(u"noun1\tnoun2\tjensenShannonDivergence\n")
# combine temporary files into results file and delete
for path in tempFiles:
 tempfile = codecs.open(path, "r", "utf-8")
 for line in tempfile:
 resultswrite(line)
 tempfile.close()
 os.remove(path)
Rollback to Revision 4
Source Link
Jamal
  • 35.2k
  • 13
  • 134
  • 238

--- EDIT --- (apparently I'm not supposed to answer my own post, without others having answered - noted)

@TheBlackCat, thanks, this package looks interesting and like a good way to get fine-grained parallelization, but also like it was a little much for this purpose.

@Veedrac, I'll remember to profile before posting here for next time. I rewrote the entire thing, now it's two orders of magnitude (500x or so!) faster. Key changes included: Using a list of tuples (noun, adjectiveDictionary) instead of the sparse matrix and dropping the 0 elements completely; using math.log instead of scipy.log (huge difference!); switching from python 2 to python 3; not dividing the tasks up manually, instead using a generator and the queue that comes with imap_unordered, for which it was crucial to find the right chunksize.

Here's the new code:

#!/usr/bin/env/python3
from multiprocessing import Pool
from collections import Counter
from math import log2
# from numba import jit
# --- data & parameters--- #
frequenciesAdjectives = open('/home/christian/results/gender/frequenciesAdjectivesGivenNouns_UK.txt', 'r')
results = open('/home/christian/results/gender/JensenShannonDivergences_ukWaC.txt', 'w')
cores = 16
# --- functions --- #
# calculates Jason-Shannon Divergence from tuple of two nouns and their associated adjective probabilities in dictionaries p and q
def JSD(nounTuple):
 noun1, noun2, p, q = nounTuple
 jsd = 0.0
 m = p + q
 for key in m:
 m_key = 0.5 * m[key]
 if key in p:
 p_key = p[key]
 jsd += 0.5 * p_key * log2(p_key/m_key)
 if key in q:
 q_key = q[key]
 jsd += 0.5 * q_key * log2(q_key/m_key)
 return noun1, noun2, jsd
def jobGenerator(tuples):
 for index, (noun, adjectives) in enumerate(tuples):
 for noun2, adjectives2 in tuples[index:]:
 yield noun, noun2, adjectives, adjectives2
# --- processing --- #
# ignore header
frequenciesAdjectives.readline()
# make list of tuples of nouns and dictionaries containing their preceding adjective frequencies
nounAdjectives = []
for line in frequenciesAdjectives:
 adjectives = Counter()
 line = line.strip().lower().split("\t")
 noun = line[0]
 adjectiveList = [pair.split(" ") for pair in line[2:]]
 frequencySum = sum(int(frequency) for _, frequency in adjectiveList)
 for adjective, frequency in adjectiveList:
 probability = int(frequency)/frequencySum
 adjectives[adjective] = probability
 nounAdjectives.append((noun, adjectives))
# make generator of (noun, noun2, adjectives, adjectives2)-tuples
jobs = jobGenerator(nounAdjectives)
# shortcut results.write and write header
resultswrite = results.write
resultswrite(u"noun1\tnoun2\tjensenShannonDivergence")
# calculate JSDs in parallel and write to file
pool = Pool(cores)
for noun1, noun2, jsd in pool.imap_unordered(JSD, jobs, chunksize=500000):
 resultswrite(u"\n{noun1}\t{noun2}\t{jsd}".format_map(locals()))
pool.close()

--- EDIT --- (apparently I'm not supposed to answer my own post, without others having answered - noted)

@TheBlackCat, thanks, this package looks interesting and like a good way to get fine-grained parallelization, but also like it was a little much for this purpose.

@Veedrac, I'll remember to profile before posting here for next time. I rewrote the entire thing, now it's two orders of magnitude (500x or so!) faster. Key changes included: Using a list of tuples (noun, adjectiveDictionary) instead of the sparse matrix and dropping the 0 elements completely; using math.log instead of scipy.log (huge difference!); switching from python 2 to python 3; not dividing the tasks up manually, instead using a generator and the queue that comes with imap_unordered, for which it was crucial to find the right chunksize.

Here's the new code:

#!/usr/bin/env/python3
from multiprocessing import Pool
from collections import Counter
from math import log2
# from numba import jit
# --- data & parameters--- #
frequenciesAdjectives = open('/home/christian/results/gender/frequenciesAdjectivesGivenNouns_UK.txt', 'r')
results = open('/home/christian/results/gender/JensenShannonDivergences_ukWaC.txt', 'w')
cores = 16
# --- functions --- #
# calculates Jason-Shannon Divergence from tuple of two nouns and their associated adjective probabilities in dictionaries p and q
def JSD(nounTuple):
 noun1, noun2, p, q = nounTuple
 jsd = 0.0
 m = p + q
 for key in m:
 m_key = 0.5 * m[key]
 if key in p:
 p_key = p[key]
 jsd += 0.5 * p_key * log2(p_key/m_key)
 if key in q:
 q_key = q[key]
 jsd += 0.5 * q_key * log2(q_key/m_key)
 return noun1, noun2, jsd
def jobGenerator(tuples):
 for index, (noun, adjectives) in enumerate(tuples):
 for noun2, adjectives2 in tuples[index:]:
 yield noun, noun2, adjectives, adjectives2
# --- processing --- #
# ignore header
frequenciesAdjectives.readline()
# make list of tuples of nouns and dictionaries containing their preceding adjective frequencies
nounAdjectives = []
for line in frequenciesAdjectives:
 adjectives = Counter()
 line = line.strip().lower().split("\t")
 noun = line[0]
 adjectiveList = [pair.split(" ") for pair in line[2:]]
 frequencySum = sum(int(frequency) for _, frequency in adjectiveList)
 for adjective, frequency in adjectiveList:
 probability = int(frequency)/frequencySum
 adjectives[adjective] = probability
 nounAdjectives.append((noun, adjectives))
# make generator of (noun, noun2, adjectives, adjectives2)-tuples
jobs = jobGenerator(nounAdjectives)
# shortcut results.write and write header
resultswrite = results.write
resultswrite(u"noun1\tnoun2\tjensenShannonDivergence")
# calculate JSDs in parallel and write to file
pool = Pool(cores)
for noun1, noun2, jsd in pool.imap_unordered(JSD, jobs, chunksize=500000):
 resultswrite(u"\n{noun1}\t{noun2}\t{jsd}".format_map(locals()))
pool.close()
added 3299 characters in body
Source Link

--- EDIT --- (apparently I'm not supposed to answer my own post, without others having answered - noted)

@TheBlackCat, thanks, this package looks interesting and like a good way to get fine-grained parallelization, but also like it was a little much for this purpose.

@Veedrac, I'll remember to profile before posting here for next time. I rewrote the entire thing, now it's two orders of magnitude (500x or so!) faster. Key changes included: Using a list of tuples (noun, adjectiveDictionary) instead of the sparse matrix and dropping the 0 elements completely; using math.log instead of scipy.log (huge difference!); switching from python 2 to python 3; not dividing the tasks up manually, instead using a generator and the queue that comes with imap_unordered, for which it was crucial to find the right chunksize.

Here's the new code:

#!/usr/bin/env/python3
from multiprocessing import Pool
from collections import Counter
from math import log2
# from numba import jit
# --- data & parameters--- #
frequenciesAdjectives = open('/home/christian/results/gender/frequenciesAdjectivesGivenNouns_UK.txt', 'r')
results = open('/home/christian/results/gender/JensenShannonDivergences_ukWaC.txt', 'w')
cores = 16
# --- functions --- #
# calculates Jason-Shannon Divergence from tuple of two nouns and their associated adjective probabilities in dictionaries p and q
def JSD(nounTuple):
 noun1, noun2, p, q = nounTuple
 jsd = 0.0
 m = p + q
 for key in m:
 m_key = 0.5 * m[key]
 if key in p:
 p_key = p[key]
 jsd += 0.5 * p_key * log2(p_key/m_key)
 if key in q:
 q_key = q[key]
 jsd += 0.5 * q_key * log2(q_key/m_key)
 return noun1, noun2, jsd
def jobGenerator(tuples):
 for index, (noun, adjectives) in enumerate(tuples):
 for noun2, adjectives2 in tuples[index:]:
 yield noun, noun2, adjectives, adjectives2
# --- processing --- #
# ignore header
frequenciesAdjectives.readline()
# make list of tuples of nouns and dictionaries containing their preceding adjective frequencies
nounAdjectives = []
for line in frequenciesAdjectives:
 adjectives = Counter()
 line = line.strip().lower().split("\t")
 noun = line[0]
 adjectiveList = [pair.split(" ") for pair in line[2:]]
 frequencySum = sum(int(frequency) for _, frequency in adjectiveList)
 for adjective, frequency in adjectiveList:
 probability = int(frequency)/frequencySum
 adjectives[adjective] = probability
 nounAdjectives.append((noun, adjectives))
# make generator of (noun, noun2, adjectives, adjectives2)-tuples
jobs = jobGenerator(nounAdjectives)
# shortcut results.write and write header
resultswrite = results.write
resultswrite(u"noun1\tnoun2\tjensenShannonDivergence")
# calculate JSDs in parallel and write to file
pool = Pool(cores)
for noun1, noun2, jsd in pool.imap_unordered(JSD, jobs, chunksize=500000):
 resultswrite(u"\n{noun1}\t{noun2}\t{jsd}".format_map(locals()))
pool.close()

--- EDIT --- (apparently I'm not supposed to answer my own post, without others having answered - noted)

@TheBlackCat, thanks, this package looks interesting and like a good way to get fine-grained parallelization, but also like it was a little much for this purpose.

@Veedrac, I'll remember to profile before posting here for next time. I rewrote the entire thing, now it's two orders of magnitude (500x or so!) faster. Key changes included: Using a list of tuples (noun, adjectiveDictionary) instead of the sparse matrix and dropping the 0 elements completely; using math.log instead of scipy.log (huge difference!); switching from python 2 to python 3; not dividing the tasks up manually, instead using a generator and the queue that comes with imap_unordered, for which it was crucial to find the right chunksize.

Here's the new code:

#!/usr/bin/env/python3
from multiprocessing import Pool
from collections import Counter
from math import log2
# from numba import jit
# --- data & parameters--- #
frequenciesAdjectives = open('/home/christian/results/gender/frequenciesAdjectivesGivenNouns_UK.txt', 'r')
results = open('/home/christian/results/gender/JensenShannonDivergences_ukWaC.txt', 'w')
cores = 16
# --- functions --- #
# calculates Jason-Shannon Divergence from tuple of two nouns and their associated adjective probabilities in dictionaries p and q
def JSD(nounTuple):
 noun1, noun2, p, q = nounTuple
 jsd = 0.0
 m = p + q
 for key in m:
 m_key = 0.5 * m[key]
 if key in p:
 p_key = p[key]
 jsd += 0.5 * p_key * log2(p_key/m_key)
 if key in q:
 q_key = q[key]
 jsd += 0.5 * q_key * log2(q_key/m_key)
 return noun1, noun2, jsd
def jobGenerator(tuples):
 for index, (noun, adjectives) in enumerate(tuples):
 for noun2, adjectives2 in tuples[index:]:
 yield noun, noun2, adjectives, adjectives2
# --- processing --- #
# ignore header
frequenciesAdjectives.readline()
# make list of tuples of nouns and dictionaries containing their preceding adjective frequencies
nounAdjectives = []
for line in frequenciesAdjectives:
 adjectives = Counter()
 line = line.strip().lower().split("\t")
 noun = line[0]
 adjectiveList = [pair.split(" ") for pair in line[2:]]
 frequencySum = sum(int(frequency) for _, frequency in adjectiveList)
 for adjective, frequency in adjectiveList:
 probability = int(frequency)/frequencySum
 adjectives[adjective] = probability
 nounAdjectives.append((noun, adjectives))
# make generator of (noun, noun2, adjectives, adjectives2)-tuples
jobs = jobGenerator(nounAdjectives)
# shortcut results.write and write header
resultswrite = results.write
resultswrite(u"noun1\tnoun2\tjensenShannonDivergence")
# calculate JSDs in parallel and write to file
pool = Pool(cores)
for noun1, noun2, jsd in pool.imap_unordered(JSD, jobs, chunksize=500000):
 resultswrite(u"\n{noun1}\t{noun2}\t{jsd}".format_map(locals()))
pool.close()
Replaced original code with new code from a self-answer
Source Link
Jamal
  • 35.2k
  • 13
  • 134
  • 238
Loading
added 2 characters in body
Source Link
ferada
  • 11.4k
  • 25
  • 65
Loading
Post Reopened by TheCoffeeCup, ferada, Community Bot, Ethan Bierlein
added 3783 characters in body
Source Link
Loading
Post Closed as "Not suitable for this site" by SirPython, Quill, rolfl
Source Link
Loading
lang-py

AltStyle によって変換されたページ (->オリジナル) /