Return to Question

added 1373 characters in body

edited Jul 6, 2015 at 16:44

35.2k
13
134
238

#!/usr/bin/env/python3
from multiprocessing__future__ import Pooldivision
from collectionsmultiprocessing import CounterPool
from mathscipy.stats import log2entropy
#import fromcodecs
import numbaos
import scipy
import jitscipy.sparse
# --- data & parameters--- #
frequenciesAdjectives = codecs.open('/home/christian/results/gender/frequenciesAdjectivesGivenNouns_UK'frequencies.txt', 'r', 'utf-8')
results = codecs.open('/home/christian/results/gender/JensenShannonDivergences_ukWaC'results.txt', 'w', 'utf-8')
temporaryFilesPath = "/temp/"
cores = 16
# --- functions --- #
# calculates Jason-Shannon Divergence from tuple of two nouns and their associated adjective probabilities in dictionariescalculate pjensen andshannon qdivergence
def JSD(nounTuplep, q):
 noun1,p noun2,= p, q = nounTupleq
 jsdm = 0.05 * (p+q)
 mjsd = 0.5 * (entropy(p, m) + entropy(q, m))
 forreturn keyjsd
# incalculate m:
JSD for every i-th noun with every other m_keynoun =and 0.5write *to m[key]
temp file, where i is the number of ifcores keyused, inand preturn filepath
def getJSDs(n):
 # open temporary file
 filename p_key= "temp"+str(n).zfill(2)
 fullpath = p[key]temporaryFilesPath+filename
 temp = codecs.open(fullpath, "w", "utf-8")
 jsd# +=shortcut 0.5write *function
 p_key * log2(p_key/m_key) tempwrite = temp.write

 # calculate JSD for ifeach keynoun inpair q:and write to temporary file
 for index, noun1 in enumerate(nouns[n::cores]):
 q_key index = q[key]index*cores+n
 first = sparseMatrix.getrow(index).toarray()[0]
 jsd += 0.5 * q_key * log2tempwrite(q_key/m_key"here")
 return noun1 for index2, noun2, jsd
defin jobGeneratorenumerate(tuplesnouns[index:]):
 for index2 += index, (noun, adjectives) in enumerate second = sparseMatrix.getrow(tuplesindex2):.toarray()[0]
 for noun2, adjectives2 in tuples[index:]:
divergence = JSD(first, second)
 yield noun, tempwrite(u"{noun1}\t{noun2,}\t{divergence}\n".format(**locals()))
 adjectives, adjectives2 temp.close()
 return fullpath

# --- processing --- #
# ignoreset header
frequenciesAdjectives.readline()up parameter variables for sparse matrix
nouns = []
#adjectiveIDs make= list{} of tuples of nouns and# dictionariescolumn containingid theirof precedingeach adjective in matrix
frequencies = [] # non-zero entries in matrix
nounAdjectivespositions = [] # column id of frequency in corresponding position in "frequencies"
forindices = [0] # frequencies[indices[i]:indices[i+]] = non-zero entries of line ini of matrix
# ignore file header
frequenciesAdjectives:.readline()

# incrementally get sparse adjectivesmatrix =parameters Counter(turn frequencies into probabilites)
for line in frequenciesAdjectives:
 line = line.strip().lower().split("\t")
 noun = line[0]
 nouns.append(noun)
  adjectiveList = [pair.split(" ") for pair in line[2:]]
 frequencySum total = sum(int[int(frequency) for _,frequency in adjectiveListadjectiveList])
 for pair in adjectiveList:
 adjective, frequency in= adjectiveList:pair
 probability = int(frequency)/frequencySumtotal
 adjectives[adjective]position = adjectiveIDs.setdefault(adjective, len(adjectiveIDs))
  frequencies.append(probability)
 nounAdjectives positions.append(position)
 indices.append(noun,len(frequencies))
# adjectivesturn lists into arrays
frequencies = scipy.array(frequencies)
positions = scipy.array(positions)
indices = scipy.array(indices)

# makecreate generatorsparse ofmatrix from parameter arrays and delete arrays
sparseMatrix = scipy.sparse.csr_matrix(noun(frequencies, noun2positions, adjectivesindices), adjectives2shape=(len(nouns)-tuples, len(adjectiveIDs)))
jobsdel frequencies, positions, indices, adjectiveIDs
# calculate JSDs in parallel and get list of temporary files
pool = jobGeneratorPool(nounAdjectives)
tempFiles = pool.map(getJSDs, range(cores))
pool.close()
pool.join()
# shortcut results.write and write header
resultswrite = results.write
resultswrite(u"noun1\tnoun2\tjensenShannonDivergence"u"noun1\tnoun2\tjensenShannonDivergence\n")
# calculatecombine JSDstemporary infiles parallelinto andresults writefile toand filedelete
poolfor =path Pool(cores)in tempFiles:
for noun1, noun2, jsd intempfile pool= codecs.imap_unorderedopen(JSDpath, jobs"r", chunksize=500000"utf-8")
 for line in tempfile:
 resultswrite(u"\n{noun1}\t{noun2}\t{jsd}"line)
 tempfile.format_map(localsclose()))
pool os.closeremove(path)

#!/usr/bin/env/python3
from multiprocessing import Pool
from collections import Counter
from math import log2
# from numba import jit
# --- data & parameters--- #
frequenciesAdjectives = open('/home/christian/results/gender/frequenciesAdjectivesGivenNouns_UK.txt', 'r')
results = open('/home/christian/results/gender/JensenShannonDivergences_ukWaC.txt', 'w')
cores = 16
# --- functions --- #
# calculates Jason-Shannon Divergence from tuple of two nouns and their associated adjective probabilities in dictionaries p and q
def JSD(nounTuple):
 noun1, noun2, p, q = nounTuple
 jsd = 0.0
 m = p + q
 for key in m:
 m_key = 0.5 * m[key]
 if key in p:
 p_key = p[key]
 jsd += 0.5 * p_key * log2(p_key/m_key)
 if key in q:
 q_key = q[key]
 jsd += 0.5 * q_key * log2(q_key/m_key)
 return noun1, noun2, jsd
def jobGenerator(tuples):
 for index, (noun, adjectives) in enumerate(tuples):
 for noun2, adjectives2 in tuples[index:]:
 yield noun, noun2, adjectives, adjectives2
# --- processing --- #
# ignore header
frequenciesAdjectives.readline()
# make list of tuples of nouns and dictionaries containing their preceding adjective frequencies
nounAdjectives = []
for line in frequenciesAdjectives:
 adjectives = Counter()
 line = line.strip().lower().split("\t")
 noun = line[0]
 adjectiveList = [pair.split(" ") for pair in line[2:]]
 frequencySum = sum(int(frequency) for _,frequency in adjectiveList)
 for adjective, frequency in adjectiveList:
 probability = int(frequency)/frequencySum
 adjectives[adjective] = probability
 nounAdjectives.append((noun, adjectives))
# make generator of (noun, noun2, adjectives, adjectives2)-tuples
jobs = jobGenerator(nounAdjectives)
# shortcut results.write and write header
resultswrite = results.write
resultswrite(u"noun1\tnoun2\tjensenShannonDivergence")
# calculate JSDs in parallel and write to file
pool = Pool(cores)
for noun1, noun2, jsd in pool.imap_unordered(JSD, jobs, chunksize=500000):
 resultswrite(u"\n{noun1}\t{noun2}\t{jsd}".format_map(locals()))
pool.close()

from __future__ import division
from multiprocessing import Pool
from scipy.stats import entropy
import codecs
import os
import scipy
import scipy.sparse
# --- data & parameters--- #
frequenciesAdjectives = codecs.open('frequencies.txt', 'r', 'utf-8')
results = codecs.open('results.txt', 'w', 'utf-8')
temporaryFilesPath = "/temp/"
cores = 16
# --- functions --- #
# calculate jensen shannon divergence
def JSD(p, q):
 p = p q = q
 m = 0.5 * (p+q)
 jsd = 0.5 * (entropy(p, m) + entropy(q, m))
 return jsd
# calculate JSD for every i-th noun with every other noun and write to temp file, where i is the number of cores used, and return filepath
def getJSDs(n):
 # open temporary file
 filename = "temp"+str(n).zfill(2)
 fullpath = temporaryFilesPath+filename
 temp = codecs.open(fullpath, "w", "utf-8")
 # shortcut write function
  tempwrite = temp.write

 # calculate JSD for each noun pair and write to temporary file
 for index, noun1 in enumerate(nouns[n::cores]):
 index = index*cores+n
 first = sparseMatrix.getrow(index).toarray()[0]
 tempwrite("here")
  for index2, noun2 in enumerate(nouns[index:]):
 index2 += index  second = sparseMatrix.getrow(index2).toarray()[0]
 divergence = JSD(first, second)
 tempwrite(u"{noun1}\t{noun2}\t{divergence}\n".format(**locals()))
  temp.close()
 return fullpath

# --- processing --- #
# set up parameter variables for sparse matrix
nouns = []
adjectiveIDs = {} # column id of each adjective in matrix
frequencies = [] # non-zero entries in matrix
positions = [] # column id of frequency in corresponding position in "frequencies"
indices = [0] # frequencies[indices[i]:indices[i+]] = non-zero entries of line i of matrix
# ignore file header
frequenciesAdjectives.readline()

# incrementally get sparse matrix parameters (turn frequencies into probabilites)
for line in frequenciesAdjectives:
 line = line.strip().lower().split("\t")
 noun = line[0]
 nouns.append(noun)
  adjectiveList = [pair.split(" ") for pair in line[2:]]
  total = sum([int(frequency) for _,frequency in adjectiveList])
 for pair in adjectiveList:
 adjective, frequency = pair
 probability = int(frequency)/total
 position = adjectiveIDs.setdefault(adjective, len(adjectiveIDs))
  frequencies.append(probability)
  positions.append(position)
 indices.append(len(frequencies))
# turn lists into arrays
frequencies = scipy.array(frequencies)
positions = scipy.array(positions)
indices = scipy.array(indices)

# create sparse matrix from parameter arrays and delete arrays
sparseMatrix = scipy.sparse.csr_matrix((frequencies, positions, indices), shape=(len(nouns), len(adjectiveIDs)))
del frequencies, positions, indices, adjectiveIDs
# calculate JSDs in parallel and get list of temporary files
pool = Pool()
tempFiles = pool.map(getJSDs, range(cores))
pool.close()
pool.join()
# shortcut results.write and write header
resultswrite = results.write
resultswrite(u"noun1\tnoun2\tjensenShannonDivergence\n")
# combine temporary files into results file and delete
for path in tempFiles:
 tempfile = codecs.open(path, "r", "utf-8")
 for line in tempfile:
 resultswrite(line)
 tempfile.close()
 os.remove(path)

Rollback to Revision 4

Source Link

edited Jul 6, 2015 at 16:43

Jamal

edited Jul 6, 2015 at 16:43

Jamal

35.2k
13
134
238

--- EDIT --- (apparently I'm not supposed to answer my own post, without others having answered - noted)

@TheBlackCat, thanks, this package looks interesting and like a good way to get fine-grained parallelization, but also like it was a little much for this purpose.

@Veedrac, I'll remember to profile before posting here for next time. I rewrote the entire thing, now it's two orders of magnitude (500x or so!) faster. Key changes included: Using a list of tuples (noun, adjectiveDictionary) instead of the sparse matrix and dropping the 0 elements completely; using math.log instead of scipy.log (huge difference!); switching from python 2 to python 3; not dividing the tasks up manually, instead using a generator and the queue that comes with imap_unordered, for which it was crucial to find the right chunksize.

Here's the new code:

#!/usr/bin/env/python3
from multiprocessing import Pool
from collections import Counter
from math import log2
# from numba import jit
# --- data & parameters--- #
frequenciesAdjectives = open('/home/christian/results/gender/frequenciesAdjectivesGivenNouns_UK.txt', 'r')
results = open('/home/christian/results/gender/JensenShannonDivergences_ukWaC.txt', 'w')
cores = 16
# --- functions --- #
# calculates Jason-Shannon Divergence from tuple of two nouns and their associated adjective probabilities in dictionaries p and q
def JSD(nounTuple):
 noun1, noun2, p, q = nounTuple
 jsd = 0.0
 m = p + q
 for key in m:
 m_key = 0.5 * m[key]
 if key in p:
 p_key = p[key]
 jsd += 0.5 * p_key * log2(p_key/m_key)
 if key in q:
 q_key = q[key]
 jsd += 0.5 * q_key * log2(q_key/m_key)
 return noun1, noun2, jsd
def jobGenerator(tuples):
 for index, (noun, adjectives) in enumerate(tuples):
 for noun2, adjectives2 in tuples[index:]:
 yield noun, noun2, adjectives, adjectives2
# --- processing --- #
# ignore header
frequenciesAdjectives.readline()
# make list of tuples of nouns and dictionaries containing their preceding adjective frequencies
nounAdjectives = []
for line in frequenciesAdjectives:
 adjectives = Counter()
 line = line.strip().lower().split("\t")
 noun = line[0]
 adjectiveList = [pair.split(" ") for pair in line[2:]]
 frequencySum = sum(int(frequency) for _, frequency in adjectiveList)
 for adjective, frequency in adjectiveList:
 probability = int(frequency)/frequencySum
 adjectives[adjective] = probability
 nounAdjectives.append((noun, adjectives))
# make generator of (noun, noun2, adjectives, adjectives2)-tuples
jobs = jobGenerator(nounAdjectives)
# shortcut results.write and write header
resultswrite = results.write
resultswrite(u"noun1\tnoun2\tjensenShannonDivergence")
# calculate JSDs in parallel and write to file
pool = Pool(cores)
for noun1, noun2, jsd in pool.imap_unordered(JSD, jobs, chunksize=500000):
 resultswrite(u"\n{noun1}\t{noun2}\t{jsd}".format_map(locals()))
pool.close()

--- EDIT --- (apparently I'm not supposed to answer my own post, without others having answered - noted)

@TheBlackCat, thanks, this package looks interesting and like a good way to get fine-grained parallelization, but also like it was a little much for this purpose.

Here's the new code:

#!/usr/bin/env/python3
from multiprocessing import Pool
from collections import Counter
from math import log2
# from numba import jit
# --- data & parameters--- #
frequenciesAdjectives = open('/home/christian/results/gender/frequenciesAdjectivesGivenNouns_UK.txt', 'r')
results = open('/home/christian/results/gender/JensenShannonDivergences_ukWaC.txt', 'w')
cores = 16
# --- functions --- #
# calculates Jason-Shannon Divergence from tuple of two nouns and their associated adjective probabilities in dictionaries p and q
def JSD(nounTuple):
 noun1, noun2, p, q = nounTuple
 jsd = 0.0
 m = p + q
 for key in m:
 m_key = 0.5 * m[key]
 if key in p:
 p_key = p[key]
 jsd += 0.5 * p_key * log2(p_key/m_key)
 if key in q:
 q_key = q[key]
 jsd += 0.5 * q_key * log2(q_key/m_key)
 return noun1, noun2, jsd
def jobGenerator(tuples):
 for index, (noun, adjectives) in enumerate(tuples):
 for noun2, adjectives2 in tuples[index:]:
 yield noun, noun2, adjectives, adjectives2
# --- processing --- #
# ignore header
frequenciesAdjectives.readline()
# make list of tuples of nouns and dictionaries containing their preceding adjective frequencies
nounAdjectives = []
for line in frequenciesAdjectives:
 adjectives = Counter()
 line = line.strip().lower().split("\t")
 noun = line[0]
 adjectiveList = [pair.split(" ") for pair in line[2:]]
 frequencySum = sum(int(frequency) for _, frequency in adjectiveList)
 for adjective, frequency in adjectiveList:
 probability = int(frequency)/frequencySum
 adjectives[adjective] = probability
 nounAdjectives.append((noun, adjectives))
# make generator of (noun, noun2, adjectives, adjectives2)-tuples
jobs = jobGenerator(nounAdjectives)
# shortcut results.write and write header
resultswrite = results.write
resultswrite(u"noun1\tnoun2\tjensenShannonDivergence")
# calculate JSDs in parallel and write to file
pool = Pool(cores)
for noun1, noun2, jsd in pool.imap_unordered(JSD, jobs, chunksize=500000):
 resultswrite(u"\n{noun1}\t{noun2}\t{jsd}".format_map(locals()))
pool.close()

added 3299 characters in body

Source Link

edited Jul 6, 2015 at 16:17

Christian Adam

edited Jul 6, 2015 at 16:17

Christian Adam

--- EDIT --- (apparently I'm not supposed to answer my own post, without others having answered - noted)

@TheBlackCat, thanks, this package looks interesting and like a good way to get fine-grained parallelization, but also like it was a little much for this purpose.

Here's the new code:

#!/usr/bin/env/python3
from multiprocessing import Pool
from collections import Counter
from math import log2
# from numba import jit
# --- data & parameters--- #
frequenciesAdjectives = open('/home/christian/results/gender/frequenciesAdjectivesGivenNouns_UK.txt', 'r')
results = open('/home/christian/results/gender/JensenShannonDivergences_ukWaC.txt', 'w')
cores = 16
# --- functions --- #
# calculates Jason-Shannon Divergence from tuple of two nouns and their associated adjective probabilities in dictionaries p and q
def JSD(nounTuple):
 noun1, noun2, p, q = nounTuple
 jsd = 0.0
 m = p + q
 for key in m:
 m_key = 0.5 * m[key]
 if key in p:
 p_key = p[key]
 jsd += 0.5 * p_key * log2(p_key/m_key)
 if key in q:
 q_key = q[key]
 jsd += 0.5 * q_key * log2(q_key/m_key)
 return noun1, noun2, jsd
def jobGenerator(tuples):
 for index, (noun, adjectives) in enumerate(tuples):
 for noun2, adjectives2 in tuples[index:]:
 yield noun, noun2, adjectives, adjectives2
# --- processing --- #
# ignore header
frequenciesAdjectives.readline()
# make list of tuples of nouns and dictionaries containing their preceding adjective frequencies
nounAdjectives = []
for line in frequenciesAdjectives:
 adjectives = Counter()
 line = line.strip().lower().split("\t")
 noun = line[0]
 adjectiveList = [pair.split(" ") for pair in line[2:]]
 frequencySum = sum(int(frequency) for _, frequency in adjectiveList)
 for adjective, frequency in adjectiveList:
 probability = int(frequency)/frequencySum
 adjectives[adjective] = probability
 nounAdjectives.append((noun, adjectives))
# make generator of (noun, noun2, adjectives, adjectives2)-tuples
jobs = jobGenerator(nounAdjectives)
# shortcut results.write and write header
resultswrite = results.write
resultswrite(u"noun1\tnoun2\tjensenShannonDivergence")
# calculate JSDs in parallel and write to file
pool = Pool(cores)
for noun1, noun2, jsd in pool.imap_unordered(JSD, jobs, chunksize=500000):
 resultswrite(u"\n{noun1}\t{noun2}\t{jsd}".format_map(locals()))
pool.close()

--- EDIT --- (apparently I'm not supposed to answer my own post, without others having answered - noted)

@TheBlackCat, thanks, this package looks interesting and like a good way to get fine-grained parallelization, but also like it was a little much for this purpose.

Here's the new code:

#!/usr/bin/env/python3
from multiprocessing import Pool
from collections import Counter
from math import log2
# from numba import jit
# --- data & parameters--- #
frequenciesAdjectives = open('/home/christian/results/gender/frequenciesAdjectivesGivenNouns_UK.txt', 'r')
results = open('/home/christian/results/gender/JensenShannonDivergences_ukWaC.txt', 'w')
cores = 16
# --- functions --- #
# calculates Jason-Shannon Divergence from tuple of two nouns and their associated adjective probabilities in dictionaries p and q
def JSD(nounTuple):
 noun1, noun2, p, q = nounTuple
 jsd = 0.0
 m = p + q
 for key in m:
 m_key = 0.5 * m[key]
 if key in p:
 p_key = p[key]
 jsd += 0.5 * p_key * log2(p_key/m_key)
 if key in q:
 q_key = q[key]
 jsd += 0.5 * q_key * log2(q_key/m_key)
 return noun1, noun2, jsd
def jobGenerator(tuples):
 for index, (noun, adjectives) in enumerate(tuples):
 for noun2, adjectives2 in tuples[index:]:
 yield noun, noun2, adjectives, adjectives2
# --- processing --- #
# ignore header
frequenciesAdjectives.readline()
# make list of tuples of nouns and dictionaries containing their preceding adjective frequencies
nounAdjectives = []
for line in frequenciesAdjectives:
 adjectives = Counter()
 line = line.strip().lower().split("\t")
 noun = line[0]
 adjectiveList = [pair.split(" ") for pair in line[2:]]
 frequencySum = sum(int(frequency) for _, frequency in adjectiveList)
 for adjective, frequency in adjectiveList:
 probability = int(frequency)/frequencySum
 adjectives[adjective] = probability
 nounAdjectives.append((noun, adjectives))
# make generator of (noun, noun2, adjectives, adjectives2)-tuples
jobs = jobGenerator(nounAdjectives)
# shortcut results.write and write header
resultswrite = results.write
resultswrite(u"noun1\tnoun2\tjensenShannonDivergence")
# calculate JSDs in parallel and write to file
pool = Pool(cores)
for noun1, noun2, jsd in pool.imap_unordered(JSD, jobs, chunksize=500000):
 resultswrite(u"\n{noun1}\t{noun2}\t{jsd}".format_map(locals()))
pool.close()