How would I go about making this script much faster? Essentially it reads from a file and the slowest part is populating the words. In the words file there are over 100k words and I was looking for a way to speed it up since this script is going to be used to populate the database with over 200 languages.
import os, django
os.environ.setdefault("DJANGO_SETTINGS_MODULE","eureka.settings")
django.setup()
from django.contrib.auth.models import User
from django.contrib import admin
from wordDictionary.models import Genus, Word, Feature, Dimension, Language, Lemma, Family, TagSet, POS
import multiprocessing
# Dimensions
def dimensionPop():
dimData = open("data/models/dimensions.txt","r")
for x in dimData:
a = x.split("\n")
dimName = a[0]
# Create Object
nextDim = Dimension(name=dimName)
nextDim.save()
#print(f":{nextDim.name}:")
dimData.close()
print("Dimension done")
# Features
def featurePop():
featData = open("data/models/features.txt","r")
for x in featData:
line = x.split(";")
featName = line[1]
dimName = line[0]
# Create Object
nextFeature = Feature(name=featName)
dimObject = Dimension.objects.get(name=dimName)
nextFeature.dimension = dimObject
#print(f"{nextFeature.dimension.name}")
nextFeature.save()
featData.close()
print("Feature done")
# Part of Speech
def posPop():
posData = open("data/models/POS.txt","r")
for x in posData:
line = x.split(";")
posName = line[1]
# Create Object
nextPOS = POS(name=posName)
#print(f"{nextPOS.name}")
nextPOS.save()
posData.close()
print("Part of Speech done")
# Genus
def genusPop():
genusData = open("data/models/genus.txt","r")
for x in genusData:
genusName = x.split("\n")[0]
# Create Object
nextGenus = Genus(name=genusName)
#print(f":{nextGenus.name}:")
nextGenus.save()
genusData.close()
print("Genus done")
# Family
def familyPop():
famData = open("data/models/families.txt","r")
for x in famData:
FamilyName = x.split(";")[0]
# Create Object
nextFamily = Family(name=FamilyName)
#print(f":{nextFamily.name}:")
nextFamily.save()
famData.close()
print("Family done")
def languagePop():
#Populate only english for now
nextLang = Language(name="English")
nextLang.walsCode = "eng"
nextLang.genus = Genus.objects.get(name="Germanic")
nextLang.family = Family.objects.get(name="Indo-European")
nextLang.save()
print("Language done")
def lemmaPop():
lemmaData = open("data/models/lemmas.txt","r",encoding="utf8")
for x in lemmaData:
x = x.split("\n")
lemmaName = x[0]
nextLemma = Lemma(name=lemmaName)
langName = Language.objects.get(name="English")
nextLemma.language = langName
posName = POS.objects.get(name="Verb")
nextLemma.pos = posName
nextLemma.save()
lemmaData.close()
print("Lemma done")
findFeature={}
def readAppendix():
fileContent = open("data/models/features.txt","r")
for row in fileContent:
rowWords = row.split(";")
dimension = rowWords[0]
feature = rowWords[1]
label =(rowWords[2].rstrip()).upper()
findFeature[label]=feature # assign feature to label
fileContent.close()
print("\nStarting with words...")
usedTagset = {}
def wordPop():
wordData = open("data/langs/English.txt","r",encoding="utf8")
it = 0
for line in wordData:
it += 1
if it % 1000 :
print(f"> {it}...")
rowContent = line.split()
if(len(rowContent)>=3): # checks if line is valid
tagsetName = rowContent[-1]
tagSetObject = None
try:
if usedTagset[tagsetName] == 1:
someTagset = TagSet.objects.get(name=tagsetName)
tagSetObject = someTagset
except KeyError:
usedTagset[tagsetName]=1
tagSetObject = TagSet(name=tagsetName)
rootWord = rowContent[0]
currWordList = rowContent[1:-1] # it can be more than a single words
currWord = ""
for temp in currWordList:
currWord += temp + " "
currWord = currWord[:-1] # remove last space
allLabels = tagsetName.split(";") # last block of words corrensponds to allLabels
for currLabel in allLabels:
try:
currFeature = findFeature[currLabel.upper()]
featObject = Feature.objects.get(name=currFeature)
tagSetObject.features.add(featObject)
except KeyError:
print(f"{currLabel} label doesn't exist.")
# print(tagSetObject.features.all())
tagSetObject.save()
# Defining the Word/Form
wordObject = Word(name=currWord)
lemmaObject = Lemma.objects.get(name=rootWord)
wordObject.lemma = lemmaObject
wordObject.tagset = tagSetObject
wordObject.language = lemmaObject.language
# print(f"{wordObject.name} : {wordObject.lemma} : {wordObject.tagset} : {wordObject.language}")
wordObject.save()
wordData.close()
# * uncomment below to populate !!in order!! *
dimensionPop()
featurePop()
genusPop()
posPop()
familyPop()
languagePop()
lemmaPop()
readAppendix()
wordPop()
#
# processes = []
# for _ in range(16):
# p = multiprocessing.Process(target=wordPop)
# p.start()
# processes.append(p)
# for proc in processes:
# process.join()
# Just in case it goes wrong
def emptyDatabase():
Word.objects.all().delete()
Lemma.objects.all().delete()
TagSet.objects.all().delete()
Language.objects.all().delete()
Family.objects.all().delete()
Dimension.objects.all().delete()
Genus.objects.all().delete()
POS.objects.all().delete()
print("Database is empty...")
# emptyDatabase()
```
1 Answer 1
Just looking at wordPop
since that's where you say the most time is being taken:
I think your validity check is broken. There's an
if
statement to make sure the line is valid, but noelse
, and the code after theif
uses variables defined within the conditional block.Use
" ".join(currWordList)
instead of manually concatenating a bunch of strings. That'll save you the creation of a bunch of intermediate string objects.You're storing values of
1
inusedTagset
, but that's meaningless. What you're actually doing each time is fetching the object from the database. Sure, that's a "fast" lookup if thename
field is indexed, but nowhere near as fast as just fetching the object from the dictionary. Instead, consider something like the following:if tagsetName not in usedTagset: usedTagset[tagsetName] = TagSet.objects.create(name=tagsetName) tagSetObject = usedTagset[tagsetName]
Same thing for the
findFeature
chunk in theallLabels
loop. If you're just doing a couple lookups,.get
on an indexed column is super fast... but that's "fast" relative to database speeds. In this context, "fast" means a few milliseconds or most of a ms, most likely. That's completely intolerable when you're doing a loop of hundreds of thousands or millions of rows. Fetch the objects once, cache them in memory, and do direct lookups using a dictionary. You can initialize the cache as follows:usedTagset = {tag.name: tag for tag in TagSet.objects.all()}
and then, as discussed above, save any newly created objects as you create them rather than re-fetch them each time. If you're not sure if something already exists in the database, use
get_or_create
:if tagsetName not in usedTagset: # get_or_create returns a tuple, the first element of which is the ORM object usedTagset[tagsetName] = TagSet.objects.get_or_create(name=tagsetName)[0] tagSetObject = usedTagset[tagsetName]
Cutting the DB and associated driver/network/ORM overhead out of the loop will likely be all you need to get reasonably performant code.
I'm also a fan of
...objects.create
. It's more explicit that constructing the object one bit at a time.lemma = lemma_lookup[rootWord] Word.objects.create( name=currWord, lemma=lemma, tagset=tagSetObject, language=lemma.language )
And finally, not performance related, but for the love of other Python developers, please follow PEP8 naming conventions, particularly using lower-case underscore-separated names for variables; e.g.,
word_object
instead ofwordObject
, andtag_set_object
instead oftagSetObject
.
tqdm
is a great module for reporting progress on a loop with very little overhead. Might be a good alternative to the manual iteration tracking in your word loop. \$\endgroup\$