Speed up Python Script to populate MongoDB

Question 1

How would I go about making this script much faster? Essentially it reads from a file and the slowest part is populating the words. In the words file there are over 100k words and I was looking for a way to speed it up since this script is going to be used to populate the database with over 200 languages.

import os, django
os.environ.setdefault("DJANGO_SETTINGS_MODULE","eureka.settings")
django.setup()
from django.contrib.auth.models import User
from django.contrib import admin
from wordDictionary.models import Genus, Word, Feature, Dimension, Language, Lemma, Family, TagSet, POS
import multiprocessing
# Dimensions
def dimensionPop():
 dimData = open("data/models/dimensions.txt","r")
 for x in dimData:
 a = x.split("\n")
 dimName = a[0]
 # Create Object 
 nextDim = Dimension(name=dimName)
 nextDim.save()
 #print(f":{nextDim.name}:")
 dimData.close()
 print("Dimension done")
# Features
def featurePop():
 featData = open("data/models/features.txt","r")
 for x in featData:
 line = x.split(";")
 featName = line[1]
 dimName = line[0]
 # Create Object 
 nextFeature = Feature(name=featName)
 dimObject = Dimension.objects.get(name=dimName)
 nextFeature.dimension = dimObject 
 #print(f"{nextFeature.dimension.name}")
 nextFeature.save()
 featData.close()
 print("Feature done")
# Part of Speech
def posPop():
 posData = open("data/models/POS.txt","r")
 for x in posData:
 line = x.split(";")
 posName = line[1]
 # Create Object 
 nextPOS = POS(name=posName)
 #print(f"{nextPOS.name}")
 nextPOS.save()
 posData.close()
 print("Part of Speech done")
# Genus
def genusPop():
 genusData = open("data/models/genus.txt","r")
 for x in genusData:
 genusName = x.split("\n")[0]
 # Create Object 
 nextGenus = Genus(name=genusName)
 #print(f":{nextGenus.name}:")
 nextGenus.save()
 genusData.close()
 print("Genus done")
# Family
def familyPop():
 famData = open("data/models/families.txt","r")
 for x in famData:
 FamilyName = x.split(";")[0]
 # Create Object 
 nextFamily = Family(name=FamilyName)
 #print(f":{nextFamily.name}:")
 nextFamily.save()
 famData.close()
 print("Family done")
def languagePop():
 #Populate only english for now
 nextLang = Language(name="English")
 nextLang.walsCode = "eng"
 nextLang.genus = Genus.objects.get(name="Germanic")
 nextLang.family = Family.objects.get(name="Indo-European")
 nextLang.save()
 print("Language done")
def lemmaPop():
 lemmaData = open("data/models/lemmas.txt","r",encoding="utf8")
 for x in lemmaData:
 x = x.split("\n")
 lemmaName = x[0]
 nextLemma = Lemma(name=lemmaName)
 langName = Language.objects.get(name="English")
 nextLemma.language = langName
 posName = POS.objects.get(name="Verb")
 nextLemma.pos = posName
 nextLemma.save()
 lemmaData.close()
 print("Lemma done")
findFeature={}
def readAppendix():
 fileContent = open("data/models/features.txt","r")
 for row in fileContent:
 rowWords = row.split(";")
 dimension = rowWords[0]
 feature = rowWords[1]
 label =(rowWords[2].rstrip()).upper()
 findFeature[label]=feature # assign feature to label
 fileContent.close()
 print("\nStarting with words...")
usedTagset = {}
def wordPop():
 wordData = open("data/langs/English.txt","r",encoding="utf8")
 it = 0
 for line in wordData:
 it += 1
 if it % 1000 :
 print(f"> {it}...")
 rowContent = line.split()
 if(len(rowContent)>=3): # checks if line is valid
 tagsetName = rowContent[-1]
 tagSetObject = None
 try:
 if usedTagset[tagsetName] == 1:
 someTagset = TagSet.objects.get(name=tagsetName)
 tagSetObject = someTagset
 except KeyError:
 usedTagset[tagsetName]=1
 tagSetObject = TagSet(name=tagsetName)
 rootWord = rowContent[0]
 currWordList = rowContent[1:-1] # it can be more than a single words
 currWord = ""
 for temp in currWordList:
 currWord += temp + " "
 currWord = currWord[:-1] # remove last space
 allLabels = tagsetName.split(";") # last block of words corrensponds to allLabels
 for currLabel in allLabels:
 try:
 currFeature = findFeature[currLabel.upper()]
 featObject = Feature.objects.get(name=currFeature)
 tagSetObject.features.add(featObject)
 except KeyError:
 print(f"{currLabel} label doesn't exist.")
 # print(tagSetObject.features.all())
 tagSetObject.save()
 # Defining the Word/Form
 wordObject = Word(name=currWord)
 lemmaObject = Lemma.objects.get(name=rootWord)
 wordObject.lemma = lemmaObject
 wordObject.tagset = tagSetObject
 wordObject.language = lemmaObject.language
 # print(f"{wordObject.name} : {wordObject.lemma} : {wordObject.tagset} : {wordObject.language}")
 wordObject.save()
 wordData.close()
# * uncomment below to populate !!in order!! *
dimensionPop()
featurePop()
genusPop()
posPop()
familyPop()
languagePop()
lemmaPop()
readAppendix()
wordPop()
#
# processes = []
# for _ in range(16):
# p = multiprocessing.Process(target=wordPop)
# p.start()
# processes.append(p)
# for proc in processes:
# process.join()
# Just in case it goes wrong
def emptyDatabase():
 Word.objects.all().delete()
 Lemma.objects.all().delete()
 TagSet.objects.all().delete()
 Language.objects.all().delete()
 Family.objects.all().delete()
 Dimension.objects.all().delete()
 Genus.objects.all().delete()
 POS.objects.all().delete()
 print("Database is empty...")
# emptyDatabase()
```

Question 2

btw tqdm is a great module for reporting progress on a loop with very little overhead. Might be a good alternative to the manual iteration tracking in your word loop.

Question 3

Just looking at wordPop since that's where you say the most time is being taken:

I think your validity check is broken. There's an if statement to make sure the line is valid, but no else, and the code after the if uses variables defined within the conditional block.
Use " ".join(currWordList) instead of manually concatenating a bunch of strings. That'll save you the creation of a bunch of intermediate string objects.
You're storing values of 1 in usedTagset, but that's meaningless. What you're actually doing each time is fetching the object from the database. Sure, that's a "fast" lookup if the name field is indexed, but nowhere near as fast as just fetching the object from the dictionary. Instead, consider something like the following:
```
if tagsetName not in usedTagset:
 usedTagset[tagsetName] = TagSet.objects.create(name=tagsetName)
tagSetObject = usedTagset[tagsetName]
```
Same thing for the findFeature chunk in the allLabels loop. If you're just doing a couple lookups, .get on an indexed column is super fast... but that's "fast" relative to database speeds. In this context, "fast" means a few milliseconds or most of a ms, most likely. That's completely intolerable when you're doing a loop of hundreds of thousands or millions of rows. Fetch the objects once, cache them in memory, and do direct lookups using a dictionary. You can initialize the cache as follows:
```
usedTagset = {tag.name: tag for tag in TagSet.objects.all()}
```
and then, as discussed above, save any newly created objects as you create them rather than re-fetch them each time. If you're not sure if something already exists in the database, use get_or_create:
```
if tagsetName not in usedTagset:
 # get_or_create returns a tuple, the first element of which is the ORM object
 usedTagset[tagsetName] = TagSet.objects.get_or_create(name=tagsetName)[0]
tagSetObject = usedTagset[tagsetName]
```

Cutting the DB and associated driver/network/ORM overhead out of the loop will likely be all you need to get reasonably performant code.

I'm also a fan of ...objects.create. It's more explicit that constructing the object one bit at a time.

lemma = lemma_lookup[rootWord]
Word.objects.create(
 name=currWord,
 lemma=lemma,
 tagset=tagSetObject,
 language=lemma.language
)

And finally, not performance related, but for the love of other Python developers, please follow PEP8 naming conventions, particularly using lower-case underscore-separated names for variables; e.g., word_object instead of wordObject, and tag_set_object instead of tagSetObject.

scnerd scnerdscnerd 2,0607 silver badges12 bronze badges · Accepted Answer · 2020-04-28 16:20:18Z

Just looking at wordPop since that's where you say the most time is being taken:

I think your validity check is broken. There's an if statement to make sure the line is valid, but no else, and the code after the if uses variables defined within the conditional block.
Use " ".join(currWordList) instead of manually concatenating a bunch of strings. That'll save you the creation of a bunch of intermediate string objects.
You're storing values of 1 in usedTagset, but that's meaningless. What you're actually doing each time is fetching the object from the database. Sure, that's a "fast" lookup if the name field is indexed, but nowhere near as fast as just fetching the object from the dictionary. Instead, consider something like the following:
```
if tagsetName not in usedTagset:
 usedTagset[tagsetName] = TagSet.objects.create(name=tagsetName)
tagSetObject = usedTagset[tagsetName]
```
Same thing for the findFeature chunk in the allLabels loop. If you're just doing a couple lookups, .get on an indexed column is super fast... but that's "fast" relative to database speeds. In this context, "fast" means a few milliseconds or most of a ms, most likely. That's completely intolerable when you're doing a loop of hundreds of thousands or millions of rows. Fetch the objects once, cache them in memory, and do direct lookups using a dictionary. You can initialize the cache as follows:
```
usedTagset = {tag.name: tag for tag in TagSet.objects.all()}
```
and then, as discussed above, save any newly created objects as you create them rather than re-fetch them each time. If you're not sure if something already exists in the database, use get_or_create:
```
if tagsetName not in usedTagset:
 # get_or_create returns a tuple, the first element of which is the ORM object
 usedTagset[tagsetName] = TagSet.objects.get_or_create(name=tagsetName)[0]
tagSetObject = usedTagset[tagsetName]
```

Cutting the DB and associated driver/network/ORM overhead out of the loop will likely be all you need to get reasonably performant code.

I'm also a fan of ...objects.create. It's more explicit that constructing the object one bit at a time.

lemma = lemma_lookup[rootWord]
Word.objects.create(
 name=currWord,
 lemma=lemma,
 tagset=tagSetObject,
 language=lemma.language
)

And finally, not performance related, but for the love of other Python developers, please follow PEP8 naming conventions, particularly using lower-case underscore-separated names for variables; e.g., word_object instead of wordObject, and tag_set_object instead of tagSetObject.

Stack Exchange Network

Speed up Python Script to populate MongoDB

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Speed up Python Script to populate MongoDB

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions