I'm building a Django (1.8.12) application that parses a .bc3 file (Standard Interchange Format for Databases of Construction and Real State) and loads all the data into the database (PostgreSQL 9.3.9).
A .bc3 file looks like this, and a common one has more than 2000 concepts (those records that start with ~C).
To sum up, the user uploads the file and the webapp in a short period of time is able to insert the data into the database to start working on it.
Models
class Concept(models.Model):
code = models.CharField(_('code'), max_length=20, primary_key=True)
root = models.BooleanField(_('is it root'), default=False)
chapter = models.BooleanField(_('is it chapter'), default=False)
parent = models.BooleanField(_('is it parent'), default=False)
unit = models.CharField(_('unit'), blank=True, max_length=3)
summary = models.CharField(_('summary'), blank=True, max_length=100)
price = models.DecimalField(_('price'), max_digits=12, decimal_places=3,
null=True, blank=True)
date = models.DateField(_('creation date'), null=True, blank=True)
concept_type = models.CharField(_('concept type'), max_length=3, blank=True)
def __str__(self):
return '%s: %s' % (self.code, self.summary)
class Deco(models.Model):
parent_concept = models.ForeignKey(Concept, null=True, blank=True,
related_name='decos')
concept = models.ForeignKey(Concept, null=True, blank=True)
factor = models.DecimalField(max_digits=12, decimal_places=3,
default=Decimal('0.000'))
efficiency = models.DecimalField(max_digits=12, decimal_places=3,
default=Decimal('0.000'))
def __str__(self):
return '%s: %s' % (self.parent_concept, self.concept)
bc3parser.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Parses bc3 files and insert all the data into the database."""
import re
from enkendas.models import Version, Concept, Deco, Text
from .utils import optional_codes, parse_dates
# regex stuff
# parsers stuff
concepts = {}
decos = {}
# decos = {'PER02': [('Qexcav', '1', '231.13'), ('Qzanj', '1', '34.5'),
# ('Qexcav2', '1', '19.07'), ('Qrelltras', '1', '19.07')],
# ...
# 'Qexcav': [('MMMT.3c', '1', '0.045'), ('O01OA070', '1', '0.054'),
# ('M07CB030', '1', '0.036'), ('%0300', '1', '0.03')]}
def dispatch_record(record):
"""
Dispatch every record.
Check the first character of the record and send it to the proper function.
"""
if record.startswith('D'):
parse_decomp(record)
elif record.startswith('V'):
parse_version(record)
elif record.startswith('C'):
parse_concept(record)
elif record.startswith('T'):
parse_text(record)
else:
pass
def parse_file(file):
"""
Parse the whole file.
file is a generator returned by file.chunks(chunk_size=80000) in views.py.
"""
while True:
try:
record = ''
incomplete_record = ''
# Iterates over the file sent by the user.
byte_string = next(file)
byte_stripped_string = byte_string.strip()
string = byte_stripped_string.decode(encoding='ISO-8859-1')
# List of records.
durty_strings_list = string.split('~')
# Check if one chunk in chunks is complete.
if durty_strings_list[-1] != '' and incomplete_record != '':
incomplete_record = incomplete_record + durty_strings_list.pop(-1)
dispatch_record(incomplete_record)
incomplete_record = ''
elif durty_strings_list[-1] != '' and incomplete_record == '':
incomplete_record = durty_strings_list.pop(-1)
for durty_string in durty_strings_list:
stripped_string = durty_string.strip()
if durty_string == '':
record = record + ''
# TODO: I didn't create a regex for 'M' and 'E' records yet.
elif durty_string[0] == 'M' or durty_string[0] == 'E':
continue
if record != '':
# Dispatch the previous record.
dispatch_record(record)
# Reset the used record.
record = ''
# Assign the current record.
record = stripped_string
else:
record = record + stripped_string
except StopIteration as e:
dispatch_record(record)
break
concept_instances = []
for key_code, data in concepts.items():
code = key_code
root = chapter = parent = False
if len(key_code) > 2 and key_code[-2:] == '##':
root = True
code = key_code[:-2]
elif len(key_code) > 1 and key_code[-1:] == '#':
chapter = True
code = key_code[:-1]
if code in decos:
parent = True
concept = Concept(code=code, root=root, chapter=chapter, parent=parent,
unit=data['unit'], summary=data['summary'],
price=data['price'], date=data['date'],
concept_type=data['concept_type'])
concept_instances.append(concept)
Concept.objects.bulk_create(concept_instances)
deco_instances = []
cobjs_storage = {}
for concept in Concept.objects.all():
if concept.parent is False:
continue
dec = decos[concept.code]
for child, factor, efficiency in dec:
if child == '':
continue
if factor == '':
factor = '0.000'
if efficiency == '':
efficiency = '0.000'
# To avoid extra queries.
if child in cobjs_storage:
cobj = cobjs_storage[child]
else:
cobj = Concept.objects.get(code=child)
cobjs_storage.update({child: cobj})
deco = Deco(parent_concept=concept, concept=cobj,
factor=float(factor), efficiency=float(efficiency))
deco_instances.append(deco)
decos.pop(concept.code, None)
Deco.objects.bulk_create(deco_instances)
Process
Parsing the .bc3 file uploaded by the user.
Everything is working as expected.
Instantiating the
Concept
model.I save the instances in
concept_instances = [c1, c2, c3... cn]
.Inserting
Concept
instances into the database.In order to speed up the load I use the
bulk_create(concept_instances)
method.Instantiating the
Deco
model.I save the instances in
deco_instances = [d1, d2, d3... dn]
. But, to do that I need to retrieve eachConcept
object from the database because of theparent_concept
andconcept
fields.Inserting
Deco
instances into the database.As before, to speed up the load I use the
bulk_create(deco_instances)
method.
Bottleneck
The whole process on the .bc3 file mentioned earlier is taking too much (95230 ms) because I'm doing 1278 SQL queries, but inserting 1276 Concept
objects just takes 693 ms and 2826 Deco
objects 289 ms.
Research
I read some Stack Overflow questions and the Django official documentation about Database access optimization, but I didn't find any useful improvement for this case.
My Assumption
I think this line of code is the main problem, but in my opinion it is absolutely necessary.
Questions
- Is it possible to create
Deco
objects without getting everyConcept
object? - Is running tasks in the background the only approach to follow?
- Am I missing something?
1 Answer 1
An important aspect when doing optimisations is profiling. You should really start with that instead of asking random strangers on the internet.
Anyhow, let me take a quick look.
Filtering
for concept in Concept.objects.all():
if concept.parent is False:
continue
...
This seems a bit redundant, why not just
for concept in Concept.objects.filter(parent=True):
...
Many queries
I took a close look at the line you indicated might be troublesome. You have not profiled (I assume), but it looks suspicious because it performs a query in a loop.
So basically the code looks like this:
for concept in Concept.objects.all():
...
for child, factor, efficiency in dec:
...
if child in cobjs_storage:
cobj = cobjs_storage[child]
else:
cobj = Concept.objects.get(code=child)
cobjs_storage.update({child: cobj})
...
So, ideally, you'd want to make sure that cobjs_storage
contains as much as possible. One way to do that would be to add the following before the first for
loop above:
# Pre-fetch required objects.
needs_prefetch = set(child for child, __, __ in decos.values())
for cobj in Concept.objects.filter(code__in=needs_prefetch):
cobjs_storage[cobj.code] = codj
It's a bit hacky, perhaps, but it should lower the number of queries, and as such improve results.
[edit: I just found a better way] Using in_bulk
(https://docs.djangoproject.com/en/1.9/ref/models/querysets/#django.db.models.query.QuerySet.in_bulk) you can rewrite it a bit:
# Pre-fetch required objects.
needs_prefetch = set(child for child, __, __ in decos.values())
cobjs_storage.update(Concept.objects.in_bulk(needs_prefetch))
Also, make sure to add any created Concept
object to the cobjs_storage
after creating, so that you don't incur a database hit for that.
Dispatch
def dispatch_record(record):
"""
Dispatch every record.
Check the first character of the record and send it to the proper function.
"""
if record.startswith('D'):
parse_decomp(record)
elif record.startswith('V'):
parse_version(record)
elif record.startswith('C'):
parse_concept(record)
elif record.startswith('T'):
parse_text(record)
else:
pass
This is not as expensive as a database hit, but it's still someplace that might need some optimisation, or at least a bit of a refactoring to make it cleaner.
def dispatch_record(record):
dispatch_table = {
'D': parse_decomp,
'V': parse_version,
'C': parse_concept,
'T': parse_text,
}
try:
parser = dispatch_table[record[0]]
except (IndexError, KeyError):
return
parser(record)
This makes it easier to add extra parsers, and the .startswith()
is no longer called multiple times.
Parsing files
The following piece of code is quite suspect.
while True:
try:
record = ''
incomplete_record = ''
# Iterates over the file sent by the user.
byte_string = next(file)
byte_stripped_string = byte_string.strip()
string = byte_stripped_string.decode(encoding='ISO-8859-1')
# List of records.
durty_strings_list = string.split('~')
# Check if one chunk in chunks is complete.
if durty_strings_list[-1] != '' and incomplete_record != '':
incomplete_record = incomplete_record + durty_strings_list.pop(-1)
dispatch_record(incomplete_record)
incomplete_record = ''
elif durty_strings_list[-1] != '' and incomplete_record == '':
incomplete_record = durty_strings_list.pop(-1)
for durty_string in durty_strings_list:
stripped_string = durty_string.strip()
if durty_string == '':
record = record + ''
# TODO: I didn't create a regex for 'M' and 'E' records yet.
elif durty_string[0] == 'M' or durty_string[0] == 'E':
continue
if record != '':
# Dispatch the previous record.
dispatch_record(record)
# Reset the used record.
record = ''
# Assign the current record.
record = stripped_string
else:
record = record + stripped_string
except StopIteration as e:
dispatch_record(record)
break
First of all, it's quite long, but there is one thing I would very much like to comment on. If possible, do not use while
loops when a for
loop suffices. But there is actually a lot more going on. Let me walk you through a few refactorings I'd like to suggest.
First, the code just before the except StopIteration
:
if record != '':
# Dispatch the previous record.
dispatch_record(record)
# Reset the used record.
record = ''
# Assign the current record.
record = stripped_string
else:
record = record + stripped_string
In the else
, you know record == ''
, and '' + stripped_string
is always the same as stripped_string
.
if record != '':
# Dispatch the previous record.
dispatch_record(record)
# Reset the used record.
record = ''
# Assign the current record.
record = stripped_string
else:
record = stripped_string
In both branches, the last line is the same, so we can move it out, and drop the else
which is now empty.
if record != '':
# Dispatch the previous record.
dispatch_record(record)
# Reset the used record.
record = ''
# Assign the current record.
record = stripped_string
This makes the record = ''
in the if redundant.
if record != '':
# Dispatch the previous record.
dispatch_record(record)
# Assign the current record.
record = stripped_string
Already so much cleaner.
for durty_string in durty_strings_list:
stripped_string = durty_string.strip()
if durty_string == '':
record = record + ''
# TODO: I didn't create a regex for 'M' and 'E' records yet.
elif durty_string[0] == 'M' or durty_string[0] == 'E':
continue
if record != '':
# Dispatch the previous record.
dispatch_record(record)
# Assign the current record.
record = stripped_string
The record = record + ''
is a bit useless. Because we already know it's a string, we can modify the elif
a bit.
for durty_string in durty_strings_list:
stripped_string = durty_string.strip()
if durty_string and durty_string[0] == 'M' or durty_string[0] == 'E':
continue
if record != '':
# Dispatch the previous record.
dispatch_record(record)
# Assign the current record.
record = stripped_string
(I broke PEP8 here, but I'm going to fix that now.)
for durty_string in durty_strings_list:
stripped_string = durty_string.strip()
if durty_string and durty_string[0] in ('M', 'E'):
continue
if record != '':
# Dispatch the previous record.
dispatch_record(record)
# Assign the current record.
record = stripped_string
Marginally better. I have a bit more overview now, and I'd really like to get rid of the try
/except
, so let me see what's necessary for that.
while True:
try:
...1
byte_string = next(file)
...2
except StopIteration as e:
dispatch_record(record)
break
I did the hard work at looking over the rest of the code (the ...1
and ...2
), and I feel confident that those parts won't throw a StopIteration
. So let's factor those out.
while True:
...1
try:
byte_string = next(file)
except StopIteration as e:
dispatch_record(record)
break
...2
Now, to continue I need to elaborate on ...1
a bit, filling it in again
while True:
record = ''
incomplete_record = ''
try:
# Iterates over the file sent by the user.
byte_string = next(file)
except StopIteration as e:
dispatch_record(record)
break
...2
We can move incomplete_record
to after the try/except
.
while True:
record = ''
try:
# Iterates over the file sent by the user.
byte_string = next(file)
except StopIteration as e:
dispatch_record(record)
break
incomplete_record = ''
...2
I'd like to do the same for the record
, but it's used in the except
clause. But, it's still ''
at that point, so let's cheat a bit and substitute that by hand.
while True:
try:
# Iterates over the file sent by the user.
byte_string = next(file)
except StopIteration as e:
dispatch_record('')
break
record = ''
incomplete_record = ''
...2
Looking at dispatch_record
we see that ''
is handled as pass
. So it does nothing. Let's remove that call.
while True:
try:
# Iterates over the file sent by the user.
byte_string = next(file)
except StopIteration as e:
break
record = ''
incomplete_record = ''
...2
And this is a fairly common pattern, so common in fact that this is the basis of the for
loop.
for byte_string in file:
record = ''
incomplete_record = ''
...2
Let me zoom out again.
for byte_string in file:
record = ''
incomplete_record = ''
byte_stripped_string = byte_string.strip()
string = byte_stripped_string.decode(encoding='ISO-8859-1')
# List of records.
durty_strings_list = string.split('~')
# Check if one chunk in chunks is complete.
if durty_strings_list[-1] != '' and incomplete_record != '':
incomplete_record = incomplete_record + durty_strings_list.pop(-1)
dispatch_record(incomplete_record)
incomplete_record = ''
elif durty_strings_list[-1] != '' and incomplete_record == '':
incomplete_record = durty_strings_list.pop(-1)
for durty_string in durty_strings_list:
stripped_string = durty_string.strip()
if durty_string and durty_string[0] in ('M', 'E'):
continue
if record != '':
# Dispatch the previous record.
dispatch_record(record)
# Assign the current record.
record = stripped_string
Because incomplete_record = ''
is inside the loop, it always gets reset. Are you sure you have tried the algorithm with larger files? (And tested it is correct)?
There are more reasons why I think your code is broken, for instance the handling of dispatch_record
, and where the assignments take place.
Rewriting parse_file
.
What parse_file
should do is the following:
Iterate over all the records in
file
(separated by~
), and callparse_record
on all of them.
Assuming memory was infinite (or just 'large enough'), you could just do
for record in file.read().split('~'):
dispatch_record(record)
But from your code, I assume it's not 'large enough', and we get it in chunks.
def parse_file(chunks):
partial_record = ''
for chunk in chunks:
stripped_chunk = byte_string.strip()
string = stripped_chunk.decode(encoding='ISO-8859-1')
records = chunk.split('~')
# Prepend the partial record to the first record
records[0] = partial_record + records[0]
# Get the last
partial_record = records.pop(-1)
for record in records:
dispatch_record(record)
# If we still have data left, it's a full record, but just at
# the end of the file.
if partial_record != '':
dispatch_record(partial_record)
Ideally, you'd split out the parsing of the ~
-chunked blocks from the iteration, but this is good enough for now, I think.
-
\$\begingroup\$ First of all, thank you so much for taking the time to answer my question. I've learned a lot. I'm using Django-Debug-Toolbar and I can see 1276 queries coming from
cobj = Concept.objects.get(code=child)
, and each takes about 60 ms. I did this:for concept in Concept.objects.all(): if concept.parent is False: continue
because I needconcept
later indec = decos[concept.code]
. \$\endgroup\$oubiga– oubiga2016年04月05日 14:27:50 +00:00Commented Apr 5, 2016 at 14:27 -
\$\begingroup\$ Did my suggested change (pre-fetching) decrease the number of queries? \$\endgroup\$Sjoerd Job Postmus– Sjoerd Job Postmus2016年04月05日 16:07:07 +00:00Commented Apr 5, 2016 at 16:07
-
\$\begingroup\$ Unfortunately, it increased +1 query. The new hit is
cobjs_storage.update(Concept.objects.in_bulk(needs_prefetch))
, which took 63 ms. \$\endgroup\$oubiga– oubiga2016年04月05日 21:17:00 +00:00Commented Apr 5, 2016 at 21:17
code
is not the primary key for theConcept
model? \$\endgroup\$