Here is my code for importing from a .csv to a neo4j graph using py2neo and cypher statements. I've noticed that it slows down significantly the bigger the graph gets. It takes several seconds just to parse ~10,000 lines. I'm curious to see if there are any glaring mistakes or improvements to be made.
Before doing any of the import, I created indexes for every node and property, based off of this blog.
The machine I'm working on is windows 7 with 65gb RAM, so it's certainly fast enough to handle a large graph. The upload of a 20 million line csv took weeks, so there has to be something that can be improved on.
I am currently stuck with using windows, I can't switch over to Linux. I also can't use Jexp's batch importer unfortunately.
import csv
import sys
import os
from py2neo import neo4j,node, rel, cypher
import time
def main():
f = "C:file_path/file.csv"
graph_db = neo4j.GraphDatabaseService("http://localhost:7474/db/data/")
with open(f, 'r+') as in_file:
reader = csv.reader(in_file, delimiter = ',')
next(reader, None) # skip headers
batch = neo4j.WriteBatch(graph_db)
try:
i = 0
j = 0
for row in reader:
if row:
if (i == 10000):
print j, "processed"
i = 0
i += 1
j += 1
character = strip(row[0])
first_name = strip(row[1])
last_name = strip(row[2])
actor = strip(row[3])
character_birth = strip(row[4])
character_death = strip(row[5])
allegiance = strip(row[6])
house = strip(row[7])
territory = strip(row[8])
region = strip(row[9])
query = neo4j.CypherQuery(graph_db,
"""CYPHER 2.0
merge (character:Character {Character: {a}, First_Name:{b}, Last_Name:{c}, Actor:{d}, Birth:{e}, Death:{f}})
merge (house:House{House:{g}, Allegiance:{h}})
merge (territory:Territory {Territory: {i}})
merge (region:Region {Region: {j}})
merge (character)-[:Of_House{House:{k}}]-(house)-[:Is_From]->(territory)-[:Is_In]->(region)
""")
result = query.execute(a = character, b = first_name, c = last_name, d = actor, e = character_birth, f = character_death, g = house, h = allegiance, i = territory, j = region, k = house)
except Exception as e:
print e, row, reader.line_num #print the line number to fix any data errors in file
def strip(string): return''.join([c if 0 < ord(c) < 128 else ' ' for c in string]) #removes non utf-8 chars from string within cell
if __name__ == '__main__':
start=time.time()
main()
end = time.time() - start
print "Time to complete:", end
1 Answer 1
Here is the updated solution with batching. I found that the request times out of the batch size is much larger than 1000, unfortunately and I'm not sure what causes that:
import csv
from py2neo import Graph
import time
def main():
f = "C:file_path/file.csv"
graph = Graph("http://localhost:7474/db/data/")
with open(f, 'r+') as in_file:
reader = csv.reader(in_file, delimiter=',')
next(reader, None)
batch = graph.cypher.begin()
try:
i = 0;
j = 0;
for row in reader:
if row:
character = strip(row[0])
first_name = strip(row[1])
last_name = strip(row[2])
actor = strip(row[3])
character_birth = strip(row[4])
character_death = strip(row[5])
allegiance = strip(row[6])
house = strip(row[7])
territory = strip(row[8])
region = strip(row[9])
query = """
merge (character:Character {Character: {a}, First_Name:{b}, Last_Name:{c}, Actor:{d}, Birth:{e}, Death:{f}})
merge (house:House{House:{g}, Allegiance:{h}})
merge (territory:Territory {Territory: {i}})
merge (region:Region {Region: {j}})
merge (character)-[:Of_House{House:{k}}]-(house)-[:Is_From]->(territory)-[:Is_In]->(region)
"""
batch.append(query, {"a":character, "b": first_name, "c": last_name, "d":actor, "e":character_birth, "f":character_death, "g":house, "h": allegiance, "i":territory, "j":region, "k":house})
i += 1
j += 1
batch.process()
if (i == 1000): #submits a batch every 1000 lines read
batch.commit()
print j, "lines processed"
i = 0
batch = graph.cypher.begin()
else: batch.commit() #submits remainder of lines read
print j, "lines processed"
except Exception as e:
print e, row, reader.line_num
def strip(string): return''.join([c if 0 < ord(c) < 128 else ' ' for c in string]) #removes non utf-8 chars from string within cell
if __name__ == '__main__':
start = time.time()
main()
end = time.time() - start
print "Time to complete:", end
Explore related questions
See similar questions with these tags.
query.execute()
once per row, is it possible to batch those queries and perform them later? I worked on a project which utilized rabbitmq and batching query execs gave us quite a nice speed boost - rather than doing 30 million network calls, we only did 3000 network calls using a 10k batch size EDIT: Seems it is possible py2neo.org/2.0/batch.html \$\endgroup\$query.append()
forquery.execute()
and move the execute statement inside thei == 10000
loop? \$\endgroup\$