This is a follow-up to previous post, code is significantly better thanks to comments. The problem is that now it takes 0.25 seconds to iterate 1 row of VehicleEvents through 95000 rows of GPS information. At this rate, with 370k rows of VehicleEvents, it will take me days and I was wondering if you can find a better way.
import csv
from openpyxl.utils.datetime import from_excel, to_ISO8601
import datetime
import timeit
tic = timeit.default_timer()
for x in range(1,2): # here the code will read from 34 csv files containing GPS informations into a list of lists
csvfilepath = "GpsData{}.csv".format(x)
# Here the gps data is loaded into list of lists gpsdata and the timestamp is converted to ISO8601
with open(csvfilepath, "r") as f:
reader = csv.reader(f)
headers = next(reader)
gpsdata = list(list(rec) for rec in csv.reader(f, delimiter=','))
for row in gpsdata:
try:
GpsTimestamp = datetime.datetime.strptime(row[10], '%m/%d/%Y %H:%M')
except:
GpsTimestamp = datetime.datetime.strptime(row[10], '%Y-%m-%d %H:%M:%S')
row[10] = to_ISO8601(dt=GpsTimestamp)
driving = 0
idle = 0
working = 0
prev_job_end = ['2219-12-16T05:02:10Z']
dists = [[], [], [], []] #this list of lists will capture the distances in the various states
vhfilepath = "VehicleEvents.csv"
# Capturing the headers in vehicle list
with open(vhfilepath, "r") as f:
reader = csv.reader(f)
headers = next(reader)
vehiclist = csv.DictReader(open(vhfilepath, "r"))
for r in vehiclist:
JobID = r["ID"]
vehicle_no = r['Vhcl']
mode = r['Mode']
Engineon = to_ISO8601(dt=from_excel(value=float(r["Engineon"])))
Engineoff = to_ISO8601(dt=from_excel(value=float(r["Engineoff"])))
try:
WorkStart = to_ISO8601(dt=from_excel(value=float(r["WorkStart"])))
WorkEnd = to_ISO8601(dt=from_excel(value=float(r["WorkEnd"])))
except:
WorkStart = 2000
WorkEnd = 2000
try:
ParkStart = to_ISO8601(dt=from_excel(value=float(r["ParkStart"])))
ParkEnd = to_ISO8601(dt=from_excel(value=float(r["ParkEnd"])))
except:
ParkStart = 2000
ParkEnd = 2000
driving = idle = working = 0.0
for i in range(len(gpsdata)): #I go through all rows of gps list
if i % 930000 == 0 and i != 0: # Keeping track of the program
toc = timeit.default_timer()
print('processing algorithm: {}'.format(toc - tic))
print('we are at row {}'.format(r["ID"]))
if vehicle_no == gpsdata[0][2] and Engineon <= gpsdata[i][10] <= Engineoff: #I want to exclude if the vehicle was off at the gps timestamp
c1 = gpsdata[i][10]
c2 = gpsdata[i][8]
if mode == 'DIS' :
if ParkStart <= c1 <= ParkEnd:
driving += float(c2)
if Engineon <= c1 <= ParkStart:
idle += float(c2)
else:
if ParkEnd <= c1 <= Engineoff :
driving += float(c2)
if Engineon <= c1 <= ParkEnd:
idle += float(c2)
elif vehicle_no == gpsdata[0][2] and Engineon >= gpsdata[i][10] >= prev_job_end[-1] :
working += float(gpsdata[i][8])
prev_job_end.append(Engineoff)
toc = timeit.default_timer()
dists[0].append(JobID)
dists[1].append(driving)
dists[2].append(idle)
dists[3].append(working)
driving = idle = working = 0.0
with open("outfile{}.csv".format(x), 'w') as outfile:
outfile_writer = csv.writer(outfile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
outfile_writer.writerow((dists[0], dists[1], dists[2], dists[3])),
tac = timeit.default_timer()
print('exporting {}'.format(tac - toc))
Sample data:
GpsData = AA,ddate,gps_id,latitude,longitude,speed,accuracy,bearing,distance,ip,Timestamp 6631060599,20191216,V001,50.94269749,71.30314074,0.61,16.687,120.2,4564.41,,12/16/2019 0:00 6631065030,20191216,V001,50.9437751,71.30016186,7.46,33.374,0,48356.2,,12/16/2019 4:31 6631065153,20191216,V001,50.94291191,71.30116373,4.04,24.272,31.6,12747.4,,12/16/2019 19:07 6631060271,20191217,V001,50.94866914,71.30378451,4.49,4.551,0,8368.56,,12/17/2019 17:10 6631060630,20191218,V001,50.94821482,71.30276768,5.16,50.061,27.7,28869.4,,12/18/2019 4:54 6631069096,20191219,V001,50.94534268,71.30259522,2.11,24.272,63.1,9506.68,,12/19/2019 2:55 6631059877,20191219,V001,50.94332513,71.30145316,1.72,10.619,32.4,14834.6,,12/19/2019 17:13 6631068956,20191219,V001,50.94716712,71.30282306,3.14,21.238,300.4,11038.9,,12/19/2019 21:33 6631067712,20191220,V002,50.93724855,71.29651212,9.71,18.204,206,96770.7,,12/20/2019 16:22 6631065674,20191221,V002,50.9467955,71.30202786,6.96,15.008,31.1,28784.4,,12/21/2019 0:07 6631061274,20191221,V002,50.94207924,71.29937677,7.86,30.34,194.3,43064.3,,12/21/2019 14:15 6631063673,20191221,V002,50.94591501,71.30136721,0.83,22.755,293,27036.3,,12/21/2019 18:37 6631064438,20191221,V002,50.94743564,71.30240991,0.56,39.442,259.7,3525.12,,12/21/2019 22:25 6631060673,20191222,V002,50.94580669,71.30148016,5.55,13.653,204.3,53280,,12/22/2019 3:26 6631068423,20191222,V002,50.91783065,71.28983391,6.24,28.823,2.5,42947.8,,12/22/2019 4:14 6631066879,20191222,V002,50.94432872,71.30065772,6.25,19.721,209.8,36728.9,,12/22/2019 11:44 6631068174,20191223,V002,50.93346341,71.29449935,9.26,25.789,28.8,42795.9,,12/23/2019 4:02 VehicleEvents = ID,Vhcl,Mode,CID,Engineon,Engineoff,WorkStart,WorkEnd,ParkStart,ParkEnd 85807835,V001,Manual,AAA5846129341,43815.08135,43815.08938,,,43815.07334,43815.08211 85809668,V001,Auto,AAA8022407504,43815.08938,43815.10535,43815.08938,43815.10535,43815.08938,43815.09535 85810976,V001,Auto,AAA0571518764,43815.10538,43815.11505,43815.10538,43815.11505,43815.10959,43815.11505 85813025,V001,Manual,AAA3189634914,43815.11506,43815.12703,43815.11506,43815.12703,43815.11506,43815.12303 85813028,V001,Manual,AAA1940741282,43815.11506,43815.12703,,,43815.11506,43815.12372 85815305,V001,Manual,AAA1894455904,43815.12705,43815.14505,43815.12705,43815.14692,43815.13366,43815.14505 85815467,V001,Auto,AAA9538532026,43815.12705,43815.14692,,,43815.13361,43815.14692 85821410,V001,Auto,AAA8391952906,43815.14696,43815.20984,43815.14696,43815.20984,43815.14696,43815.15206 85873358,V001,Manual,AAA4922964611,43815.72992,43815.74645,43815.72992,43815.74645,43815.73586,43815.74645 85875020,V001,Manual,AAA6039158858,43815.74646,43815.76461,43815.74646,43815.76601,43815.75975,43815.76461 85875137,V001,Manual,AAA7495366053,43815.74646,43815.76601,,,43815.75972,43815.76601 85877825,V001,Auto,AAA7638509608,43815.76602,43815.79272,43815.76602,43815.79429,43815.76602,43815.77079 85877942,V001,Auto,AAA1265572219,43815.76602,43815.79429,,,43815.76602,43815.76985 85879040,V001,Manual,AAA2968711840,43815.79431,43815.80431,43815.79431,43815.80431,43815.79431,43815.80127 85882028,V001,Manual,AAA7692514875,43815.80432,43815.83537,43815.80432,43815.83537,43815.80432,43815.82109 85884230,V001,Manual,AAA4674654439,43815.83538,43815.85745,43815.83538,43815.85745,43815.83538,43815.84685 85885460,V001,Auto,AAA8107186366,43815.85748,43815.86895,43815.85748,43815.86895,43815.86016,43815.86895 85885994,V001,Auto,AAA5796012701,43815.86916,43815.875,43815.86916,43815.875,43815.87164,43815.875 85886981,V001,Manual,AAA8719492664,43815.87502,43815.88547,43815.87502,43815.88547,43815.8795,43815.88547 85890116,V001,Manual,AAA2865936367,43815.88549,43815.91355,43815.88549,43815.91355,43815.89722,43815.9073 85890119,V001,Manual,AAA1887612592,43815.88549,43815.91355,,,43815.88549,43815.8972 85891310,V001,Auto,AAA1144467605,43815.91358,43815.92514,43815.91358,43815.92514,43815.91856,43815.92514 85892144,V001,Auto,AAA3719694551,43815.92516,43815.93397,43815.92516,43815.93523,43815.92922,43815.93397
2 Answers 2
I find the code presented here notably more readable than the previous iteration - the main apparent difference is meaningful naming of variables (most not in snake_case, yet).
It looks like you want the results in one file per file of GpsData:
such should be specified explicitly, as should be whether output records need to stay in the order of jobs in the vehicle event file (with "prev"job_end
taken care of).
You read the vehicle event file once per file of GpsData - if you don't expect it to change, don't:
read it once before even touching the GpsData. Into a dict
with "Vhcl" as key (see below)
Given csv.DictReader
's ability to handle fieldnames, don't open code that.
You don't use the list of prev_job_end
s: just keep one.
Why convert datetimes to ISO8601?
(running out of time.
Checking every combination of ×ばつevent record most likely kills your time performance, vectorised&filtered or not.
My current take: event list per vehicle, ordered by timestamp
got fed up with all the indexing and conversion, introduced classes. Work in progress:
classes & reading vehicles
#@functools.total_ordering()
class Gps:
def __init__(self, distance, timestamp):
self.distance = distance
self.timestamp = timestamp
def __str__(self):
return "Gps(" + str(self.distance) + ", " + str(self.timestamp) + ')'
def __eq__(self, other):
return self.timestamp == other.timestamp
def __lt__(self, other):
return self.timestamp < other.timestamp
def __le__(self, other):
return self.timestamp <= other.timestamp
def __ge__(self, other):
return self.timestamp >= other.timestamp
def __gt__(self, other):
return self.timestamp > other.timestamp
class Vehicle_event:
def __init__(self, mode, engine_on, engine_off,
work_start, work_end, park_start, park_end):
self.mode = mode
self.engine_on = engine_on
self.engine_off = engine_off
self.work_start = work_start
self.work_end = work_end
self.park_start = park_start
self.park_end = park_end
def __str__(self):
return '<'+self.mode+", ".join(("",
str(self.engine_on), str(self.engine_off),
str(self.work_start), str(self.work_end),
str(self.park_start), str(self.park_end)))+'>'
class Vehicle:
def __init__(self, vid):
self.id = vid
self.events = []
self.gps = []
reading vehicle events:
EVENTDEFAULT = 2000 # whatever is appropriate
def event2timestamp(index, row, default=None):
if default:
try:
return from_excel(float(row[index] # .strip()
)).timestamp()
except:
return default
return from_excel(float(row[index] # .strip()
)).timestamp()
def read_records(csv_path, key_name):
""" Read records from the CSV file named into lists that are values of a
dict keyed by the values of column key_name.
Return this directory.
"""
with open(csv_path, "r") as f:
records = dict() # defaultdict(list)
rows = csv.reader(f)
header = next(rows)
KEY_INDEX = header.index(key_name)
# JOB_ID = header.index("ID")
# VEHICLE_NO = header.index('Vhcl')
MODE = header.index('Mode')
ENGINE_ON = header.index("Engineon")
ENGINE_OFF = header.index("Engineoff")
WORK_START = header.index("WorkStart")
WORK_END = header.index("WorkEnd")
PARK_START = header.index("ParkStart")
PARK_END = header.index("ParkEnd")
for row in rows:
vid = row[KEY_INDEX]
vehicle = records.get(vid)
if None is vehicle:
vehicle = Vehicle(vid)
records[vid] = vehicle
vehicle.events.append(Vehicle_event(
row[MODE],
event2timestamp(ENGINE_ON, row),
event2timestamp(ENGINE_OFF, row),
event2timestamp(WORK_START, row, EVENTDEFAULT),
event2timestamp(WORK_END, row, EVENTDEFAULT),
event2timestamp(PARK_START, row, EVENTDEFAULT),
event2timestamp(PARK_END, row, EVENTDEFAULT)))
return records
def read_vehicle_events():
return read_records("VehicleEvents.csv", 'Vhcl')
vehicles = read_vehicle_events()
outermost loop over GPS files, inner loops over vehicles and events:
for x in range(1, 2): # 34): # handle CSV files containing GPS records
csvfilepath = "GpsData{}.csv".format(x)
# Here the GPS data is appended to lists of Gps
with open(csvfilepath, "r") as f:
reader = csv.reader(f)
headers = next(reader)
COLUMNS = len(headers)
ID = headers.index('gps_id')
DISTANCE = headers.index('distance')
TIMESTAMP = headers.index('Timestamp')
eventless = defaultdict(int)
for rec in reader:
if len(rec) < COLUMNS:
continue
vid = rec[ID]
vehicle = vehicles.get(vid)
if None is vehicle:
eventless[vid] += 1
else:
timestamp = rec[TIMESTAMP]
try:
gps_timestamp = datetime.datetime.strptime(timestamp, '%m/%d/%Y %H:%M')
except:
gps_timestamp = datetime.datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
vehicle.gps.append(Gps(float(rec[DISTANCE]),
gps_timestamp.timestamp()))
print(eventless)
dists = [[]]*4 # distances in the various states
for vehicle in vehicles.values():
vehicle.gps = sorted(vehicle.gps)
prev_job_end = datetime.datetime(2019, 12, 15, 5, 2, 10).timestamp()
driving = idle = working = 0.0
first, beyond = 0, len(vehicle.gps)
for job in vehicle.events:
first = bisect_left(vehicle.gps, Gps(0, prev_job_end), 0, # first?
beyond)
beyond = bisect_right(vehicle.gps, Gps(0, job.engine_off), first)
for g in range(first, beyond): # all Gps from previous Engineoff to current
gps = vehicle.gps[g]
timestamp = gps.timestamp
distance = gps.distance
if timestamp < job.engine_on:
if prev_job_end <= timestamp:
working += distance
elif job.mode == 'DIS':
if timestamp <= job.park_start:
idle += distance
elif timestamp <= job.park_end:
driving += distance
else:
if job.park_end <= timestamp:
driving += distance
else:
idle += distance
prev_job_end = job.engine_off
vehicle.events.clear()
dists[0].append(vehicle.id)
dists[1].append(driving)
dists[2].append(idle)
dists[3].append(working)
toc = timeit.default_timer()
with open("outfile{}.csv".format(x), 'w') as outfile:
outfile_writer = csv.writer(outfile, delimiter=",", quotechar='"',
quoting=csv.QUOTE_MINIMAL)
outfile_writer.writerow(dists)
tac = timeit.default_timer()
print('exporting {}'.format(tac - toc))
)
-
\$\begingroup\$ This implementation takes 6.6 seconds per row, however since the rows are sorted, this may change I did not manage to use the bisect functions because I didn't understand the documentation However I did understand what you said and made an if -> continue which cut processing time a lot I convert to ISO8601 because I know not of a better way to compare datetimes I think I will let it run for a few hours and see if it gets faster, then compare with the previous one \$\endgroup\$Paul– Paul2020年01月13日 20:40:25 +00:00Commented Jan 13, 2020 at 20:40
With 90k gps records and 370k vehicle records, the inner loop of the current code runs 33.3 Billion times. To get significant speed ups will require a different approach.
I would suggest converting everything to an event: gps event, engineon event, engineoff event, and so on. Put it all into a big queue (list) and sort it by time stamp. It looks like the gps data and vehicle events are already sorted, or nearly so. On my machine, it takes less than a second to sort a million records. Could use a heap instead of sorting.
Then loop through the events once, updating the status of the vehicles and jobs, and collecting data, as you go.
'2019-...'
(and adjust .25 as need arises). \$\endgroup\$gpsdata[0][2]
? Shouldn't that begps_data[i][GPS_ID]
? \$\endgroup\$