I am new to programming, and am using Python to take wind data and simulate future wind profiles. The code as written takes a while to execute and I was hoping someone could suggest ways to make my code more efficient...
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
def boot(data, block=10, size=100):
length = len(data)
new = np.zeros(size*block)
for i in range(size):
x = random.randint(0, length-block-1)
new[i*block: (i+1) * block] = data[x: x+block]
return new
def sumx(data, x):
length = data.shape[0]
new = np.zeros(length//x)
for i in range(length//x):
new[i] = data[x*i:x*i+x].sum()
return new
def capsum(x):
length = x.shape[0]
new = np.zeros(length)
new[0] = x[0]
for i in range(1, length):
if new[i-1] + x[i] > 1:
new[i] = 1
elif new[i-1] + x[i] < 0:
new[i] = 0
else:
new[i] = new[i-1] + x[i]
return new
def differences(wind, load, total_wind):
length = len(load)
wind_diff = wind.diff(1)
sim = boot(wind_diff[1:], 5000, 20)
sum_sim = capsum(sim)
sum_sim = sum_sim[:length] * total_wind
net_load = load[:length] - sum_sim
return net_load, sum_sim
def monte(x, wind, load, total_wind):
wind_energy = np.zeros(x)
sim_max = np.zeros(x)
sim_min = np.zeros(x)
sim_std = np.zeros(x)
sim_mean = np.zeros(x)
sim_5 = np.zeros(x)
sim_10 = np.zeros(x)
sim_15 = np.zeros(x)
sim_20 = np.zeros(x)
sim_25 = np.zeros(x)
sim_30 = np.zeros(x)
sim_35 = np.zeros(x)
sim_40 = np.zeros(x)
sim_45 = np.zeros(x)
sim_50 = np.zeros(x)
sim_55 = np.zeros(x)
sim_60 = np.zeros(x)
sim_65 = np.zeros(x)
sim_70 = np.zeros(x)
sim_75 = np.zeros(x)
sim_80 = np.zeros(x)
sim_85 = np.zeros(x)
sim_90 = np.zeros(x)
sim_95 = np.zeros(x)
sim_96 = np.zeros(x)
sim_97 = np.zeros(x)
sim_98 = np.zeros(x)
sim_99 = np.zeros(x)
sim_100 = np.zeros(x)
for i in range(x):
net_load, sim_wind = differences(wind, load, total_wind)
len_wind = len(sim_wind)
wind_energy[i] = sim_wind.mean() * len_wind
ramp = pd.Series(net_load).diff(1)
ramp = ramp[1:]
sim_max[i] = net_load.max()
sim_min[i] = net_load.min()
sim_std[i] = ramp.std()
sim_mean[i] = ramp.mean()
sim_5[i] = np.percentile(ramp, 5)
sim_10[i] = np.percentile(ramp, 10)
sim_15[i] = np.percentile(ramp, 15)
sim_20[i] = np.percentile(ramp, 20)
sim_25[i] = np.percentile(ramp, 25)
sim_30[i] = np.percentile(ramp, 30)
sim_35[i] = np.percentile(ramp, 35)
sim_40[i] = np.percentile(ramp, 40)
sim_45[i] = np.percentile(ramp, 45)
sim_50[i] = np.percentile(ramp, 50)
sim_55[i] = np.percentile(ramp, 55)
sim_60[i] = np.percentile(ramp, 60)
sim_65[i] = np.percentile(ramp, 65)
sim_70[i] = np.percentile(ramp, 70)
sim_75[i] = np.percentile(ramp, 75)
sim_80[i] = np.percentile(ramp, 80)
sim_85[i] = np.percentile(ramp, 85)
sim_90[i] = np.percentile(ramp, 90)
sim_95[i] = np.percentile(ramp, 95)
sim_96[i] = np.percentile(ramp, 96)
sim_97[i] = np.percentile(ramp, 97)
sim_98[i] = np.percentile(ramp, 98)
sim_99[i] = np.percentile(ramp, 99)
sim_100[i] = np.percentile(ramp, 100)
return (wind_energy.mean(), sim_max.mean(), sim_min.mean(), sim_std.mean(), sim_mean.mean(), sim_5.mean(), sim_10.mean(), sim_15.mean(),
sim_20.mean(), sim_25.mean(), sim_30.mean(), sim_35.mean(), sim_40.mean(), sim_45.mean(), sim_50.mean(), sim_55.mean(),
sim_60.mean(), sim_65.mean(), sim_70.mean(), sim_75.mean(), sim_80.mean(), sim_85.mean(), sim_90.mean(), sim_95.mean(),
sim_96.mean(), sim_97.mean(), sim_98.mean(), sim_99.mean(), sim_100.mean())
if __name__ == '__main__':
data2 = pd.read_csv('Wind_Locations.csv')
length = len(data2.CH)
#Convert Genivar data from hourly to 5 minute
CH_5min = pd.Series(np.empty(length*12) * np.nan)
for i, row in enumerate(data2.CH):
CH_5min[i*12] = data2.CH[i]
CH_5min = CH_5min.interpolate()
BV_5min = pd.Series(np.empty(length*12) * np.nan)
for i, row in enumerate(data2.BV):
BV_5min[i*12] = data2.BV[i]
BV_5min = BV_5min.interpolate()
IH_5min = pd.Series(np.empty(length*12) * np.nan)
for i, row in enumerate(data2.IH):
IH_5min[i*12] = data2.IH[i]
IH_5min = IH_5min.interpolate()
RG_5min = pd.Series(np.empty(length*12) * np.nan)
for i, row in enumerate(data2.RG):
RG_5min[i*12] = data2.RG[i]
RG_5min = RG_5min.interpolate()
WA_5min = pd.Series(np.empty(length*12) * np.nan)
for i, row in enumerate(data2.WA):
WA_5min[i*12] = data2.WA[i]
WA_5min = WA_5min.interpolate()
WY_5min = pd.Series(np.empty(length*12) * np.nan)
for i, row in enumerate(data2.WY):
WY_5min[i*12] = data2.WY[i]
WY_5min = WY_5min.interpolate()
#Convert Genivar data from hourly to 15 minute
CH_15min = pd.Series(np.empty(length*4) * np.nan)
for i, row in enumerate(data2.CH):
CH_15min[i*4] = data2.CH[i]
CH_15min = CH_15min.interpolate()
BV_15min = pd.Series(np.empty(length*4) * np.nan)
for i, row in enumerate(data2.BV):
BV_15min[i*4] = data2.BV[i]
BV_15min = BV_15min.interpolate()
IH_15min = pd.Series(np.empty(length*4) * np.nan)
for i, row in enumerate(data2.IH):
IH_15min[i*4] = data2.IH[i]
IH_15min = IH_15min.interpolate()
RG_15min = pd.Series(np.empty(length*4) * np.nan)
for i, row in enumerate(data2.RG):
RG_15min[i*4] = data2.RG[i]
RG_15min = RG_15min.interpolate()
WA_15min = pd.Series(np.empty(length*4) * np.nan)
for i, row in enumerate(data2.WA):
WA_15min[i*4] = data2.WA[i]
WA_15min = WA_15min.interpolate()
WY_15min = pd.Series(np.empty(length*4) * np.nan)
for i, row in enumerate(data2.WY):
WY_15min[i*4] = data2.WY[i]
WY_15min = WY_15min.interpolate()
data = pd.read_csv('5MinuteData.csv')
CH_mean = data2.CH.mean()
BV_mean = data2.BV.mean()
IH_mean = data2.IH.mean()
RG_mean = data2.RG.mean()
WA_mean = data2.WA.mean()
WY_mean = data2.WY.mean()
wind_correlations = np.corrcoef([data2.CH, data2.BV, data2.IH, data2.RG, data2.WA, data2.WY] )
length = len(data.LOAD)/3
load_5min = data.LOAD
load_15min = pd.Series(np.empty(length) * np.nan)
for i, row in enumerate(data.LOAD[:(length)]):
load_15min[i] = (data.LOAD.iloc[i*3] + data.LOAD.iloc[i*3+1] + data.LOAD.iloc[i*3+2]) / 3
load_max = data.LOAD.max()
year10 = 4418
year20 = 4908
year30 = 5531
year40 = 6036
inc10_load = year10 - load_max
inc20_load = year20 - load_max
inc30_load = year30 - load_max
inc40_load = year40 - load_max
results = {}
#15 min Simulation
for year, load_inc in [(10, inc10_load), (20, inc20_load), (30, inc30_load), (40, inc40_load)]:
load = load_15min + load_inc
for winds, name in [([CH_15min], 'CH'), ([CH_15min, BV_15min], 'CH, BV'), ([CH_15min, RG_15min], 'CH, RG'),
([CH_15min, BV_15min, IH_15min], 'CH, BV, IH'), ([CH_15min, RG_15min, WA_15min], 'CH, RG, WA'),
([CH_15min, BV_15min, IH_15min, RG_15min], 'CH, BV, IH, RG'),([CH_15min, BV_15min, IH_15min, RG_15min, WA_15min], 'CH, BV, IH, RG, WA'),
([CH_15min, BV_15min, IH_15min, RG_15min, WA_15min, WY_15min], 'CH, BV, IH, RG, WA, WY') ]:
length = len(winds)
winds[0] = winds[0] / winds[0].max()
total = winds[0]
for i in range(1, length):
winds[i] = winds[i] / winds[i].max()
total += winds[i]
wind = total / len(winds)
for wind_percent in [0, .05, .1, .15, .2, .25, .3, .35, .4, .45, .5]: #evaluates various wind penetrations
print name
print 'Year ', year
print 'Load inc ', load_inc
print 'percent wind ', wind_percent
#base = data.LOAD - wind * data.LOAD.max()*wind_percent
#print summary(base)
result = monte(1, wind, load, load.max()*wind_percent)
print result
results[name + '-' + str(wind_percent)] = result
results = pd.DataFrame(results, index=['Wind Energy', 'NL max', 'NL min', 'NL ramp std', 'NL ramp mean', 'NL ramp 5', 'NL ramp 10',
'NL ramp 15', 'NL ramp 20', 'NL ramp 25', 'NL ramp 30', 'NL ramp 35', 'NL ramp 40',
'NL ramp 45', 'NL ramp 50', 'NL ramp 55', 'NL ramp 60', 'NL ramp 65', 'NL ramp 70',
'NL ramp 75', 'NL ramp 80', 'NL ramp 85', 'NL ramp 90', 'NL ramp 95', 'NL ramp 96',
'NL ramp 97','NL ramp 98','NL ramp 99','NL ramp 100'])
results.to_csv(str(year) + 'Year' + ' Peak Load ' + str(load.max()) + '-' + 'results.csv')
print '-----RESULTS-----'
print results
2 Answers 2
I want to speak towards some style and general Python improvements (of which there are quite a few) that you can make.
Repeated Code
Whenever you have to define a handful of variables that are all basically identical (or generated identically), you can simplify this with another structure. In your case, your sim_xx
variables in monte
can be all placed into a list
:
sims = [np.zeros(x)]*24
With this change (and some iteration tricks), your whole function gets slimmed down to this:
def monte(x, wind, load, total_wind):
wind_energy = np.zeros(x)
sim_max = np.zeros(x)
sim_min = np.zeros(x)
sim_std = np.zeros(x)
sim_mean = np.zeros(x)
sims = [np.zeros(x)]*24
for i in range(x):
net_load, sim_wind = differences(wind, load, total_wind)
len_wind = len(sim_wind)
wind_energy[i] = sim_wind.mean() * len_wind
ramp = pd.Series(net_load).diff(1)
ramp = ramp[1:]
sim_max[i] = net_load.max()
sim_min[i] = net_load.min()
sim_std[i] = ramp.std()
sim_mean[i] = ramp.mean()
# Assign the values for each `sim`. This also generates
# values in `val` of 5, 10, 15, ..., 98, 99, 100.
val = 5
for sim in sims:
sim[i] = np.percentile(ramp, val)
val += 5 if val < 95 else 1
return tuple(arr.mean() for arr in [wind_energy, sim_max, sim_min,
sim_std, sim_mean] + sims)
Note: There are more optimizations that I would suggest (use/return a
dict
). However, your return value is used in a parameter to aDataFrame
. I do not know how they would affect the structure of your code.
You can implement this same idea later on in your code, e.g. when you are dealing with your CH_5min
, BV_5min
, etc. values.
Pull Code into Functions
In my code above, I have a nice loop that gives counts 5, 10, 15, 20, ..., 98, 99, 100
. This would be useful in other sections of your code. So the best thing to do is pull it into a function (more specifically a generator):
def get_values(start=5, stop=100, threshold=95, less_than=5, greater_than=1):
while start <= stop:
yield start
start += less_than if start < threshold else greater_than
This function will yield
values incremented by a certain value until a specific threshold then increment values by a different value. It can be used like xrange
and instead of doing this to create your indexes:
results = pd.DataFrame(results, index=['Wind Energy', 'NL max', 'NL min', 'NL ramp std', 'NL ramp mean', 'NL ramp 5', 'NL ramp 10',
'NL ramp 15', 'NL ramp 20', 'NL ramp 25', 'NL ramp 30', 'NL ramp 35', 'NL ramp 40',
'NL ramp 45', 'NL ramp 50', 'NL ramp 55', 'NL ramp 60', 'NL ramp 65', 'NL ramp 70',
'NL ramp 75', 'NL ramp 80', 'NL ramp 85', 'NL ramp 90', 'NL ramp 95', 'NL ramp 96',
'NL ramp 97','NL ramp 98','NL ramp 99','NL ramp 100'])
you can do this:
indices = ['Wind Energy', 'NL max', 'NL min', 'NL ramp std',
'NL ramp mean'] + ['NL ramp {}'.format(val) for val in get_values()]
results = pd.DataFrame(results, index=indices)
Style Pointers
- Take a look at PEP8, the official Python style guide. It will help your code look cleaner.
Use descriptive variable names.
x
tells us nothing about what it holds. Always err on the side of being to descriptive than being too terse.Also, shy away from using capital letters. Convention says that variables are
lowercase_with_underscores
. The only time capital letters are used in conventional Python is for class names (PascalCase
) and constants (ALL_CAPS
).Use
format
when creating strings with variable information. This is in the Style Pointers because the benefit of using string formatting over string concatenation is debatable. However, usingformat
(as I have in above sections) makes your code more readable. Another example:# Your original code... results.to_csv(str(year) + 'Year' + ' Peak Load ' + str(load.max()) + '-' + 'results.csv') # becomes this. results.to_csv('{}Year Peak Load {}-results.csv'.format(year, load.max()))
Try to keep your line length less than 80 characters. Long lines are especially irksome for users with small monitors (or portrait monitors).
-
\$\begingroup\$ Great review, but nowadays 120 chars seems like a much more reasonable limit. \$\endgroup\$Adam– Adam2014年06月17日 08:20:29 +00:00Commented Jun 17, 2014 at 8:20
-
\$\begingroup\$ @codesparkle I agree moreso with the 120 limit instead of the 80. I was mainly quoting PEP8 at that point. \$\endgroup\$BeetDemGuise– BeetDemGuise2014年06月17日 11:39:14 +00:00Commented Jun 17, 2014 at 11:39
A couple of observations:
You can factor out the initialisation step:
BV_5min = pd.Series(np.empty(length*4) * np.nan)
...
To:
empty_array = np.empty(length*4)
empty_array[:] = np.nan
BV_5min = pd.Series(empty_array)
...
BV_15min = pd.Series(empty_array)
...
Next, the rolling average you calculate here:
for i, row in enumerate(data.LOAD[:(length)]):
load_15min[i] = (data.LOAD.iloc[i*3] + data.LOAD.iloc[i*3+1] + data.LOAD.iloc[i*3+2]) / 3
Can probably be replaced by pd.rolling_mean
:
load_15min[i] = pd.rolling_mean(data.LOAD, 3)
Also, you can use something like timeit to pinpoint what is most time consuming so you can focus on that.
I did not execute your code and verify my sugestions, So there may be sintax errors, please check my suggestions.
In any case, I hope they help.
Explore related questions
See similar questions with these tags.
Wind_scaled
actually was not used in the code that gets called from'___main___'
. I have removed it to avoid confusion. Thanks for pointing it out \$\endgroup\$boot
? \$\endgroup\$