Taking wind data and simulating future wind profiles

Question 1

I am new to programming, and am using Python to take wind data and simulate future wind profiles. The code as written takes a while to execute and I was hoping someone could suggest ways to make my code more efficient...

from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
def boot(data, block=10, size=100):
 length = len(data)
 new = np.zeros(size*block)
 for i in range(size):
 x = random.randint(0, length-block-1)
 new[i*block: (i+1) * block] = data[x: x+block]
 return new 
def sumx(data, x):
 length = data.shape[0]
 new = np.zeros(length//x)
 for i in range(length//x):
 new[i] = data[x*i:x*i+x].sum()
 return new
def capsum(x):
 length = x.shape[0]
 new = np.zeros(length)
 new[0] = x[0]
 for i in range(1, length):
 if new[i-1] + x[i] > 1:
 new[i] = 1
 elif new[i-1] + x[i] < 0:
 new[i] = 0
 else:
 new[i] = new[i-1] + x[i]
 return new
def differences(wind, load, total_wind):
 length = len(load)
 wind_diff = wind.diff(1)
 sim = boot(wind_diff[1:], 5000, 20)
 sum_sim = capsum(sim)
 sum_sim = sum_sim[:length] * total_wind
 net_load = load[:length] - sum_sim
 return net_load, sum_sim
def monte(x, wind, load, total_wind):
 wind_energy = np.zeros(x)
 sim_max = np.zeros(x)
 sim_min = np.zeros(x)
 sim_std = np.zeros(x)
 sim_mean = np.zeros(x)
 sim_5 = np.zeros(x)
 sim_10 = np.zeros(x)
 sim_15 = np.zeros(x)
 sim_20 = np.zeros(x)
 sim_25 = np.zeros(x)
 sim_30 = np.zeros(x)
 sim_35 = np.zeros(x)
 sim_40 = np.zeros(x)
 sim_45 = np.zeros(x)
 sim_50 = np.zeros(x)
 sim_55 = np.zeros(x)
 sim_60 = np.zeros(x)
 sim_65 = np.zeros(x)
 sim_70 = np.zeros(x)
 sim_75 = np.zeros(x)
 sim_80 = np.zeros(x)
 sim_85 = np.zeros(x)
 sim_90 = np.zeros(x)
 sim_95 = np.zeros(x)
 sim_96 = np.zeros(x)
 sim_97 = np.zeros(x)
 sim_98 = np.zeros(x)
 sim_99 = np.zeros(x)
 sim_100 = np.zeros(x)
 for i in range(x):
 net_load, sim_wind = differences(wind, load, total_wind)
 len_wind = len(sim_wind)
 wind_energy[i] = sim_wind.mean() * len_wind 
 ramp = pd.Series(net_load).diff(1)
 ramp = ramp[1:]
 sim_max[i] = net_load.max()
 sim_min[i] = net_load.min()
 sim_std[i] = ramp.std()
 sim_mean[i] = ramp.mean()
 sim_5[i] = np.percentile(ramp, 5)
 sim_10[i] = np.percentile(ramp, 10) 
 sim_15[i] = np.percentile(ramp, 15)
 sim_20[i] = np.percentile(ramp, 20)
 sim_25[i] = np.percentile(ramp, 25)
 sim_30[i] = np.percentile(ramp, 30)
 sim_35[i] = np.percentile(ramp, 35)
 sim_40[i] = np.percentile(ramp, 40)
 sim_45[i] = np.percentile(ramp, 45)
 sim_50[i] = np.percentile(ramp, 50)
 sim_55[i] = np.percentile(ramp, 55)
 sim_60[i] = np.percentile(ramp, 60)
 sim_65[i] = np.percentile(ramp, 65)
 sim_70[i] = np.percentile(ramp, 70)
 sim_75[i] = np.percentile(ramp, 75)
 sim_80[i] = np.percentile(ramp, 80)
 sim_85[i] = np.percentile(ramp, 85)
 sim_90[i] = np.percentile(ramp, 90)
 sim_95[i] = np.percentile(ramp, 95)
 sim_96[i] = np.percentile(ramp, 96)
 sim_97[i] = np.percentile(ramp, 97)
 sim_98[i] = np.percentile(ramp, 98)
 sim_99[i] = np.percentile(ramp, 99)
 sim_100[i] = np.percentile(ramp, 100)
 return (wind_energy.mean(), sim_max.mean(), sim_min.mean(), sim_std.mean(), sim_mean.mean(), sim_5.mean(), sim_10.mean(), sim_15.mean(), 
 sim_20.mean(), sim_25.mean(), sim_30.mean(), sim_35.mean(), sim_40.mean(), sim_45.mean(), sim_50.mean(), sim_55.mean(),
 sim_60.mean(), sim_65.mean(), sim_70.mean(), sim_75.mean(), sim_80.mean(), sim_85.mean(), sim_90.mean(), sim_95.mean(), 
 sim_96.mean(), sim_97.mean(), sim_98.mean(), sim_99.mean(), sim_100.mean())
if __name__ == '__main__':
 data2 = pd.read_csv('Wind_Locations.csv')
 length = len(data2.CH)
 #Convert Genivar data from hourly to 5 minute 
 CH_5min = pd.Series(np.empty(length*12) * np.nan)
 for i, row in enumerate(data2.CH):
 CH_5min[i*12] = data2.CH[i]
 CH_5min = CH_5min.interpolate()
 BV_5min = pd.Series(np.empty(length*12) * np.nan)
 for i, row in enumerate(data2.BV):
 BV_5min[i*12] = data2.BV[i]
 BV_5min = BV_5min.interpolate()
 IH_5min = pd.Series(np.empty(length*12) * np.nan)
 for i, row in enumerate(data2.IH):
 IH_5min[i*12] = data2.IH[i]
 IH_5min = IH_5min.interpolate()
 RG_5min = pd.Series(np.empty(length*12) * np.nan)
 for i, row in enumerate(data2.RG):
 RG_5min[i*12] = data2.RG[i]
 RG_5min = RG_5min.interpolate()
 WA_5min = pd.Series(np.empty(length*12) * np.nan)
 for i, row in enumerate(data2.WA):
 WA_5min[i*12] = data2.WA[i]
 WA_5min = WA_5min.interpolate()
 WY_5min = pd.Series(np.empty(length*12) * np.nan)
 for i, row in enumerate(data2.WY):
 WY_5min[i*12] = data2.WY[i]
 WY_5min = WY_5min.interpolate()
 #Convert Genivar data from hourly to 15 minute 
 CH_15min = pd.Series(np.empty(length*4) * np.nan)
 for i, row in enumerate(data2.CH):
 CH_15min[i*4] = data2.CH[i]
 CH_15min = CH_15min.interpolate()
 BV_15min = pd.Series(np.empty(length*4) * np.nan)
 for i, row in enumerate(data2.BV):
 BV_15min[i*4] = data2.BV[i]
 BV_15min = BV_15min.interpolate()
 IH_15min = pd.Series(np.empty(length*4) * np.nan)
 for i, row in enumerate(data2.IH):
 IH_15min[i*4] = data2.IH[i]
 IH_15min = IH_15min.interpolate()
 RG_15min = pd.Series(np.empty(length*4) * np.nan)
 for i, row in enumerate(data2.RG):
 RG_15min[i*4] = data2.RG[i]
 RG_15min = RG_15min.interpolate()
 WA_15min = pd.Series(np.empty(length*4) * np.nan)
 for i, row in enumerate(data2.WA):
 WA_15min[i*4] = data2.WA[i]
 WA_15min = WA_15min.interpolate()
 WY_15min = pd.Series(np.empty(length*4) * np.nan)
 for i, row in enumerate(data2.WY):
 WY_15min[i*4] = data2.WY[i]
 WY_15min = WY_15min.interpolate()
 data = pd.read_csv('5MinuteData.csv')
 CH_mean = data2.CH.mean()
 BV_mean = data2.BV.mean()
 IH_mean = data2.IH.mean()
 RG_mean = data2.RG.mean()
 WA_mean = data2.WA.mean()
 WY_mean = data2.WY.mean()
 wind_correlations = np.corrcoef([data2.CH, data2.BV, data2.IH, data2.RG, data2.WA, data2.WY] )
 length = len(data.LOAD)/3
 load_5min = data.LOAD
 load_15min = pd.Series(np.empty(length) * np.nan)
 for i, row in enumerate(data.LOAD[:(length)]):
 load_15min[i] = (data.LOAD.iloc[i*3] + data.LOAD.iloc[i*3+1] + data.LOAD.iloc[i*3+2]) / 3
 load_max = data.LOAD.max()
 year10 = 4418
 year20 = 4908
 year30 = 5531
 year40 = 6036
 inc10_load = year10 - load_max 
 inc20_load = year20 - load_max
 inc30_load = year30 - load_max
 inc40_load = year40 - load_max
 results = {}
 #15 min Simulation
 for year, load_inc in [(10, inc10_load), (20, inc20_load), (30, inc30_load), (40, inc40_load)]:
 load = load_15min + load_inc 
 for winds, name in [([CH_15min], 'CH'), ([CH_15min, BV_15min], 'CH, BV'), ([CH_15min, RG_15min], 'CH, RG'),
 ([CH_15min, BV_15min, IH_15min], 'CH, BV, IH'), ([CH_15min, RG_15min, WA_15min], 'CH, RG, WA'), 
 ([CH_15min, BV_15min, IH_15min, RG_15min], 'CH, BV, IH, RG'),([CH_15min, BV_15min, IH_15min, RG_15min, WA_15min], 'CH, BV, IH, RG, WA'),
 ([CH_15min, BV_15min, IH_15min, RG_15min, WA_15min, WY_15min], 'CH, BV, IH, RG, WA, WY') ]: 
 length = len(winds)
 winds[0] = winds[0] / winds[0].max()
 total = winds[0]
 for i in range(1, length):
 winds[i] = winds[i] / winds[i].max()
 total += winds[i]
 wind = total / len(winds)
 for wind_percent in [0, .05, .1, .15, .2, .25, .3, .35, .4, .45, .5]: #evaluates various wind penetrations
 print name
 print 'Year ', year
 print 'Load inc ', load_inc
 print 'percent wind ', wind_percent
 #base = data.LOAD - wind * data.LOAD.max()*wind_percent
 #print summary(base)
 result = monte(1, wind, load, load.max()*wind_percent)
 print result
 results[name + '-' + str(wind_percent)] = result
 results = pd.DataFrame(results, index=['Wind Energy', 'NL max', 'NL min', 'NL ramp std', 'NL ramp mean', 'NL ramp 5', 'NL ramp 10', 
 'NL ramp 15', 'NL ramp 20', 'NL ramp 25', 'NL ramp 30', 'NL ramp 35', 'NL ramp 40', 
 'NL ramp 45', 'NL ramp 50', 'NL ramp 55', 'NL ramp 60', 'NL ramp 65', 'NL ramp 70',
 'NL ramp 75', 'NL ramp 80', 'NL ramp 85', 'NL ramp 90', 'NL ramp 95', 'NL ramp 96',
 'NL ramp 97','NL ramp 98','NL ramp 99','NL ramp 100']) 
 results.to_csv(str(year) + 'Year' + ' Peak Load ' + str(load.max()) + '-' + 'results.csv')
 print '-----RESULTS-----'
 print results

Question 2

This is quite impressive for someone who's "new to programming"! Good job, I'm anxious for a good review

Question 3

How have you tried to improve it so far? Have you profiled the code?

Question 4

Sorry, that Wind_scaled actually was not used in the code that gets called from '___main___'. I have removed it to avoid confusion. Thanks for pointing it out

Question 5

Can you check the indentation in boot ?

Question 6

Good eye Josay, indent is fixedm missed that when putting in my question

Question 7

I want to speak towards some style and general Python improvements (of which there are quite a few) that you can make.

Repeated Code

Whenever you have to define a handful of variables that are all basically identical (or generated identically), you can simplify this with another structure. In your case, your sim_xx variables in monte can be all placed into a list:

sims = [np.zeros(x)]*24

With this change (and some iteration tricks), your whole function gets slimmed down to this:

def monte(x, wind, load, total_wind):
 wind_energy = np.zeros(x)
 sim_max = np.zeros(x)
 sim_min = np.zeros(x)
 sim_std = np.zeros(x)
 sim_mean = np.zeros(x)
 sims = [np.zeros(x)]*24
 for i in range(x):
 net_load, sim_wind = differences(wind, load, total_wind)
 len_wind = len(sim_wind)
 wind_energy[i] = sim_wind.mean() * len_wind 
 ramp = pd.Series(net_load).diff(1)
 ramp = ramp[1:]
 sim_max[i] = net_load.max()
 sim_min[i] = net_load.min()
 sim_std[i] = ramp.std()
 sim_mean[i] = ramp.mean()
 # Assign the values for each `sim`. This also generates
 # values in `val` of 5, 10, 15, ..., 98, 99, 100.
 val = 5
 for sim in sims:
 sim[i] = np.percentile(ramp, val)
 val += 5 if val < 95 else 1
 return tuple(arr.mean() for arr in [wind_energy, sim_max, sim_min,
 sim_std, sim_mean] + sims)

Note: There are more optimizations that I would suggest (use/return a dict). However, your return value is used in a parameter to a DataFrame. I do not know how they would affect the structure of your code.

You can implement this same idea later on in your code, e.g. when you are dealing with your CH_5min, BV_5min, etc. values.

Pull Code into Functions

In my code above, I have a nice loop that gives counts 5, 10, 15, 20, ..., 98, 99, 100. This would be useful in other sections of your code. So the best thing to do is pull it into a function (more specifically a generator):

def get_values(start=5, stop=100, threshold=95, less_than=5, greater_than=1):
 while start <= stop:
 yield start
 start += less_than if start < threshold else greater_than

This function will yield values incremented by a certain value until a specific threshold then increment values by a different value. It can be used like xrange and instead of doing this to create your indexes:

results = pd.DataFrame(results, index=['Wind Energy', 'NL max', 'NL min', 'NL ramp std', 'NL ramp mean', 'NL ramp 5', 'NL ramp 10', 
 'NL ramp 15', 'NL ramp 20', 'NL ramp 25', 'NL ramp 30', 'NL ramp 35', 'NL ramp 40', 
 'NL ramp 45', 'NL ramp 50', 'NL ramp 55', 'NL ramp 60', 'NL ramp 65', 'NL ramp 70',
 'NL ramp 75', 'NL ramp 80', 'NL ramp 85', 'NL ramp 90', 'NL ramp 95', 'NL ramp 96',
 'NL ramp 97','NL ramp 98','NL ramp 99','NL ramp 100'])

you can do this:

indices = ['Wind Energy', 'NL max', 'NL min', 'NL ramp std',
 'NL ramp mean'] + ['NL ramp {}'.format(val) for val in get_values()]
results = pd.DataFrame(results, index=indices)

Style Pointers

Take a look at PEP8, the official Python style guide. It will help your code look cleaner.
Use descriptive variable names. x tells us nothing about what it holds. Always err on the side of being to descriptive than being too terse.

Also, shy away from using capital letters. Convention says that variables are lowercase_with_underscores. The only time capital letters are used in conventional Python is for class names (PascalCase) and constants (ALL_CAPS).
Use format when creating strings with variable information. This is in the Style Pointers because the benefit of using string formatting over string concatenation is debatable. However, using format (as I have in above sections) makes your code more readable. Another example:
```
# Your original code...
results.to_csv(str(year) + 'Year' + ' Peak Load ' + str(load.max()) + '-' + 'results.csv') 
# becomes this.
results.to_csv('{}Year Peak Load {}-results.csv'.format(year, load.max()))
```
Try to keep your line length less than 80 characters. Long lines are especially irksome for users with small monitors (or portrait monitors).

Question 8

Great review, but nowadays 120 chars seems like a much more reasonable limit.

Question 9

@codesparkle I agree moreso with the 120 limit instead of the 80. I was mainly quoting PEP8 at that point.

Question 10

A couple of observations:

You can factor out the initialisation step:

BV_5min = pd.Series(np.empty(length*4) * np.nan)
...

To:

empty_array = np.empty(length*4)
empty_array[:] = np.nan
BV_5min = pd.Series(empty_array)
...
BV_15min = pd.Series(empty_array)
...

Next, the rolling average you calculate here:

for i, row in enumerate(data.LOAD[:(length)]):
 load_15min[i] = (data.LOAD.iloc[i*3] + data.LOAD.iloc[i*3+1] + data.LOAD.iloc[i*3+2]) / 3

Can probably be replaced by pd.rolling_mean:

load_15min[i] = pd.rolling_mean(data.LOAD, 3)

Also, you can use something like timeit to pinpoint what is most time consuming so you can focus on that.

I did not execute your code and verify my sugestions, So there may be sintax errors, please check my suggestions.

In any case, I hope they help.

BeetDemGuise BeetDemGuise 4,21612 silver badges29 bronze badges · Answer 1 · 2014-06-16 17:40:40Z

I want to speak towards some style and general Python improvements (of which there are quite a few) that you can make.

Repeated Code

Whenever you have to define a handful of variables that are all basically identical (or generated identically), you can simplify this with another structure. In your case, your sim_xx variables in monte can be all placed into a list:

sims = [np.zeros(x)]*24

With this change (and some iteration tricks), your whole function gets slimmed down to this:

def monte(x, wind, load, total_wind):
 wind_energy = np.zeros(x)
 sim_max = np.zeros(x)
 sim_min = np.zeros(x)
 sim_std = np.zeros(x)
 sim_mean = np.zeros(x)
 sims = [np.zeros(x)]*24
 for i in range(x):
 net_load, sim_wind = differences(wind, load, total_wind)
 len_wind = len(sim_wind)
 wind_energy[i] = sim_wind.mean() * len_wind 
 ramp = pd.Series(net_load).diff(1)
 ramp = ramp[1:]
 sim_max[i] = net_load.max()
 sim_min[i] = net_load.min()
 sim_std[i] = ramp.std()
 sim_mean[i] = ramp.mean()
 # Assign the values for each `sim`. This also generates
 # values in `val` of 5, 10, 15, ..., 98, 99, 100.
 val = 5
 for sim in sims:
 sim[i] = np.percentile(ramp, val)
 val += 5 if val < 95 else 1
 return tuple(arr.mean() for arr in [wind_energy, sim_max, sim_min,
 sim_std, sim_mean] + sims)

Note: There are more optimizations that I would suggest (use/return a dict). However, your return value is used in a parameter to a DataFrame. I do not know how they would affect the structure of your code.

You can implement this same idea later on in your code, e.g. when you are dealing with your CH_5min, BV_5min, etc. values.

Pull Code into Functions

In my code above, I have a nice loop that gives counts 5, 10, 15, 20, ..., 98, 99, 100. This would be useful in other sections of your code. So the best thing to do is pull it into a function (more specifically a generator):

def get_values(start=5, stop=100, threshold=95, less_than=5, greater_than=1):
 while start <= stop:
 yield start
 start += less_than if start < threshold else greater_than

This function will yield values incremented by a certain value until a specific threshold then increment values by a different value. It can be used like xrange and instead of doing this to create your indexes:

results = pd.DataFrame(results, index=['Wind Energy', 'NL max', 'NL min', 'NL ramp std', 'NL ramp mean', 'NL ramp 5', 'NL ramp 10', 
 'NL ramp 15', 'NL ramp 20', 'NL ramp 25', 'NL ramp 30', 'NL ramp 35', 'NL ramp 40', 
 'NL ramp 45', 'NL ramp 50', 'NL ramp 55', 'NL ramp 60', 'NL ramp 65', 'NL ramp 70',
 'NL ramp 75', 'NL ramp 80', 'NL ramp 85', 'NL ramp 90', 'NL ramp 95', 'NL ramp 96',
 'NL ramp 97','NL ramp 98','NL ramp 99','NL ramp 100'])

you can do this:

indices = ['Wind Energy', 'NL max', 'NL min', 'NL ramp std',
 'NL ramp mean'] + ['NL ramp {}'.format(val) for val in get_values()]
results = pd.DataFrame(results, index=indices)

Style Pointers

Take a look at PEP8, the official Python style guide. It will help your code look cleaner.
Use descriptive variable names. x tells us nothing about what it holds. Always err on the side of being to descriptive than being too terse.

Also, shy away from using capital letters. Convention says that variables are lowercase_with_underscores. The only time capital letters are used in conventional Python is for class names (PascalCase) and constants (ALL_CAPS).
Use format when creating strings with variable information. This is in the Style Pointers because the benefit of using string formatting over string concatenation is debatable. However, using format (as I have in above sections) makes your code more readable. Another example:
```
# Your original code...
results.to_csv(str(year) + 'Year' + ' Peak Load ' + str(load.max()) + '-' + 'results.csv') 
# becomes this.
results.to_csv('{}Year Peak Load {}-results.csv'.format(year, load.max()))
```
Try to keep your line length less than 80 characters. Long lines are especially irksome for users with small monitors (or portrait monitors).

Great review, but nowadays 120 chars seems like a much more reasonable limit.
@codesparkle I agree moreso with the 120 limit instead of the 80. I was mainly quoting PEP8 at that point.

Pablo Pablo 951 silver badge10 bronze badges · Answer 2 · 2014-06-16 16:03:19Z

A couple of observations:

You can factor out the initialisation step:

BV_5min = pd.Series(np.empty(length*4) * np.nan)
...

To:

empty_array = np.empty(length*4)
empty_array[:] = np.nan
BV_5min = pd.Series(empty_array)
...
BV_15min = pd.Series(empty_array)
...

Next, the rolling average you calculate here:

for i, row in enumerate(data.LOAD[:(length)]):
 load_15min[i] = (data.LOAD.iloc[i*3] + data.LOAD.iloc[i*3+1] + data.LOAD.iloc[i*3+2]) / 3

Can probably be replaced by pd.rolling_mean:

load_15min[i] = pd.rolling_mean(data.LOAD, 3)

Also, you can use something like timeit to pinpoint what is most time consuming so you can focus on that.

I did not execute your code and verify my sugestions, So there may be sintax errors, please check my suggestions.

In any case, I hope they help.

Stack Exchange Network

Taking wind data and simulating future wind profiles

2 Answers 2

Repeated Code

Pull Code into Functions

Style Pointers

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Taking wind data and simulating future wind profiles

2 Answers 2

Repeated Code

Pull Code into Functions

Style Pointers

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions