Code Summary
The following code is a data science script I've been working on that cross-validates a fixed effect model. I'm moving from R to Python and would appreciate feedback on the code below.
The code does the following:
Split data into train and test using a custom function that groups/clusters the data
Estimate a linear fixed effect model with train and test data
Calculate RMSE and tstat to verify independence of residuals
Prints RMSE, SE, and tstat from cross-validation exercise.
Note: the code downloads a remote data set, so the code can be run on its own.
Code
from urllib import request
from scipy import stats
import pandas as pd
import numpy as np
import statsmodels.api as sm
print("Defining functions......")
def main():
"""
Estimate baseline and degree day regression.
Returns:
data.frame with RMSE, SE, and tstats
"""
# Download remote from github
print("Downloading custom data set from: ")
print("https://github.com/johnwoodill/corn_yield_pred/raw/master/data/full_data.pickle")
file_url = "https://github.com/johnwoodill/corn_yield_pred/raw/master/data/full_data.pickle"
request.urlretrieve(file_url, "full_data.pickle")
cropdat = pd.read_pickle("full_data.pickle")
# Baseline WLS Regression Cross-Validation with FE and trends
print("Estimating Baseline Regression")
basedat = cropdat[['ln_corn_yield', 'trend', 'trend_sq', 'corn_acres']]
fe_group = pd.get_dummies(cropdat.fips)
regdat = pd.concat([basedat, fe_group], axis=1)
base_rmse, base_se, base_tstat = felm_cv(regdat, cropdat['trend'])
# Degree Day Regression Cross-Validation
print("Estimating Degree Day Regression")
dddat = cropdat[['ln_corn_yield', 'dday0_10C', 'dday10_30C', 'dday30C',
'prec', 'prec_sq', 'trend', 'trend_sq', 'corn_acres']]
fe_group = pd.get_dummies(cropdat.fips)
regdat = pd.concat([dddat, fe_group], axis=1)
ddreg_rmse, ddreg_se, ddreg_tstat = felm_cv(regdat, cropdat['trend'])
# Get results as data.frame
fdat = {'Regression': ['Baseline', 'Degree Day',],
'RMSE': [base_rmse, ddreg_rmse],
'se': [base_se, ddreg_se],
't-stat': [base_tstat, ddreg_tstat]}
fdat = pd.DataFrame(fdat, columns=['Regression', 'RMSE', 'se', 't-stat'])
# Calculate percentage change
fdat['change'] = (fdat['RMSE'] - fdat['RMSE'].iloc[0])/fdat['RMSE'].iloc[0]
return fdat
def felm_rmse(y_train, x_train, weights, y_test, x_test):
"""
Estimate WLS from y_train, x_train, predict using x_test, calculate RMSE,
and test whether residuals are independent.
Arguments:
y_train: Dep variable - Full or training data
x_train: Covariates - Full or training data
weights: Weights for WLS
y_test: Dep variable - test data
x_test: Covariates - test data
Returns:
Returns tuple with RMSE and tstat from ttest
"""
# Fit model and get predicted values of test data
mod = sm.WLS(y_train, x_train, weights=weights).fit()
pred = mod.predict(x_test)
#Get residuals from test data
res = (y_test[:] - pred.values)
# Calculate ttest to check residuals from test and train are independent
t_stat = stats.ttest_ind(mod.resid, res, equal_var=False)[0]
# Return RMSE and t-stat from ttest
return (np.sqrt(np.mean(res**2)), t_stat)
def gc_kfold_cv(data, group, begin, end):
"""
Custom group/cluster data split for cross-validation of panel data.
(Ensure groups are clustered and train and test residuals are independent)
Arguments:
data: data to filter with 'trend'
group: group to cluster
begin: start of cluster
end: end of cluster
Return:
Return test and train data for Group-by-Cluster Cross-validation method
"""
# Get group data
data = data.assign(group=group.values)
# Filter test and train based on begin and end
test = data[data['group'].isin(range(begin, end))]
train = data[~data['group'].isin(range(begin, end))]
# Return train and test
dfs = {}
tsets = [train, test]
# Combine train and test to return dfs
for i, val in enumerate([1, 2]):
dfs[val] = tsets[i]
return dfs
def felm_cv(regdata, group):
"""
Cross-validate WLS FE model
Arguments:
regdata: regression data
group: group fixed effect
Returns:
return mean RMSE, standard error, and mean tstat from ttest
"""
# Loop through 1-31 years with 5 groups in test set and 26 train set
#i = 1
#j = False
retrmse = []
rettstat = []
#for j, val in enumerate([1, 27]):
for j in range(1, 28):
# Get test and training data
tset = gc_kfold_cv(regdata, group, j, j + 4)
# Separate y_train, x_train, y_test, x_test, and weights
y_train = tset[1].ln_corn_yield
x_train = tset[1].drop(['ln_corn_yield', 'corn_acres'], 1)
weights = tset[1].corn_acres
y_test = tset[2].ln_corn_yield
x_test = tset[2].drop(['ln_corn_yield', 'corn_acres'], 1)
# Get RMSE and tstat from train and test data
inrmse, t_stat = felm_rmse(y_train, x_train, weights, y_test, x_test)
# Append RMSE and tstats to return
retrmse.append(inrmse)
rettstat.append(t_stat)
# If end of loop return mean RMSE, s.e., and tstat
if j == 27:
return (np.mean(retrmse), np.std(retrmse), np.mean(t_stat))
if __name__ == "__main__":
RDAT = main()
print(RDAT)
# print results
print("---Results--------------------------------------------")
print("Baseline: ", round(RDAT.iloc[0, 1], 2), "(RMSE)",
round(RDAT.iloc[0, 2], 2), "(se)",
round(RDAT.iloc[0, 1], 3), "(t-stat)")
print("Degree Day: ", round(RDAT.iloc[1, 1], 2), "(RMSE)",
round(RDAT.iloc[0, 2], 2), "(se)",
round(RDAT.iloc[1, 3], 2), "(t-stat)")
print("------------------------------------------------------")
print("% Change from Baseline: ", round(RDAT.iloc[1, 4], 4)*100, "%")
print("------------------------------------------------------")
2 Answers 2
A first analysis. Once I have more time, I'll try to look in what happens with the data exactly, these are some remarks are about the general code quality:
pickle
from the python documentation:
Warning
The pickle module is not secure against erroneous or maliciously constructed data. Never unpickle data received from an untrusted or unauthenticated source.
If this is partially processed data, another intermediary format like feather or parquet or so might be more appropriate
Functions
I would make even more functions, instead of cramming everything into the main
actually, almost everywhere where you do a print('<doing this>')
, I would make a different functions
- fetch_data
- baseline_regression``
-
degree_day`
- ...
caching
Instead of downloading the file each time you do the analysis, you might try to cache it
dataframe indexing
sometimes you use df['<key>']
, sometimes df.<key>
. Try to be consistent
return values
in gc_kfold_cv
you use 5 lines to put the result in a dictionary, with 1
and 2
as keys. Simpler would be to return a tuple test, train
or a dict {'test': test, 'train': train}
keyword arguments
in tset[2].drop(['ln_corn_yield', 'corn_acres'], 1)
, the significance of the 1
is unclear, so better use axis=1
or axis='columns'
In general looks great and clean the code, I saw just a couple of things that looked strange, marked with ##!##:
def gc_kfold_cv(data, group, begin, end):
"""
Custom group/cluster data split for cross-validation of panel data.
(Ensure groups are clustered and train and test residuals are independent)
Arguments:
data: data to filter with 'trend'
group: group to cluster
begin: start of cluster
end: end of cluster
Return:
Return test and train data for Group-by-Cluster Cross-validation method
"""
# Get group data
data = data.assign(group=group.values)
# Filter test and train based on begin and end
test = data[data['group'].isin(range(begin, end))]
train = data[~data['group'].isin(range(begin, end))]
# Return train and test
dfs = {}
tsets = [train, test]
# Combine train and test to return dfs
##!## In the felm_cv method you already had changed
##!## the definition of the loop, here you can do the same.
for val in range(2):
dfs[val + 1] = tsets[val]
return dfs
def felm_cv(regdata, group):
"""
Cross-validate WLS FE model
Arguments:
regdata: regression data
group: group fixed effect
Returns:
return mean RMSE, standard error, and mean tstat from ttest
"""
# Loop through 1-31 years with 5 groups in test set and 26 train set
#i = 1
#j = False
retrmse = []
rettstat = []
#for j, val in enumerate([1, 27]):
for j in range(1, 28):
# Get test and training data
tset = gc_kfold_cv(regdata, group, j, j + 4)
# Separate y_train, x_train, y_test, x_test, and weights
y_train = tset[1].ln_corn_yield
x_train = tset[1].drop(['ln_corn_yield', 'corn_acres'], 1)
weights = tset[1].corn_acres
y_test = tset[2].ln_corn_yield
x_test = tset[2].drop(['ln_corn_yield', 'corn_acres'], 1)
# Get RMSE and tstat from train and test data
inrmse, t_stat = felm_rmse(y_train, x_train, weights, y_test, x_test)
# Append RMSE and tstats to return
retrmse.append(inrmse)
rettstat.append(t_stat)
# If end of loop return mean RMSE, s.e., and tstat
##!## If you want to do something at the end of a loop, you
##!## just need to write the code outside it (One indentation level lower)
return (np.mean(retrmse), np.std(retrmse), np.mean(t_stat))
-
\$\begingroup\$ Great, thank you. Good point on the last loop. How do you feel about include
__main__
in data science scripts like this? If I'm not loading it as a module is it ok to leave it off? \$\endgroup\$Amstell– Amstell2018年08月28日 14:00:11 +00:00Commented Aug 28, 2018 at 14:00 -
\$\begingroup\$ If you are sure that you will never call it as a module you "could", but it is good practice and styling to leave it like this. \$\endgroup\$ImanolUr– ImanolUr2018年08月29日 08:04:31 +00:00Commented Aug 29, 2018 at 8:04