Commit 4311a1b

author

Algorithmica

authored

Add files via upload

1 parent ebf3c26 commit 4311a1bCopy full SHA for 4311a1b

File tree

7 files changed

+335

-0

lines changed

2019-october/18.time series forecasting

7 files changed

+335

-0

lines changed

`‎2019-october/18.time series forecasting/timeseries-ar(gridsearch).py‎`

Lines changed: 84 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,84 @@`
	`1`	`+import math`
	`2`	`+import pandas as pd`
	`3`	`+import numpy as np`
	`4`	`+import os`
	`5`	`+from sklearn import metrics`
	`6`	`+from sklearn.model_selection import TimeSeriesSplit`
	`7`	`+from statsmodels.tsa import ar_model`
	`8`	`+import matplotlib.pyplot as plt`
	`9`	`+`
	`10`	`+def grid_search_best_model_timeseries_ar(df, grid, cv):`
	`11`	`+ best_param = None`
	`12`	`+ best_score = np.infty`
	`13`	`+ tsp = TimeSeriesSplit(n_splits=cv)`
	`14`	`+`
	`15`	`+ for param in grid.get('lags'):`
	`16`	`+ scores = []`
	`17`	`+ for train_ind, test_ind in tsp.split(df):`
	`18`	`+ train_data = df.iloc[train_ind]`
	`19`	`+ test_data = df.iloc[test_ind]`
	`20`	`+ try:`
	`21`	`+ #print(train_data, test_data)`
	`22`	`+ estimator = ar_model.AutoReg(train_data, lags=param)`
	`23`	`+ res = estimator.fit()`
	`24`	`+ #print(res.params)`
	`25`	`+ #get out of sample predictions with test data start and end`
	`26`	`+ pred = estimator.predict(res.params, test_data.index[0], test_data.index[-1])`
	`27`	`+ #print(pred)`
	`28`	`+ y_pred = pred.values.reshape(-1)`
	`29`	`+ y_test = test_data.values.reshape(-1)`
	`30`	`+ score = math.sqrt(metrics.mean_squared_error(y_test, y_pred))`
	`31`	`+ scores.append(score)`
	`32`	`+ except:`
	`33`	`+ pass`
	`34`	`+ #print(scores)`
	`35`	`+ if len(scores) > 0 and np.mean(scores) < best_score :`
	`36`	`+ best_score = np.mean(scores)`
	`37`	`+ best_param = param`
	`38`	`+`
	`39`	`+ if best_param is not None:`
	`40`	`+ estimator = ar_model.AutoReg(df, lags=best_param)`
	`41`	`+ res = estimator.fit()`
	`42`	`+ print("best parameters:" + str(best_param))`
	`43`	`+ print("validation rmse:" + str(best_score))`
	`44`	`+ #get insample predictions with start and end indices`
	`45`	`+ predictions = estimator.predict(res.params, start=0, end=df.shape[0]-1 )`
	`46`	`+ y_pred = predictions.values.reshape(-1)`
	`47`	`+ y_train = df.values.reshape(-1)[best_param:]`
	`48`	`+ train_rmse = math.sqrt(metrics.mean_squared_error(y_train, y_pred))`
	`49`	`+ print("train rmse:" + str(train_rmse))`
	`50`	`+ return estimator, res`
	`51`	`+ else:`
	`52`	`+ return None, None`
	`53`	`+`
	`54`	`+path = 'F:/'`
	`55`	`+df = pd.read_csv(os.path.join(path,'uk-deaths-from-bronchitis-emphys.csv'))`
	`56`	`+df.info()`
	`57`	`+`
	`58`	`+df.columns = ['timestamp', 'y']`
	`59`	`+df.index = pd.to_datetime(df['timestamp'], format='%Y-%m')`
	`60`	`+df.index.freq = 'MS'`
	`61`	`+df.drop('timestamp', axis=1, inplace=True)`
	`62`	`+`
	`63`	`+#grid search and get final model with best parameters`
	`64`	`+ar_grid = { 'lags':[2,3,4,5] }`
	`65`	`+estimator, res = grid_search_best_model_timeseries_ar(df, ar_grid, 3)`
	`66`	`+print(res.params)`
	`67`	`+print(res.summary())`
	`68`	`+plt.plot(df)`
	`69`	`+plt.figure()`
	`70`	`+res.resid.plot()`
	`71`	`+`
	`72`	`+#get predictions for future(implicit intervals based on freq of train data)`
	`73`	`+start_index = pd.datetime(1980, 1, 1)`
	`74`	`+end_index = pd.datetime(1990, 12, 1)`
	`75`	`+pred = estimator.predict(res.params, start_index, end_index)`
	`76`	`+print(pred)`
	`77`	`+`
	`78`	`+#get predictions for future(explicit intervals)`
	`79`	`+index = pd.date_range('1-1-1980', '12-1-1990', freq='MS')`
	`80`	`+pred = estimator.predict(res.params, index[0], index[-1])`
	`81`	`+print(pred)`
	`82`	`+`
	`83`	`+plt.figure()`
	`84`	`+plt.plot(pred)`

`‎2019-october/18.time series forecasting/timeseries-arima(gridsearch).py‎`

Lines changed: 90 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,90 @@`
	`1`	`+import math`
	`2`	`+import pandas as pd`
	`3`	`+import numpy as np`
	`4`	`+import os`
	`5`	`+from sklearn import metrics`
	`6`	`+from sklearn.model_selection import TimeSeriesSplit`
	`7`	`+from itertools import product`
	`8`	`+from statsmodels.tsa import arima_model`
	`9`	`+import matplotlib.pyplot as plt`
	`10`	`+`
	`11`	`+def grid_search_best_model_timeseries_arima(df, grid, cv):`
	`12`	`+ keys, values = zip(*grid.items())`
	`13`	`+ params =[]`
	`14`	`+ for v in product(*values):`
	`15`	`+ params.append(tuple(v))`
	`16`	`+`
	`17`	`+ print(params)`
	`18`	`+ best_param = None`
	`19`	`+ best_score = np.infty`
	`20`	`+ tsp = TimeSeriesSplit(n_splits=cv)`
	`21`	`+`
	`22`	`+ for param in params:`
	`23`	`+ scores = []`
	`24`	`+ for train_ind, test_ind in tsp.split(df):`
	`25`	`+ train_data = df.iloc[train_ind]`
	`26`	`+ test_data = df.iloc[test_ind]`
	`27`	`+ try:`
	`28`	`+ #print(train_data, test_data)`
	`29`	`+ estimator = arima_model.ARIMA(train_data, order=param)`
	`30`	`+ res = estimator.fit()`
	`31`	`+ #print(res.params)`
	`32`	`+ #get out of sample predictions with test data start and end`
	`33`	`+ y_pred = estimator.predict(res.params, test_data.index[0], test_data.index[-1])`
	`34`	`+ #print(y_pred)`
	`35`	`+ y_test = test_data.values.reshape(-1)`
	`36`	`+ score = math.sqrt(metrics.mean_squared_error(y_test, y_pred))`
	`37`	`+ scores.append(score)`
	`38`	`+ except:`
	`39`	`+ pass`
	`40`	`+ #print(scores)`
	`41`	`+ if len(scores) > 0 and np.mean(scores) < best_score :`
	`42`	`+ best_score = np.mean(scores)`
	`43`	`+ best_param = param`
	`44`	`+`
	`45`	`+ if best_param is not None:`
	`46`	`+ estimator = arima_model.ARIMA(df, order=best_param)`
	`47`	`+ res = estimator.fit()`
	`48`	`+ print("best parameters:" + str(best_param))`
	`49`	`+ print("validation rmse:" + str(best_score))`
	`50`	`+ #get insample predictions with start and end indices`
	`51`	`+ y_pred = estimator.predict(res.params, start=0, end=df.shape[0]-1 )`
	`52`	`+ y_train = df.values.reshape(-1)`
	`53`	`+ train_rmse = math.sqrt(metrics.mean_squared_error(y_train, y_pred))`
	`54`	`+ print("train rmse:" + str(train_rmse))`
	`55`	`+ return estimator, res`
	`56`	`+ else:`
	`57`	`+ return None, None`
	`58`	`+`
	`59`	`+path = 'F:/'`
	`60`	`+df = pd.read_csv(os.path.join(path,'uk-deaths-from-bronchitis-emphys.csv'))`
	`61`	`+df.info()`
	`62`	`+`
	`63`	`+df.columns = ['timestamp', 'y']`
	`64`	`+df.index = pd.to_datetime(df['timestamp'], format='%Y-%m')`
	`65`	`+df.index.freq = 'MS'`
	`66`	`+df.drop('timestamp', axis=1, inplace=True)`
	`67`	`+`
	`68`	`+#grid search and get final model with best parameters`
	`69`	`+arima_grid = { 'p':[0,1,5,7], 'd':[0,1,2], 'q':[0,1,2,5] }`
	`70`	`+estimator, res = grid_search_best_model_timeseries_arima(df, arima_grid, 3)`
	`71`	`+print(res.params)`
	`72`	`+print(res.summary())`
	`73`	`+plt.plot(df)`
	`74`	`+plt.figure()`
	`75`	`+res.resid.plot()`
	`76`	`+`
	`77`	`+#get predictions for future(implicit intervals based on freq of train data)`
	`78`	`+start_index = pd.datetime(1980, 1, 1)`
	`79`	`+end_index = pd.datetime(1990, 12, 1)`
	`80`	`+pred = estimator.predict(res.params, start_index, end_index)`
	`81`	`+print(pred)`
	`82`	`+`
	`83`	`+#get predictions for future(explicit intervals)`
	`84`	`+index = pd.date_range('1-1-1980', '12-1-1990', freq='MS')`
	`85`	`+pred = estimator.predict(res.params, index[0], index[-1])`
	`86`	`+pred = pd.Series(pred, index=index)`
	`87`	`+print(pred)`
	`88`	`+`
	`89`	`+plt.figure()`
	`90`	`+plt.plot(pred)`

`‎2019-october/18.time series forecasting/timeseries-arma(gridsearch).py‎`

Lines changed: 90 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,90 @@`
	`1`	`+import math`
	`2`	`+import pandas as pd`
	`3`	`+import numpy as np`
	`4`	`+import os`
	`5`	`+from sklearn import metrics`
	`6`	`+from sklearn.model_selection import TimeSeriesSplit`
	`7`	`+from itertools import product`
	`8`	`+from statsmodels.tsa import arima_model`
	`9`	`+import matplotlib.pyplot as plt`
	`10`	`+`
	`11`	`+def grid_search_best_model_timeseries_arma(df, grid, cv):`
	`12`	`+ keys, values = zip(*grid.items())`
	`13`	`+ params =[]`
	`14`	`+ for v in product(*values):`
	`15`	`+ params.append(tuple(v))`
	`16`	`+`
	`17`	`+ print(params)`
	`18`	`+ best_param = None`
	`19`	`+ best_score = np.infty`
	`20`	`+ tsp = TimeSeriesSplit(n_splits=cv)`
	`21`	`+`
	`22`	`+ for param in params:`
	`23`	`+ scores = []`
	`24`	`+ for train_ind, test_ind in tsp.split(df):`
	`25`	`+ train_data = df.iloc[train_ind]`
	`26`	`+ test_data = df.iloc[test_ind]`
	`27`	`+ try:`
	`28`	`+ #print(train_data, test_data)`
	`29`	`+ estimator = arima_model.ARMA(train_data, order=param)`
	`30`	`+ res = estimator.fit()`
	`31`	`+ #print(res.params)`
	`32`	`+ #get out of sample predictions with test data start and end`
	`33`	`+ y_pred = estimator.predict(res.params, test_data.index[0], test_data.index[-1])`
	`34`	`+ #print(y_pred)`
	`35`	`+ y_test = test_data.values.reshape(-1)`
	`36`	`+ score = math.sqrt(metrics.mean_squared_error(y_test, y_pred))`
	`37`	`+ scores.append(score)`
	`38`	`+ except:`
	`39`	`+ pass`
	`40`	`+ #print(scores)`
	`41`	`+ if len(scores) > 0 and np.mean(scores) < best_score :`
	`42`	`+ best_score = np.mean(scores)`
	`43`	`+ best_param = param`
	`44`	`+`
	`45`	`+ if best_param is not None:`
	`46`	`+ estimator = arima_model.ARMA(df, order=best_param)`
	`47`	`+ res = estimator.fit()`
	`48`	`+ print("best parameters:" + str(best_param))`
	`49`	`+ print("validation rmse:" + str(best_score))`
	`50`	`+ #get insample predictions with start and end indices`
	`51`	`+ y_pred = estimator.predict(res.params, start=0, end=df.shape[0]-1 )`
	`52`	`+ y_train = df.values.reshape(-1)`
	`53`	`+ train_rmse = math.sqrt(metrics.mean_squared_error(y_train, y_pred))`
	`54`	`+ print("train rmse:" + str(train_rmse))`
	`55`	`+ return estimator, res`
	`56`	`+ else:`
	`57`	`+ return None, None`
	`58`	`+`
	`59`	`+path = 'F:/'`
	`60`	`+df = pd.read_csv(os.path.join(path,'uk-deaths-from-bronchitis-emphys.csv'))`
	`61`	`+df.info()`
	`62`	`+`
	`63`	`+df.columns = ['timestamp', 'y']`
	`64`	`+df.index = pd.to_datetime(df['timestamp'], format='%Y-%m')`
	`65`	`+df.index.freq = 'MS'`
	`66`	`+df.drop('timestamp', axis=1, inplace=True)`
	`67`	`+`
	`68`	`+#grid search and get final model with best parameters`
	`69`	`+arma_grid = { 'p':[0,1,2,3,5,7], 'q':[1,2,3,5,7] }`
	`70`	`+estimator, res = grid_search_best_model_timeseries_arma(df, arma_grid, 3)`
	`71`	`+print(res.params)`
	`72`	`+print(res.summary())`
	`73`	`+plt.plot(df)`
	`74`	`+plt.figure()`
	`75`	`+res.resid.plot()`
	`76`	`+`
	`77`	`+#get predictions for future(implicit intervals based on freq of train data)`
	`78`	`+start_index = pd.datetime(1980, 1, 1)`
	`79`	`+end_index = pd.datetime(1990, 12, 1)`
	`80`	`+pred = estimator.predict(res.params, start_index, end_index)`
	`81`	`+print(pred)`
	`82`	`+`
	`83`	`+#get predictions for future(explicit intervals)`
	`84`	`+index = pd.date_range('1-1-1980', '12-1-1990', freq='MS')`
	`85`	`+pred = estimator.predict(res.params, index[0], index[-1])`
	`86`	`+pred = pd.Series(pred, index=index)`
	`87`	`+print(pred)`
	`88`	`+`
	`89`	`+plt.figure()`
	`90`	`+plt.plot(pred)`

`‎2019-october/18.time series forecasting/timeseries-cross validation.py‎`

Lines changed: 23 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,23 @@`
	`1`	`+import pandas as pd`
	`2`	`+import os`
	`3`	`+from sklearn.model_selection import TimeSeriesSplit`
	`4`	`+import statsmodels`
	`5`	`+print(statsmodels.__version__)`
	`6`	`+`
	`7`	`+path = 'F:/'`
	`8`	`+df = pd.read_csv(os.path.join(path,'uk-deaths-from-bronchitis-emphys.csv'))`
	`9`	`+df.info()`
	`10`	`+`
	`11`	`+df.columns = ['timestamp', 'y']`
	`12`	`+df.index = pd.to_datetime(df['timestamp'], format='%Y-%m')`
	`13`	`+df.drop('timestamp', axis=1, inplace=True)`
	`14`	`+`
	`15`	`+tsp = TimeSeriesSplit(n_splits=3)`
	`16`	`+`
	`17`	`+for train_ind, test_ind in tsp.split(df):`
	`18`	`+ print(train_ind, test_ind)`
	`19`	`+`
	`20`	`+for train_ind, test_ind in tsp.split(df):`
	`21`	`+ train_data = df.iloc[train_ind]`
	`22`	`+ test_data = df.iloc[test_ind]`
	`23`	`+ print(train_data, test_data)`

`‎2019-october/18.time series forecasting/timeseries-model building and evaluation.py‎`

Lines changed: 48 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,48 @@`
	`1`	`+import math`
	`2`	`+import pandas as pd`
	`3`	`+import numpy as np`
	`4`	`+import os`
	`5`	`+from sklearn import metrics`
	`6`	`+from sklearn.model_selection import TimeSeriesSplit`
	`7`	`+from statsmodels.tsa import ar_model`
	`8`	`+`
	`9`	`+path = 'F:/'`
	`10`	`+df = pd.read_csv(os.path.join(path,'uk-deaths-from-bronchitis-emphys.csv'))`
	`11`	`+df.info()`
	`12`	`+`
	`13`	`+df.columns = ['timestamp', 'y']`
	`14`	`+df.index = pd.to_datetime(df['timestamp'], format='%Y-%m').copy()`
	`15`	`+df.index.freq = 'MS'`
	`16`	`+df.drop('timestamp', axis=1, inplace=True)`
	`17`	`+`
	`18`	`+#build model`
	`19`	`+estimator = ar_model.AutoReg(df, lags=5)`
	`20`	`+res = estimator.fit()`
	`21`	`+print(res.params)`
	`22`	`+print(res.model)`
	`23`	`+print(res.summary())`
	`24`	`+`
	`25`	`+#using model`
	`26`	`+predictions = estimator.predict(res.params, start=0, end=df.shape[0]-1 )`
	`27`	`+print(predictions)`
	`28`	`+y_pred = predictions.values.reshape(-1)`
	`29`	`+y_train = df.values.reshape(-1)[5:]`
	`30`	`+train_rmse = math.sqrt(metrics.mean_squared_error(y_train, y_pred))`
	`31`	`+print(train_rmse)`
	`32`	`+`
	`33`	`+#evaluate model`
	`34`	`+tsp = TimeSeriesSplit(n_splits=3)`
	`35`	`+scores = []`
	`36`	`+`
	`37`	`+for train_ind, test_ind in tsp.split(df):`
	`38`	`+ train_data = df.iloc[train_ind]`
	`39`	`+ test_data = df.iloc[test_ind]`
	`40`	`+ estimator = ar_model.AutoReg(df, lags=5)`
	`41`	`+ res = estimator.fit()`
	`42`	`+ pred = estimator.predict(res.params,test_data.index[0], test_data.index[-1])`
	`43`	`+ y_pred = pred.values.reshape(-1)`
	`44`	`+ y_test = test_data.values.reshape(-1)`
	`45`	`+ score = math.sqrt(metrics.mean_squared_error(y_test, y_pred))`
	`46`	`+ scores.append(score)`
	`47`	`+print(scores)`
	`48`	`+print(np.mean(scores))`

`‎2019-october/18.time series forecasting/timeseries-preprocessing.py‎`

1.66 KB

Binary file not shown.

`‎2019-october/18.time series forecasting/timeseries-properties.py‎`

2.45 KB

Binary file not shown.

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 4311a1b

File tree

7 files changed

7 files changed

`‎2019-october/18.time series forecasting/timeseries-ar(gridsearch).py‎`

`‎2019-october/18.time series forecasting/timeseries-arima(gridsearch).py‎`

`‎2019-october/18.time series forecasting/timeseries-arma(gridsearch).py‎`

`‎2019-october/18.time series forecasting/timeseries-cross validation.py‎`

`‎2019-october/18.time series forecasting/timeseries-model building and evaluation.py‎`

`‎2019-october/18.time series forecasting/timeseries-preprocessing.py‎`

`‎2019-october/18.time series forecasting/timeseries-properties.py‎`

0 commit comments