Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit b7ffb7d

Browse files
author
Algorithmica
authored
Add files via upload
1 parent 04f29aa commit b7ffb7d

File tree

8 files changed

+807
-0
lines changed

8 files changed

+807
-0
lines changed
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
import sys
2+
path = 'E://New Folder/utils'
3+
sys.path.append(path)
4+
5+
import common_utils as utils
6+
import regression_utils as rutils
7+
from sklearn import metrics, model_selection, neighbors, linear_model, decomposition, manifold
8+
import math
9+
import pandas as pd
10+
import os
11+
import seaborn as sns
12+
import numpy as np
13+
14+
def log_rmse(y_orig, y_pred):
15+
return math.sqrt(metrics.mean_squared_log_error(y_orig,y_pred) )
16+
17+
def rmse(y_orig, y_pred):
18+
return math.sqrt(metrics.mean_squared_error(y_orig,y_pred) )
19+
20+
21+
path = 'E://house-prices'
22+
house_train = pd.read_csv(os.path.join(path,"train.csv"))
23+
house_train.shape
24+
house_train.info()
25+
26+
house_test = pd.read_csv(os.path.join(path,"test.csv"))
27+
house_test.shape
28+
house_test.info()
29+
30+
house = pd.concat((house_train, house_test), axis = 0)
31+
house.shape
32+
house.info()
33+
34+
print(utils.get_continuous_features(house))
35+
print(utils.get_non_continuous_features(house))
36+
37+
sns.distplot(house_train['SalePrice'])
38+
sns.countplot(x='GarageType',data=house_train)
39+
sns.jointplot(x="SalePrice", y="YearBuilt", data=house_train)
40+
sns.FacetGrid(house_train, hue="YearBuilt",size=8).map(sns.kdeplot, "SalePrice").add_legend()
41+
42+
sns.countplot(x='YrSold',data=house_train)
43+
sns.jointplot(x="SalePrice", y="YrSold", data=house_train)
44+
sns.FacetGrid(house_train, hue="YrSold",size=8).map(sns.kdeplot, "SalePrice").add_legend()
45+
46+
#type cast features
47+
features_to_cast = ['MSSubClass']
48+
utils.cast_to_cat(house, features_to_cast)
49+
50+
#manual feature selection
51+
features_to_drop = ['Id']
52+
missing_features_above_th = utils.get_features_to_drop_on_missingdata(house, 0.25)
53+
features_to_drop.extend(missing_features_above_th)
54+
house1 = utils.drop_features(house, features_to_drop)
55+
house1.info()
56+
57+
#impute categorical features
58+
imputable_cat_features = utils.get_non_continuous_features(house1)
59+
cat_imputer = utils.get_categorical_imputers(house1, imputable_cat_features)
60+
house1[imputable_cat_features] = cat_imputer.transform(house1[imputable_cat_features])
61+
62+
#impute continuous features
63+
imputable_cont_features = utils.get_continuous_features(house1)
64+
cont_imputer = utils.get_continuous_imputers(house1, imputable_cont_features)
65+
house1[imputable_cont_features] = cont_imputer.transform(house1[imputable_cont_features])
66+
house1.info()
67+
68+
#one hot encoding of all categorical features
69+
house2 = utils.ohe(house1, imputable_cat_features)
70+
71+
#scale the data
72+
scaler = utils.get_scaler(house2)
73+
house3 = scaler.transform(house2)
74+
house3 = pd.DataFrame(house3, columns=house2.columns)
75+
76+
X_train = house3[:house_train.shape[0]]
77+
y_train = house_train['SalePrice']
78+
79+
#feature selection
80+
lasso_selector = linear_model.Lasso()
81+
lasso_selector.fit(X_train, y_train)
82+
print(lasso_selector.coef_)
83+
utils.plot_feature_importances(lasso_selector, X_train, 40)
84+
85+
X_train1 = utils.select_features(lasso_selector, X_train)
86+
87+
#reduce features for visualization
88+
utils.corr_heatmap(X_train1)
89+
lpca = decomposition.PCA(n_components=0.95)
90+
lpca.fit(X_train1)
91+
print(np.cumsum(lpca.explained_variance_ratio_))
92+
pca_data = lpca.transform(X_train1)
93+
94+
tsne = manifold.TSNE(n_components=3)
95+
tsne_data = tsne.fit_transform(pca_data)
96+
rutils.plot_data_3d_regression(tsne_data, y_train)
97+
98+
#build model with regression machine learning algorithms
99+
scoring = metrics.make_scorer(log_rmse, greater_is_better=False)
100+
101+
knn_estimator = neighbors.KNeighborsRegressor()
102+
knn_grid = {'n_neighbors':list(range(5,15)) }
103+
final_model = utils.grid_search_best_model(knn_estimator, knn_grid, pca_data, y_train, scoring=scoring)
104+
105+
X_test = house3[house_train.shape[0]:]
106+
X_test1 = utils.select_features(lasso_selector, X_test)
107+
pca_test_data = lpca.transform(X_test1)
108+
pca_test_data.shape
109+
110+
house_test['SalePrice'] = final_model.predict(pca_test_data)
111+
house_test.to_csv(os.path.join(path, "submission.csv"), columns=["Id", "SalePrice"], index=False)
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import sys
2+
path = 'G://New Folder/utils'
3+
sys.path.append(path)
4+
5+
import common_utils as utils
6+
import regression_utils as rutils
7+
from sklearn import metrics, model_selection, neighbors, linear_model, decomposition, manifold, feature_selection, preprocessing, pipeline, impute, compose
8+
import math
9+
import pandas as pd
10+
import os
11+
import numpy as np
12+
13+
def log_rmse(y_orig, y_pred):
14+
return math.sqrt(metrics.mean_squared_log_error(y_orig,y_pred) )
15+
16+
def rmse(y_orig, y_pred):
17+
return math.sqrt(metrics.mean_squared_error(y_orig,y_pred) )
18+
19+
path = 'G://house-prices'
20+
house_train = pd.read_csv(os.path.join(path,"train.csv"))
21+
house_train.shape
22+
house_train.info()
23+
24+
#type cast features
25+
features_to_cast = ['MSSubClass']
26+
utils.cast_to_cat(house_train, features_to_cast)
27+
28+
#manual feature selection
29+
features_to_drop = ['Id', 'SalePrice']
30+
missing_features_above_th = utils.get_features_to_drop_on_missingdata(house_train, 0.25)
31+
features_to_drop.extend(missing_features_above_th)
32+
house_train1 = utils.drop_features(house_train, features_to_drop)
33+
house_train1.info()
34+
35+
#build pipeline for categorical features
36+
categorical_pipeline = pipeline.Pipeline([
37+
('imputer', impute.SimpleImputer(strategy="most_frequent") ),
38+
('ohe', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore') )
39+
])
40+
41+
#build pipeline for numerical features
42+
numerical_pipeline = pipeline.Pipeline([
43+
('imputer', impute.SimpleImputer() ),
44+
('scaler', preprocessing.StandardScaler() )
45+
])
46+
47+
#build preprocessing pipeline for all features
48+
cat_features = utils.get_non_continuous_features(house_train1)
49+
num_features = utils.get_continuous_features(house_train1)
50+
51+
preprocess_pipeline = compose.ColumnTransformer([
52+
('cat', categorical_pipeline, cat_features),
53+
('num', numerical_pipeline, num_features)
54+
])
55+
56+
#build complete pipeline with feature selection and ml algorithms
57+
complete_pipeline = pipeline.Pipeline([
58+
('preprocess', preprocess_pipeline),
59+
('zv_filter', feature_selection.VarianceThreshold() ),
60+
('feature_selector', feature_selection.SelectFromModel(linear_model.Lasso()) ),
61+
('pca', decomposition.PCA() ),
62+
('regressor', neighbors.KNeighborsRegressor() )
63+
])
64+
pipeline_grid = {'preprocess__num__imputer__strategy':['mean', 'median'] ,'pca__n_components':[0.90, 0.95], 'regressor__n_neighbors':list(range(5,15))}
65+
66+
#build model with pipeline
67+
scoring = metrics.make_scorer(log_rmse, greater_is_better=False)
68+
pipeline_generated = utils.grid_search_best_model(complete_pipeline, pipeline_grid, house_train1, house_train['SalePrice'], scoring=scoring)
69+
70+
#read test data
71+
house_test = pd.read_csv(os.path.join(path,"test.csv"))
72+
house_test.shape
73+
house_test.info()
74+
house_test['SalePrice'] = None
75+
76+
#apply preprocessing required before pipeline
77+
utils.cast_to_cat(house_test, features_to_cast)
78+
house_test1 = utils.drop_features(house_test, features_to_drop)
79+
house_test1.info()
80+
81+
#get predictions on test data with constructed pipeline
82+
house_test['SalePrice'] = np.round(pipeline_generated.predict(house_test1), 2)
83+
house_test.to_csv(os.path.join(path, "submission.csv"), columns=["Id", "SalePrice"], index=False)
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import sys
2+
path = 'G://New Folder/utils'
3+
sys.path.append(path)
4+
5+
import common_utils as utils
6+
import regression_utils as rutils
7+
from sklearn import metrics, model_selection, ensemble, neighbors, linear_model, decomposition, manifold, feature_selection, preprocessing, pipeline, impute, compose
8+
import math
9+
import pandas as pd
10+
import os
11+
import numpy as np
12+
13+
def log_rmse(y_orig, y_pred):
14+
return math.sqrt(metrics.mean_squared_log_error(y_orig,y_pred) )
15+
16+
def rmse(y_orig, y_pred):
17+
return math.sqrt(metrics.mean_squared_error(y_orig,y_pred) )
18+
19+
path = 'G://house-prices'
20+
house_train = pd.read_csv(os.path.join(path,"train.csv"))
21+
house_train.shape
22+
house_train.info()
23+
24+
#type cast features
25+
features_to_cast = ['MSSubClass']
26+
utils.cast_to_cat(house_train, features_to_cast)
27+
28+
#manual feature selection
29+
features_to_drop = ['Id', 'SalePrice']
30+
missing_features_above_th = utils.get_features_to_drop_on_missingdata(house_train, 0.25)
31+
features_to_drop.extend(missing_features_above_th)
32+
house_train1 = utils.drop_features(house_train, features_to_drop)
33+
house_train1.info()
34+
35+
#build pipeline for categorical features
36+
categorical_pipeline = pipeline.Pipeline([
37+
('imputer', impute.SimpleImputer(strategy="most_frequent") ),
38+
('ohe', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore') )
39+
])
40+
41+
#build pipeline for numerical features
42+
numerical_pipeline = pipeline.Pipeline([
43+
('imputer', impute.SimpleImputer() ),
44+
('scaler', preprocessing.StandardScaler() )
45+
])
46+
47+
#build preprocessing pipeline for all features
48+
cat_features = utils.get_non_continuous_features(house_train1)
49+
num_features = utils.get_continuous_features(house_train1)
50+
51+
preprocess_pipeline = compose.ColumnTransformer([
52+
('cat', categorical_pipeline, cat_features),
53+
('num', numerical_pipeline, num_features)
54+
])
55+
56+
#build feature selection pipeline
57+
features_pipeline = pipeline.FeatureUnion([
58+
('pca_selector', decomposition.PCA() ),
59+
#('stats_selector', feature_selection.SelectKBest() ),
60+
#('lasso_selector', feature_selection.SelectFromModel(linear_model.Lasso()) ),
61+
('et_selector', feature_selection.SelectFromModel(ensemble.ExtraTreesClassifier()) )
62+
])
63+
64+
#build complete pipeline with feature selection and ml algorithms
65+
complete_pipeline = pipeline.Pipeline([
66+
('preprocess', preprocess_pipeline),
67+
('zv_filter', feature_selection.VarianceThreshold() ),
68+
('features', features_pipeline ),
69+
('tregressor', compose.TransformedTargetRegressor(
70+
regressor= neighbors.KNeighborsRegressor(),
71+
func=np.log1p, inverse_func=np.expm1))
72+
])
73+
74+
pipeline_grid = {'preprocess__num__imputer__strategy':['mean', 'median'],
75+
'features__pca_selector__n_components':[2, 3],
76+
#'features__stats_selector__k':[10],
77+
'tregressor__regressor__n_neighbors':list(range(5,15))
78+
}
79+
80+
#build model with pipeline
81+
scoring = metrics.make_scorer(log_rmse, greater_is_better=False)
82+
pipeline_generated = utils.grid_search_best_model(complete_pipeline, pipeline_grid, house_train1, house_train['SalePrice'], scoring=scoring)
83+
84+
#read test data
85+
house_test = pd.read_csv(os.path.join(path,"test.csv"))
86+
house_test.shape
87+
house_test.info()
88+
house_test['SalePrice'] = None
89+
90+
#apply preprocessing required before pipeline
91+
utils.cast_to_cat(house_test, features_to_cast)
92+
house_test1 = utils.drop_features(house_test, features_to_drop)
93+
house_test1.info()
94+
95+
#get predictions on test data with constructed pipeline
96+
house_test['SalePrice'] = np.round(pipeline_generated.predict(house_test1), 2)
97+
house_test.to_csv(os.path.join(path, "submission.csv"), columns=["Id", "SalePrice"], index=False)
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
import sys
2+
path = 'I://New Folder/utils'
3+
sys.path.append(path)
4+
5+
import common_utils as utils
6+
import regression_utils as rutils
7+
from sklearn import metrics, model_selection, ensemble, neighbors, linear_model, decomposition, manifold, feature_selection, preprocessing, pipeline, impute, compose
8+
import math
9+
import pandas as pd
10+
import os
11+
import numpy as np
12+
13+
def log_rmse(y_orig, y_pred):
14+
return math.sqrt(metrics.mean_squared_log_error(y_orig,y_pred) )
15+
16+
def rmse(y_orig, y_pred):
17+
return math.sqrt(metrics.mean_squared_error(y_orig,y_pred) )
18+
19+
path = 'I://house-prices'
20+
house_train = pd.read_csv(os.path.join(path,"train.csv"))
21+
house_train.shape
22+
house_train.info()
23+
24+
#type cast features
25+
features_to_cast = ['MSSubClass']
26+
utils.cast_to_cat(house_train, features_to_cast)
27+
28+
#manual feature selection
29+
features_to_drop = ['Id', 'SalePrice']
30+
missing_features_above_th = utils.get_features_to_drop_on_missingdata(house_train, 0.25)
31+
features_to_drop.extend(missing_features_above_th)
32+
house_train1 = utils.drop_features(house_train, features_to_drop)
33+
house_train1.info()
34+
35+
#build pipeline for categorical features
36+
categorical_pipeline = pipeline.Pipeline([
37+
('imputer', impute.SimpleImputer(strategy="most_frequent") ),
38+
('ohe', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore') )
39+
])
40+
41+
42+
#build pipeline for numerical features
43+
numerical_pipeline = pipeline.Pipeline([
44+
('imputer', impute.SimpleImputer() ),
45+
('scaler', preprocessing.StandardScaler() )
46+
])
47+
48+
#build preprocessing pipeline for all features
49+
cat_features = utils.get_non_continuous_features(house_train1)
50+
num_features = utils.get_continuous_features(house_train1)
51+
52+
preprocess_pipeline = compose.ColumnTransformer([
53+
('cat', categorical_pipeline, cat_features),
54+
('num', numerical_pipeline, num_features)
55+
])
56+
57+
#preprocess_pipeline.fit_transform(house_train1)
58+
59+
#build feature selection pipeline
60+
features_pipeline = pipeline.FeatureUnion([
61+
('pca_selector', decomposition.PCA() ),
62+
#('lasso_selector', feature_selection.SelectFromModel(linear_model.Lasso()) ),
63+
('et_selector', feature_selection.SelectFromModel(ensemble.ExtraTreesClassifier()) )
64+
])
65+
66+
67+
68+
regressor = ensemble.RandomForestRegressor()
69+
#build complete pipeline with feature selection and ml algorithms
70+
complete_pipeline = pipeline.Pipeline([
71+
('preprocess', preprocess_pipeline),
72+
('zv_filter', feature_selection.VarianceThreshold() ),
73+
('features', features_pipeline ),
74+
('tregressor', compose.TransformedTargetRegressor(
75+
regressor= regressor,
76+
func=np.log1p, inverse_func=np.expm1))
77+
])
78+
79+
pipeline_grid = {'features__pca_selector__n_components':[2, 3],
80+
'tregressor__regressor__n_estimators':list(range(100,1000,200))
81+
}
82+
83+
#build model with pipeline
84+
scoring = metrics.make_scorer(log_rmse, greater_is_better=False)
85+
pipeline_generated = utils.grid_search_best_model(complete_pipeline, pipeline_grid, house_train1, house_train['SalePrice'], scoring=scoring)
86+
87+
#read test data
88+
house_test = pd.read_csv(os.path.join(path,"test.csv"))
89+
house_test.shape
90+
house_test.info()
91+
house_test['SalePrice'] = None
92+
93+
#apply preprocessing required before pipeline
94+
utils.cast_to_cat(house_test, features_to_cast)
95+
house_test1 = utils.drop_features(house_test, features_to_drop)
96+
house_test1.info()
97+
98+
#get predictions on test data with constructed pipeline
99+
house_test['SalePrice'] = np.round(pipeline_generated.predict(house_test1), 2)
100+
house_test.to_csv(os.path.join(path, "submission.csv"), columns=["Id", "SalePrice"], index=False)

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /