|
| 1 | +import sys |
| 2 | +path = 'F://New Folder/utils' |
| 3 | +sys.path.append(path) |
| 4 | + |
| 5 | +import common_utils as utils |
| 6 | +from sklearn import metrics, model_selection, ensemble, neighbors, linear_model, decomposition, manifold, feature_selection, preprocessing, pipeline, impute, compose, svm |
| 7 | +import math |
| 8 | +import pandas as pd |
| 9 | +import os |
| 10 | +import numpy as np |
| 11 | +from sklearn.externals import joblib |
| 12 | +from sklearn2pmml import sklearn2pmml |
| 13 | +from sklearn2pmml.pipeline import PMMLPipeline |
| 14 | + |
| 15 | + |
| 16 | +def log_rmse(y_orig, y_pred): |
| 17 | + return math.sqrt(metrics.mean_squared_log_error(y_orig,y_pred) ) |
| 18 | + |
| 19 | +def rmse(y_orig, y_pred): |
| 20 | + return math.sqrt(metrics.mean_squared_error(y_orig,y_pred) ) |
| 21 | + |
| 22 | +path = 'F://house-prices' |
| 23 | +house_train = pd.read_csv(os.path.join(path,"train.csv")) |
| 24 | +house_train.shape |
| 25 | +house_train.info() |
| 26 | + |
| 27 | +#type cast features |
| 28 | +features_to_cast = ['MSSubClass'] |
| 29 | +utils.cast_to_cat(house_train, features_to_cast) |
| 30 | + |
| 31 | +#manual feature selection |
| 32 | +features_to_drop = ['Id', 'SalePrice'] |
| 33 | +missing_features_above_th = utils.get_features_to_drop_on_missingdata(house_train, 0.25) |
| 34 | +features_to_drop.extend(missing_features_above_th) |
| 35 | +house_train1 = utils.drop_features(house_train, features_to_drop) |
| 36 | +house_train1.shape |
| 37 | + |
| 38 | +#build pipeline for categorical features |
| 39 | +categorical_pipeline = pipeline.Pipeline([ |
| 40 | + ('imputer', impute.SimpleImputer(strategy="most_frequent") ), |
| 41 | + ('ohe', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore') ) |
| 42 | + ]) |
| 43 | + |
| 44 | + |
| 45 | +#build pipeline for numerical features |
| 46 | +numerical_pipeline = pipeline.Pipeline([ |
| 47 | + ('imputer', impute.SimpleImputer() ), |
| 48 | + ('scaler', preprocessing.StandardScaler() ) |
| 49 | + ]) |
| 50 | + |
| 51 | +#build preprocessing pipeline for all features |
| 52 | +cat_features = utils.get_non_continuous_features(house_train1) |
| 53 | +num_features = utils.get_continuous_features(house_train1) |
| 54 | + |
| 55 | +preprocess_pipeline = compose.ColumnTransformer([ |
| 56 | + ('cat', categorical_pipeline, cat_features), |
| 57 | + ('num', numerical_pipeline, num_features) |
| 58 | + ]) |
| 59 | + |
| 60 | + #build feature selection pipeline |
| 61 | +features_pipeline = pipeline.FeatureUnion([ |
| 62 | + ('pca_selector', decomposition.PCA() ), |
| 63 | + ('et_selector', feature_selection.SelectFromModel(ensemble.ExtraTreesClassifier()) ) |
| 64 | + ]) |
| 65 | + |
| 66 | + |
| 67 | + |
| 68 | +regressor = svm.LinearSVR() |
| 69 | +#build complete pipeline with feature selection and ml algorithms |
| 70 | +complete_pipeline = pipeline.Pipeline([ |
| 71 | + ('preprocess', preprocess_pipeline), |
| 72 | + ('zv_filter', feature_selection.VarianceThreshold() ), |
| 73 | + ('features', features_pipeline ), |
| 74 | + ('tregressor', compose.TransformedTargetRegressor( |
| 75 | + regressor= regressor, |
| 76 | + func=np.log1p, inverse_func=np.expm1)) |
| 77 | + ]) |
| 78 | + |
| 79 | +pipeline_grid = {'features__pca_selector__n_components':[2, 3], |
| 80 | + 'tregressor__regressor__C':[0.01, 0.1, 5, 10] |
| 81 | + } |
| 82 | + |
| 83 | +#build model with pipeline |
| 84 | +scoring = metrics.make_scorer(log_rmse, greater_is_better=False) |
| 85 | +pipeline_generated = utils.grid_search_best_model(complete_pipeline, pipeline_grid, house_train1, house_train['SalePrice'], scoring=scoring) |
| 86 | +print(pipeline_generated) |
| 87 | + |
| 88 | +objects_to_dump = { |
| 89 | + 'features_to_cast': features_to_cast, |
| 90 | + 'features_to_drop': features_to_drop, |
| 91 | + 'pipeline': pipeline_generated |
| 92 | + } |
| 93 | +joblib.dump(objects_to_dump, os.path.join(path, 'house_price_model_v1.pkl')) |
| 94 | + |
| 95 | +#build pipeline in pmml format |
| 96 | +complete_pipeline_pmml = PMMLPipeline([ |
| 97 | + ('preprocess', preprocess_pipeline), |
| 98 | + ('zv_filter', feature_selection.VarianceThreshold() ), |
| 99 | + ('features', features_pipeline ), |
| 100 | + ('tregressor', compose.TransformedTargetRegressor( |
| 101 | + regressor= regressor, |
| 102 | + func=np.log1p, inverse_func=np.expm1)) |
| 103 | + ]) |
| 104 | + |
| 105 | +pipeline_grid = {'features__pca_selector__n_components':[2, 3], |
| 106 | + 'tregressor__regressor__C':[0.01, 0.1, 5, 10] |
| 107 | + } |
| 108 | + |
| 109 | +pipeline_generated_pmml = utils.grid_search_best_model(complete_pipeline_pmml, pipeline_grid, house_train1, house_train['SalePrice'], scoring=scoring) |
| 110 | +sklearn2pmml(pipeline_generated_pmml, 'house_price_model_v1.pmml', with_repr = True) |
0 commit comments