Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 3c57156

Browse files
author
Algorithmica
authored
Add files via upload
1 parent 94f89af commit 3c57156

File tree

3 files changed

+77
-40
lines changed

3 files changed

+77
-40
lines changed

‎2019-october/14.dimensionality reduction/dimensionality reduction-pca1.py‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
print(X.corr())
1717
lpca = decomposition.PCA(2)
1818
lpca.fit(X)
19+
print(lpca.components_)
1920
print(lpca.explained_variance_)
2021
print(lpca.explained_variance_ratio_)
2122
np.cumsum(lpca.explained_variance_ratio_)
Lines changed: 6 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,11 @@
11
import sys
2-
path = 'G:/utils'
2+
path = 'J:/utils'
33
sys.path.append(path)
44
import common_utils as utils
5-
import pca_utils as putils
65
import tsne_utils as tutils
76
import classification_utils as cutils
87
import clustering_utils as cl_utils
98
import pandas as pd
10-
import numpy as np
11-
from sklearn import decomposition
12-
13-
#pca effect on linearly related data
14-
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=2, n_redundant=0, n_classes=2, class_sep=0, weights=[.5,.5])
15-
X = pd.DataFrame(X, columns=['X1', 'X2'])
16-
utils.plot_data_2d(X)
17-
print(X.corr())
18-
lpca = decomposition.PCA(2)
19-
lpca.fit(X)
20-
print(lpca.explained_variance_)
21-
print(lpca.explained_variance_ratio_)
22-
np.cumsum(lpca.explained_variance_ratio_)
23-
putils.plot_pca_result(lpca, X)
24-
25-
#pca effect on linearly related data(1 redundant feature)
26-
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=3, n_redundant=1, n_classes=2, weights=[.5,.5])
27-
X = pd.DataFrame(X, columns=['X1', 'X2', 'X3'])
28-
utils.plot_data_3d(X)
29-
print(X.corr())
30-
lpca = decomposition.PCA(2)
31-
lpca.fit(X)
32-
print(lpca.explained_variance_)
33-
print(lpca.explained_variance_ratio_)
34-
np.cumsum(lpca.explained_variance_ratio_)
35-
putils.plot_pca_result(lpca, X)
36-
37-
#pca effect on linearly related data(2 redundant featues)
38-
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=3, n_redundant=2, n_classes=2, weights=[.5,.5])
39-
X = pd.DataFrame(X, columns=['X1', 'X2', 'X3'])
40-
utils.plot_data_3d(X)
41-
print(X.corr())
42-
lpca = decomposition.PCA(1)
43-
lpca.fit(X)
44-
print(lpca.explained_variance_)
45-
print(lpca.explained_variance_ratio_)
46-
np.cumsum(lpca.explained_variance_ratio_)
47-
putils.plot_pca_result(lpca, X)
489

4910
#tsne effect on non-linearly related data
5011
X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000)
@@ -58,3 +19,8 @@
5819
utils.plot_data_3d(X)
5920
tutils.plot_tsne_result(X, y, 2)
6021

22+
#tsne effect on linearly related data(2 redundant featues)
23+
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=3, n_redundant=2, n_classes=2, weights=[.5,.5])
24+
X = pd.DataFrame(X, columns=['X1', 'X2', 'X3'])
25+
utils.plot_data_3d(X)
26+
tutils.plot_tsne_result(X, y, 2)
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import sys
2+
path = 'J://New Folder//utils'
3+
sys.path.append(path)
4+
5+
import common_utils as utils
6+
import regression_utils as rutils
7+
from sklearn import metrics, model_selection, ensemble, neighbors, linear_model, decomposition, manifold, feature_selection, preprocessing, pipeline, impute, compose, svm
8+
import math
9+
import pandas as pd
10+
import os
11+
import numpy as np
12+
import tsne_utils as tutils
13+
14+
def log_rmse(y_orig, y_pred):
15+
return math.sqrt(metrics.mean_squared_log_error(y_orig,y_pred) )
16+
17+
def rmse(y_orig, y_pred):
18+
return math.sqrt(metrics.mean_squared_error(y_orig,y_pred) )
19+
20+
path = 'J://house-prices'
21+
house_train = pd.read_csv(os.path.join(path,"train.csv"))
22+
house_train.shape
23+
house_train.info()
24+
25+
#type cast features
26+
features_to_cast = ['MSSubClass']
27+
utils.cast_to_cat(house_train, features_to_cast)
28+
29+
#manual feature selection
30+
features_to_drop = ['Id', 'SalePrice']
31+
missing_features_above_th = utils.get_features_to_drop_on_missingdata(house_train, 0.25)
32+
features_to_drop.extend(missing_features_above_th)
33+
house_train1 = utils.drop_features(house_train, features_to_drop)
34+
house_train1.shape
35+
36+
#build pipeline for categorical features
37+
categorical_pipeline = pipeline.Pipeline([
38+
('imputer', impute.SimpleImputer(strategy="most_frequent") ),
39+
('ohe', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore') )
40+
])
41+
42+
43+
#build pipeline for numerical features
44+
numerical_pipeline = pipeline.Pipeline([
45+
('imputer', impute.SimpleImputer() ),
46+
('scaler', preprocessing.StandardScaler() )
47+
])
48+
49+
#build preprocessing pipeline for all features
50+
cat_features = utils.get_non_continuous_features(house_train1)
51+
num_features = utils.get_continuous_features(house_train1)
52+
53+
preprocess_pipeline = compose.ColumnTransformer([
54+
('cat', categorical_pipeline, cat_features),
55+
('num', numerical_pipeline, num_features)
56+
])
57+
58+
preprocessed_data = preprocess_pipeline.fit_transform(house_train1)
59+
print(preprocessed_data.shape)
60+
61+
viz_pipeline = pipeline.Pipeline([
62+
('preprocess', preprocess_pipeline),
63+
('tsne', manifold.TSNE(n_components=2))
64+
])
65+
66+
tsne_data = viz_pipeline.fit_transform(house_train1)
67+
print(tsne_data.shape)
68+
rutils.plot_data_3d_regression(tsne_data, house_train['SalePrice'])
69+
70+
tutils.plot_tsne_result(X, y, 2)

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /