Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 94f89af

Browse files
author
Algorithmica
authored
Add files via upload
1 parent ee365f7 commit 94f89af

File tree

3 files changed

+186
-0
lines changed

3 files changed

+186
-0
lines changed
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import sys
2+
path = 'I:/utils'
3+
sys.path.append(path)
4+
import common_utils as utils
5+
import pca_utils as putils
6+
import tsne_utils as tutils
7+
import classification_utils as cutils
8+
import pandas as pd
9+
import numpy as np
10+
from sklearn import decomposition
11+
12+
#pca effect on linearly related data
13+
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=2, n_redundant=0, n_classes=2, class_sep=0, weights=[.5,.5])
14+
X = pd.DataFrame(X, columns=['X1', 'X2'])
15+
utils.plot_data_2d(X)
16+
print(X.corr())
17+
lpca = decomposition.PCA(2)
18+
lpca.fit(X)
19+
print(lpca.explained_variance_)
20+
print(lpca.explained_variance_ratio_)
21+
np.cumsum(lpca.explained_variance_ratio_)
22+
putils.plot_pca_result(lpca, X)
23+
24+
#pca effect on linearly related data(1 redundant feature)
25+
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=3, n_redundant=1, n_classes=2, weights=[.5,.5])
26+
X = pd.DataFrame(X, columns=['X1', 'X2', 'X3'])
27+
utils.plot_data_3d(X)
28+
print(X.corr())
29+
lpca = decomposition.PCA(2)
30+
lpca.fit(X)
31+
print(lpca.explained_variance_)
32+
print(lpca.explained_variance_ratio_)
33+
np.cumsum(lpca.explained_variance_ratio_)
34+
putils.plot_pca_result(lpca, X)
35+
36+
#pca effect on linearly related data(2 redundant featues)
37+
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=3, n_redundant=2, n_classes=2, weights=[.5,.5])
38+
X = pd.DataFrame(X, columns=['X1', 'X2', 'X3'])
39+
utils.plot_data_3d(X)
40+
print(X.corr())
41+
lpca = decomposition.PCA(1)
42+
lpca.fit(X)
43+
print(lpca.explained_variance_)
44+
print(lpca.explained_variance_ratio_)
45+
np.cumsum(lpca.explained_variance_ratio_)
46+
putils.plot_pca_result(lpca, X)
47+
48+
#pca effect on non-linearly related data
49+
X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000)
50+
X = pd.DataFrame(X, columns=['X1', 'X2'])
51+
utils.plot_data_2d(X)
52+
print(X.corr())
53+
lpca = decomposition.PCA(2)
54+
lpca.fit(X)
55+
print(lpca.explained_variance_)
56+
print(lpca.explained_variance_ratio_)
57+
np.cumsum(lpca.explained_variance_ratio_)
58+
putils.plot_pca_result(lpca, X)
59+
60+
tutils.plot_tsne_result(X, y, 2)
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import sys
2+
path = 'G://New Folder/utils'
3+
sys.path.append(path)
4+
5+
import common_utils as utils
6+
import regression_utils as rutils
7+
from sklearn import metrics, model_selection, ensemble, neighbors, linear_model, decomposition, manifold, feature_selection, preprocessing, pipeline, impute, compose, svm
8+
import math
9+
import pandas as pd
10+
import os
11+
import numpy as np
12+
13+
def log_rmse(y_orig, y_pred):
14+
return math.sqrt(metrics.mean_squared_log_error(y_orig,y_pred) )
15+
16+
def rmse(y_orig, y_pred):
17+
return math.sqrt(metrics.mean_squared_error(y_orig,y_pred) )
18+
19+
path = 'I://house-prices'
20+
house_train = pd.read_csv(os.path.join(path,"train.csv"))
21+
house_train.shape
22+
house_train.info()
23+
24+
#type cast features
25+
features_to_cast = ['MSSubClass']
26+
utils.cast_to_cat(house_train, features_to_cast)
27+
28+
#manual feature selection
29+
features_to_drop = ['Id', 'SalePrice']
30+
missing_features_above_th = utils.get_features_to_drop_on_missingdata(house_train, 0.25)
31+
features_to_drop.extend(missing_features_above_th)
32+
house_train1 = utils.drop_features(house_train, features_to_drop)
33+
house_train1.shape
34+
35+
#build pipeline for categorical features
36+
categorical_pipeline = pipeline.Pipeline([
37+
('imputer', impute.SimpleImputer(strategy="most_frequent") ),
38+
('ohe', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore') )
39+
])
40+
41+
42+
#build pipeline for numerical features
43+
numerical_pipeline = pipeline.Pipeline([
44+
('imputer', impute.SimpleImputer() ),
45+
('scaler', preprocessing.StandardScaler() )
46+
])
47+
48+
#build preprocessing pipeline for all features
49+
cat_features = utils.get_non_continuous_features(house_train1)
50+
num_features = utils.get_continuous_features(house_train1)
51+
52+
preprocess_pipeline = compose.ColumnTransformer([
53+
('cat', categorical_pipeline, cat_features),
54+
('num', numerical_pipeline, num_features)
55+
])
56+
57+
preprocessed_data = preprocess_pipeline.fit_transform(house_train1)
58+
print(preprocessed_data.shape)
59+
60+
viz_pipeline = pipeline.Pipeline([
61+
('preprocess', preprocess_pipeline),
62+
('pca', decomposition.PCA(n_components=0.95))
63+
])
64+
65+
lpca_data = viz_pipeline.fit_transform(house_train1)
66+
print(lpca_data.shape)
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import sys
2+
path = 'G:/utils'
3+
sys.path.append(path)
4+
import common_utils as utils
5+
import pca_utils as putils
6+
import tsne_utils as tutils
7+
import classification_utils as cutils
8+
import clustering_utils as cl_utils
9+
import pandas as pd
10+
import numpy as np
11+
from sklearn import decomposition
12+
13+
#pca effect on linearly related data
14+
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=2, n_redundant=0, n_classes=2, class_sep=0, weights=[.5,.5])
15+
X = pd.DataFrame(X, columns=['X1', 'X2'])
16+
utils.plot_data_2d(X)
17+
print(X.corr())
18+
lpca = decomposition.PCA(2)
19+
lpca.fit(X)
20+
print(lpca.explained_variance_)
21+
print(lpca.explained_variance_ratio_)
22+
np.cumsum(lpca.explained_variance_ratio_)
23+
putils.plot_pca_result(lpca, X)
24+
25+
#pca effect on linearly related data(1 redundant feature)
26+
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=3, n_redundant=1, n_classes=2, weights=[.5,.5])
27+
X = pd.DataFrame(X, columns=['X1', 'X2', 'X3'])
28+
utils.plot_data_3d(X)
29+
print(X.corr())
30+
lpca = decomposition.PCA(2)
31+
lpca.fit(X)
32+
print(lpca.explained_variance_)
33+
print(lpca.explained_variance_ratio_)
34+
np.cumsum(lpca.explained_variance_ratio_)
35+
putils.plot_pca_result(lpca, X)
36+
37+
#pca effect on linearly related data(2 redundant featues)
38+
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=3, n_redundant=2, n_classes=2, weights=[.5,.5])
39+
X = pd.DataFrame(X, columns=['X1', 'X2', 'X3'])
40+
utils.plot_data_3d(X)
41+
print(X.corr())
42+
lpca = decomposition.PCA(1)
43+
lpca.fit(X)
44+
print(lpca.explained_variance_)
45+
print(lpca.explained_variance_ratio_)
46+
np.cumsum(lpca.explained_variance_ratio_)
47+
putils.plot_pca_result(lpca, X)
48+
49+
#tsne effect on non-linearly related data
50+
X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000)
51+
X = pd.DataFrame(X, columns=['X1', 'X2'])
52+
utils.plot_data_2d(X)
53+
tutils.plot_tsne_result(X, y, 2)
54+
55+
#tsne effect on clustered data
56+
X, y = cl_utils.generate_synthetic_data_3d_clusters(1000, 7, 0.01)
57+
X = pd.DataFrame(X, columns=['X1', 'X2', 'X3'])
58+
utils.plot_data_3d(X)
59+
tutils.plot_tsne_result(X, y, 2)
60+

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /