Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit c8adb34

Browse files
author
Algorithmica
authored
Add files via upload
1 parent 0bc4d53 commit c8adb34

File tree

3 files changed

+161
-0
lines changed

3 files changed

+161
-0
lines changed
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import pandas as pd
2+
import os
3+
from sklearn import tree, ensemble, model_selection, preprocessing, decomposition, manifold, feature_selection, svm
4+
import seaborn as sns
5+
import numpy as np
6+
7+
import sys
8+
sys.path.append("E:/New Folder/utils")
9+
10+
import classification_utils as cutils
11+
12+
dir = 'C:/Users/Algorithmica/Downloads/dont-overfit-ii'
13+
train = pd.read_csv(os.path.join(dir, 'train.csv'))
14+
print(train.info())
15+
print(train.columns)
16+
17+
#filter unique value features
18+
train1 = train.iloc[:,2:]
19+
y = train['target'].astype(int)
20+
21+
#filter zero-variance features
22+
variance = feature_selection.VarianceThreshold()
23+
train2 = variance.fit_transform(train1)
24+
25+
lpca = decomposition.PCA(n_components=0.95)
26+
lpca.fit(train2)
27+
np.cumsum(lpca.explained_variance_ratio_)
28+
train_pca = lpca.transform(train2)
29+
30+
tsne = manifold.TSNE(n_components=3)
31+
train_tsne = tsne.fit_transform(train_pca)
32+
cutils.plot_data_3d_classification(train_tsne, y)
33+
34+
X_train, X_eval, y_train, y_eval = model_selection.train_test_split(train_pca, y, test_size=0.1, random_state=1)
35+
36+
sns.countplot(x='target',data=train)
37+
38+
kernel_svm_estimator = svm.SVC(kernel='rbf')
39+
kernel_svm_grid = {'gamma':[0.01, 0.1, 1, 2, 5, 10], 'C':[0.001, 0.01, 0.1, 0.5] }
40+
final_estimator = cutils.grid_search_best_model(kernel_svm_estimator, kernel_svm_grid, X_train, y_train)
41+
42+
print(final_estimator.score(X_eval, y_eval))
43+
44+
test = pd.read_csv(os.path.join(dir, 'test.csv'))
45+
print(test.info())
46+
print(test.columns)
47+
48+
test1 = test.iloc[:,1:]
49+
test2 = variance.transform(test1)
50+
test_pca = lpca.transform(test2)
51+
test['target'] = final_estimator.predict(test_pca)
52+
test.to_csv(os.path.join(dir, 'submission.csv'), columns=['id', 'target'], index=False)
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import pandas as pd
2+
import os
3+
from sklearn import tree, ensemble, model_selection, preprocessing, decomposition, manifold, feature_selection, svm
4+
import seaborn as sns
5+
import numpy as np
6+
7+
import sys
8+
sys.path.append("E:/New Folder/utils")
9+
10+
import classification_utils as cutils
11+
import common_utils as utils
12+
13+
dir = 'C:/Users/Algorithmica/Downloads/dont-overfit-ii'
14+
train = pd.read_csv(os.path.join(dir, 'train.csv'))
15+
print(train.info())
16+
print(train.columns)
17+
18+
#filter unique value features
19+
train1 = train.iloc[:,2:]
20+
y = train['target'].astype(int)
21+
22+
#filter zero-variance features
23+
variance = feature_selection.VarianceThreshold()
24+
train2 = variance.fit_transform(train1)
25+
26+
rf_estimator = ensemble.RandomForestClassifier()
27+
rf_grid = {'max_depth':list(range(1,9)), 'n_estimators':list(range(1,300,100)) }
28+
rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid, train1, y)
29+
embedded_selector = feature_selection.SelectFromModel(rf_final_estimator, prefit=True, threshold='mean')
30+
train3 = embedded_selector.transform(train1)
31+
utils.plot_feature_importances(rf_final_estimator,train1, cutoff=50)
32+
33+
et_estimator = ensemble.ExtraTreesClassifier()
34+
et_grid = {'max_depth':list(range(1,9)), 'n_estimators':list(range(1,300,100)) }
35+
et_final_estimator = cutils.grid_search_best_model(et_estimator, et_grid, train1, y)
36+
embedded_selector = feature_selection.SelectFromModel(et_final_estimator, prefit=True, threshold='mean')
37+
train3 = embedded_selector.transform(train1)
38+
utils.plot_feature_importances(et_final_estimator,train1, cutoff=50)
39+
40+
gb_estimator = ensemble.GradientBoostingClassifier()
41+
gb_grid = {'max_depth':[1,2,3], 'n_estimators':list(range(50,300, 100)), 'learning_rate':[0.001, 0.1, 1.0] }
42+
gb_final_estimator = cutils.grid_search_best_model(gb_estimator, gb_grid, train1, y)
43+
embedded_selector = feature_selection.SelectFromModel(gb_final_estimator, prefit=True, threshold='mean')
44+
X_train1 = embedded_selector.transform(train1)
45+
utils.plot_feature_importances(gb_final_estimator, train1)
46+
47+
kernel_svm_estimator = svm.SVC(kernel='rbf')
48+
kernel_svm_grid = {'gamma':[0.01, 0.1, 1, 2, 5, 10], 'C':[0.001, 0.01, 0.1, 0.5] }
49+
final_estimator = cutils.grid_search_best_model(kernel_svm_estimator, kernel_svm_grid, train1, y)
50+
embedded_selector = feature_selection.SelectFromModel(final_estimator, prefit=True, threshold='mean')
51+
X_train1 = embedded_selector.transform(train1)
52+
utils.plot_feature_importances(final_estimator, train1)
53+
54+
X_train, X_eval, y_train, y_eval = model_selection.train_test_split(train_pca, y, test_size=0.1, random_state=1)
55+
56+
sns.countplot(x='target',data=train)
57+
58+
kernel_svm_estimator = svm.SVC(kernel='rbf')
59+
kernel_svm_grid = {'gamma':[0.01, 0.1, 1, 2, 5, 10], 'C':[0.001, 0.01, 0.1, 0.5] }
60+
final_estimator = cutils.grid_search_best_model(kernel_svm_estimator, kernel_svm_grid, X_train, y_train)
61+
62+
print(final_estimator.score(X_eval, y_eval))
63+
64+
test = pd.read_csv(os.path.join(dir, 'test.csv'))
65+
print(test.info())
66+
print(test.columns)
67+
68+
test1 = test.iloc[:,1:]
69+
test2 = variance.transform(test1)
70+
test_pca = lpca.transform(test2)
71+
test['target'] = final_estimator.predict(test_pca)
72+
test.to_csv(os.path.join(dir, 'submission.csv'), columns=['id', 'target'], index=False)
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import sys
2+
sys.path.append("E:/New Folder/utils")
3+
4+
import classification_utils as cutils
5+
from sklearn import model_selection, linear_model, dummy
6+
7+
#binary classification
8+
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=2, n_classes=2, weights=[0.4,0.6], class_sep=1.5)
9+
#X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1)
10+
11+
12+
X_train, X_eval, y_train, y_eval = model_selection.train_test_split(X, y, test_size=0.2, random_state=1)
13+
cutils.plot_data_2d_classification(X_train, y_train)
14+
15+
lr_estimator = linear_model.LogisticRegression()
16+
lr_grid = {'penalty':['l1', 'l2'], 'C':[0.01, 0.001, 0.1, 0.3, 0.5, 0.7, 1] }
17+
final_estimator = cutils.grid_search_best_model(lr_estimator, lr_grid, X_train, y_train, scoring='accuracy')
18+
print(final_estimator.intercept_)
19+
print(final_estimator.coef_)
20+
cutils.plot_model_2d_classification(final_estimator, X_train, y_train)
21+
cutils.performance_metrics_hard_binary_classification(final_estimator, X_eval, y_eval)
22+
23+
#multi class classification
24+
X, y = cutils.generate_linear_synthetic_data_classification(n_samples=1000, n_features=2, n_classes=4, weights=[0.3,0.3,0.2,0.2], class_sep=1.5)
25+
#X, y = cutils.generate_nonlinear_synthetic_data_classification2(n_samples=1000, noise=0.1)
26+
27+
28+
X_train, X_eval, y_train, y_eval = model_selection.train_test_split(X, y, test_size=0.2, random_state=1)
29+
cutils.plot_data_2d_classification(X_train, y_train)
30+
31+
lr_estimator = linear_model.LogisticRegression()
32+
lr_grid = {'penalty':['l1', 'l2'], 'C':[0.01, 0.001, 0.1, 0.3, 0.5, 0.7, 1] }
33+
final_estimator = cutils.grid_search_best_model(lr_estimator, lr_grid, X_train, y_train, scoring='accuracy')
34+
print(final_estimator.intercept_)
35+
print(final_estimator.coef_)
36+
cutils.plot_model_2d_classification(final_estimator, X_train, y_train)
37+
cutils.performance_metrics_hard_multiclass_classification(final_estimator, X_eval, y_eval)

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /