1. ISLR_Python
  2. Chapter4_classification.ipynb
Notebook

Chapter 4 - Classification

In [37]:
importpandasaspd
importnumpyasnp
importmatplotlibasmpl
importmatplotlib.pyplotasplt
importseabornassns
importsklearn.linear_modelasskl_lm
fromsklearn.discriminant_analysisimport LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
fromsklearn.metricsimport confusion_matrix, classification_report, precision_score, roc_curve, auc, log_loss
fromsklearnimport preprocessing
fromsklearnimport neighbors
fromscipyimport stats
importscikitplotasskplt
importstatsmodels.apiassm
importstatsmodels.formula.apiassmf
fromipywidgetsimport widgets
fromclassification_helperimport print_classification_statistics, plot_ROC, print_OLS_error_table, plot_classification
%matplotlib inline
plt.style.use('seaborn-white')

4.1 An Overview of Classification

Load data

In [2]:
# In R, I exported the dataset from package 'ISLR' to an Excel file
df_default = pd.read_excel('Data/Default.xlsx')
# Note: factorize() returns two objects: a label array and an array with the unique values.
# We are only interested in the first object. 
df_default['default2'] = df_default.default.factorize()[0]
df_default['student2'] = df_default.student.factorize()[0]
df_default.head(3)
Out[2]:
default student balance income default2 student2
1 No No 729.526495 44361.625074 0 0
2 No Yes 817.180407 12106.134700 0 1
3 No No 1073.549164 31767.138947 0 0

Plot data

In [3]:
fig = plt.figure(figsize=(12,5))
gs = mpl.gridspec.GridSpec(1, 4)
ax1 = plt.subplot(gs[0,:-2])
ax2 = plt.subplot(gs[0,-2])
ax3 = plt.subplot(gs[0,-1])
# Take a fraction of the samples where target value (default) is 'no'
df_no = df_default[df_default.default2 == 0].sample(frac=.08)
# Take all samples where target value is 'yes'
df_yes = df_default[df_default.default2 == 1]
ax1.scatter(df_yes.balance, df_yes.income, s=40, c='orange', marker='+', linewidths=1)
ax1.scatter(df_no.balance, df_no.income, s=40, marker='o', linewidths='1',
 edgecolors='lightblue', facecolors='white', alpha=.6)
ax1.set_ylim(ymin=0)
ax1.set_ylabel('Income')
ax1.set_xlim(xmin=-100)
ax1.set_xlabel('Balance')
c_palette = {'No':'lightblue', 'Yes':'orange'}
sns.boxplot('default', 'balance', data=df_default, orient='v', ax=ax2, palette=c_palette)
sns.boxplot('default', 'income', data=df_default, orient='v', ax=ax3, palette=c_palette)
gs.tight_layout(plt.gcf())
No description has been provided for this image

4.3 Logistic regression

scikit-learn

In [4]:
X_train = df_default.balance.values.reshape(-1,1) 
y = df_default.default2
# Create array of test data. Calculate the classification probability
# and predicted classification.
X_plot = np.arange(df_default.balance.min(), df_default.balance.max()).reshape(-1,1)
clf = skl_lm.LogisticRegression(solver='newton-cg')
clf.fit(X_train,y)
prob = clf.predict_proba(X_plot)
fig = plt.figure()
ax = plt.axes()
# Right plot
ax.scatter(X_train, y, color='orange')
ax.plot(X_plot, prob[:,1], color='lightblue')
ax.hlines(1, xmin=ax.xaxis.get_data_interval()[0],
 xmax=ax.xaxis.get_data_interval()[1], linestyles='dashed', lw=1)
ax.hlines(0, xmin=ax.xaxis.get_data_interval()[0],
 xmax=ax.xaxis.get_data_interval()[1], linestyles='dashed', lw=1)
ax.set_ylabel('Probability of default')
ax.set_xlabel('Balance')
ax.set_yticks([0, 0.25, 0.5, 0.75, 1.])
ax.set_xlim(xmin=-100)
print(clf)
print('classes: ', clf.classes_)
print('coefficients: ', [*clf.intercept_, *clf.coef_.tolist()[0]])
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
 intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
 penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
 verbose=0, warm_start=False)
classes: [0 1]
coefficients: [-10.651330005794106, 0.0054989165568046445]
No description has been provided for this image

statsmodels

In [5]:
X_train = sm.add_constant(df_default.balance)
y = df_default.default2
est = smf.Logit(y.ravel(), X_train).fit()
est.summary2().tables[1]
Optimization terminated successfully.
 Current function value: 0.079823
 Iterations 10
Out[5]:
Coef. Std.Err. z P>|z| [0.025 0.975]
const -10.651331 0.361169 -29.491287 3.723665e-191 -11.359208 -9.943453
balance 0.005499 0.000220 24.952404 2.010855e-137 0.005067 0.005931
In [6]:
X_train = sm.add_constant(df_default.student2)
y = df_default.default2
est = smf.Logit(y, X_train).fit()
est.summary2().tables[1]
Optimization terminated successfully.
 Current function value: 0.145434
 Iterations 7
Out[6]:
Coef. Std.Err. z P>|z| [0.025 0.975]
const -3.504128 0.070713 -49.554094 0.000000 -3.642723 -3.365532
student2 0.404887 0.115019 3.520177 0.000431 0.179454 0.630320
In [7]:
X_train = sm.add_constant(df_default[['balance', 'income', 'student2']])
est = smf.Logit(y, X_train).fit()
est.summary2()
Optimization terminated successfully.
 Current function value: 0.078577
 Iterations 10
Out[7]:
Model: Logit Pseudo R-squared: 0.462
Dependent Variable: default2 AIC: 1579.5448
Date: 2018年06月23日 13:11 BIC: 1608.3862
No. Observations: 10000 Log-Likelihood: -785.77
Df Model: 3 LL-Null: -1460.3
Df Residuals: 9996 LLR p-value: 3.2575e-292
Converged: 1.0000 Scale: 1.0000
No. Iterations: 10.0000
Coef. Std.Err. z P>|z| [0.025 0.975]
const -10.8690 0.4923 -22.0793 0.0000 -11.8339 -9.9042
balance 0.0057 0.0002 24.7365 0.0000 0.0053 0.0062
income 0.0000 0.0000 0.3698 0.7115 -0.0000 0.0000
student2 -0.6468 0.2363 -2.7376 0.0062 -1.1098 -0.1837

4.4 Linear Discriminant Analysis

In [8]:
defgenerate_LDA(mean1=-2, mean2=1, mean3=2, sigma1=0.5, sigma2=0.5):
 mean1 = mean1*np.array([1, 1])
 mean2 = mean2*np.array([1, 1])
 mean3 = mean3*np.array([1, -1])
 cov = np.array([[sigma1, 0], [0, sigma2]])
 N = 500
 K = 3
 # if you sample from a t-distribution, the LDA results are really bad
 defmultivariate_t(means, S, N):
 df = 1
 m = np.asarray(means)
 d = len(means)
 x = np.random.chisquare(df, N[0])/df
 z = np.random.multivariate_normal(np.zeros(d), S, N)
 return m + z/np.sqrt(x)[:,None]
 sample1 = np.random.multivariate_normal(mean1, cov, (N,)) 
 sample2 = np.random.multivariate_normal(mean2, cov, (N,)) 
 sample3 = np.random.multivariate_normal(mean3, cov, (N,))
 
 maxX = np.max([sample1[:,0], sample2[:,0], sample3[:,0]])
 minX = np.min([sample1[:,0], sample2[:,0], sample3[:,0]])
 maxY = np.max([sample1[:,1], sample2[:,1], sample3[:,1]])
 minY = np.min([sample1[:,1], sample2[:,1], sample3[:,1]])
 # priors
 pi1 = pi2 = pi3 = N/K
 # grid of points to plot the bayes and LDA regions/lines
 N_points_grid = 200
 xx, yy = np.meshgrid(np.linspace(minX, maxX, N_points_grid), np.linspace(minY, maxY, N_points_grid))
 X = np.c_[xx.ravel(), yy.ravel()]
 # Bayes regions
 inv_cov = np.linalg.inv(cov)
 delta1_fun = lambda X: np.dot(X, np.dot(inv_cov, mean1)) - 1/2*np.dot(mean1.T, np.dot(inv_cov, mean1)) + np.log(pi1)
 delta2_fun = lambda X: np.dot(X, np.dot(inv_cov, mean2)) - 1/2*np.dot(mean2.T, np.dot(inv_cov, mean2)) + np.log(pi2)
 delta3_fun = lambda X: np.dot(X, np.dot(inv_cov, mean3)) - 1/2*np.dot(mean3.T, np.dot(inv_cov, mean3)) + np.log(pi3)
 region1 = np.logical_and(delta1_fun(X) > delta2_fun(X), delta1_fun(X) > delta3_fun(X))
 region2 = np.logical_and(delta2_fun(X) > delta1_fun(X), delta2_fun(X) > delta3_fun(X))
 region3 = np.logical_and(delta3_fun(X) > delta1_fun(X), delta3_fun(X) > delta2_fun(X))
 # LDA prediction
 est_mean1 = 1/N*np.sum(sample1, axis=0)
 est_mean2 = 1/N*np.sum(sample2, axis=0)
 est_mean3 = 1/N*np.sum(sample3, axis=0)
 est_cov = (np.cov(sample1, rowvar=False) + np.cov(sample2, rowvar=False) + np.cov(sample3, rowvar=False))/K
 inv_est_cov = np.linalg.inv(est_cov)
 est_delta1_fun = lambda X: np.dot(X, np.dot(inv_est_cov, est_mean1)) - 1/2*np.dot(est_mean1.T, np.dot(inv_est_cov, est_mean1)) + np.log(pi1)
 est_delta2_fun = lambda X: np.dot(X, np.dot(inv_est_cov, est_mean2)) - 1/2*np.dot(est_mean2.T, np.dot(inv_est_cov, est_mean2)) + np.log(pi2)
 est_delta3_fun = lambda X: np.dot(X, np.dot(inv_est_cov, est_mean3)) - 1/2*np.dot(est_mean3.T, np.dot(inv_est_cov, est_mean3)) + np.log(pi3)
 est_region1 = np.logical_and(est_delta1_fun(X) > est_delta2_fun(X), est_delta1_fun(X) > est_delta3_fun(X))
 est_region3 = np.logical_and(est_delta3_fun(X) > est_delta2_fun(X), est_delta3_fun(X) > est_delta1_fun(X))
 ### Plot Bayes regions and LDA lines
 fig = plt.figure(figsize=(8,8))
 ax = plt.subplot(1,1,1)
 # Bayes regions
 plt.contourf(xx, yy, region1.reshape(xx.shape), alpha=0.5, colors='g', levels=[0.5, 1.0])
 plt.contourf(xx, yy, region2.reshape(xx.shape), alpha=0.5, colors='orange', levels=[0.5, 1.0])
 plt.contourf(xx, yy, region3.reshape(xx.shape), alpha=0.5, colors='b', levels=[0.5, 1.0])
 # Samples
 ax.scatter(sample1[:,0], sample1[:,1], s=20, c='green', marker='o', label='1')
 ax.scatter(sample2[:,0], sample2[:,1], s=20, c='orange', marker='o', label='2')
 ax.scatter(sample3[:,0], sample3[:,1], s=20, c='blue', marker='o', label='3')
 ax.set_xlabel('X1');
 ax.set_ylabel('X2');
 # LDA lines
 plt.contour(xx, yy, est_region1.reshape(xx.shape), alpha=0.5, colors='k')
 plt.contour(xx, yy, est_region3.reshape(xx.shape), alpha=0.5, colors='k');
 
 # statistics
 pred_green = sum([np.logical_and(delta1_fun(x) > delta2_fun(x), delta1_fun(x) > delta3_fun(x)) for x in sample1])/N*100
 pred_orange = sum([np.logical_and(delta2_fun(x) > delta1_fun(x), delta2_fun(x) > delta3_fun(x)) for x in sample2])/N*100
 pred_blue = sum([np.logical_and(delta3_fun(x) > delta2_fun(x), delta3_fun(x) > delta1_fun(x)) for x in sample3])/N*100
 print('Bayes accuracy: ', np.round(pred_green, 1), np.round(pred_orange, 1), np.round(pred_blue, 1))
 est_pred_green = sum([np.logical_and(est_delta1_fun(x) > est_delta2_fun(x), est_delta1_fun(x) > est_delta3_fun(x)) for x in sample1])/N*100
 est_pred_orange = sum([np.logical_and(est_delta2_fun(x) > est_delta1_fun(x), est_delta2_fun(x) > est_delta3_fun(x)) for x in sample2])/N*100
 est_pred_blue = sum([np.logical_and(est_delta3_fun(x) > est_delta2_fun(x), est_delta3_fun(x) > est_delta1_fun(x)) for x in sample3])/N*100
 print('LDA accuracy: ', np.round(est_pred_green, 1), np.round(est_pred_orange, 1), np.round(est_pred_blue, 1))
In [9]:
interactive_plot = widgets.interactive(generate_LDA, 
 mean1=(-2,2,0.5), mean2=(-2,2,0.5), mean3=(-2,2,0.5),
 sigma1=(0.1,5,0.1), sigma2=(0.1,5,0.1),
 continuous_update=False);
output = interactive_plot.children[-1]
output.layout.height = '15cm'
interactive_plot
interactive(children=(FloatSlider(value=-2.0, description='mean1', max=2.0, min=-2.0, step=0.5), FloatSlider(v...

Default dataset

In [10]:
X = df_default[['balance', 'income', 'student2']].values
y = df_default.default2.values
lda = LinearDiscriminantAnalysis(solver='svd')
lda.fit(X, y)
Out[10]:
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
 solver='svd', store_covariance=False, tol=0.0001)
In [11]:
print_classification_statistics(lda, X, y, labels=['Down', 'Up'])
plot_ROC(lda, X, y, label='LDA Classification')
Classification Report:
 precision recall f1-score support
 Down 0.974 0.998 0.986 9667
 Up 0.782 0.237 0.364 333
avg / total 0.968 0.972 0.965 10000
Confusion Matrix:
 Predicted 
 True False
Real True 0.997724 0.002276
 False 0.762763 0.237237
No description has been provided for this image

4.4.4 Quadratic Discriminant Analysis

In [12]:
defplot_QDA(mean1=-2, mean2=1, sigma1=1, sigma2=0.5):
 mean1 = mean1*np.array([1, 1])
 mean2 = mean2*np.array([1, 1])
 cov1 = np.array([[sigma1, 0], [0, sigma1]])
 cov2 = np.array([[sigma2, 0], [0, sigma2]])
 inv_cov1 = np.linalg.inv(cov1)
 inv_cov2 = np.linalg.inv(cov2)
 N = 500
 K = 2
 sample1 = np.random.multivariate_normal(mean1, cov1, (N,)) 
 sample2 = np.random.multivariate_normal(mean2, cov2, (N,)) 
 maxX = np.max([sample1[:,0], sample2[:,0]])
 minX = np.min([sample1[:,0], sample2[:,0]])
 maxY = np.max([sample1[:,1], sample2[:,1]])
 minY = np.min([sample1[:,1], sample2[:,1]])
 pi1 = pi2 = N/K
 # grid of points to plot the bayes and LDA regions/lines
 N_points_grid = 150
 xx, yy = np.meshgrid(np.linspace(minX, maxX, N_points_grid), np.linspace(minY, maxY, N_points_grid))
 X = np.c_[xx.ravel(), yy.ravel()]
 delta1_fun = lambda Xin: -1/2*((Xin-mean1).dot(inv_cov1)*(Xin-mean1)).sum(axis=1) + np.log(pi1)
 delta2_fun = lambda Xin: -1/2*((Xin-mean2).dot(inv_cov2)*(Xin-mean2)).sum(axis=1) + np.log(pi2)
 region1 = delta1_fun(X) > delta2_fun(X)
 region2 = delta2_fun(X) > delta1_fun(X)
 # prediction
 est_mean1 = 1/N*np.sum(sample1, axis=0)
 est_mean2 = 1/N*np.sum(sample2, axis=0)
 est_cov1 = np.cov(sample1, rowvar=False) 
 est_cov2 = np.cov(sample2, rowvar=False)
 inv_est_cov1 = np.linalg.inv(est_cov1)
 inv_est_cov2 = np.linalg.inv(est_cov2)
 est_delta1_fun = lambda Xin: -1/2*((Xin-est_mean1).dot(inv_est_cov1)*(Xin-est_mean1)).sum(axis=1) + np.log(pi1)
 est_delta2_fun = lambda Xin: -1/2*((Xin-est_mean2).dot(inv_est_cov2)*(Xin-est_mean2)).sum(axis=1) + np.log(pi2)
 est_region1 = est_delta1_fun(X) > est_delta2_fun(X)
 fig = plt.figure(figsize=(8,8))
 ax = plt.subplot(1,1,1)
 # Bayes regions
 plt.contourf(xx, yy, region1.reshape(xx.shape), alpha=0.5, colors='g', levels=[0.5, 1.0])
 plt.contourf(xx, yy, region2.reshape(xx.shape), alpha=0.5, colors='orange', levels=[0.5, 1.0])
 # Samples
 ax.scatter(sample1[:,0], sample1[:,1], s=20, c='green', marker='o',)
 ax.scatter(sample2[:,0], sample2[:,1], s=20, c='orange', marker='o',)
 ax.set_xlabel('X1');
 ax.set_ylabel('X2');
 # LDA lines
 plt.contour(xx, yy, est_region1.reshape(xx.shape), alpha=0.5, colors='k');
 
 pred_green = sum(delta1_fun(sample1) > delta2_fun(sample1))/N*100
 pred_orange = sum(delta2_fun(sample2) > delta1_fun(sample2))/N*100
 print('Bayes accuracy: ', np.round(pred_green, 1), np.round(pred_orange, 1))
 est_pred_green = sum(est_delta1_fun(sample1) > est_delta2_fun(sample1))/N*100
 est_pred_orange = sum(est_delta2_fun(sample2) > est_delta1_fun(sample2))/N*100
 print('LDA accuracy: ', np.round(est_pred_green, 1), np.round(est_pred_orange, 1))
In [13]:
interactive_plot = widgets.interactive(plot_QDA, 
 mean1=(-2,2,0.5), mean2=(-2,2,0.5),
 sigma1=(0.1,5,0.1), sigma2=(0.1,5,0.1),
 continuous_update=False);
output = interactive_plot.children[-1]
output.layout.height = '15cm'
interactive_plot
interactive(children=(FloatSlider(value=-2.0, description='mean1', max=2.0, min=-2.0, step=0.5), FloatSlider(v...

4.5 A Comparison of Classification Methods

Compare Logistic regression, LDA, QDA and KNN under different conditions

4.6 Lab: Logistic Regression, LDA, QDA, and KNN

4.6.1 The Stock Market Data

In [14]:
df_stock = pd.read_csv('Data/Smarket.csv', usecols=range(1,10), index_col=0, parse_dates=True)
# convert direction to binary. Up is 1, Down is 0
df_stock.replace({'Up': 1, 'Down': 0}, inplace=True)
df_stock.head()
Out[14]:
Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
Year
2001年01月01日 0.381 -0.192 -2.624 -1.055 5.010 1.1913 0.959 1
2001年01月01日 0.959 0.381 -0.192 -2.624 -1.055 1.2965 1.032 1
2001年01月01日 1.032 0.959 0.381 -0.192 -2.624 1.4112 -0.623 0
2001年01月01日 -0.623 1.032 0.959 0.381 -0.192 1.2760 0.614 1
2001年01月01日 0.614 -0.623 1.032 0.959 0.381 1.2057 0.213 1
In [15]:
df_stock.describe()
Out[15]:
Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
count 1250.000000 1250.000000 1250.000000 1250.000000 1250.00000 1250.000000 1250.000000 1250.000000
mean 0.003834 0.003919 0.001716 0.001636 0.00561 1.478305 0.003138 0.518400
std 1.136299 1.136280 1.138703 1.138774 1.14755 0.360357 1.136334 0.499861
min -4.922000 -4.922000 -4.922000 -4.922000 -4.92200 0.356070 -4.922000 0.000000
25% -0.639500 -0.639500 -0.640000 -0.640000 -0.64000 1.257400 -0.639500 0.000000
50% 0.039000 0.039000 0.038500 0.038500 0.03850 1.422950 0.038500 1.000000
75% 0.596750 0.596750 0.596750 0.596750 0.59700 1.641675 0.596750 1.000000
max 5.733000 5.733000 5.733000 5.733000 5.73300 3.152470 5.733000 1.000000
In [16]:
df_stock.corr()
Out[16]:
Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
Lag1 1.000000 -0.026294 -0.010803 -0.002986 -0.005675 0.040910 -0.026155 -0.039757
Lag2 -0.026294 1.000000 -0.025897 -0.010854 -0.003558 -0.043383 -0.010250 -0.024081
Lag3 -0.010803 -0.025897 1.000000 -0.024051 -0.018808 -0.041824 -0.002448 0.006132
Lag4 -0.002986 -0.010854 -0.024051 1.000000 -0.027084 -0.048414 -0.006900 0.004215
Lag5 -0.005675 -0.003558 -0.018808 -0.027084 1.000000 -0.022002 -0.034860 0.005423
Volume 0.040910 -0.043383 -0.041824 -0.048414 -0.022002 1.000000 0.014592 0.022951
Today -0.026155 -0.010250 -0.002448 -0.006900 -0.034860 0.014592 1.000000 0.730563
Direction -0.039757 -0.024081 0.006132 0.004215 0.005423 0.022951 0.730563 1.000000
In [17]:
# very small correlations (today and direction are obiously correlated)
corr = df_stock.corr().values
np.max(np.abs(np.triu(corr, k=1)), axis=1)
Out[17]:
array([ 0.04090991, 0.04338321, 0.04182369, 0.04841425, 0.03486008,
 0.02295096, 0.7305629 , 0. ])
In [18]:
# volume increases with year
plot = sns.boxplot(df_stock.index, df_stock['Volume'],)
plot.set_xticklabels([str(date.year) for date in df_stock.index.unique()]);
No description has been provided for this image
In [19]:
# volumne by year and direction
ax = sns.boxplot(df_stock.index, df_stock['Volume'], hue=df_stock['Direction'])
ax.set_xticklabels([str(date.year) for date in df_stock.index.unique()])
handles, _ = ax.get_legend_handles_labels()
ax.legend(handles, ["Down", "Up"]);
No description has been provided for this image

Data

In [20]:
X = df_stock[df_stock.columns.difference(['Today', 'Direction'])]
y = df_stock['Direction']
X_train = X[:'2004']
y_train = y[:'2004']
X_test = X['2005':]
y_test = y['2005':]

4.6.2 Logistic regression

Logistic regression, not test/train split

In [21]:
logistic = skl_lm.LogisticRegression(C=1e10)
logistic.fit(X, y)
print_OLS_error_table(logistic, X, y)
print_classification_statistics(logistic, X, y, labels=['Down', 'Up'])
plot_ROC(logistic, X, y, label='Logistic Classification')
# same results as statsmodels
#smLogistic = sm.Logit(y, sm.add_constant(X)).fit()
#print(smLogistic.summary())
No. Observations: 1250
Df Residuals: 1243
Df Model: 6
Log-Likelihood: -863.79
AIC: 1741.58
 Coefficients Standard Errors t values p values
Intercept -0.1259 0.241 -0.523 0.601
Lag1 -0.0731 0.050 -1.457 0.145
Lag2 -0.0423 0.050 -0.845 0.399
Lag3 0.0111 0.050 0.222 0.824
Lag4 0.0094 0.050 0.187 0.851
Lag5 0.0103 0.050 0.208 0.835
Volume 0.1354 0.158 0.855 0.393
Classification Report:
 precision recall f1-score support
 Down 0.507 0.241 0.327 602
 Up 0.526 0.782 0.629 648
avg / total 0.517 0.522 0.483 1250
Confusion Matrix:
 Predicted 
 True False
Real True 0.240864 0.759136
 False 0.217593 0.782407
No description has been provided for this image

Logistic regression, with test/train split

In [22]:
logistic_test = skl_lm.LogisticRegression(C=1e10)
logistic_test.fit(X_train, y_train)
print_OLS_error_table(logistic_test, X_train, y_train)
print_classification_statistics(logistic_test, X_test, y_test, labels=['Down', 'Up'])
plot_ROC(logistic_test, X_test, y_test, label='Logistic Classification Train/Test')
No. Observations: 998
Df Residuals: 991
Df Model: 6
Log-Likelihood: -690.55
AIC: 1395.11
 Coefficients Standard Errors t values p values
Intercept 0.1912 0.334 0.573 0.567
Lag1 -0.0542 0.052 -1.046 0.296
Lag2 -0.0458 0.052 -0.884 0.377
Lag3 0.0072 0.052 0.139 0.889
Lag4 0.0064 0.052 0.125 0.901
Lag5 -0.0042 0.051 -0.083 0.934
Volume -0.1162 0.240 -0.485 0.628
Classification Report:
 precision recall f1-score support
 Down 0.443 0.694 0.540 111
 Up 0.564 0.312 0.402 141
avg / total 0.511 0.480 0.463 252
Confusion Matrix:
 Predicted 
 True False
Real True 0.693694 0.306306
 False 0.687943 0.312057
No description has been provided for this image

4.6.3 Linear Discriminant Analysis (LDA)

In [23]:
# use only Lag1 and Lag2
X_train2 = X_train[['Lag1','Lag2']]
X_test2 = X_test[['Lag1','Lag2']]
In [24]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train2, y_train)
print('Prior probabilities of groups: ')
print(pd.DataFrame(data=lda.priors_.reshape((1,2)), columns=['Down', 'Up'], index=['']))
print()
print('Group means: ')
print(pd.DataFrame(data=lda.means_, columns=X_train2.columns, index=['Down', 'Up']))
print()
print('Coefficients of linear discriminant: ')
print(pd.DataFrame(data=lda.scalings_, columns=['LDA'], index=X_train2.columns))
print()
Prior probabilities of groups: 
 Down Up
 0.491984 0.508016
Group means: 
 Lag1 Lag2
Down 0.042790 0.033894
Up -0.039546 -0.031325
Coefficients of linear discriminant: 
 LDA
Lag1 -0.642019
Lag2 -0.513529
In [25]:
print_classification_statistics(lda, X_test2, y_test, labels=['Down', 'Up'])
plot_ROC(lda, X_test2, y_test, label='LDA Train/Test, only Lag1 and Lag2')
plot_classification(lda, X_test2, y_test)
Classification Report:
 precision recall f1-score support
 Down 0.500 0.315 0.387 111
 Up 0.582 0.752 0.656 141
avg / total 0.546 0.560 0.538 252
Confusion Matrix:
 Predicted 
 True False
Real True 0.315315 0.684685
 False 0.248227 0.751773
No description has been provided for this image
No description has been provided for this image

4.6.4 Quadratic Discriminant Analysis (QDA)

In [26]:
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train2, y_train)
print('Prior probabilities of groups: ')
print(pd.DataFrame(data=qda.priors_.reshape((1,2)), columns=['Down', 'Up'], index=['']))
print()
print('Group means: ')
print(pd.DataFrame(data=qda.means_, columns=X_train2.columns, index=['Down', 'Up']))
print()
Prior probabilities of groups: 
 Down Up
 0.491984 0.508016
Group means: 
 Lag1 Lag2
Down 0.042790 0.033894
Up -0.039546 -0.031325
In [27]:
print_classification_statistics(qda, X_test2, y_test, labels=['Down', 'Up'])
plot_ROC(qda, X_test2, y_test, label='QDA Train/Test, only Lag1 and Lag2')
plot_classification(qda, X_test2, y_test)
Classification Report:
 precision recall f1-score support
 Down 0.600 0.270 0.373 111
 Up 0.599 0.858 0.706 141
avg / total 0.599 0.599 0.559 252
Confusion Matrix:
 Predicted 
 True False
Real True 0.270270 0.729730
 False 0.141844 0.858156
No description has been provided for this image
No description has been provided for this image

4.6.5 K-Nearest Neighbors

In [28]:
n_neighbors = 3
knn = neighbors.KNeighborsClassifier(n_neighbors)
knn.fit(X_train2, y_train)
Out[28]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
 metric_params=None, n_jobs=1, n_neighbors=3, p=2,
 weights='uniform')
In [29]:
print_classification_statistics(knn, X_test2, y_test, labels=['Down', 'Up'])
plot_ROC(knn, X_test2, y_test, label='KNN Train/Test, only Lag1 and Lag2')
plot_classification(knn, X_test2, y_test)
Classification Report:
 precision recall f1-score support
 Down 0.466 0.432 0.449 111
 Up 0.577 0.610 0.593 141
avg / total 0.528 0.532 0.529 252
Confusion Matrix:
 Predicted 
 True False
Real True 0.432432 0.567568
 False 0.390071 0.609929
No description has been provided for this image
No description has been provided for this image

4.6.6 An Application to Caravan Insurance Data

KNN

In [30]:
df_caravan = pd.read_csv('Data/Caravan.csv')
df_caravan['Purchase'] = df_caravan['Purchase'].astype('category')
df_caravan.head()
Out[30]:
Unnamed: 0 MOSTYPE MAANTHUI MGEMOMV MGEMLEEF MOSHOOFD MGODRK MGODPR MGODOV MGODGE ... APERSONG AGEZONG AWAOREG ABRAND AZEILPL APLEZIER AFIETS AINBOED ABYSTAND Purchase
0 1 33 1 3 2 8 0 5 1 3 ... 0 0 0 1 0 0 0 0 0 No
1 2 37 1 2 2 8 1 4 1 4 ... 0 0 0 1 0 0 0 0 0 No
2 3 37 1 2 2 8 0 4 2 4 ... 0 0 0 1 0 0 0 0 0 No
3 4 9 1 3 3 3 2 3 2 4 ... 0 0 0 1 0 0 0 0 0 No
4 5 40 1 4 2 10 1 4 1 4 ... 0 0 0 1 0 0 0 0 0 No

5 rows ×ばつ 87 columns

In [31]:
df_caravan['Purchase'].value_counts()
Out[31]:
No 5474
Yes 348
Name: Purchase, dtype: int64
In [32]:
y = df_caravan.Purchase
X = df_caravan.drop('Purchase', axis=1).astype('float64')
X_scaled = preprocessing.scale(X)
In [33]:
X_test = X_scaled[:1000]
y_test = y[:1000]
X_train = X_scaled[1000:]
y_train = y[1000:]
In [34]:
for i in [1, 3, 5]:
 print(f'Using {i} neighbors')
 knn = neighbors.KNeighborsClassifier(n_neighbors=i)
 knn.fit(X_train, y_train)
 print_classification_statistics(knn, X_test, y_test, labels=['No', 'Yes'])
 #plot_ROC(knn, X_test, y_test, label='KNN')
 #skplt.metrics.plot_confusion_matrix(y_test, knn.predict(X_test), normalize=False)
 plt.show()
Using 1 neighbors
Classification Report:
 precision recall f1-score support
 No 0.948 0.937 0.943 941
 Yes 0.157 0.186 0.171 59
avg / total 0.902 0.893 0.897 1000
Confusion Matrix:
 Predicted 
 True False
Real True 0.937301 0.062699
 False 0.813559 0.186441
Using 3 neighbors
Classification Report:
 precision recall f1-score support
 No 0.946 0.979 0.962 941
 Yes 0.231 0.102 0.141 59
avg / total 0.903 0.927 0.913 1000
Confusion Matrix:
 Predicted 
 True False
Real True 0.978746 0.021254
 False 0.898305 0.101695
Using 5 neighbors
Classification Report:
 precision recall f1-score support
 No 0.944 0.993 0.968 941
 Yes 0.364 0.068 0.114 59
avg / total 0.910 0.938 0.918 1000
Confusion Matrix:
 Predicted 
 True False
Real True 0.992561 0.007439
 False 0.932203 0.067797

Logistic

In [35]:
logistic = skl_lm.LogisticRegression(C=1e10)
logistic.fit(X_train, y_train)
print_classification_statistics(logistic, X_test, y_test, labels=['No', 'Yes'])
Classification Report:
 precision recall f1-score support
 No 0.941 0.994 0.966 941
 Yes 0.000 0.000 0.000 59
avg / total 0.885 0.935 0.909 1000
Confusion Matrix:
 Predicted 
 True False
Real True 0.993624 0.006376
 False 1.000000 0.000000
In [36]:
# using 25% changes of buying instead of 50%
pred_p = logistic.predict_proba(X_test)
cm_df = pd.DataFrame({'True': y_test, 'Pred': pred_p[:,1] > .25})
cm_df.Pred.replace(to_replace={True:'Yes', False:'No'}, inplace=True)
print(classification_report(y_test, cm_df.Pred))
print(cm_df.groupby(['True', 'Pred']).size().unstack('True').T)
 precision recall f1-score support
 No 0.95 0.98 0.96 941
 Yes 0.34 0.19 0.24 59
avg / total 0.91 0.93 0.92 1000
Pred No Yes
True 
No 920 21
Yes 48 11

AltStyle によって変換されたページ (->オリジナル) /