Chapter 4 - Classification¶

4.4.4 Quadratic Discriminant Analysis

4.5 A Comparison of Classification Methods

4.6 Lab: Logistic Regression, LDA, QDA, and KNN

4.6.1 The Stock Market Data
4.6.2 Logistic regression
4.6.3 Linear Discriminant Analysis (LDA)
4.6.4 Quadratic Discriminant Analysis (QDA)
4.6.5 K-Nearest Neighbors
4.6.6 An Application to Caravan Insurance Data

In [37]:

importpandasaspd
importnumpyasnp
importmatplotlibasmpl
importmatplotlib.pyplotasplt
importseabornassns
importsklearn.linear_modelasskl_lm
fromsklearn.discriminant_analysisimport LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
fromsklearn.metricsimport confusion_matrix, classification_report, precision_score, roc_curve, auc, log_loss
fromsklearnimport preprocessing
fromsklearnimport neighbors
fromscipyimport stats
importscikitplotasskplt
importstatsmodels.apiassm
importstatsmodels.formula.apiassmf
fromipywidgetsimport widgets
fromclassification_helperimport print_classification_statistics, plot_ROC, print_OLS_error_table, plot_classification
%matplotlib inline
plt.style.use('seaborn-white')

4.1 An Overview of Classification¶

Load data¶

In [2]:

# In R, I exported the dataset from package 'ISLR' to an Excel file
df_default = pd.read_excel('Data/Default.xlsx')
# Note: factorize() returns two objects: a label array and an array with the unique values.
# We are only interested in the first object. 
df_default['default2'] = df_default.default.factorize()[0]
df_default['student2'] = df_default.student.factorize()[0]
df_default.head(3)

Out[2]:

default	student	balance	income	default2
1	No	No	729.526495	44361.625074	0
2	No	Yes	817.180407	12106.134700	1
3	No	No	1073.549164	31767.138947	0

Plot data¶

In [3]:

fig = plt.figure(figsize=(12,5))
gs = mpl.gridspec.GridSpec(1, 4)
ax1 = plt.subplot(gs[0,:-2])
ax2 = plt.subplot(gs[0,-2])
ax3 = plt.subplot(gs[0,-1])
# Take a fraction of the samples where target value (default) is 'no'
df_no = df_default[df_default.default2 == 0].sample(frac=.08)
# Take all samples where target value is 'yes'
df_yes = df_default[df_default.default2 == 1]
ax1.scatter(df_yes.balance, df_yes.income, s=40, c='orange', marker='+', linewidths=1)
ax1.scatter(df_no.balance, df_no.income, s=40, marker='o', linewidths='1',
 edgecolors='lightblue', facecolors='white', alpha=.6)
ax1.set_ylim(ymin=0)
ax1.set_ylabel('Income')
ax1.set_xlim(xmin=-100)
ax1.set_xlabel('Balance')
c_palette = {'No':'lightblue', 'Yes':'orange'}
sns.boxplot('default', 'balance', data=df_default, orient='v', ax=ax2, palette=c_palette)
sns.boxplot('default', 'income', data=df_default, orient='v', ax=ax3, palette=c_palette)
gs.tight_layout(plt.gcf())

No description has been provided for this image

4.3 Logistic regression¶

scikit-learn¶

In [4]:

X_train = df_default.balance.values.reshape(-1,1) 
y = df_default.default2
# Create array of test data. Calculate the classification probability
# and predicted classification.
X_plot = np.arange(df_default.balance.min(), df_default.balance.max()).reshape(-1,1)
clf = skl_lm.LogisticRegression(solver='newton-cg')
clf.fit(X_train,y)
prob = clf.predict_proba(X_plot)
fig = plt.figure()
ax = plt.axes()
# Right plot
ax.scatter(X_train, y, color='orange')
ax.plot(X_plot, prob[:,1], color='lightblue')
ax.hlines(1, xmin=ax.xaxis.get_data_interval()[0],
 xmax=ax.xaxis.get_data_interval()[1], linestyles='dashed', lw=1)
ax.hlines(0, xmin=ax.xaxis.get_data_interval()[0],
 xmax=ax.xaxis.get_data_interval()[1], linestyles='dashed', lw=1)
ax.set_ylabel('Probability of default')
ax.set_xlabel('Balance')
ax.set_yticks([0, 0.25, 0.5, 0.75, 1.])
ax.set_xlim(xmin=-100)
print(clf)
print('classes: ', clf.classes_)
print('coefficients: ', [*clf.intercept_, *clf.coef_.tolist()[0]])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
 intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
 penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
 verbose=0, warm_start=False)
classes: [0 1]
coefficients: [-10.651330005794106, 0.0054989165568046445]

No description has been provided for this image

statsmodels¶

In [5]:

X_train = sm.add_constant(df_default.balance)
y = df_default.default2
est = smf.Logit(y.ravel(), X_train).fit()
est.summary2().tables[1]

Optimization terminated successfully.
 Current function value: 0.079823
 Iterations 10

Out[5]:

Coef.	Std.Err.	z	P>\|z\|	[0.025	0.975]
const	-10.651331	0.361169	-29.491287	3.723665e-191	-11.359208	-9.943453
balance	0.005499	0.000220	24.952404	2.010855e-137	0.005067	0.005931

In [6]:

X_train = sm.add_constant(df_default.student2)
y = df_default.default2
est = smf.Logit(y, X_train).fit()
est.summary2().tables[1]

Optimization terminated successfully.
 Current function value: 0.145434
 Iterations 7

Out[6]:

Coef.	Std.Err.	z	P>\|z\|	[0.025	0.975]
const	-3.504128	0.070713	-49.554094	0.000000	-3.642723	-3.365532
student2	0.404887	0.115019	3.520177	0.000431	0.179454	0.630320

In [7]:

X_train = sm.add_constant(df_default[['balance', 'income', 'student2']])
est = smf.Logit(y, X_train).fit()
est.summary2()

Optimization terminated successfully.
 Current function value: 0.078577
 Iterations 10

Out[7]:

Model: Logit Pseudo R-squared: 0.462

Dependent Variable: default2 AIC: 1579.5448

Date: 2018年06月23日 13:11 BIC: 1608.3862

No. Observations: 10000 Log-Likelihood: -785.77

Df Model: 3 LL-Null: -1460.3

Df Residuals: 9996 LLR p-value: 3.2575e-292

Converged: 1.0000 Scale: 1.0000

No. Iterations: 10.0000

Coef.	Std.Err.	z	P>\|z\|	[0.025	0.975]
const	-10.8690	0.4923	-22.0793	0.0000	-11.8339	-9.9042
balance	0.0057	0.0002	24.7365	0.0000	0.0053	0.0062
income	0.0000	0.0000	0.3698	0.7115	-0.0000	0.0000
student2	-0.6468	0.2363	-2.7376	0.0062	-1.1098	-0.1837

4.4 Linear Discriminant Analysis¶

In [8]:

defgenerate_LDA(mean1=-2, mean2=1, mean3=2, sigma1=0.5, sigma2=0.5):
 mean1 = mean1*np.array([1, 1])
 mean2 = mean2*np.array([1, 1])
 mean3 = mean3*np.array([1, -1])
 cov = np.array([[sigma1, 0], [0, sigma2]])
 N = 500
 K = 3
 # if you sample from a t-distribution, the LDA results are really bad
 defmultivariate_t(means, S, N):
 df = 1
 m = np.asarray(means)
 d = len(means)
 x = np.random.chisquare(df, N[0])/df
 z = np.random.multivariate_normal(np.zeros(d), S, N)
 return m + z/np.sqrt(x)[:,None]
 sample1 = np.random.multivariate_normal(mean1, cov, (N,)) 
 sample2 = np.random.multivariate_normal(mean2, cov, (N,)) 
 sample3 = np.random.multivariate_normal(mean3, cov, (N,))
 
 maxX = np.max([sample1[:,0], sample2[:,0], sample3[:,0]])
 minX = np.min([sample1[:,0], sample2[:,0], sample3[:,0]])
 maxY = np.max([sample1[:,1], sample2[:,1], sample3[:,1]])
 minY = np.min([sample1[:,1], sample2[:,1], sample3[:,1]])
 # priors
 pi1 = pi2 = pi3 = N/K
 # grid of points to plot the bayes and LDA regions/lines
 N_points_grid = 200
 xx, yy = np.meshgrid(np.linspace(minX, maxX, N_points_grid), np.linspace(minY, maxY, N_points_grid))
 X = np.c_[xx.ravel(), yy.ravel()]
 # Bayes regions
 inv_cov = np.linalg.inv(cov)
 delta1_fun = lambda X: np.dot(X, np.dot(inv_cov, mean1)) - 1/2*np.dot(mean1.T, np.dot(inv_cov, mean1)) + np.log(pi1)
 delta2_fun = lambda X: np.dot(X, np.dot(inv_cov, mean2)) - 1/2*np.dot(mean2.T, np.dot(inv_cov, mean2)) + np.log(pi2)
 delta3_fun = lambda X: np.dot(X, np.dot(inv_cov, mean3)) - 1/2*np.dot(mean3.T, np.dot(inv_cov, mean3)) + np.log(pi3)
 region1 = np.logical_and(delta1_fun(X) > delta2_fun(X), delta1_fun(X) > delta3_fun(X))
 region2 = np.logical_and(delta2_fun(X) > delta1_fun(X), delta2_fun(X) > delta3_fun(X))
 region3 = np.logical_and(delta3_fun(X) > delta1_fun(X), delta3_fun(X) > delta2_fun(X))
 # LDA prediction
 est_mean1 = 1/N*np.sum(sample1, axis=0)
 est_mean2 = 1/N*np.sum(sample2, axis=0)
 est_mean3 = 1/N*np.sum(sample3, axis=0)
 est_cov = (np.cov(sample1, rowvar=False) + np.cov(sample2, rowvar=False) + np.cov(sample3, rowvar=False))/K
 inv_est_cov = np.linalg.inv(est_cov)
 est_delta1_fun = lambda X: np.dot(X, np.dot(inv_est_cov, est_mean1)) - 1/2*np.dot(est_mean1.T, np.dot(inv_est_cov, est_mean1)) + np.log(pi1)
 est_delta2_fun = lambda X: np.dot(X, np.dot(inv_est_cov, est_mean2)) - 1/2*np.dot(est_mean2.T, np.dot(inv_est_cov, est_mean2)) + np.log(pi2)
 est_delta3_fun = lambda X: np.dot(X, np.dot(inv_est_cov, est_mean3)) - 1/2*np.dot(est_mean3.T, np.dot(inv_est_cov, est_mean3)) + np.log(pi3)
 est_region1 = np.logical_and(est_delta1_fun(X) > est_delta2_fun(X), est_delta1_fun(X) > est_delta3_fun(X))
 est_region3 = np.logical_and(est_delta3_fun(X) > est_delta2_fun(X), est_delta3_fun(X) > est_delta1_fun(X))
 ### Plot Bayes regions and LDA lines
 fig = plt.figure(figsize=(8,8))
 ax = plt.subplot(1,1,1)
 # Bayes regions
 plt.contourf(xx, yy, region1.reshape(xx.shape), alpha=0.5, colors='g', levels=[0.5, 1.0])
 plt.contourf(xx, yy, region2.reshape(xx.shape), alpha=0.5, colors='orange', levels=[0.5, 1.0])
 plt.contourf(xx, yy, region3.reshape(xx.shape), alpha=0.5, colors='b', levels=[0.5, 1.0])
 # Samples
 ax.scatter(sample1[:,0], sample1[:,1], s=20, c='green', marker='o', label='1')
 ax.scatter(sample2[:,0], sample2[:,1], s=20, c='orange', marker='o', label='2')
 ax.scatter(sample3[:,0], sample3[:,1], s=20, c='blue', marker='o', label='3')
 ax.set_xlabel('X1');
 ax.set_ylabel('X2');
 # LDA lines
 plt.contour(xx, yy, est_region1.reshape(xx.shape), alpha=0.5, colors='k')
 plt.contour(xx, yy, est_region3.reshape(xx.shape), alpha=0.5, colors='k');
 
 # statistics
 pred_green = sum([np.logical_and(delta1_fun(x) > delta2_fun(x), delta1_fun(x) > delta3_fun(x)) for x in sample1])/N*100
 pred_orange = sum([np.logical_and(delta2_fun(x) > delta1_fun(x), delta2_fun(x) > delta3_fun(x)) for x in sample2])/N*100
 pred_blue = sum([np.logical_and(delta3_fun(x) > delta2_fun(x), delta3_fun(x) > delta1_fun(x)) for x in sample3])/N*100
 print('Bayes accuracy: ', np.round(pred_green, 1), np.round(pred_orange, 1), np.round(pred_blue, 1))
 est_pred_green = sum([np.logical_and(est_delta1_fun(x) > est_delta2_fun(x), est_delta1_fun(x) > est_delta3_fun(x)) for x in sample1])/N*100
 est_pred_orange = sum([np.logical_and(est_delta2_fun(x) > est_delta1_fun(x), est_delta2_fun(x) > est_delta3_fun(x)) for x in sample2])/N*100
 est_pred_blue = sum([np.logical_and(est_delta3_fun(x) > est_delta2_fun(x), est_delta3_fun(x) > est_delta1_fun(x)) for x in sample3])/N*100
 print('LDA accuracy: ', np.round(est_pred_green, 1), np.round(est_pred_orange, 1), np.round(est_pred_blue, 1))

In [9]:

interactive_plot = widgets.interactive(generate_LDA, 
 mean1=(-2,2,0.5), mean2=(-2,2,0.5), mean3=(-2,2,0.5),
 sigma1=(0.1,5,0.1), sigma2=(0.1,5,0.1),
 continuous_update=False);
output = interactive_plot.children[-1]
output.layout.height = '15cm'
interactive_plot

interactive(children=(FloatSlider(value=-2.0, description='mean1', max=2.0, min=-2.0, step=0.5), FloatSlider(v...

Default dataset

In [10]:

X = df_default[['balance', 'income', 'student2']].values
y = df_default.default2.values
lda = LinearDiscriminantAnalysis(solver='svd')
lda.fit(X, y)

Out[10]:

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
 solver='svd', store_covariance=False, tol=0.0001)

In [11]:

print_classification_statistics(lda, X, y, labels=['Down', 'Up'])
plot_ROC(lda, X, y, label='LDA Classification')

Classification Report:
 precision recall f1-score support
 Down 0.974 0.998 0.986 9667
 Up 0.782 0.237 0.364 333
avg / total 0.968 0.972 0.965 10000
Confusion Matrix:
 Predicted 
 True False
Real True 0.997724 0.002276
 False 0.762763 0.237237

No description has been provided for this image

4.4.4 Quadratic Discriminant Analysis¶

In [12]:

defplot_QDA(mean1=-2, mean2=1, sigma1=1, sigma2=0.5):
 mean1 = mean1*np.array([1, 1])
 mean2 = mean2*np.array([1, 1])
 cov1 = np.array([[sigma1, 0], [0, sigma1]])
 cov2 = np.array([[sigma2, 0], [0, sigma2]])
 inv_cov1 = np.linalg.inv(cov1)
 inv_cov2 = np.linalg.inv(cov2)
 N = 500
 K = 2
 sample1 = np.random.multivariate_normal(mean1, cov1, (N,)) 
 sample2 = np.random.multivariate_normal(mean2, cov2, (N,)) 
 maxX = np.max([sample1[:,0], sample2[:,0]])
 minX = np.min([sample1[:,0], sample2[:,0]])
 maxY = np.max([sample1[:,1], sample2[:,1]])
 minY = np.min([sample1[:,1], sample2[:,1]])
 pi1 = pi2 = N/K
 # grid of points to plot the bayes and LDA regions/lines
 N_points_grid = 150
 xx, yy = np.meshgrid(np.linspace(minX, maxX, N_points_grid), np.linspace(minY, maxY, N_points_grid))
 X = np.c_[xx.ravel(), yy.ravel()]
 delta1_fun = lambda Xin: -1/2*((Xin-mean1).dot(inv_cov1)*(Xin-mean1)).sum(axis=1) + np.log(pi1)
 delta2_fun = lambda Xin: -1/2*((Xin-mean2).dot(inv_cov2)*(Xin-mean2)).sum(axis=1) + np.log(pi2)
 region1 = delta1_fun(X) > delta2_fun(X)
 region2 = delta2_fun(X) > delta1_fun(X)
 # prediction
 est_mean1 = 1/N*np.sum(sample1, axis=0)
 est_mean2 = 1/N*np.sum(sample2, axis=0)
 est_cov1 = np.cov(sample1, rowvar=False) 
 est_cov2 = np.cov(sample2, rowvar=False)
 inv_est_cov1 = np.linalg.inv(est_cov1)
 inv_est_cov2 = np.linalg.inv(est_cov2)
 est_delta1_fun = lambda Xin: -1/2*((Xin-est_mean1).dot(inv_est_cov1)*(Xin-est_mean1)).sum(axis=1) + np.log(pi1)
 est_delta2_fun = lambda Xin: -1/2*((Xin-est_mean2).dot(inv_est_cov2)*(Xin-est_mean2)).sum(axis=1) + np.log(pi2)
 est_region1 = est_delta1_fun(X) > est_delta2_fun(X)
 fig = plt.figure(figsize=(8,8))
 ax = plt.subplot(1,1,1)
 # Bayes regions
 plt.contourf(xx, yy, region1.reshape(xx.shape), alpha=0.5, colors='g', levels=[0.5, 1.0])
 plt.contourf(xx, yy, region2.reshape(xx.shape), alpha=0.5, colors='orange', levels=[0.5, 1.0])
 # Samples
 ax.scatter(sample1[:,0], sample1[:,1], s=20, c='green', marker='o',)
 ax.scatter(sample2[:,0], sample2[:,1], s=20, c='orange', marker='o',)
 ax.set_xlabel('X1');
 ax.set_ylabel('X2');
 # LDA lines
 plt.contour(xx, yy, est_region1.reshape(xx.shape), alpha=0.5, colors='k');
 
 pred_green = sum(delta1_fun(sample1) > delta2_fun(sample1))/N*100
 pred_orange = sum(delta2_fun(sample2) > delta1_fun(sample2))/N*100
 print('Bayes accuracy: ', np.round(pred_green, 1), np.round(pred_orange, 1))
 est_pred_green = sum(est_delta1_fun(sample1) > est_delta2_fun(sample1))/N*100
 est_pred_orange = sum(est_delta2_fun(sample2) > est_delta1_fun(sample2))/N*100
 print('LDA accuracy: ', np.round(est_pred_green, 1), np.round(est_pred_orange, 1))

In [13]:

interactive_plot = widgets.interactive(plot_QDA, 
 mean1=(-2,2,0.5), mean2=(-2,2,0.5),
 sigma1=(0.1,5,0.1), sigma2=(0.1,5,0.1),
 continuous_update=False);
output = interactive_plot.children[-1]
output.layout.height = '15cm'
interactive_plot

interactive(children=(FloatSlider(value=-2.0, description='mean1', max=2.0, min=-2.0, step=0.5), FloatSlider(v...

4.5 A Comparison of Classification Methods¶

Compare Logistic regression, LDA, QDA and KNN under different conditions

4.6 Lab: Logistic Regression, LDA, QDA, and KNN¶

4.6.1 The Stock Market Data¶

In [14]:

df_stock = pd.read_csv('Data/Smarket.csv', usecols=range(1,10), index_col=0, parse_dates=True)
# convert direction to binary. Up is 1, Down is 0
df_stock.replace({'Up': 1, 'Down': 0}, inplace=True)
df_stock.head()

Out[14]:

Lag1	Lag2	Lag3	Lag4	Lag5	Volume	Today	Direction
Year
2001年01月01日	0.381	-0.192	-2.624	-1.055	5.010	1.1913	0.959	1
2001年01月01日	0.959	0.381	-0.192	-2.624	-1.055	1.2965	1.032	1
2001年01月01日	1.032	0.959	0.381	-0.192	-2.624	1.4112	-0.623	0
2001年01月01日	-0.623	1.032	0.959	0.381	-0.192	1.2760	0.614	1
2001年01月01日	0.614	-0.623	1.032	0.959	0.381	1.2057	0.213	1

In [15]:

df_stock.describe()

Out[15]:

Lag1	Lag2	Lag3	Lag4	Lag5	Volume	Today	Direction
count	1250.000000	1250.000000	1250.000000	1250.000000	1250.00000	1250.000000	1250.000000	1250.000000
mean	0.003834	0.003919	0.001716	0.001636	0.00561	1.478305	0.003138	0.518400
std	1.136299	1.136280	1.138703	1.138774	1.14755	0.360357	1.136334	0.499861
min	-4.922000	-4.922000	-4.922000	-4.922000	-4.92200	0.356070	-4.922000	0.000000
25%	-0.639500	-0.639500	-0.640000	-0.640000	-0.64000	1.257400	-0.639500	0.000000
50%	0.039000	0.039000	0.038500	0.038500	0.03850	1.422950	0.038500	1.000000
75%	0.596750	0.596750	0.596750	0.596750	0.59700	1.641675	0.596750	1.000000
max	5.733000	5.733000	5.733000	5.733000	5.73300	3.152470	5.733000	1.000000

In [16]:

df_stock.corr()

Out[16]:

Lag1	Lag2	Lag3	Lag4	Lag5	Volume	Today	Direction
Lag1	1.000000	-0.026294	-0.010803	-0.002986	-0.005675	0.040910	-0.026155	-0.039757
Lag2	-0.026294	1.000000	-0.025897	-0.010854	-0.003558	-0.043383	-0.010250	-0.024081
Lag3	-0.010803	-0.025897	1.000000	-0.024051	-0.018808	-0.041824	-0.002448	0.006132
Lag4	-0.002986	-0.010854	-0.024051	1.000000	-0.027084	-0.048414	-0.006900	0.004215
Lag5	-0.005675	-0.003558	-0.018808	-0.027084	1.000000	-0.022002	-0.034860	0.005423
Volume	0.040910	-0.043383	-0.041824	-0.048414	-0.022002	1.000000	0.014592	0.022951
Today	-0.026155	-0.010250	-0.002448	-0.006900	-0.034860	0.014592	1.000000	0.730563
Direction	-0.039757	-0.024081	0.006132	0.004215	0.005423	0.022951	0.730563	1.000000

In [17]:

# very small correlations (today and direction are obiously correlated)
corr = df_stock.corr().values
np.max(np.abs(np.triu(corr, k=1)), axis=1)

Out[17]:

array([ 0.04090991, 0.04338321, 0.04182369, 0.04841425, 0.03486008,
 0.02295096, 0.7305629 , 0. ])

In [18]:

# volume increases with year
plot = sns.boxplot(df_stock.index, df_stock['Volume'],)
plot.set_xticklabels([str(date.year) for date in df_stock.index.unique()]);

No description has been provided for this image

In [19]:

# volumne by year and direction
ax = sns.boxplot(df_stock.index, df_stock['Volume'], hue=df_stock['Direction'])
ax.set_xticklabels([str(date.year) for date in df_stock.index.unique()])
handles, _ = ax.get_legend_handles_labels()
ax.legend(handles, ["Down", "Up"]);

No description has been provided for this image

Data¶

In [20]:

X = df_stock[df_stock.columns.difference(['Today', 'Direction'])]
y = df_stock['Direction']
X_train = X[:'2004']
y_train = y[:'2004']
X_test = X['2005':]
y_test = y['2005':]

4.6.2 Logistic regression¶

Logistic regression, not test/train split¶

In [21]:

logistic = skl_lm.LogisticRegression(C=1e10)
logistic.fit(X, y)
print_OLS_error_table(logistic, X, y)
print_classification_statistics(logistic, X, y, labels=['Down', 'Up'])
plot_ROC(logistic, X, y, label='Logistic Classification')
# same results as statsmodels
#smLogistic = sm.Logit(y, sm.add_constant(X)).fit()
#print(smLogistic.summary())

No. Observations: 1250
Df Residuals: 1243
Df Model: 6
Log-Likelihood: -863.79
AIC: 1741.58
 Coefficients Standard Errors t values p values
Intercept -0.1259 0.241 -0.523 0.601
Lag1 -0.0731 0.050 -1.457 0.145
Lag2 -0.0423 0.050 -0.845 0.399
Lag3 0.0111 0.050 0.222 0.824
Lag4 0.0094 0.050 0.187 0.851
Lag5 0.0103 0.050 0.208 0.835
Volume 0.1354 0.158 0.855 0.393
Classification Report:
 precision recall f1-score support
 Down 0.507 0.241 0.327 602
 Up 0.526 0.782 0.629 648
avg / total 0.517 0.522 0.483 1250
Confusion Matrix:
 Predicted 
 True False
Real True 0.240864 0.759136
 False 0.217593 0.782407

No description has been provided for this image

Logistic regression, with test/train split¶

In [22]:

logistic_test = skl_lm.LogisticRegression(C=1e10)
logistic_test.fit(X_train, y_train)
print_OLS_error_table(logistic_test, X_train, y_train)
print_classification_statistics(logistic_test, X_test, y_test, labels=['Down', 'Up'])
plot_ROC(logistic_test, X_test, y_test, label='Logistic Classification Train/Test')

No. Observations: 998
Df Residuals: 991
Df Model: 6
Log-Likelihood: -690.55
AIC: 1395.11
 Coefficients Standard Errors t values p values
Intercept 0.1912 0.334 0.573 0.567
Lag1 -0.0542 0.052 -1.046 0.296
Lag2 -0.0458 0.052 -0.884 0.377
Lag3 0.0072 0.052 0.139 0.889
Lag4 0.0064 0.052 0.125 0.901
Lag5 -0.0042 0.051 -0.083 0.934
Volume -0.1162 0.240 -0.485 0.628
Classification Report:
 precision recall f1-score support
 Down 0.443 0.694 0.540 111
 Up 0.564 0.312 0.402 141
avg / total 0.511 0.480 0.463 252
Confusion Matrix:
 Predicted 
 True False
Real True 0.693694 0.306306
 False 0.687943 0.312057

No description has been provided for this image

4.6.3 Linear Discriminant Analysis (LDA)¶

In [23]:

# use only Lag1 and Lag2
X_train2 = X_train[['Lag1','Lag2']]
X_test2 = X_test[['Lag1','Lag2']]

In [24]:

lda = LinearDiscriminantAnalysis()
lda.fit(X_train2, y_train)
print('Prior probabilities of groups: ')
print(pd.DataFrame(data=lda.priors_.reshape((1,2)), columns=['Down', 'Up'], index=['']))
print()
print('Group means: ')
print(pd.DataFrame(data=lda.means_, columns=X_train2.columns, index=['Down', 'Up']))
print()
print('Coefficients of linear discriminant: ')
print(pd.DataFrame(data=lda.scalings_, columns=['LDA'], index=X_train2.columns))
print()

Prior probabilities of groups: 
 Down Up
 0.491984 0.508016
Group means: 
 Lag1 Lag2
Down 0.042790 0.033894
Up -0.039546 -0.031325
Coefficients of linear discriminant: 
 LDA
Lag1 -0.642019
Lag2 -0.513529

In [25]:

print_classification_statistics(lda, X_test2, y_test, labels=['Down', 'Up'])
plot_ROC(lda, X_test2, y_test, label='LDA Train/Test, only Lag1 and Lag2')
plot_classification(lda, X_test2, y_test)

Classification Report:
 precision recall f1-score support
 Down 0.500 0.315 0.387 111
 Up 0.582 0.752 0.656 141
avg / total 0.546 0.560 0.538 252
Confusion Matrix:
 Predicted 
 True False
Real True 0.315315 0.684685
 False 0.248227 0.751773

No description has been provided for this image

4.6.4 Quadratic Discriminant Analysis (QDA)¶

In [26]:

qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train2, y_train)
print('Prior probabilities of groups: ')
print(pd.DataFrame(data=qda.priors_.reshape((1,2)), columns=['Down', 'Up'], index=['']))
print()
print('Group means: ')
print(pd.DataFrame(data=qda.means_, columns=X_train2.columns, index=['Down', 'Up']))
print()

Prior probabilities of groups: 
 Down Up
 0.491984 0.508016
Group means: 
 Lag1 Lag2
Down 0.042790 0.033894
Up -0.039546 -0.031325

In [27]:

print_classification_statistics(qda, X_test2, y_test, labels=['Down', 'Up'])
plot_ROC(qda, X_test2, y_test, label='QDA Train/Test, only Lag1 and Lag2')
plot_classification(qda, X_test2, y_test)

Classification Report:
 precision recall f1-score support
 Down 0.600 0.270 0.373 111
 Up 0.599 0.858 0.706 141
avg / total 0.599 0.599 0.559 252
Confusion Matrix:
 Predicted 
 True False
Real True 0.270270 0.729730
 False 0.141844 0.858156

No description has been provided for this image

4.6.5 K-Nearest Neighbors¶

In [28]:

n_neighbors = 3
knn = neighbors.KNeighborsClassifier(n_neighbors)
knn.fit(X_train2, y_train)

Out[28]:

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
 metric_params=None, n_jobs=1, n_neighbors=3, p=2,
 weights='uniform')

In [29]:

print_classification_statistics(knn, X_test2, y_test, labels=['Down', 'Up'])
plot_ROC(knn, X_test2, y_test, label='KNN Train/Test, only Lag1 and Lag2')
plot_classification(knn, X_test2, y_test)

Classification Report:
 precision recall f1-score support
 Down 0.466 0.432 0.449 111
 Up 0.577 0.610 0.593 141
avg / total 0.528 0.532 0.529 252
Confusion Matrix:
 Predicted 
 True False
Real True 0.432432 0.567568
 False 0.390071 0.609929

No description has been provided for this image

4.6.6 An Application to Caravan Insurance Data¶

KNN¶

In [30]:

df_caravan = pd.read_csv('Data/Caravan.csv')
df_caravan['Purchase'] = df_caravan['Purchase'].astype('category')
df_caravan.head()

Out[30]:

Unnamed: 0	MOSTYPE	MAANTHUI	MGEMOMV	MGEMLEEF	MOSHOOFD	MGODRK	MGODPR	MGODOV	MGODGE	...	APERSONG	AZEILPL
0	1	33	1	3	2	8	0	5	1	3	...	1	No
1	2	37	1	2	2	8	1	4	1	4	...	1	No
2	3	37	1	2	2	8	0	4	2	4	...	1	No
3	4	9	1	3	3	3	2	3	2	4	...	1	No
4	5	40	1	4	2	10	1	4	1	4	...	1	No

5 rows ×ばつ 87 columns

In [31]:

df_caravan['Purchase'].value_counts()

Out[31]:

No 5474
Yes 348
Name: Purchase, dtype: int64

In [32]:

y = df_caravan.Purchase
X = df_caravan.drop('Purchase', axis=1).astype('float64')
X_scaled = preprocessing.scale(X)

In [33]:

X_test = X_scaled[:1000]
y_test = y[:1000]
X_train = X_scaled[1000:]
y_train = y[1000:]

In [34]:

for i in [1, 3, 5]:
 print(f'Using {i} neighbors')
 knn = neighbors.KNeighborsClassifier(n_neighbors=i)
 knn.fit(X_train, y_train)
 print_classification_statistics(knn, X_test, y_test, labels=['No', 'Yes'])
 #plot_ROC(knn, X_test, y_test, label='KNN')
 #skplt.metrics.plot_confusion_matrix(y_test, knn.predict(X_test), normalize=False)
 plt.show()

Using 1 neighbors
Classification Report:
 precision recall f1-score support
 No 0.948 0.937 0.943 941
 Yes 0.157 0.186 0.171 59
avg / total 0.902 0.893 0.897 1000
Confusion Matrix:
 Predicted 
 True False
Real True 0.937301 0.062699
 False 0.813559 0.186441
Using 3 neighbors
Classification Report:
 precision recall f1-score support
 No 0.946 0.979 0.962 941
 Yes 0.231 0.102 0.141 59
avg / total 0.903 0.927 0.913 1000
Confusion Matrix:
 Predicted 
 True False
Real True 0.978746 0.021254
 False 0.898305 0.101695
Using 5 neighbors
Classification Report:
 precision recall f1-score support
 No 0.944 0.993 0.968 941
 Yes 0.364 0.068 0.114 59
avg / total 0.910 0.938 0.918 1000
Confusion Matrix:
 Predicted 
 True False
Real True 0.992561 0.007439
 False 0.932203 0.067797

Logistic¶

In [35]:

logistic = skl_lm.LogisticRegression(C=1e10)
logistic.fit(X_train, y_train)
print_classification_statistics(logistic, X_test, y_test, labels=['No', 'Yes'])

Classification Report:
 precision recall f1-score support
 No 0.941 0.994 0.966 941
 Yes 0.000 0.000 0.000 59
avg / total 0.885 0.935 0.909 1000
Confusion Matrix:
 Predicted 
 True False
Real True 0.993624 0.006376
 False 1.000000 0.000000

In [36]:

# using 25% changes of buying instead of 50%
pred_p = logistic.predict_proba(X_test)
cm_df = pd.DataFrame({'True': y_test, 'Pred': pred_p[:,1] > .25})
cm_df.Pred.replace(to_replace={True:'Yes', False:'No'}, inplace=True)
print(classification_report(y_test, cm_df.Pred))
print(cm_df.groupby(['True', 'Pred']).size().unstack('True').T)

 precision recall f1-score support
 No 0.95 0.98 0.96 941
 Yes 0.34 0.19 0.24 59
avg / total 0.91 0.93 0.92 1000
Pred No Yes
True 
No 920 21
Yes 48 11

Unnamed: 0	MOSTYPE	MAANTHUI	MGEMOMV	MGEMLEEF	MOSHOOFD	MGODRK	MGODPR	MGODOV	MGODGE	...	APERSONG	AZEILPL
0	1	33	1	3	2	8	0	5	1	3	...	1	No
1	2	37	1	2	2	8	1	4	1	4	...	1	No
2	3	37	1	2	2	8	0	4	2	4	...	1	No
3	4	9	1	3	3	3	2	3	2	4	...	1	No
4	5	40	1	4	2	10	1	4	1	4	...	1	No

Unnamed: 0	MOSTYPE	MAANTHUI	MGEMOMV	MGEMLEEF	MOSHOOFD	MGODRK	MGODPR	MGODOV	MGODGE	...	APERSONG	AZEILPL
0	1	33	1	3	2	8	0	5	1	3	...	1	No
1	2	37	1	2	2	8	1	4	1	4	...	1	No
2	3	37	1	2	2	8	0	4	2	4	...	1	No
3	4	9	1	3	3	3	2	3	2	4	...	1	No
4	5	40	1	4	2	10	1	4	1	4	...	1	No

Unnamed: 0	MOSTYPE	MAANTHUI	MGEMOMV	MGEMLEEF	MOSHOOFD	MGODRK	MGODPR	MGODOV	MGODGE	...	APERSONG	AZEILPL
0	1	33	1	3	2	8	0	5	1	3	...	1	No
1	2	37	1	2	2	8	1	4	1	4	...	1	No
2	3	37	1	2	2	8	0	4	2	4	...	1	No
3	4	9	1	3	3	3	2	3	2	4	...	1	No
4	5	40	1	4	2	10	1	4	1	4	...	1	No