I am a machine learning noob attempting to implement regularized logistic regression via Newton's method.
The example data have two features which are to be expanded to 28 through finding all monomial terms of (u,v) up to degree 6.
My code converges to the correct solution of norm(theta)=0.9384 after around 500 or so iterations when it should only take around 15 for lambda = 10, though the exercise is based on Matlab instead of Python. Each cycle of the parameter update is also very slow with my code and I am not sure exactly why. If anyone could explain why my code takes so many iterations to converge and why each iteration is painfully slow I would be very grateful! Please be brutally honest about my code habits as well.
The data are taken from Andrew Ng's open course exercise 5. The problem information and data can be found here, although I posted the data and my code below.
X values, 2 features
0.051267,0.69956
-0.092742,0.68494
-0.21371,0.69225
-0.375,0.50219
-0.51325,0.46564
-0.52477,0.2098
-0.39804,0.034357
-0.30588,-0.19225
0.016705,-0.40424
0.13191,-0.51389
0.38537,-0.56506
0.52938,-0.5212
0.63882,-0.24342
0.73675,-0.18494
0.54666,0.48757
0.322,0.5826
0.16647,0.53874
-0.046659,0.81652
-0.17339,0.69956
-0.47869,0.63377
-0.60541,0.59722
-0.62846,0.33406
-0.59389,0.005117
-0.42108,-0.27266
-0.11578,-0.39693
0.20104,-0.60161
0.46601,-0.53582
0.67339,-0.53582
-0.13882,0.54605
-0.29435,0.77997
-0.26555,0.96272
-0.16187,0.8019
-0.17339,0.64839
-0.28283,0.47295
-0.36348,0.31213
-0.30012,0.027047
-0.23675,-0.21418
-0.06394,-0.18494
0.062788,-0.16301
0.22984,-0.41155
0.2932,-0.2288
0.48329,-0.18494
0.64459,-0.14108
0.46025,0.012427
0.6273,0.15863
0.57546,0.26827
0.72523,0.44371
0.22408,0.52412
0.44297,0.67032
0.322,0.69225
0.13767,0.57529
-0.0063364,0.39985
-0.092742,0.55336
-0.20795,0.35599
-0.20795,0.17325
-0.43836,0.21711
-0.21947,-0.016813
-0.13882,-0.27266
0.18376,0.93348
0.22408,0.77997
0.29896,0.61915
0.50634,0.75804
0.61578,0.7288
0.60426,0.59722
0.76555,0.50219
0.92684,0.3633
0.82316,0.27558
0.96141,0.085526
0.93836,0.012427
0.86348,-0.082602
0.89804,-0.20687
0.85196,-0.36769
0.82892,-0.5212
0.79435,-0.55775
0.59274,-0.7405
0.51786,-0.5943
0.46601,-0.41886
0.35081,-0.57968
0.28744,-0.76974
0.085829,-0.75512
0.14919,-0.57968
-0.13306,-0.4481
-0.40956,-0.41155
-0.39228,-0.25804
-0.74366,-0.25804
-0.69758,0.041667
-0.75518,0.2902
-0.69758,0.68494
-0.4038,0.70687
-0.38076,0.91886
-0.50749,0.90424
-0.54781,0.70687
0.10311,0.77997
0.057028,0.91886
-0.10426,0.99196
-0.081221,1.1089
0.28744,1.087
0.39689,0.82383
0.63882,0.88962
0.82316,0.66301
0.67339,0.64108
1.0709,0.10015
-0.046659,-0.57968
-0.23675,-0.63816
-0.15035,-0.36769
-0.49021,-0.3019
-0.46717,-0.13377
-0.28859,-0.060673
-0.61118,-0.067982
-0.66302,-0.21418
-0.59965,-0.41886
-0.72638,-0.082602
-0.83007,0.31213
-0.72062,0.53874
-0.59389,0.49488
-0.48445,0.99927
-0.0063364,0.99927
Y values
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
My code
import pandas as pd
import numpy as np
import math
def sigmoid(theta, x):
return 1/(1 + math.exp(-1*theta.T.dot(x)))
def cost_function(X, y, theta):
s = 0
for i in range(m):
loss = -y[i]*np.log(sigmoid(theta, X[i])) - (1-y[i])*np.log(1-sigmoid(theta, X[i]))
s += loss
s /= m
s += (lamb/(2*m))*sum(theta[j]**2 for j in range(1, 28))
return s
def gradient(theta, X, y):
add_column = theta * (lamb/m)
add_column[0] = 0
a = sum((sigmoid(theta, X[i]) - y[i])*X[i] + add_column for i in range(m))/m
return a
def hessian(theta, X, reg_matrix):
matrix = []
for i in range(28):
row = []
for j in range(28):
cell = sum(sigmoid(theta, X[k])*(1-sigmoid(theta, X[k]))*X[k][i]*X[k][j] for k in range(m))
row.append(cell)
matrix.append(row)
H = np.array(matrix)
H = np.add(H, reg_matrix)
return H
def newtons_method(theta, iterations):
for i in range(iterations):
g = gradient(theta, X, y)
H = hessian(theta, X, reg_matrix)
theta = theta - np.linalg.inv(H).dot(g)
cost = cost_function(X,y,theta)
print(cost)
return theta
def map_feature(u, v): # expand features according to problem instructions
new_row = []
new_row.append(1)
new_row.append(u)
new_row.append(v)
new_row.append(u**2)
new_row.append(u*v)
new_row.append(v**2)
new_row.append(u**3)
new_row.append(u**2*v)
new_row.append(u*v**2)
new_row.append(v**3)
new_row.append(u**4)
new_row.append(u**3*v)
new_row.append(u*v**3)
new_row.append(v**4)
new_row.append(u**2*v**2)
new_row.append(u**5)
new_row.append(u**4*v)
new_row.append(u*v**4)
new_row.append(v**5)
new_row.append(u**2*v**3)
new_row.append(u**3*v**2)
new_row.append(u**6)
new_row.append(u**5*v)
new_row.append(u*v**5)
new_row.append(v**6)
new_row.append(u**4*v**2)
new_row.append(u**2*v**4)
new_row.append(u**3*v**3)
return np.array(new_row)
with open('ex5Logx.dat', 'r') as f:
array = []
for line in f.readlines():
array.append(line.strip().split(','))
for a in array:
a[0], a[1] = float(a[0]), float(a[1].strip())
xdata= np.array(array)
with open('ex5Logy.dat', 'r') as f:
array = []
for line in f.readlines():
array.append(line.strip())
for i in range(len(array)):
array[i] = float(array[i])
ydata= np.array(array)
X_df = pd.DataFrame(xdata, columns=['score1', 'score2'])
y_df = pd.DataFrame(ydata, columns=['acceptence'])
m = len(y_df)
iterations = 1500
ones = np.ones((m,1)) # intercept term in first column
X = np.array(X_df)
X = np.append(ones, X, axis=1)
y = np.array(y_df).flatten()
new_X = [] # prepare new array for expanded features
for i in range(m):
new_row = map_feature(X[i][1], X[i][2])
new_X.append(new_row)
X = np.array(new_X)
theta = np.array([0 for i in range(28)]) # initialize parameters to 0
lamb = 1 # lambda constant for regularization
reg_matrix = np.zeros((28,28),dtype=int) # n+1*n+1 regularization matrix
np.fill_diagonal(reg_matrix, 1)
reg_matrix[0] = 0
reg_matrix = (lamb/m)*reg_matrix
theta = newtons_method(theta, iterations)
print(np.linalg.norm(theta))
2 Answers 2
This is a small improvement to Maarten's answer. Instead of hard-coding indices, you could use
import itertools
indices = list(itertools.product(range(7), repeat=2))
This is clearly shorter, and also cleaner.
Better yet,
def map_feature(u, v): # expand features according to problem instructions
return np.array([u ** i * v ** j for i, j in itertools.product(range(7), repeat=2)])
This will be even faster, as it removes the need to create the indices list.
-
\$\begingroup\$
[(0, 0), (0, 1), (0, 2), (0, 3), (1, 0), (1, 1), (1, 2), (1, 3), (2, 0), (2, 1), (2, 2), (2, 3), (3, 0), (3, 1), (3, 2), (3, 3)]
is not the same list asindices
. I've looked into creating that list in that order withitertools
, bu!t it becomes convoluted quickly \$\endgroup\$Maarten Fabré– Maarten Fabré2018年04月03日 07:37:27 +00:00Commented Apr 3, 2018 at 7:37 -
\$\begingroup\$ Oh, the order matters. Never mind. \$\endgroup\$Oscar Smith– Oscar Smith2018年04月03日 14:26:10 +00:00Commented Apr 3, 2018 at 14:26
-
\$\begingroup\$ I don't know whether the order matters. OP is unclear about that, but even so, not all combinations are generated \$\endgroup\$Maarten Fabré– Maarten Fabré2018年04月03日 14:56:31 +00:00Commented Apr 3, 2018 at 14:56
-
\$\begingroup\$ oops, 4 should be 7, and then it's just order. \$\endgroup\$Oscar Smith– Oscar Smith2018年04月03日 15:43:06 +00:00Commented Apr 3, 2018 at 15:43
-
\$\begingroup\$ are you even paying attention? There is no coefficient (6, 6). If you want to do it without regards to the order, you'd need something like:
itertools.chain.from_iterable(set((j, i-j) for j in range(i + 1)) for i in range(7))
\$\endgroup\$Maarten Fabré– Maarten Fabré2018年04月04日 08:04:14 +00:00Commented Apr 4, 2018 at 8:04
A few tips for clarity, before you can even begin to analyze what goes slow
global state
You pass on global state to your functions, but not as arguments
for example in your cost_function
: m
and lamb
are from the global scope
Use pandas csv_reader
Depending on how exactly the datafiles look like
with open('ex5Logx.dat', 'r') as f:
array = []
for line in f.readlines():
array.append(line.strip().split(','))
for a in array:
a[0], a[1] = float(a[0]), float(a[1].strip())
xdata= np.array(array)
with open('ex5Logy.dat', 'r') as f:
array = []
for line in f.readlines():
array.append(line.strip())
for i in range(len(array)):
array[i] = float(array[i])
ydata= np.array(array)
X_df = pd.DataFrame(xdata, columns=['score1', 'score2'])
y_df = pd.DataFrame(ydata, columns=['acceptence'])
Can be replace by something as
X_df = pd.read_csv(x_filename, sep=',', header=None, index_col=None).rename(columns={0: 'score1', 1: 'score2'})
y_df = pd.read_csv(y_filename, sep=',', header=None, index_col=None).rename(columns={0: 'acceptance'})
map_feature
Instead of appending new rows (which is slow) to a list for every item, why not build it in one go, either literal or via the indices?
def map_feature(u, v): # expand features according to problem instructions
indices = [
(0, 0),
(1, 0), (0, 1),
(2, 0), (1, 1), (0, 2),
(3, 0), (2, 1), (1, 2), (0, 3),
(4, 0), (3, 1), (1, 3), (0, 4), (2, 2),
(5, 0), (4, 1), (1, 4), (0, 5), (3, 2), (2, 3),
(6, 0), (5, 1), (1, 5), (0, 6), (4, 2), (2, 4), (3, 3),
]
return np.array([u ** i * v ** j for i, j in indices])
If needed, you could generate the indices on the fly with something like this:
def indices(n):
yield from sorted(set(itertools.chain.from_iterable(((n - i, i), (i, n - i)) for i in range(min(n + 1, 2)))), reverse=True)
yield from itertools.chain.from_iterable(sorted({(n - i, i), (i, n - i)}, reverse=True) for i in range(2, n//2 + 1))
idx = itertools.chain.from_iterable(indices(i) for i in range(7))
conversions
Why all the conversions between pd.DataFrame
and np.array
? like here?
y = np.array(y_df).flatten()
The insertion of the column of 1 can be handled like this:
X = X_df.insert('1'=1)
vectorize
The map_feature
is done for all rows of the X
after adding a column of 1
s. Why not do it in 1 go?
new_X = pd.concat((X['score1'] ** i * X['score2'] ** j for i, j in indices), axis=1)
your cost_function can be vectorized too, especially this part:
for i in range(m):
loss = -y[i]*np.log(sigmoid(theta, X[i])) - (1-y[i])*np.log(1-sigmoid(theta, X[i]))
s += loss
The Hessian
can use some vectorization too
Magical value
where does the 28 come from? is it len(indices)
?
use the libraries functions
reg_matrix = np.zeros((28,28),dtype=int) # n+1*n+1 regularization matrix
np.fill_diagonal(reg_matrix, 1)
reg_matrix[0] = 0
reg_matrix = (lamb/m)*reg_matrix
creates a diagonal matrix with 1/len(y_df)
on the diagonal except the first item. numpy had diag_flat
reg_matrix = np.diagflat([0] + [lamb/m] * 27)
There are still a lot of open points, but without further clarification what it does and how the data looks like, that will be hard
-
\$\begingroup\$ The 28 comes from the expansion of the two given features per training example to 28 per x. It was part of the problem instructions presumably to test the effectiveness of restricting the parameters. The data were not said to represent any real quantities but the objective is to classify an input as either 1 or 0. This answer is very helpful, thank you. I am still unable to upvote but will when possible. \$\endgroup\$user162065– user1620652018年03月04日 00:24:24 +00:00Commented Mar 4, 2018 at 0:24
brutally honest
frank, at least: you present uncommented code. Please heed What to (not) do when someone answers a question. \$\endgroup\$map_features
matter? \$\endgroup\$