I'm writing a k nearest neighbors implementation to solve multiclass classification.
import heapq
import logging
import numpy as np
from scipy import spatial
logging.basicConfig()
class KNN(object):
similarities = {
1: lambda a, b: np.linalg.norm(a-b),
2: lambda a, b: spatial.distance.cosine(a, b),
}
def __init__(self, k, similarity_func, loglevel=logging.DEBUG):
self.k = k
self.logger = logging.getLogger(type(self).__name__)
self.logger.setLevel(loglevel)
if similarity_func not in KNN.similarities:
raise ValueError("Illegal similarity value {0}. Legal values are {1}".format(similarity_func, sorted(KNN.similarities.keys())))
self.similarity_func = KNN.similarities[similarity_func]
def train(self, X, y):
self.training_X = X
self.training_y = y
self.num_classes = len(np.unique(y))
self.logger.debug("There are %s classes", self.num_classes)
return self
def probs(self, X):
class_probs = []
for i, e in enumerate(X, 1):
votes = np.zeros((self.num_classes,))
self.logger.debug("Votes: %s", votes)
if i % 100 == 0:
self.logger.info("Example %s", i)
distance = [(self.similarity_func(e, x), y) for x, y in zip(self.training_X, self.training_y)]
for (_, label) in heapq.nsmallest(self.k, distance, lambda t: t[0]):
votes[label] += 1
class_probs.append(normalize(votes))
return class_probs
def predict(self, X):
return np.argmax(self.probs(X))
I find that this implementation's predict
is slowTM and think it could be sped up with vectorized operations in numpy, but I'm fairly inexperienced with numpy vectorization techniques.
Does anyone have some suggestions for performance boosts I can get from predict
?
1 Answer 1
I'm going to post one optimization:
Euclidean distances don't need to be computed fully!
Because I'm only using them for ranking, a square root is unnecessary. Therefore, the following can be used:
def squared_euclidean(x, y):
dist = np.array(x) - np.array(y)
return np.dot(dist, dist)