Because neither sklearn nor Pandas provide a straightforward and complete one-hot encoder, I decided to write one myself. Both Pandas and sklearn do have an encoder with no option to decode, and the sklearn.LabelEncoder
that has the decoding only produces that, labels.
Here's the class:
import numpy as np
class OneHotEncoder:
def __init__(self):
self.unq = np.array([])
self.n_features = len(self.unq)
def set_unq(self, unq):
self.unq = unq
self.n_features = len(unq)
@staticmethod
def _assure(cond, msg):
if not cond:
raise ValueError(msg)
def fit_transform(self, np_arr):
"""
From categories to one-hot encoding. Calculate unique occurences.
:param np_arr: categorical data of shape (samples, 1)
:return: one-hot encoding with shape (sample, categories)
"""
self._assure(np_arr.shape[-1] == 1, 'Last axis must be length 1.')
unq, idx = np.unique(np_arr, return_inverse=True)
self.set_unq(unq)
arr = np.zeros((len(idx), len(self.unq)))
arr[range(len(idx)), idx] = 1
return arr
def transform(self, np_arr):
"""
From categories to one-hot encoding based on previous samples.
:param np_arr: categorical data of shape (samples, 1)
:return: one-hot encoding with shape (sample, categories)
"""
self._assure(np_arr.shape[-1] == 1, 'Last axis must be length 1.')
arr = np.argwhere(self.unq == np_arr)[:, 1]
zr = np.zeros((len(arr), len(self.unq)))
zr[range(len(arr)), arr] = 1
return zr
def fit_transform_to_labels(self, np_arr):
"""
From categories to label values. Calculate unique occurences.
:param np_arr: categorical data of shape (samples, 1)
:return: label values with shape (sample, 1)
"""
self._assure(np_arr.shape[-1] == 1, 'Last axis must be length 1.')
unq, idx = np.unique(np_arr, return_inverse=True)
self.set_unq(unq)
return idx.reshape(-1, 1)
def transform_to_labels(self, np_arr):
"""
From categories to label values based on previous samples.
:param np_arr: categorical data of shape (samples, 1)
:return: label values with shape (sample, 1)
"""
self._assure(np_arr.shape[-1] == 1, 'Last axis must be length 1.')
arr = np.argwhere(self.unq == np_arr)
return arr[:, 1:2]
def transform_from_labels(self, np_arr):
"""
From label values to one-hot encoding.
:param np_arr: label values of shape (samples, 1)
:return: one-hot encoding with shape (samples, categories)
"""
self._assure(np_arr.shape[-1] == 1, 'Last axis must be length 1.')
arr = np.zeros((len(np_arr), len(self.unq)))
arr[range(len(arr)), np_arr.reshape(-1)] = 1
return arr
def inverse_from_labels(self, np_arr):
"""
From label values to original categorical values.
:param np_arr: label values of shape (samples, 1)
:return: original categorical values with shape (samples, 1)
"""
self._assure(np_arr.shape[-1] == 1, 'Last axis must be length 1.')
return self.unq[np_arr]
def inverse_to_lables(self, np_arr):
"""
From one-hot encoding to label values.
:param np_arr: one-hot encoding of shape (samples, categories)
:return: label values with shape (samples, 1)
"""
self._assure(np_arr.shape[-1] == len(self.unq), 'Inverting array must be same length as available labels.')
return np.argmax(np_arr, axis=-1).reshape(-1, 1)
def inverse(self, np_arr):
"""
From one-hot encoding to original categorical values.
:param np_arr: one-hot encoding of shape (samples, categories)
:return: original categorical values with shape (samples, 1)
"""
self._assure(np_arr.shape[-1] == len(self.unq), 'Inverting array must be same length as available labels.')
return self.inverse_from_labels(np.argmax(np_arr, axis=-1).reshape(-1, 1))
So in short, this class combines the functionality of sklearn.LabelEncoder
and sklearn.OneHotEncoder
. The assertions are a bit redundant, I just like to keep my vectors as column vectors.
This class does work.
- Is it missing something in terms of functionality or safety?
- Could it be expanded to some different cases I haven't yet taken into account?
Here's a small snippet of using the class:
a = np.array([1,2,3,4,3,2,1]).reshape(-1, 1)
oh = OneHotEncoder()
labs = oh.fit_transform_to_labels(a)
encoded = oh.transform_from_labels(labs)
decoded = oh.inverse(encoded)
-
\$\begingroup\$ Revised version below as an answer for anyone who's interested! \$\endgroup\$Felix– Felix2018年09月02日 18:35:00 +00:00Commented Sep 2, 2018 at 18:35
2 Answers 2
variable naming
np_arr
is a bad name for a variable, especially if you use it in multiple places, each with a different meaning. Name the part, so for example in fit_transform
, samples
is a better name.
self.unq
is also unclear, I would replace it with self._categories
_assure
why not just use the built-in assert
. If you really want to raise a ValueError
instead of an AssertionError
, make it a top-level function instead of putting it on the class.
@property
instead of
class OneHotEncoder:
def __init__(self):
self.unq = np.array([])
self.n_features = len(self.unq)
def set_unq(self, unq):
self.unq = unq
self.n_features = len(unq)
you can use the @property
decorator
class OneHotEncoder:
def __init__(self):
self._categories = None
@property
def categories(self):
assert self._categories is not None, 'Fit the encoder first'
return self._categories
@categories.setter
def categories(self, categories):
self._categories = categories
@property
def n_features(self):
return len(self.categories)
and then where you would use self.set_unq(unq)
, you can do self.categories = unq
shape
Is there any particular reason why the last axis must be length 1? I find shape (n_samples,)
more intuitive than (n_samples, 1)
tests
Did you test this code?
in your example code, encoded
is
array([[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.]])
so something went wrong there (probably to do with the extra dimension).
dtype
For the encoded array, I would use dtype=bool
, since it's a series of flags.
examples
including examples in the docstring might make it clearer what the goal of the method is
DRY
The code to fit the np_arr
is used in 2 methods, you can refactor that out
In inverse
, you have the code np.argmax(np_arr, axis=-1).reshape(-1, 1)
, which is exactly what inverse_to_lables
does, so use that method instead of repeating the code
class
Why do you need a class for this? All you use it for is passing around the categories of the encoding. passing them around as arguments seems a lot easier, then these 4 methods do about the same as your class
def encode_index(index, n_categories: int=None):
if n_categories is None:
n_categories = len(np.unique(index))
shape = (len(index), n_categories)
encoding = np.zeros(shape, dtype=bool)
encoding[range(len(index)), index] = True
return encoding
takes a list of integers, and turns it into the encoded form
encode_index([0,1,2,1])
array([[ True, False, False], [False, True, False], [False, False, True], [False, True, False]])
and it's inverse:
def decode_index(encoding):
return np.argmax(encoding, axis=-1)
the higher-level encoding can be done like this:
def encode(samples, categories=None):
cat, index = np.unique(samples, return_inverse=True)
categories = categories if categories is not None else cat
encoding = encode_index(index, len(categories))
return encoding, categories
encode(list('abdefea'))
(array([[ True, False, False, False, False], [False, True, False, False, False], [False, False, True, False, False], [False, False, False, True, False], [False, False, False, False, True], [False, False, False, True, False], [ True, False, False, False, False]]), array(['a', 'b', 'd', 'e', 'f'], dtype='<U1'))
and the inverse:
def decode(encoding, categories):
index = decode_index(encoding)
return categories[index]
-
\$\begingroup\$ Thank you for the comprehensive answer! I learned a lot, expecially about the decorator! I can't believe I managed to not include the fix to the post. I did test, and fix the very bug you pointed out but left it in the local code only... But to respond: I've heard that assertions are bad at least in running code. As I said, the shape assertion is just to my own liking, so that the array returned has the same dimensions e.g.
(samples, 1) -> (samples, features)
but I understand the point and perhaps it would be good to make it more general. \$\endgroup\$Felix– Felix2018年08月06日 16:21:30 +00:00Commented Aug 6, 2018 at 16:21 -
\$\begingroup\$ Or even accept multiple categories in the last axis! Hey that's a neat idea :D Using a class is solely for keeping the
features
state and more clearly bundling the functionality together, but functions and a module might work too. Cheers! Please do further disagree if you feel that's warranted. \$\endgroup\$Felix– Felix2018年08月06日 16:24:14 +00:00Commented Aug 6, 2018 at 16:24
Reply to self: an improved version
Here's an improved version for anyone who's interested. I managed to simplify it in lots of ways. Many suggestions from the answer were implemented, but I decided to keep the class form. Most notably I decided to ditch the assertion that the first axis should have length 1, allowing for a more general encoding scheme (still only the last axis encoded though).
class OneHotEncoder:
"""
Simple one-hot encoder.
Does not handle unseen categories: will default to the first category.
Does not invert all-zero arrays: will default to the first category.
Does not handle nan data.
"""
def __init__(self):
self._categories = None
@property
def categories(self) -> np.ndarray:
if self._categories is None:
raise ValueError('Encoder not fitted!')
return self._categories
@categories.setter
def categories(self, categories) -> None:
self._categories = categories
@property
def n_features(self) -> int:
return len(self.categories)
def fit(self, categories: np.ndarray) -> None:
self.categories = np.unique(categories)
def transform(self, samples: np.ndarray) -> np.ndarray:
return self.transform_from_labels(self.transform_to_labels(samples))
def transform_to_labels(self, samples: np.ndarray) -> np.ndarray:
arr = np.argwhere(self.categories == samples.reshape(-1, 1))
labels = np.zeros((samples.size,), dtype=int)
labels[arr[:, 0]] = arr[:, 1]
return labels.reshape(samples.shape)
def transform_from_labels(self, labels: np.ndarray) -> np.ndarray:
return np.eye(self.n_features)[labels]
def inverse_from_labels(self, labels: np.ndarray) -> np.ndarray:
return self.categories[labels]
@staticmethod
def inverse_to_labels(encoded: np.ndarray) -> np.ndarray:
return np.argmax(encoded, axis=-1)
def inverse(self, encoded: np.ndarray) -> np.ndarray:
return self.inverse_from_labels(self.inverse_to_labels(encoded))
Next I made a subclass for some more demanding tasks.
class NanHotEncoder(OneHotEncoder):
"""
Extension to the simple OneHotEncoder.
Does handle NaN data, ignores unseen categories (all zero) and inverts all zero rows.
Only accepts and returns 1-dimensional data (pd.Series) as samples (categories).
"""
def __init__(self):
super().__init__()
@staticmethod
def _mask_assign(shape: tuple, mask: np.ndarray, values: np.ndarray, init: float=np.nan) -> np.ndarray:
array = np.full(shape, init)
array[mask] = values
return array
def transform_from_labels(self, labels: np.ndarray) -> np.ndarray:
nans = np.isnan(labels)
encoded = super(NanHotEncoder, self).transform_from_labels(labels[~nans].astype(int))
return self._mask_assign(labels.shape + (self.n_features,), ~nans, encoded, init=0)
def inverse_to_lables(self, encoded: np.ndarray) -> np.ndarray:
nans = np.sum(encoded, axis=-1) == 0
inverted = super(NanHotEncoder, self).inverse_to_labels(encoded[~nans].astype(int))
return self._mask_assign(encoded.shape[:-1], ~nans, inverted)
def transform_to_labels(self, samples: pd.Series) -> np.ndarray:
mask = samples.isnull() | ~samples.isin(self.categories)
labels = super(NanHotEncoder, self).transform_to_labels(samples[~mask].values)
return self._mask_assign(samples.values.shape, ~mask.values, labels)
def inverse_from_labels(self, labels: np.ndarray) -> pd.Series:
series = pd.Series(labels.ravel())
inverted = super(NanHotEncoder, self).inverse_from_labels(series.dropna().values.astype(int))
series[~series.isnull()] = inverted
return series
def transform(self, samples: pd.Series) -> np.ndarray:
return self.transform_from_labels(self.transform_to_labels(samples))
def inverse(self, encoded: np.ndarray) -> pd.Series:
return self.inverse_from_labels(self.inverse_to_labels(encoded))
Here's also a basic test suite to validate the functionality.
import unittest
from encoders import OneHotEncoder, NanHotEncoder
import numpy as np
import pandas as pd
def array_equal(a: np.ndarray, b: np.ndarray) -> np.ndarray:
return (a == b) | ((a != a) & (b != b))
class TestOneHotEncoder(unittest.TestCase):
str_categories = np.array(['a', 'b', 'c', 'd'])
def setUp(self):
self.oh = OneHotEncoder()
self.oh.fit(self.str_categories)
def test_fit(self):
self.assertTrue(np.all(self.str_categories == self.oh.categories))
def test_transform_to_labels(self):
samples = np.array([[['a', 'c'], ['b', 'c']], [['d', 'd'], ['a', 'd']]])
result = np.array([[[0, 2], [1, 2]], [[3, 3], [0, 3]]])
self.assertTrue(np.all(self.oh.transform_to_labels(samples) == result))
def test_transform_from_labels(self):
labels = np.array([[0, 2], [1, 3]])
result = np.array([[[1, 0, 0, 0], [0, 0, 1, 0]], [[0, 1, 0, 0], [0, 0, 0, 1]]])
self.assertTrue(np.all(self.oh.transform_from_labels(labels) == result))
def test_inverse_from_labels(self):
labels = np.array([[[0, 2], [1, 2]], [[3, 3], [0, 3]]])
result = np.array([[['a', 'c'], ['b', 'c']], [['d', 'd'], ['a', 'd']]])
self.assertTrue(np.all(self.oh.inverse_from_labels(labels) == result))
def test_inverse_to_labels(self):
encoded = np.array([[[1, 0, 0, 0], [0, 0, 1, 0]], [[0, 1, 0, 0], [0, 0, 0, 1]]])
result = np.array([[0, 2], [1, 3]])
self.assertTrue(np.all(self.oh.inverse_to_labels(encoded) == result))
class TestNanHotEncoder(unittest.TestCase):
categories = np.array(['a', 'b', 'c', 'd'])
def setUp(self):
self.nh = NanHotEncoder()
self.nh.fit(self.categories)
def test_transform_to_labels(self):
samples = pd.Series(['a', 'c', np.nan, 'c', 'd', np.nan, 'a', 'd'])
result = np.array([0, 2, np.nan, 2, 3, np.nan, 0, 3])
self.assertTrue(np.all(array_equal(self.nh.transform_to_labels(samples), result)))
def test_transform_from_labels(self):
labels = np.array([[0, np.nan], [np.nan, 3]])
result = np.array([[[1, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 1]]])
self.assertTrue(np.all(array_equal(self.nh.transform_from_labels(labels), result)))
def test_inverse_from_labels(self):
labels = np.array([0, 2, np.nan, 2, 3, np.nan, 0, 3])
result = pd.Series(['a', 'c', np.nan, 'c', 'd', np.nan, 'a', 'd'])
self.assertTrue(self.nh.inverse_from_labels(labels).equals(result))
def test_inverse_to_labels(self):
encoded = np.array([[[1, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 1]]])
result = np.array([[0, np.nan], [np.nan, 3]])
self.assertTrue(np.all(array_equal(self.nh.inverse_to_lables(encoded), result)))
def test_novel_classes(self):
samples = pd.Series(['a', 'f', np.nan, 'd'])
result = np.array([[1, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 1]])
self.assertTrue(np.all(array_equal(self.nh.transform(samples), result)))
if __name__ == '__main__':
oh_test = TestOneHotEncoder()
nh_test = TestNanHotEncoder()
test = unittest.TestSuite()
test.addTests([oh_test, nh_test])
res = unittest.TestResult()
test.run(res)
This was a great learning experience. It's still not finished though. It'd be nice to be able to handle the general n-dimensional or at least the common 2D case with the NanHotEncoder as well. But time will tell, perhaps I'll edit this answer.
Cheers!