How to use sklearn imputation methods on numpy.void (record or structured array, I'm not sure) ndarray

Question 1

Code:

import numpy as np
import sklearn as skl
data = np.genfromtxt("water_potability.csv", delimiter = ",", names = True)
print(data)
print(data.shape)
print(type(data[0]))
imputer = skl.impute.SimpleImputer()
imputer.fit_transform(data)

(part of) water_potability.csv:

ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
,204.8904554713363,20791.318980747026,7.300211873184757,368.51644134980336,564.3086541722439,10.3797830780847,86.9909704615088,2.9631353806316407,0
3.71608007538699,129.42292051494425,18630.057857970347,6.635245883862,,592.8853591348523,15.180013116357259,56.32907628451764,4.500656274942408,0
8.099124189298397,224.23625939355776,19909.541732292393,9.275883602694089,,418.6062130644815,16.868636929550973,66.42009251176368,3.0559337496641685,0
...
9.41951031641321,175.76264629629543,33155.578218312294,7.350233233214412,,432.04478304536786,11.039069688154314,69.84540029205144,3.298875498646556,1
5.1267629233515315,230.60375750846123,11983.869376336363,6.303356534249105,,402.883113121781,11.168946221056501,77.48821310275477,4.708658467526655,1
7.874671357791283,195.10229858610904,17404.17706105066,7.509305856927908,,327.4597604610721,16.140367626166324,78.69844632549504,2.309149056634923,1

I have a ndarray of item type numpy.void which is actually a numpy.record or structured array (I'm not sure which, I just know that it have field names). When I try to use sklearn's SimpleImputer (or other imputers) on it, it throws an exception:

[( nan, 204.89045547, 20791.31898075, 7.30021187, 368.51644135, 564.30865417, 10.37978308, 86.99097046, 2.96313538, 0.)
 (3.71608008, 129.42292051, 18630.05785797, 6.63524588, nan, 592.88535913, 15.18001312, 56.32907628, 4.50065627, 0.)
 (8.09912419, 224.23625939, 19909.54173229, 9.2758836 , nan, 418.60621306, 16.86863693, 66.42009251, 3.05593375, 0.)
 ...
 (9.41951032, 175.7626463 , 33155.57821831, 7.35023323, nan, 432.04478305, 11.03906969, 69.84540029, 3.2988755 , 1.)
 (5.12676292, 230.60375751, 11983.86937634, 6.30335653, nan, 402.88311312, 11.16894622, 77.4882131 , 4.70865847, 1.)
 (7.87467136, 195.10229859, 17404.17706105, 7.50930586, nan, 327.45976046, 16.14036763, 78.69844633, 2.30914906, 1.)]
(3276,)
<class 'numpy.void'>
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[4], line 9
 7 print(type(data[0]))
 8 imputer = skl.impute.SimpleImputer()
----> 9 imputer.fit_transform(data)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_set_output.py:316, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
 314 @wraps(f)
 315 def wrapped(self, X, *args, **kwargs):
--> 316 data_to_wrap = f(self, X, *args, **kwargs)
 317 if isinstance(data_to_wrap, tuple):
 318 # only wrap the first output for cross decomposition
 319 return_tuple = (
 320 _wrap_data_with_container(method, data_to_wrap[0], X, self),
 321 *data_to_wrap[1:],
 322 )
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py:894, in TransformerMixin.fit_transform(self, X, y, **fit_params)
 879 warnings.warn(
 880 (
 881 f"This object ({self.__class__.__name__}) has a `transform`"
 (...) 889 UserWarning,
 890 )
 892 if y is None:
 893 # fit method of arity 1 (unsupervised transformation)
--> 894 return self.fit(X, **fit_params).transform(X)
 895 else:
 896 # fit method of arity 2 (supervised transformation)
 897 return self.fit(X, y, **fit_params).transform(X)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py:1365, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
 1358 estimator._validate_params()
 1360 with config_context(
 1361 skip_parameter_validation=(
 1362 prefer_skip_nested_validation or global_skip_validation
 1363 )
 1364 ):
-> 1365 return fit_method(estimator, *args, **kwargs)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\impute\_base.py:452, in SimpleImputer.fit(self, X, y)
 434 @_fit_context(prefer_skip_nested_validation=True)
 435 def fit(self, X, y=None):
 436 """Fit the imputer on `X`.
 437 
 438 Parameters
 (...) 450 Fitted estimator.
 451 """
--> 452 X = self._validate_input(X, in_fit=True)
 454 # default fill_value is 0 for numerical input and "missing_value"
 455 # otherwise
 456 if self.fill_value is None:
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\impute\_base.py:360, in SimpleImputer._validate_input(self, X, in_fit)
 357 ensure_all_finite = True
 359 try:
--> 360 X = validate_data(
 361 self,
 362 X,
 363 reset=in_fit,
 364 accept_sparse="csc",
 365 dtype=dtype,
 366 force_writeable=True if not in_fit else None,
 367 ensure_all_finite=ensure_all_finite,
 368 copy=self.copy,
 369 )
 370 except ValueError as ve:
 371 if "could not convert" in str(ve):
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py:2954, in validate_data(_estimator, X, y, reset, validate_separately, skip_check_array, **check_params)
 2952 out = X, y
 2953 elif not no_val_X and no_val_y:
-> 2954 out = check_array(X, input_name="X", **check_params)
 2955 elif no_val_X and not no_val_y:
 2956 out = _check_y(y, **check_params)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py:1053, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_all_finite, ensure_non_negative, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
 1051 array = xp.astype(array, dtype, copy=False)
 1052 else:
-> 1053 array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
 1054 except ComplexWarning as complex_warning:
 1055 raise ValueError(
 1056 "Complex data not supported\n{}\n".format(array)
 1057 ) from complex_warning
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_array_api.py:757, in _asarray_with_order(array, dtype, order, copy, xp, device)
 755 array = numpy.array(array, order=order, dtype=dtype)
 756 else:
--> 757 array = numpy.asarray(array, order=order, dtype=dtype)
 759 # At this point array is a NumPy ndarray. We convert it to an array
 760 # container that is consistent with the input's namespace.
 761 return xp.asarray(array)
TypeError: Cannot cast array data from dtype([('ph', '<f8'), ('Hardness', '<f8'), ('Solids', '<f8'), ('Chloramines', '<f8'), ('Sulfate', '<f8'), ('Conductivity', '<f8'), ('Organic_carbon', '<f8'), ('Trihalomethanes', '<f8'), ('Turbidity', '<f8'), ('Potability', '<f8')]) to dtype('float64') according to the rule 'unsafe'

After testing, I found out that data.shape is (3276, ), which indicates that data will be interpreted as 1d array, instead of 2d. So my question is: Is there any way to use imputation on the data while preserving the field names, or I have to cast numpy.void to other compatible types, say numpy.ndarray?

Question 2

maybe you should convert data to normal 2D array, maybe you should read CSV using standard module csv or using pandas.read_csv

Question 3

You need to load or covert this data as 2d array of floats. numpy.lib.recfunctions.structured_to_unstructured can be used to covert this structured array. Read about structut0red arays at numpy.org/doc/stable/user/basics.rec.html

Question 4

Alternatively load with the default float dtype, skipping the 'names' row. Preserving the field names can be convenient in some cases, but generally gets in the way of 2d array calculations.

CollectivesTM on Stack Overflow

How to use sklearn imputation methods on numpy.void (record or structured array, I'm not sure) ndarray

0

Know someone who can answer? Share a link to this question via email, Twitter, or Facebook.

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions