Code:
import numpy as np
import sklearn as skl
data = np.genfromtxt("water_potability.csv", delimiter = ",", names = True)
print(data)
print(data.shape)
print(type(data[0]))
imputer = skl.impute.SimpleImputer()
imputer.fit_transform(data)
(part of) water_potability.csv:
ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
,204.8904554713363,20791.318980747026,7.300211873184757,368.51644134980336,564.3086541722439,10.3797830780847,86.9909704615088,2.9631353806316407,0
3.71608007538699,129.42292051494425,18630.057857970347,6.635245883862,,592.8853591348523,15.180013116357259,56.32907628451764,4.500656274942408,0
8.099124189298397,224.23625939355776,19909.541732292393,9.275883602694089,,418.6062130644815,16.868636929550973,66.42009251176368,3.0559337496641685,0
...
9.41951031641321,175.76264629629543,33155.578218312294,7.350233233214412,,432.04478304536786,11.039069688154314,69.84540029205144,3.298875498646556,1
5.1267629233515315,230.60375750846123,11983.869376336363,6.303356534249105,,402.883113121781,11.168946221056501,77.48821310275477,4.708658467526655,1
7.874671357791283,195.10229858610904,17404.17706105066,7.509305856927908,,327.4597604610721,16.140367626166324,78.69844632549504,2.309149056634923,1
I have a ndarray of item type numpy.void which is actually a numpy.record or structured array (I'm not sure which, I just know that it have field names).
When I try to use sklearn's SimpleImputer (or other imputers) on it, it throws an exception:
[( nan, 204.89045547, 20791.31898075, 7.30021187, 368.51644135, 564.30865417, 10.37978308, 86.99097046, 2.96313538, 0.)
(3.71608008, 129.42292051, 18630.05785797, 6.63524588, nan, 592.88535913, 15.18001312, 56.32907628, 4.50065627, 0.)
(8.09912419, 224.23625939, 19909.54173229, 9.2758836 , nan, 418.60621306, 16.86863693, 66.42009251, 3.05593375, 0.)
...
(9.41951032, 175.7626463 , 33155.57821831, 7.35023323, nan, 432.04478305, 11.03906969, 69.84540029, 3.2988755 , 1.)
(5.12676292, 230.60375751, 11983.86937634, 6.30335653, nan, 402.88311312, 11.16894622, 77.4882131 , 4.70865847, 1.)
(7.87467136, 195.10229859, 17404.17706105, 7.50930586, nan, 327.45976046, 16.14036763, 78.69844633, 2.30914906, 1.)]
(3276,)
<class 'numpy.void'>
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[4], line 9
7 print(type(data[0]))
8 imputer = skl.impute.SimpleImputer()
----> 9 imputer.fit_transform(data)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_set_output.py:316, in _wrap_method_output.<locals>.wrapped(self, X, *args, **kwargs)
314 @wraps(f)
315 def wrapped(self, X, *args, **kwargs):
--> 316 data_to_wrap = f(self, X, *args, **kwargs)
317 if isinstance(data_to_wrap, tuple):
318 # only wrap the first output for cross decomposition
319 return_tuple = (
320 _wrap_data_with_container(method, data_to_wrap[0], X, self),
321 *data_to_wrap[1:],
322 )
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py:894, in TransformerMixin.fit_transform(self, X, y, **fit_params)
879 warnings.warn(
880 (
881 f"This object ({self.__class__.__name__}) has a `transform`"
(...) 889 UserWarning,
890 )
892 if y is None:
893 # fit method of arity 1 (unsupervised transformation)
--> 894 return self.fit(X, **fit_params).transform(X)
895 else:
896 # fit method of arity 2 (supervised transformation)
897 return self.fit(X, y, **fit_params).transform(X)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py:1365, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
1358 estimator._validate_params()
1360 with config_context(
1361 skip_parameter_validation=(
1362 prefer_skip_nested_validation or global_skip_validation
1363 )
1364 ):
-> 1365 return fit_method(estimator, *args, **kwargs)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\impute\_base.py:452, in SimpleImputer.fit(self, X, y)
434 @_fit_context(prefer_skip_nested_validation=True)
435 def fit(self, X, y=None):
436 """Fit the imputer on `X`.
437
438 Parameters
(...) 450 Fitted estimator.
451 """
--> 452 X = self._validate_input(X, in_fit=True)
454 # default fill_value is 0 for numerical input and "missing_value"
455 # otherwise
456 if self.fill_value is None:
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\impute\_base.py:360, in SimpleImputer._validate_input(self, X, in_fit)
357 ensure_all_finite = True
359 try:
--> 360 X = validate_data(
361 self,
362 X,
363 reset=in_fit,
364 accept_sparse="csc",
365 dtype=dtype,
366 force_writeable=True if not in_fit else None,
367 ensure_all_finite=ensure_all_finite,
368 copy=self.copy,
369 )
370 except ValueError as ve:
371 if "could not convert" in str(ve):
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py:2954, in validate_data(_estimator, X, y, reset, validate_separately, skip_check_array, **check_params)
2952 out = X, y
2953 elif not no_val_X and no_val_y:
-> 2954 out = check_array(X, input_name="X", **check_params)
2955 elif no_val_X and not no_val_y:
2956 out = _check_y(y, **check_params)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py:1053, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_all_finite, ensure_non_negative, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
1051 array = xp.astype(array, dtype, copy=False)
1052 else:
-> 1053 array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
1054 except ComplexWarning as complex_warning:
1055 raise ValueError(
1056 "Complex data not supported\n{}\n".format(array)
1057 ) from complex_warning
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\_array_api.py:757, in _asarray_with_order(array, dtype, order, copy, xp, device)
755 array = numpy.array(array, order=order, dtype=dtype)
756 else:
--> 757 array = numpy.asarray(array, order=order, dtype=dtype)
759 # At this point array is a NumPy ndarray. We convert it to an array
760 # container that is consistent with the input's namespace.
761 return xp.asarray(array)
TypeError: Cannot cast array data from dtype([('ph', '<f8'), ('Hardness', '<f8'), ('Solids', '<f8'), ('Chloramines', '<f8'), ('Sulfate', '<f8'), ('Conductivity', '<f8'), ('Organic_carbon', '<f8'), ('Trihalomethanes', '<f8'), ('Turbidity', '<f8'), ('Potability', '<f8')]) to dtype('float64') according to the rule 'unsafe'
After testing, I found out that data.shape is (3276, ), which indicates that data will be interpreted as 1d array, instead of 2d. So my question is: Is there any way to use imputation on the data while preserving the field names, or I have to cast numpy.void to other compatible types, say numpy.ndarray?
csvor usingpandas.read_csvnumpy.lib.recfunctions.structured_to_unstructuredcan be used to covert this structured array. Read about structut0red arays at numpy.org/doc/stable/user/basics.rec.html