diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst index 323963a4ec4d2..48e08a11e517a 100644 --- a/doc/source/whatsnew/v2.3.3.rst +++ b/doc/source/whatsnew/v2.3.3.rst @@ -48,6 +48,7 @@ Bug fixes with a compiled regex and custom flags (:issue:`62240`) - Fix :meth:`Series.str.match` and :meth:`Series.str.fullmatch` not matching patterns with groups correctly for the Arrow-backed string dtype (:issue:`61072`) - Fix comparing a :class:`StringDtype` Series with mixed objects raising an error (:issue:`60228`) +- Fix error being raised when using a numpy ufunc with a Python-backed string array (:issue:`40800`) Improvements and fixes for Copy-on-Write ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index ef64bda3dc504..fab51ffa56919 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -5,6 +5,7 @@ Any, Literal, Self, + cast, ) import numpy as np @@ -48,6 +49,7 @@ ) from pandas import Index + from pandas.arrays import StringArray class NumpyExtensionArray( @@ -234,6 +236,16 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # e.g. test_np_max_nested_tuples return result else: + if self.dtype.type is str: # type: ignore[comparison-overlap] + # StringDtype + self = cast("StringArray", self) + try: + # specify dtype to preserve storage/na_value + return type(self)(result, dtype=self.dtype) + except ValueError: + # if validation of input fails (no strings) + # -> fallback to returning raw numpy array + return result # one return value; re-box array-like results return type(self)(result) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 1a71f6c41c4f1..d3effb7c33457 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -840,3 +840,30 @@ def test_string_array_view_type_error(): arr = pd.array(["a", "b", "c"], dtype="string") with pytest.raises(TypeError, match="Cannot change data-type for string array."): arr.view("i8") + + +@pytest.mark.parametrize("box", [pd.Series, pd.array]) +def test_numpy_array_ufunc(dtype, box): + arr = box(["a", "bb", "ccc"], dtype=dtype) + + # custom ufunc that works with string (object) input -> returning numeric + str_len_ufunc = np.frompyfunc(lambda x: len(x), 1, 1) + result = str_len_ufunc(arr) + expected_cls = pd.Series if box is pd.Series else np.array + # TODO we should infer int64 dtype here? + expected = expected_cls([1, 2, 3], dtype=object) + tm.assert_equal(result, expected) + + # custom ufunc returning strings + str_multiply_ufunc = np.frompyfunc(lambda x: x * 2, 1, 1) + result = str_multiply_ufunc(arr) + expected = box(["aa", "bbbb", "cccccc"], dtype=dtype) + if dtype.storage == "pyarrow": + # TODO ArrowStringArray should also preserve the class / dtype + if box is pd.array: + expected = np.array(["aa", "bbbb", "cccccc"], dtype=object) + else: + # not specifying the dtype because the exact dtype is not yet preserved + expected = pd.Series(["aa", "bbbb", "cccccc"]) + + tm.assert_equal(result, expected)

AltStyle によって変換されたページ (->オリジナル) /