diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7ec50137c3039..5ac89bdc9ad1d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -163,6 +163,7 @@ Other enhancements - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) - Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`) - Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`) +- Pivoting or unstacking boolean columns (e.g., with :meth:`DataFrame.pivot_table`, :meth:`DataFrame.unstack`) now preserves them as nullable BooleanDtype, with missing values as ``pd.NA`` for improved memory usage and correctness. (:issue:`62244`) - :class:`pandas.api.typing.NoDefault` is available for typing ``no_default`` - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`) @@ -292,6 +293,7 @@ These improvements also fixed certain bugs in groupby: - :meth:`.DataFrameGroupBy.nunique` would fail when there are multiple groupings, unobserved groups, and ``as_index=False`` (:issue:`52848`) - :meth:`.DataFrameGroupBy.sum` would have incorrect values when there are multiple groupings, unobserved groups, and non-numeric data (:issue:`43891`) - :meth:`.DataFrameGroupBy.value_counts` would produce incorrect results when used with some categorical and some non-categorical groupings and ``observed=False`` (:issue:`56016`) +- Fixed boolean columns being upcast to float or object in :meth:`DataFrame.pivot_table` and :meth:`DataFrame.unstack`; these now remain as nullable BooleanDtype with missing values as ``pd.NA``. (:issue:`62244`) .. _whatsnew_300.notable_bug_fixes.notable_bug_fix2: diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 8b7ca9f437268..1d3631a16d202 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -13,6 +13,7 @@ from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( + is_bool_dtype, is_list_like, is_nested_list_like, is_scalar, @@ -23,6 +24,7 @@ ABCSeries, ) +from pandas.core.arrays.boolean import BooleanDtype import pandas.core.common as com from pandas.core.groupby import Grouper from pandas.core.indexes.api import ( @@ -409,6 +411,17 @@ def __internal_pivot_table( if isinstance(table, ABCDataFrame) and dropna: table = table.dropna(how="all", axis=1) + # GH#62244: Preserve boolean dtype instead of upcasting to float + if isinstance(table, ABCDataFrame): + for col in table.columns: + val = table[col] + if isinstance(val, ABCSeries): + # if the column is bool or was coerced to object with booleans + if is_bool_dtype(val.dtype) or ( + val.dtype == object and val.dropna().isin([True, False]).all() + ): + table[col] = val.astype(BooleanDtype()) + return table diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index dd22f900be926..4b6de3631b380 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -24,6 +24,7 @@ from pandas.core.dtypes.common import ( ensure_platform_int, is_1d_only_ea_dtype, + is_bool_dtype, is_integer, needs_i8_conversion, ) @@ -241,13 +242,38 @@ def get_result(self, obj, value_columns, fill_value) -> DataFrame: if value_columns is None and values.shape[1] != 1: # pragma: no cover raise ValueError("must pass column labels for multi-column data") - new_values, _ = self.get_new_values(values, fill_value) + new_values, new_mask = self.get_new_values(values, fill_value) columns = self.get_new_columns(value_columns) index = self.new_index - result = self.constructor( - new_values, index=index, columns=columns, dtype=new_values.dtype, copy=False - ) + # If original values were numpy-bool, we need to respect the missing mask + # and produce a nullable boolean column (BooleanDtype). For other dtypes + # fall back to the fast construction path. + from pandas.core.dtypes.common import is_bool_dtype + + if is_bool_dtype(values.dtype): + # Build an object array from new_values so we can insert pd.NA where masked, + # then construct DataFrame and cast to nullable boolean dtype. + import pandas as pd + + # Ensure we have an object array to insert pd.NA + tmp = new_values.astype(object, copy=True) + # new_mask is True where a value exists; missing positions are ~new_mask + tmp[~new_mask] = pd.NA + + # Construct DataFrame from the tmp array, then convert to boolean dtype. + result = self.constructor(tmp, index=index, columns=columns, copy=False) + # Convert the relevant columns to nullable boolean + result = result.astype("boolean") + else: + result = self.constructor( + new_values, + index=index, + columns=columns, + dtype=new_values.dtype, + copy=False, + ) + if isinstance(values, np.ndarray): base, new_base = values.base, new_values.base elif isinstance(values, NDArrayBackedExtensionArray): @@ -297,6 +323,25 @@ def get_new_values(self, values, fill_value=None): if not mask_all: new_values[:] = fill_value else: + # GH#62244: special-case for bool to avoid upcasting to object + if is_bool_dtype(dtype): + data = np.empty(result_shape, dtype="bool") + new_mask = np.zeros(result_shape, dtype=bool) + + libreshape.unstack( + sorted_values.astype("bool", copy=False), + mask.view("u1"), + stride, + length, + width, + data, + new_mask.view("u1"), + ) + + # Return the raw numpy data + mask — pandas internals will wrap it + return data, new_mask + + # default path for non-bool dtypes if not mask_all: dtype, fill_value = maybe_promote(dtype, fill_value) new_values = np.empty(result_shape, dtype=dtype) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 756d454ebd11f..25387af37ce90 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -559,10 +559,10 @@ def test_unstack_bool(self): ) rs = df.unstack() xp = DataFrame( - np.array([[False, np.nan], [np.nan, False]], dtype=object), + [[False, pd.NA], [pd.NA, False]], index=["a", "b"], columns=MultiIndex.from_arrays([["col", "col"], ["c", "l"]]), - ) + ).astype("boolean") tm.assert_frame_equal(rs, xp) @pytest.mark.filterwarnings( @@ -2734,3 +2734,40 @@ def test_stack_preserves_na(dtype, na_value, test_multiindex): ) expected = Series(1, index=expected_index) tm.assert_series_equal(result, expected) + + +class TestUnstackBool: + """Regression tests for GH#62244 (unstack bool dtype upcasting).""" + + def test_unstack_bool_dataframe_preserves_boolean_dtype(self): + df = DataFrame( + {"level_0": ["foo", "toto"], "level_1": ["A", "B"], "val": [True, False]} + ).set_index(["level_0", "level_1"]) + + result = df.unstack("level_0") + + assert all(str(dtype) in {"bool", "boolean"} for dtype in result.dtypes) + + assert result.loc["A", ("val", "foo")] + assert pd.isna(result.loc["A", ("val", "toto")]) + assert not result.loc["B", ("val", "toto")] + + def test_unstack_bool_series_preserves_boolean_dtype(self): + s = Series([True, False], index=MultiIndex.from_product([["x", "y"], ["A"]])) + result = s.unstack(0) + + assert all(str(dtype) in {"bool", "boolean"} for dtype in result.dtypes) + + def test_unstack_bool_memory_usage_smaller_than_object(self): + df = DataFrame({"a": ["x", "y"], "b": [True, False]}).set_index("a") + + obj_unstack = df.astype("object").unstack("a") + bool_unstack = df.astype("boolean").unstack("a") + + obj_mem = obj_unstack.memory_usage(deep=True) + bool_mem = bool_unstack.memory_usage(deep=True) + + obj_total = obj_mem.sum() if hasattr(obj_mem, "sum") else int(obj_mem) + bool_total = bool_mem.sum() if hasattr(bool_mem, "sum") else int(bool_mem) + + assert bool_total < obj_total diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index a41f3014bc23f..f2589af147a1b 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2618,6 +2618,25 @@ def test_pivot_table_margins_include_nan_groups(self): expected.columns.name = "g2" tm.assert_frame_equal(result, expected, check_dtype=False) + def test_pivot_table_bool_preserves_boolean_dtype(self): + # GH#62244 + df = DataFrame( + { + "A": ["foo", "foo", "bar"], + "B": ["x", "y", "x"], + "val": [True, False, True], + } + ) + + result = pivot_table(df, values="val", index="A", columns="B", aggfunc="any") + + assert all(str(dtype) == "boolean" for dtype in result.dtypes) + + assert result.loc["foo", "x"] + assert not result.loc["foo", "y"] + assert result.loc["bar", "x"] + assert pd.isna(result.loc["bar", "y"]) + class TestPivot: def test_pivot(self):