Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

ENH: Preserve nullable boolean dtype in pivot_table (GH#62244) #62256

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ Other enhancements
- :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`)
- Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`)
- Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`)
- Pivoting or unstacking boolean columns (e.g., with :meth:`DataFrame.pivot_table`, :meth:`DataFrame.unstack`) now preserves them as nullable BooleanDtype, with missing values as ``pd.NA`` for improved memory usage and correctness. (:issue:`62244`)
- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default``
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
- :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`)
Expand Down Expand Up @@ -292,6 +293,7 @@ These improvements also fixed certain bugs in groupby:
- :meth:`.DataFrameGroupBy.nunique` would fail when there are multiple groupings, unobserved groups, and ``as_index=False`` (:issue:`52848`)
- :meth:`.DataFrameGroupBy.sum` would have incorrect values when there are multiple groupings, unobserved groups, and non-numeric data (:issue:`43891`)
- :meth:`.DataFrameGroupBy.value_counts` would produce incorrect results when used with some categorical and some non-categorical groupings and ``observed=False`` (:issue:`56016`)
- Fixed boolean columns being upcast to float or object in :meth:`DataFrame.pivot_table` and :meth:`DataFrame.unstack`; these now remain as nullable BooleanDtype with missing values as ``pd.NA``. (:issue:`62244`)

.. _whatsnew_300.notable_bug_fixes.notable_bug_fix2:

Expand Down
13 changes: 13 additions & 0 deletions pandas/core/reshape/pivot.py
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from pandas.core.dtypes.cast import maybe_downcast_to_dtype
from pandas.core.dtypes.common import (
is_bool_dtype,
is_list_like,
is_nested_list_like,
is_scalar,
Expand All @@ -23,6 +24,7 @@
ABCSeries,
)

from pandas.core.arrays.boolean import BooleanDtype
import pandas.core.common as com
from pandas.core.groupby import Grouper
from pandas.core.indexes.api import (
Expand Down Expand Up @@ -409,6 +411,17 @@ def __internal_pivot_table(
if isinstance(table, ABCDataFrame) and dropna:
table = table.dropna(how="all", axis=1)

# GH#62244: Preserve boolean dtype instead of upcasting to float
if isinstance(table, ABCDataFrame):
for col in table.columns:
val = table[col]
if isinstance(val, ABCSeries):
# if the column is bool or was coerced to object with booleans
if is_bool_dtype(val.dtype) or (
val.dtype == object and val.dropna().isin([True, False]).all()
):
table[col] = val.astype(BooleanDtype())

return table


Expand Down
53 changes: 49 additions & 4 deletions pandas/core/reshape/reshape.py
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from pandas.core.dtypes.common import (
ensure_platform_int,
is_1d_only_ea_dtype,
is_bool_dtype,
is_integer,
needs_i8_conversion,
)
Expand Down Expand Up @@ -241,13 +242,38 @@ def get_result(self, obj, value_columns, fill_value) -> DataFrame:
if value_columns is None and values.shape[1] != 1: # pragma: no cover
raise ValueError("must pass column labels for multi-column data")

new_values, _ = self.get_new_values(values, fill_value)
new_values, new_mask = self.get_new_values(values, fill_value)
columns = self.get_new_columns(value_columns)
index = self.new_index

result = self.constructor(
new_values, index=index, columns=columns, dtype=new_values.dtype, copy=False
)
# If original values were numpy-bool, we need to respect the missing mask
# and produce a nullable boolean column (BooleanDtype). For other dtypes
# fall back to the fast construction path.
from pandas.core.dtypes.common import is_bool_dtype

if is_bool_dtype(values.dtype):
# Build an object array from new_values so we can insert pd.NA where masked,
# then construct DataFrame and cast to nullable boolean dtype.
import pandas as pd

# Ensure we have an object array to insert pd.NA
tmp = new_values.astype(object, copy=True)
# new_mask is True where a value exists; missing positions are ~new_mask
tmp[~new_mask] = pd.NA

# Construct DataFrame from the tmp array, then convert to boolean dtype.
result = self.constructor(tmp, index=index, columns=columns, copy=False)
# Convert the relevant columns to nullable boolean
result = result.astype("boolean")
else:
result = self.constructor(
new_values,
index=index,
columns=columns,
dtype=new_values.dtype,
copy=False,
)

if isinstance(values, np.ndarray):
base, new_base = values.base, new_values.base
elif isinstance(values, NDArrayBackedExtensionArray):
Expand Down Expand Up @@ -297,6 +323,25 @@ def get_new_values(self, values, fill_value=None):
if not mask_all:
new_values[:] = fill_value
else:
# GH#62244: special-case for bool to avoid upcasting to object
if is_bool_dtype(dtype):
data = np.empty(result_shape, dtype="bool")
new_mask = np.zeros(result_shape, dtype=bool)

libreshape.unstack(
sorted_values.astype("bool", copy=False),
mask.view("u1"),
stride,
length,
width,
data,
new_mask.view("u1"),
)

# Return the raw numpy data + mask — pandas internals will wrap it
return data, new_mask

# default path for non-bool dtypes
if not mask_all:
dtype, fill_value = maybe_promote(dtype, fill_value)
new_values = np.empty(result_shape, dtype=dtype)
Expand Down
41 changes: 39 additions & 2 deletions pandas/tests/frame/test_stack_unstack.py
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -559,10 +559,10 @@ def test_unstack_bool(self):
)
rs = df.unstack()
xp = DataFrame(
np.array([[False, np.nan], [np.nan, False]], dtype=object),
[[False, pd.NA], [pd.NA, False]],
index=["a", "b"],
columns=MultiIndex.from_arrays([["col", "col"], ["c", "l"]]),
)
).astype("boolean")
tm.assert_frame_equal(rs, xp)

@pytest.mark.filterwarnings(
Expand Down Expand Up @@ -2734,3 +2734,40 @@ def test_stack_preserves_na(dtype, na_value, test_multiindex):
)
expected = Series(1, index=expected_index)
tm.assert_series_equal(result, expected)


class TestUnstackBool:
"""Regression tests for GH#62244 (unstack bool dtype upcasting)."""

def test_unstack_bool_dataframe_preserves_boolean_dtype(self):
df = DataFrame(
{"level_0": ["foo", "toto"], "level_1": ["A", "B"], "val": [True, False]}
).set_index(["level_0", "level_1"])

result = df.unstack("level_0")

assert all(str(dtype) in {"bool", "boolean"} for dtype in result.dtypes)

assert result.loc["A", ("val", "foo")]
assert pd.isna(result.loc["A", ("val", "toto")])
assert not result.loc["B", ("val", "toto")]

def test_unstack_bool_series_preserves_boolean_dtype(self):
s = Series([True, False], index=MultiIndex.from_product([["x", "y"], ["A"]]))
result = s.unstack(0)

assert all(str(dtype) in {"bool", "boolean"} for dtype in result.dtypes)

def test_unstack_bool_memory_usage_smaller_than_object(self):
df = DataFrame({"a": ["x", "y"], "b": [True, False]}).set_index("a")

obj_unstack = df.astype("object").unstack("a")
bool_unstack = df.astype("boolean").unstack("a")

obj_mem = obj_unstack.memory_usage(deep=True)
bool_mem = bool_unstack.memory_usage(deep=True)

obj_total = obj_mem.sum() if hasattr(obj_mem, "sum") else int(obj_mem)
bool_total = bool_mem.sum() if hasattr(bool_mem, "sum") else int(bool_mem)

assert bool_total < obj_total
19 changes: 19 additions & 0 deletions pandas/tests/reshape/test_pivot.py
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -2618,6 +2618,25 @@ def test_pivot_table_margins_include_nan_groups(self):
expected.columns.name = "g2"
tm.assert_frame_equal(result, expected, check_dtype=False)

def test_pivot_table_bool_preserves_boolean_dtype(self):
# GH#62244
df = DataFrame(
{
"A": ["foo", "foo", "bar"],
"B": ["x", "y", "x"],
"val": [True, False, True],
}
)

result = pivot_table(df, values="val", index="A", columns="B", aggfunc="any")

assert all(str(dtype) == "boolean" for dtype in result.dtypes)

assert result.loc["foo", "x"]
assert not result.loc["foo", "y"]
assert result.loc["bar", "x"]
assert pd.isna(result.loc["bar", "y"])


class TestPivot:
def test_pivot(self):
Expand Down
Loading

AltStyle によって変換されたページ (->オリジナル) /