ENH: Preserve nullable boolean dtype in pivot_table (GH#62244) #62256

Original file line number	Diff line number	Diff line change
Expand Up		@@ -163,6 +163,7 @@ Other enhancements
		- :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`)
		- Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`)
		- Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`)
	- Pivoting or unstacking boolean columns (e.g., with :meth:`DataFrame.pivot_table`, :meth:`DataFrame.unstack`) now preserves them as nullable BooleanDtype, with missing values as ``pd.NA`` for improved memory usage and correctness. (:issue:`62244`)
		- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default``
		- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
		- :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`)
Expand Down Expand Up		@@ -292,6 +293,7 @@ These improvements also fixed certain bugs in groupby:
		- :meth:`.DataFrameGroupBy.nunique` would fail when there are multiple groupings, unobserved groups, and ``as_index=False`` (:issue:`52848`)
		- :meth:`.DataFrameGroupBy.sum` would have incorrect values when there are multiple groupings, unobserved groups, and non-numeric data (:issue:`43891`)
		- :meth:`.DataFrameGroupBy.value_counts` would produce incorrect results when used with some categorical and some non-categorical groupings and ``observed=False`` (:issue:`56016`)
	- Fixed boolean columns being upcast to float or object in :meth:`DataFrame.pivot_table` and :meth:`DataFrame.unstack`; these now remain as nullable BooleanDtype with missing values as ``pd.NA``. (:issue:`62244`)

		.. _whatsnew_300.notable_bug_fixes.notable_bug_fix2:

Expand Down

13 changes: 13 additions & 0 deletions pandas/core/reshape/pivot.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -13,6 +13,7 @@

		from pandas.core.dtypes.cast import maybe_downcast_to_dtype
		from pandas.core.dtypes.common import (
	is_bool_dtype,
		is_list_like,
		is_nested_list_like,
		is_scalar,
Expand All		@@ -23,6 +24,7 @@
		ABCSeries,
		)

	from pandas.core.arrays.boolean import BooleanDtype
		import pandas.core.common as com
		from pandas.core.groupby import Grouper
		from pandas.core.indexes.api import (
Expand Down Expand Up		@@ -409,6 +411,17 @@ def __internal_pivot_table(
		if isinstance(table, ABCDataFrame) and dropna:
		table = table.dropna(how="all", axis=1)

	# GH#62244: Preserve boolean dtype instead of upcasting to float
	if isinstance(table, ABCDataFrame):
	for col in table.columns:
	val = table[col]
	if isinstance(val, ABCSeries):
	# if the column is bool or was coerced to object with booleans
	if is_bool_dtype(val.dtype) or (
	val.dtype == object and val.dropna().isin([True, False]).all()
	):
	table[col] = val.astype(BooleanDtype())

		return table


Expand Down

53 changes: 49 additions & 4 deletions pandas/core/reshape/reshape.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -24,6 +24,7 @@
		from pandas.core.dtypes.common import (
		ensure_platform_int,
		is_1d_only_ea_dtype,
	is_bool_dtype,
		is_integer,
		needs_i8_conversion,
		)
Expand Down Expand Up		@@ -241,13 +242,38 @@ def get_result(self, obj, value_columns, fill_value) -> DataFrame:
		if value_columns is None and values.shape[1] != 1: # pragma: no cover
		raise ValueError("must pass column labels for multi-column data")

	new_values, _ = self.get_new_values(values, fill_value)
	new_values, new_mask = self.get_new_values(values, fill_value)
		columns = self.get_new_columns(value_columns)
		index = self.new_index

	result = self.constructor(
	new_values, index=index, columns=columns, dtype=new_values.dtype, copy=False
	)
	# If original values were numpy-bool, we need to respect the missing mask
	# and produce a nullable boolean column (BooleanDtype). For other dtypes
	# fall back to the fast construction path.
	from pandas.core.dtypes.common import is_bool_dtype

	if is_bool_dtype(values.dtype):
	# Build an object array from new_values so we can insert pd.NA where masked,
	# then construct DataFrame and cast to nullable boolean dtype.
	import pandas as pd

	# Ensure we have an object array to insert pd.NA
	tmp = new_values.astype(object, copy=True)
	# new_mask is True where a value exists; missing positions are ~new_mask
	tmp[~new_mask] = pd.NA

	# Construct DataFrame from the tmp array, then convert to boolean dtype.
	result = self.constructor(tmp, index=index, columns=columns, copy=False)
	# Convert the relevant columns to nullable boolean
	result = result.astype("boolean")
	else:
	result = self.constructor(
	new_values,
	index=index,
	columns=columns,
	dtype=new_values.dtype,
	copy=False,
	)

		if isinstance(values, np.ndarray):
		base, new_base = values.base, new_values.base
		elif isinstance(values, NDArrayBackedExtensionArray):
Expand Down Expand Up		@@ -297,6 +323,25 @@ def get_new_values(self, values, fill_value=None):
		if not mask_all:
		new_values[:] = fill_value
		else:
	# GH#62244: special-case for bool to avoid upcasting to object
	if is_bool_dtype(dtype):
	data = np.empty(result_shape, dtype="bool")
	new_mask = np.zeros(result_shape, dtype=bool)

	libreshape.unstack(
	sorted_values.astype("bool", copy=False),
	mask.view("u1"),
	stride,
	length,
	width,
	data,
	new_mask.view("u1"),
	)

	# Return the raw numpy data + mask — pandas internals will wrap it
	return data, new_mask

	# default path for non-bool dtypes
		if not mask_all:
		dtype, fill_value = maybe_promote(dtype, fill_value)
		new_values = np.empty(result_shape, dtype=dtype)
Expand Down

41 changes: 39 additions & 2 deletions pandas/tests/frame/test_stack_unstack.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -559,10 +559,10 @@ def test_unstack_bool(self):
		)
		rs = df.unstack()
		xp = DataFrame(
	np.array([[False, np.nan], [np.nan, False]], dtype=object),
	[[False, pd.NA], [pd.NA, False]],
		index=["a", "b"],
		columns=MultiIndex.from_arrays([["col", "col"], ["c", "l"]]),
	)
	).astype("boolean")
		tm.assert_frame_equal(rs, xp)

		@pytest.mark.filterwarnings(
Expand Down Expand Up		@@ -2734,3 +2734,40 @@ def test_stack_preserves_na(dtype, na_value, test_multiindex):
		)
		expected = Series(1, index=expected_index)
		tm.assert_series_equal(result, expected)


	class TestUnstackBool:
	"""Regression tests for GH#62244 (unstack bool dtype upcasting)."""

	def test_unstack_bool_dataframe_preserves_boolean_dtype(self):
	df = DataFrame(
	{"level_0": ["foo", "toto"], "level_1": ["A", "B"], "val": [True, False]}
	).set_index(["level_0", "level_1"])

	result = df.unstack("level_0")

	assert all(str(dtype) in {"bool", "boolean"} for dtype in result.dtypes)

	assert result.loc["A", ("val", "foo")]
	assert pd.isna(result.loc["A", ("val", "toto")])
	assert not result.loc["B", ("val", "toto")]

	def test_unstack_bool_series_preserves_boolean_dtype(self):
	s = Series([True, False], index=MultiIndex.from_product([["x", "y"], ["A"]]))
	result = s.unstack(0)

	assert all(str(dtype) in {"bool", "boolean"} for dtype in result.dtypes)

	def test_unstack_bool_memory_usage_smaller_than_object(self):
	df = DataFrame({"a": ["x", "y"], "b": [True, False]}).set_index("a")

	obj_unstack = df.astype("object").unstack("a")
	bool_unstack = df.astype("boolean").unstack("a")

	obj_mem = obj_unstack.memory_usage(deep=True)
	bool_mem = bool_unstack.memory_usage(deep=True)

	obj_total = obj_mem.sum() if hasattr(obj_mem, "sum") else int(obj_mem)
	bool_total = bool_mem.sum() if hasattr(bool_mem, "sum") else int(bool_mem)

	assert bool_total < obj_total

19 changes: 19 additions & 0 deletions pandas/tests/reshape/test_pivot.py

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -2618,6 +2618,25 @@ def test_pivot_table_margins_include_nan_groups(self):
		expected.columns.name = "g2"
		tm.assert_frame_equal(result, expected, check_dtype=False)

	def test_pivot_table_bool_preserves_boolean_dtype(self):
	# GH#62244
	df = DataFrame(
	{
	"A": ["foo", "foo", "bar"],
	"B": ["x", "y", "x"],
	"val": [True, False, True],
	}
	)

	result = pivot_table(df, values="val", index="A", columns="B", aggfunc="any")

	assert all(str(dtype) == "boolean" for dtype in result.dtypes)

	assert result.loc["foo", "x"]
	assert not result.loc["foo", "y"]
	assert result.loc["bar", "x"]
	assert pd.isna(result.loc["bar", "y"])


		class TestPivot:
		def test_pivot(self):
Expand Down

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

ENH: Preserve nullable boolean dtype in pivot_table (GH#62244) #62256

Uh oh!

ENH: Preserve nullable boolean dtype in pivot_table (GH#62244) #62256

Filter by extension

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!