diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 45bf83c3c030d..d48adefe5a10d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1094,6 +1094,7 @@ Reshaping - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) - Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`) - Bug in :func:`melt` where calling with duplicate column names in ``id_vars`` raised a misleading ``AttributeError`` (:issue:`61475`) +- Bug in :meth:`DataFrame.merge` where the result of a merge does not contain any metadata or flag information from the inputs to the merge. (:issue:`28283`) - Bug in :meth:`DataFrame.merge` where user-provided suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`) - Bug in :meth:`DataFrame.merge` with :class:`CategoricalDtype` columns incorrectly raising ``RecursionError`` (:issue:`56376`) - Bug in :meth:`DataFrame.merge` with a ``float32`` index incorrectly casting the index to ``float64`` (:issue:`41626`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7b91ca3d564a2..a1c3bbde3a2c4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6096,10 +6096,16 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: """ Propagate metadata from other to self. + This is the default implementation. Subclasses may override this method to + implement their own metadata handling. + Parameters ---------- other : the object from which to get the attributes that we are going - to propagate + to propagate. If ``other`` has an ``input_objs`` attribute, then + this attribute must contain an iterable of objects, each with an + ``attrs`` attribute, in which case, each such ``attrs`` instance + must be a dictionary that is equal to all of the others. method : str, optional A passed method name providing context on where ``__finalize__`` was called. diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f35b0ef197288..324a2bb810981 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1129,12 +1129,17 @@ def _reindex_and_concat( return result def get_result(self) -> DataFrame: + """ + Execute the merge. + """ if self.indicator: self.left, self.right = self._indicator_pre_merge(self.left, self.right) join_index, left_indexer, right_indexer = self._get_join_info() result = self._reindex_and_concat(join_index, left_indexer, right_indexer) + + # Is this call to __finalize__ really necessary? result = result.__finalize__( types.SimpleNamespace(input_objs=[self.left, self.right]), method=self._merge_type, @@ -1147,6 +1152,8 @@ def get_result(self) -> DataFrame: self._maybe_restore_index_levels(result) + # __finalize is responsible for copying the metadata from the inputs to merge + # to the result. return result.__finalize__( types.SimpleNamespace(input_objs=[self.left, self.right]), method="merge" ) @@ -1167,6 +1174,14 @@ def _indicator_name(self) -> str | None: def _indicator_pre_merge( self, left: DataFrame, right: DataFrame ) -> tuple[DataFrame, DataFrame]: + """ + Add one indicator column to each of the left and right inputs to a + merge operation. + + These columns are used to produce another column in the output of the + merge, indicating for each row of the output whether it was produced + using the left, right or both inputs. + """ columns = left.columns.union(right.columns) for i in ["_left_indicator", "_right_indicator"]: @@ -1193,6 +1208,12 @@ def _indicator_pre_merge( @final def _indicator_post_merge(self, result: DataFrame) -> DataFrame: + """ + Add an indicator column to the merge result. + + This column indicates for each row of the output whether it was produced using + the left, right or both inputs. + """ result["_left_indicator"] = result["_left_indicator"].fillna(0) result["_right_indicator"] = result["_right_indicator"].fillna(0) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 4b841b54c488b..f4a1646c13806 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -1,6 +1,4 @@ -""" -An exhaustive list of pandas methods exercising NDFrame.__finalize__. -""" +"""An exhaustive list of pandas methods exercising NDFrame.__finalize__.""" import operator import re @@ -8,6 +6,8 @@ import numpy as np import pytest +from pandas._typing import MergeHow + import pandas as pd # TODO: @@ -148,14 +148,6 @@ operator.methodcaller("melt", id_vars=["A"], value_vars=["B"]), ), (pd.DataFrame, frame_data, operator.methodcaller("map", lambda x: x)), - pytest.param( - ( - pd.DataFrame, - frame_data, - operator.methodcaller("merge", pd.DataFrame({"A": [1]})), - ), - marks=not_implemented_mark, - ), (pd.DataFrame, frame_data, operator.methodcaller("round", 2)), (pd.DataFrame, frame_data, operator.methodcaller("corr")), pytest.param( @@ -371,8 +363,7 @@ def idfn(x): m = xpr.search(str(x)) if m: return m.group(1) - else: - return str(x) + return str(x) @pytest.mark.parametrize("ndframe_method", _all_methods, ids=lambda x: idfn(x[-1])) @@ -586,7 +577,8 @@ def test_datetime_property(attr): @pytest.mark.parametrize( - "attr", ["days", "seconds", "microseconds", "nanoseconds", "components"] + "attr", + ["days", "seconds", "microseconds", "nanoseconds", "components"], ) def test_timedelta_property(attr): s = pd.Series(pd.timedelta_range("2000", periods=4)) @@ -630,7 +622,8 @@ def test_categorical_accessor(method): @pytest.mark.parametrize( - "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})] + "obj", + [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], ) @pytest.mark.parametrize( "method", @@ -649,7 +642,8 @@ def test_groupby_finalize(obj, method): @pytest.mark.parametrize( - "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})] + "obj", + [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})], ) @pytest.mark.parametrize( "method", @@ -675,3 +669,154 @@ def test_finalize_frame_series_name(): df = pd.DataFrame({"name": [1, 2]}) result = pd.Series([1, 2]).__finalize__(df) assert result.name is None + + +# ---------------------------------------------------------------------------- +# Tests for merge + + +@pytest.mark.parametrize( + ["allow_on_left", "allow_on_right"], + [(False, False), (False, True), (True, False), (True, True)], +) +@pytest.mark.parametrize( + "how", + [ + "left", + "right", + "inner", + "outer", + "left_anti", + "right_anti", + "cross", + ], +) +def test_merge_sets_duplication_allowance_flag( + how: MergeHow, + allow_on_left: bool, + allow_on_right: bool, +): + """Check that DataFrame.merge correctly sets the allow_duplicate_labels flag + on its result. + + The flag on the result should be set to true if and only if both arguments + to merge have their flags set to True. + """ + # Arrange + left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_on_left) + right = pd.DataFrame({"test": [1]}).set_flags( + allows_duplicate_labels=allow_on_right, + ) + + # Act + if not how == "cross": + result = left.merge(right, how=how, on="test") + else: + result = left.merge(right, how=how) + + # Assert + expected_duplication_allowance = allow_on_left and allow_on_right + assert result.flags.allows_duplicate_labels == expected_duplication_allowance + + +@pytest.mark.parametrize( + ["allow_on_left", "allow_on_right"], + [(False, False), (False, True), (True, False), (True, True)], +) +def test_merge_asof_sets_duplication_allowance_flag( + allow_on_left: bool, + allow_on_right: bool, +): + """Check that pandas.merge_asof correctly sets the allow_duplicate_labels flag + on its result. + + The flag on the result should be set to true if and only if both arguments + to merge_asof have their flags set to True. + """ + # Arrange + left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_on_left) + right = pd.DataFrame({"test": [1]}).set_flags( + allows_duplicate_labels=allow_on_right, + ) + + # Act + result = pd.merge_asof(left, right) + + # Assert + expected_duplication_allowance = allow_on_left and allow_on_right + assert result.flags.allows_duplicate_labels == expected_duplication_allowance + + +def test_merge_propagates_metadata_from_equal_input_metadata(): + """Check that pandas.merge sets the metadata of its result to a deep copy of + the metadata from its left input, if the metadata from both inputs are equal. + """ + # Arrange + metadata = {"a": 2} + left = pd.DataFrame({"test": [1]}) + left.attrs = metadata + right = pd.DataFrame({"test": [1]}) + right.attrs = metadata.copy() + + # Act + result = left.merge(right, how="inner", on="test") + + # Assert + assert result.attrs == metadata + left.attrs = {"b": 3} + assert result.attrs == metadata + + +def test_merge_does_not_propagate_metadata_from_unequal_input_metadata(): + """Check that the metadata for the result of pandas.merge is empty if the + metadata for both inputs to pandas.merge are not equal. + """ + # Arrange + left = pd.DataFrame({"test": [1]}) + left.attrs = {"a": 2} + right = pd.DataFrame({"test": [1]}) + right.attrs = {"b": 3} + + # Act + result = left.merge(right, how="inner", on="test") + + # Assert + assert result.attrs == {} + + +no_metadata = pd.DataFrame({"test": [1]}) + +has_metadata = pd.DataFrame({"test": [1]}) +has_metadata.attrs = {"a": 2} + + +@pytest.mark.parametrize( + ["left", "right", "expected"], + [ + (no_metadata, has_metadata, {}), + (has_metadata, no_metadata, {}), + (no_metadata, no_metadata, {}), + ], + ids=["left-empty", "right-empty", "both-empty"], +) +def test_merge_does_not_propagate_metadata_if_one_input_has_no_metadata( + left: pd.DataFrame, + right: pd.DataFrame, + expected: dict, +): + """Check that if the metadata for one input to pandas.merge is empty, the result + of merge has the same metadata as the other input. + + (empty) (A) (A) (empty) (empty) (empty) + | | | | | | + --> merge <-- --> merge <-- --> merge <-- + | | | + (empty) (empty) (empty) + """ + # Arrange + + # Act + result = left.merge(right, how="inner", on="test") + + # Assert + assert result.attrs == expected diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index e927c17eceb76..c2d24cceeab0c 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -1,5 +1,6 @@ from copy import deepcopy from operator import methodcaller +from typing import Literal import numpy as np import pytest @@ -77,7 +78,12 @@ def test_metadata_propagation_indiv(self, monkeypatch): # merging with override # GH 6923 - def finalize(self, other, method=None, **kwargs): + def finalize( + self: DataFrame, + other: DataFrame, + method: Literal["merge", "concat"] | None = None, + **kwargs, + ): for name in self._metadata: if method == "merge": left, right = other.input_objs

AltStyle によって変換されたページ (->オリジナル) /