Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

BUG: Modifies pandas.merge to propagate flags and metadata from its inputs to its output. #62266

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
aijams wants to merge 7 commits into pandas-dev:main
base: main
Choose a base branch
Loading
from aijams:aijams-dataframe-metadata
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -1088,6 +1088,7 @@ Reshaping
- Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`)
- Bug in :func:`melt` where calling with duplicate column names in ``id_vars`` raised a misleading ``AttributeError`` (:issue:`61475`)
- Bug in :meth:`DataFrame.merge` where user-provided suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`)
- Bug in :meth:`DataFrame.merge` where the result of a merge does not contain any metadata or flag information from the inputs to the merge. (:issue:`28283`)
- Bug in :meth:`DataFrame.merge` with :class:`CategoricalDtype` columns incorrectly raising ``RecursionError`` (:issue:`56376`)
-

Expand Down
5 changes: 4 additions & 1 deletion pandas/core/generic.py
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -6096,7 +6096,10 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self:
Parameters
----------
other : the object from which to get the attributes that we are going
to propagate
to propagate. If ``other`` has an ``input_objs`` attribute, then this attribute
must contain an iterable of objects, each with an ``attrs`` attribute, in which
case, each such ``attrs`` instance must be a dictionary that is equal to all of
the others.
method : str, optional
A passed method name providing context on where ``__finalize__``
was called.
Expand Down
18 changes: 17 additions & 1 deletion pandas/core/reshape/merge.py
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -1129,6 +1129,9 @@ def _reindex_and_concat(
return result

def get_result(self) -> DataFrame:
"""
Execute the merge.
"""
if self.indicator:
self.left, self.right = self._indicator_pre_merge(self.left, self.right)

Expand All @@ -1148,7 +1151,8 @@ def get_result(self) -> DataFrame:
self._maybe_restore_index_levels(result)

return result.__finalize__(
types.SimpleNamespace(input_objs=[self.left, self.right]), method="merge"
types.SimpleNamespace(input_objs=[self.left, self.right]),
method="merge"
)

@final
Expand All @@ -1167,6 +1171,12 @@ def _indicator_name(self) -> str | None:
def _indicator_pre_merge(
self, left: DataFrame, right: DataFrame
) -> tuple[DataFrame, DataFrame]:
"""
Add one indicator column to each of the left and right inputs to a merge operation.

These columns are used to produce another column in the output of the merge, indicating
for each row of the output whether it was produced using the left, right or both inputs.
"""
columns = left.columns.union(right.columns)

for i in ["_left_indicator", "_right_indicator"]:
Expand All @@ -1193,6 +1203,12 @@ def _indicator_pre_merge(

@final
def _indicator_post_merge(self, result: DataFrame) -> DataFrame:
"""
Add an indicator column to the merge result.

This column indicates for each row of the output whether it was produced using the left,
right or both inputs.
"""
result["_left_indicator"] = result["_left_indicator"].fillna(0)
result["_right_indicator"] = result["_right_indicator"].fillna(0)

Expand Down
141 changes: 133 additions & 8 deletions pandas/tests/generic/test_finalize.py
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -148,14 +148,6 @@
operator.methodcaller("melt", id_vars=["A"], value_vars=["B"]),
),
(pd.DataFrame, frame_data, operator.methodcaller("map", lambda x: x)),
pytest.param(
(
pd.DataFrame,
frame_data,
operator.methodcaller("merge", pd.DataFrame({"A": [1]})),
),
marks=not_implemented_mark,
),
(pd.DataFrame, frame_data, operator.methodcaller("round", 2)),
(pd.DataFrame, frame_data, operator.methodcaller("corr")),
pytest.param(
Expand Down Expand Up @@ -675,3 +667,136 @@ def test_finalize_frame_series_name():
df = pd.DataFrame({"name": [1, 2]})
result = pd.Series([1, 2]).__finalize__(df)
assert result.name is None

# ----------------------------------------------------------------------------
# Tests for merge

@pytest.mark.parametrize(["allow_duplication_on_left", "allow_duplication_on_right"],
[
(False, False),
(False, True),
(True, False),
(True, True)
])
@pytest.mark.parametrize(["how"], [
("left",),
("right",),
("inner",),
("outer",),
("left_anti",),
("right_anti",),
("cross",),
])
def test_merge_sets_duplication_allowance_flag(how, allow_duplication_on_left, allow_duplication_on_right):
"""
Check that DataFrame.merge correctly sets the allow_duplicate_labels flag
on its result.

The flag on the result should be set to true if and only if both arguments to merge
have their flags set to True.
"""
# Arrange
left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_duplication_on_left)
right = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_duplication_on_right)

# Act
if not how == "cross":
result = left.merge(right, how=how, on="test")
else:
result = left.merge(right, how=how)

# Assert
expected_duplication_allowance = allow_duplication_on_left and allow_duplication_on_right
assert result.flags.allows_duplicate_labels == expected_duplication_allowance

@pytest.mark.parametrize(["allow_duplication_on_left", "allow_duplication_on_right"],
[
(False, False),
(False, True),
(True, False),
(True, True)
])
def test_merge_asof_sets_duplication_allowance_flag(allow_duplication_on_left, allow_duplication_on_right):
"""
Check that pandas.merge_asof correctly sets the allow_duplicate_labels flag
on its result.

The flag on the result should be set to true if and only if both arguments to merge_asof
have their flags set to True.
"""
# Arrange
left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_duplication_on_left)
right = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_duplication_on_right)

# Act
result = pd.merge_asof(left, right)

# Assert
expected_duplication_allowance = allow_duplication_on_left and allow_duplication_on_right
assert result.flags.allows_duplicate_labels == expected_duplication_allowance

def test_merge_propagates_metadata_from_equal_input_metadata():
"""
Check that pandas.merge sets the metadata of its result to a deep copy of the metadata from
its left input, if the metadata from both inputs are equal.
"""
# Arrange
metadata = {"a": 2}
left = pd.DataFrame({"test": [1]})
left.attrs = metadata
right = pd.DataFrame({"test": [1]})
right.attrs = metadata.copy()

# Act
result = left.merge(right, how="inner", on="test")

# Assert
assert result.attrs == metadata
left.attrs = {"b": 3}
assert result.attrs == metadata

def test_merge_does_not_propagate_metadata_from_unequal_input_metadata():
"""
Check that the metadata for the result of pandas.merge is empty if the metadata
for both inputs to pandas.merge are not equal.
"""
# Arrange
left = pd.DataFrame({"test": [1]})
left.attrs = {"a": 2}
right = pd.DataFrame({"test": [1]})
right.attrs = {"b": 3}

# Act
result = left.merge(right, how="inner", on="test")

# Assert
assert result.attrs == {}

no_metadata = pd.DataFrame({"test": [1]})

metadata = {"a": 2}
has_metadata = pd.DataFrame({"test": [1]})
has_metadata.attrs = metadata

@pytest.mark.parametrize(["left", "right", "expected"],
[(no_metadata, has_metadata, metadata),
(has_metadata, no_metadata, metadata),
(no_metadata, no_metadata, {})], ids=["left-empty", "right-empty", "both-empty"])
def test_merge_propagates_metadata_if_one_input_has_no_metadata(left: pd.DataFrame, right: pd.DataFrame, expected: dict):
"""
Check that if the metadata for one input to pandas.merge is empty, the result
of merge has the same metadata as the other input.

(empty) (A) (A) (empty) (empty) (empty)
| | | | | |
--> merge <-- --> merge <-- --> merge <--
| | |
(A) (A) (empty)
"""
# Arrange

# Act
result = left.merge(right, how="inner", on="test")

# Assert
assert result.attrs == expected
3 changes: 2 additions & 1 deletion pandas/tests/generic/test_frame.py
View file Open in desktop
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from copy import deepcopy
from operator import methodcaller
from typing import Literal

import numpy as np
import pytest
Expand Down Expand Up @@ -77,7 +78,7 @@ def test_metadata_propagation_indiv(self, monkeypatch):
# merging with override
# GH 6923

def finalize(self, other, method=None, **kwargs):
def finalize(self: DataFrame, other: DataFrame, method: Literal["merge", "concat"] | None = None, **kwargs):
for name in self._metadata:
if method == "merge":
left, right = other.input_objs
Expand Down
Loading

AltStyle によって変換されたページ (->オリジナル) /