From 7815e73fbfe01bbacda0156755bcebfc599580d9 Mon Sep 17 00:00:00 2001 From: Tim Hoffmann <2836374+timhoffm@users.noreply.github.com> Date: Tue, 19 Nov 2024 14:24:46 +0100 Subject: [PATCH 1/5] BUG: Copy attrs on pd.merge() This uses the same logic as `pd.concat()`: Copy `attrs` only if all input `attrs` are identical. I've refactored the handling in __finalize__ from special-casing based on th the method name (previously only "concat") to handling "other" parameters that have an `input_objs` attribute. This is a more scalable architecture compared to hard-coding method names in __finalize__. Tests added for `concat()` and `merge()`. Closes #60351. --- pandas/core/generic.py | 4 +-- pandas/core/reshape/concat.py | 8 +++--- pandas/core/reshape/merge.py | 10 +++++-- pandas/tests/frame/test_api.py | 26 ++++++++++++++++++- pandas/tests/generic/test_duplicate_labels.py | 3 --- pandas/tests/generic/test_frame.py | 8 ++++-- pandas/tests/generic/test_series.py | 2 +- 7 files changed, 47 insertions(+), 14 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 039bdf9c36ee7..2e08d075fcb45 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6053,8 +6053,8 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: assert isinstance(name, str) object.__setattr__(self, name, getattr(other, name, None)) - if method == "concat": - objs = other.objs + elif hasattr(other, "input_objs"): + objs = other.input_objs # propagate attrs only if all concat arguments have the same attrs if all(bool(obj.attrs) for obj in objs): # all concatenate arguments have non-empty attrs diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index e7cb7069bbc26..79f103345d2dd 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -545,7 +545,7 @@ def _get_result( result = sample._constructor_from_mgr(mgr, axes=mgr.axes) result._name = name return result.__finalize__( - types.SimpleNamespace(objs=objs), method="concat" + types.SimpleNamespace(input_objs=objs), method="concat" ) # combine as columns in a frame @@ -566,7 +566,9 @@ def _get_result( ) df = cons(data, index=index, copy=False) df.columns = columns - return df.__finalize__(types.SimpleNamespace(objs=objs), method="concat") + return df.__finalize__( + types.SimpleNamespace(input_objs=objs), method="concat" + ) # combine block managers else: @@ -605,7 +607,7 @@ def _get_result( ) out = sample._constructor_from_mgr(new_data, axes=new_data.axes) - return out.__finalize__(types.SimpleNamespace(objs=objs), method="concat") + return out.__finalize__(types.SimpleNamespace(input_objs=objs), method="concat") def new_axes( diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6f9bb8cb24f43..2255ff32d6fd5 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -10,6 +10,7 @@ ) import datetime from functools import partial +import types from typing import ( TYPE_CHECKING, Literal, @@ -1106,7 +1107,10 @@ def get_result(self) -> DataFrame: join_index, left_indexer, right_indexer = self._get_join_info() result = self._reindex_and_concat(join_index, left_indexer, right_indexer) - result = result.__finalize__(self, method=self._merge_type) + result = result.__finalize__( + types.SimpleNamespace(input_objs=[self.left, self.right]), + method=self._merge_type, + ) if self.indicator: result = self._indicator_post_merge(result) @@ -1115,7 +1119,9 @@ def get_result(self) -> DataFrame: self._maybe_restore_index_levels(result) - return result.__finalize__(self, method="merge") + return result.__finalize__( + types.SimpleNamespace(input_objs=[self.left, self.right]), method="merge" + ) @final @cache_readonly diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 2b0bf1b0576f9..af61cd75c4540 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -315,7 +315,7 @@ def test_attrs(self): result = df.rename(columns=str) assert result.attrs == {"version": 1} - def test_attrs_deepcopy(self): + def test_attrs_is_deepcopy(self): df = DataFrame({"A": [2, 3]}) assert df.attrs == {} df.attrs["tags"] = {"spam", "ham"} @@ -324,6 +324,30 @@ def test_attrs_deepcopy(self): assert result.attrs == df.attrs assert result.attrs["tags"] is not df.attrs["tags"] + def test_attrs_concat(self): + # concat propagates attrs if all input attrs are equal + df1 = DataFrame({"A": [2, 3]}) + df1.attrs = {"a": 1, "b": 2} + df2 = DataFrame({"A": [4, 5]}) + df2.attrs = df1.attrs.copy() + df3 = DataFrame({"A": [6, 7]}) + df3.attrs = df1.attrs.copy() + assert pd.concat([df1, df2, df3]).attrs == df1.attrs + # concat does not propagate attrs if input attrs are different + df2.attrs = {"c": 3} + assert pd.concat([df1, df2, df3]).attrs == {} + + def test_attrs_merge(self): + # merge propagates attrs if all input attrs are equal + df1 = DataFrame({"key": ["a", "b"], "val1": [1, 2]}) + df1.attrs = {"a": 1, "b": 2} + df2 = DataFrame({"key": ["a", "b"], "val2": [3, 4]}) + df2.attrs = df1.attrs.copy() + assert pd.merge(df1, df2).attrs == df1.attrs + # merge does not propagate attrs if input attrs are different + df2.attrs = {"c": 3} + assert pd.merge(df1, df2).attrs == {} + @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None]) def test_set_flags( self, diff --git a/pandas/tests/generic/test_duplicate_labels.py b/pandas/tests/generic/test_duplicate_labels.py index cfa3cabbc1747..d0d1087dd8a08 100644 --- a/pandas/tests/generic/test_duplicate_labels.py +++ b/pandas/tests/generic/test_duplicate_labels.py @@ -164,7 +164,6 @@ def test_concat(self, objs, kwargs): allows_duplicate_labels=False ), False, - marks=not_implemented, ), # false true false pytest.param( @@ -173,7 +172,6 @@ def test_concat(self, objs, kwargs): ), pd.DataFrame({"B": [0, 1]}, index=["a", "d"]), False, - marks=not_implemented, ), # true true true ( @@ -296,7 +294,6 @@ def test_concat_raises(self): with pytest.raises(pd.errors.DuplicateLabelError, match=msg): pd.concat(objs, axis=1) - @not_implemented def test_merge_raises(self): a = pd.DataFrame({"A": [0, 1, 2]}, index=["a", "b", "c"]).set_flags( allows_duplicate_labels=False diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 1d0f491529b56..e927c17eceb76 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -80,12 +80,16 @@ def test_metadata_propagation_indiv(self, monkeypatch): def finalize(self, other, method=None, **kwargs): for name in self._metadata: if method == "merge": - left, right = other.left, other.right + left, right = other.input_objs value = getattr(left, name, "") + "|" + getattr(right, name, "") object.__setattr__(self, name, value) elif method == "concat": value = "+".join( - [getattr(o, name) for o in other.objs if getattr(o, name, None)] + [ + getattr(o, name) + for o in other.input_objs + if getattr(o, name, None) + ] ) object.__setattr__(self, name, value) else: diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index 7dcdcd96cce51..2ea4b8be5cf91 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -97,7 +97,7 @@ def finalize(self, other, method=None, **kwargs): value = "+".join( [ getattr(obj, name) - for obj in other.objs + for obj in other.input_objs if getattr(obj, name, None) ] ) From bd4438a1deeef0b10f52105378dd480371f8ad49 Mon Sep 17 00:00:00 2001 From: Tim Hoffmann <2836374+timhoffm@users.noreply.github.com> Date: Wed, 4 Jun 2025 22:59:13 +0200 Subject: [PATCH 2/5] Update docs --- pandas/core/generic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 31fbca634dce8..66188d9e91232 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -330,8 +330,8 @@ def attrs(self) -> dict[Hashable, Any]: ----- Many operations that create new datasets will copy ``attrs``. Copies are always deep so that changing ``attrs`` will only affect the - present dataset. ``pandas.concat`` copies ``attrs`` only if all input - datasets have the same ``attrs``. + present dataset. :func:`pandas.concat` and :func:`pandas.merge` will + only copy ``attrs`` if all input datasets have the same ``attrs``. Examples -------- @@ -6092,9 +6092,9 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: elif hasattr(other, "input_objs"): objs = other.input_objs - # propagate attrs only if all concat arguments have the same attrs + # propagate attrs only if all inputs have the same attrs if all(bool(obj.attrs) for obj in objs): - # all concatenate arguments have non-empty attrs + # all inputs have non-empty attrs attrs = objs[0].attrs have_same_attrs = all(obj.attrs == attrs for obj in objs[1:]) if have_same_attrs: From 829714b1f3dc4d3e5435d6987d09cc422eacc23b Mon Sep 17 00:00:00 2001 From: Tim Hoffmann <2836374+timhoffm@users.noreply.github.com> Date: Wed, 4 Jun 2025 23:02:20 +0200 Subject: [PATCH 3/5] Add release note --- doc/source/whatsnew/v2.3.0.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index a7f4c53b42f58..1073c72e5a17a 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -32,6 +32,9 @@ enhancement1 Other enhancements ^^^^^^^^^^^^^^^^^^ +- :func:`pandas.merge` propagates the ``attrs`` attribute to the result if all + inputs have identical ``attrs``, as has so far already been the case for + :func:`pandas.copy`. - :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) - The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been From d671f171931f5faf4129ce3c414bd354349a827a Mon Sep 17 00:00:00 2001 From: Tim Hoffmann <2836374+timhoffm@users.noreply.github.com> Date: Sat, 7 Jun 2025 09:55:31 +0200 Subject: [PATCH 4/5] Fix wrong link Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 1073c72e5a17a..f4251c1e656e8 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -34,7 +34,7 @@ Other enhancements - :func:`pandas.merge` propagates the ``attrs`` attribute to the result if all inputs have identical ``attrs``, as has so far already been the case for - :func:`pandas.copy`. + :func:`pandas.concat`. - :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) - The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been From ed40c8de3fee0faeb69817a07d24e557c5beca8e Mon Sep 17 00:00:00 2001 From: Tim Hoffmann <2836374+timhoffm@users.noreply.github.com> Date: Sat, 7 Jun 2025 09:57:53 +0200 Subject: [PATCH 5/5] Move release note to v3.0.0 --- doc/source/whatsnew/v2.3.0.rst | 3 --- doc/source/whatsnew/v3.0.0.rst | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index f4251c1e656e8..a7f4c53b42f58 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -32,9 +32,6 @@ enhancement1 Other enhancements ^^^^^^^^^^^^^^^^^^ -- :func:`pandas.merge` propagates the ``attrs`` attribute to the result if all - inputs have identical ``attrs``, as has so far already been the case for - :func:`pandas.concat`. - :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`) - The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 9100ce0cdc990..d424be767a696 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -28,6 +28,9 @@ Enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ +- :func:`pandas.merge` propagates the ``attrs`` attribute to the result if all + inputs have identical ``attrs``, as has so far already been the case for + :func:`pandas.concat`. - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`) - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) - Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`)