diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 7a177344a42c7..9a524db89de85 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -74,12 +74,12 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ +- :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`). - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) - Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`) -- .. --------------------------------------------------------------------------- .. _whatsnew_220.notable_bug_fixes: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 738f4cbe6bc43..2e6c8919eff38 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2,6 +2,7 @@ from __future__ import annotations import collections +from copy import deepcopy import datetime as dt from functools import partial import gc @@ -368,6 +369,13 @@ def attrs(self) -> dict[Hashable, Any]: -------- DataFrame.flags : Global flags applying to this object. + Notes + ----- + Many operations that create new datasets will copy ``attrs``. Copies + are always deep so that changing ``attrs`` will only affect the + present dataset. ``pandas.concat`` copies ``attrs`` only if all input + datasets have the same ``attrs``. + Examples -------- For Series: @@ -6191,8 +6199,12 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: stable across pandas releases. """ if isinstance(other, NDFrame): - for name in other.attrs: - self.attrs[name] = other.attrs[name] + if other.attrs: + # We want attrs propagation to have minimal performance + # impact if attrs are not used; i.e. attrs is an empty dict. + # One could make the deepcopy unconditionally, but a deepcopy + # of an empty dict is 50x more expensive than the empty check. + self.attrs = deepcopy(other.attrs) self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels # For subclasses using _metadata. @@ -6201,11 +6213,13 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: object.__setattr__(self, name, getattr(other, name, None)) if method == "concat": - attrs = other.objs[0].attrs - check_attrs = all(objs.attrs == attrs for objs in other.objs[1:]) - if check_attrs: - for name in attrs: - self.attrs[name] = attrs[name] + # propagate attrs only if all concat arguments have the same attrs + if all(bool(obj.attrs) for obj in other.objs): + # all concatenate arguments have non-empty attrs + attrs = other.objs[0].attrs + have_same_attrs = all(obj.attrs == attrs for obj in other.objs[1:]) + if have_same_attrs: + self.attrs = deepcopy(attrs) allows_duplicate_labels = all( x.flags.allows_duplicate_labels for x in other.objs diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 8fc78629beb0a..06bf169bf4dbc 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -315,6 +315,15 @@ def test_attrs(self): result = df.rename(columns=str) assert result.attrs == {"version": 1} + def test_attrs_deepcopy(self): + df = DataFrame({"A": [2, 3]}) + assert df.attrs == {} + df.attrs["tags"] = {"spam", "ham"} + + result = df.rename(columns=str) + assert result.attrs == df.attrs + assert result.attrs["tags"] is not df.attrs["tags"] + @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None]) def test_set_flags( self, allows_duplicate_labels, frame_or_series, using_copy_on_write