Skip to content

Commit f3f0795

Browse files
timhoffmTim Hoffmann
and
Tim Hoffmann
authored
ENH: propagating attrs always uses deepcopy (#55314)
Always using a deepcopy prevents shared state and thus unintentional modification of the attrs of other objects. IMHO this safety has a higher priority than the slight performance cost of the deepcopy. The implementation now skips the copying if *attrs* are not used (i.e. an empty dict). This check takes only ~20ns. Thus, the attrs propagation mechanism has no performance impact if attrs are not used. Closes #54134. Co-authored-by: Tim Hoffmann <tim.hoffmann@zeiss.com>
1 parent 06cdcc0 commit f3f0795

File tree

3 files changed

+31
-8
lines changed

3 files changed

+31
-8
lines changed

doc/source/whatsnew/v2.2.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,12 +74,12 @@ enhancement2
7474
Other enhancements
7575
^^^^^^^^^^^^^^^^^^
7676

77+
- :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`).
7778
- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
7879
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
7980
- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
8081
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
8182
- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
82-
-
8383

8484
.. ---------------------------------------------------------------------------
8585
.. _whatsnew_220.notable_bug_fixes:

pandas/core/generic.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from __future__ import annotations
33

44
import collections
5+
from copy import deepcopy
56
import datetime as dt
67
from functools import partial
78
import gc
@@ -368,6 +369,13 @@ def attrs(self) -> dict[Hashable, Any]:
368369
--------
369370
DataFrame.flags : Global flags applying to this object.
370371
372+
Notes
373+
-----
374+
Many operations that create new datasets will copy ``attrs``. Copies
375+
are always deep so that changing ``attrs`` will only affect the
376+
present dataset. ``pandas.concat`` copies ``attrs`` only if all input
377+
datasets have the same ``attrs``.
378+
371379
Examples
372380
--------
373381
For Series:
@@ -6191,8 +6199,12 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self:
61916199
stable across pandas releases.
61926200
"""
61936201
if isinstance(other, NDFrame):
6194-
for name in other.attrs:
6195-
self.attrs[name] = other.attrs[name]
6202+
if other.attrs:
6203+
# We want attrs propagation to have minimal performance
6204+
# impact if attrs are not used; i.e. attrs is an empty dict.
6205+
# One could make the deepcopy unconditionally, but a deepcopy
6206+
# of an empty dict is 50x more expensive than the empty check.
6207+
self.attrs = deepcopy(other.attrs)
61966208

61976209
self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels
61986210
# For subclasses using _metadata.
@@ -6201,11 +6213,13 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self:
62016213
object.__setattr__(self, name, getattr(other, name, None))
62026214

62036215
if method == "concat":
6204-
attrs = other.objs[0].attrs
6205-
check_attrs = all(objs.attrs == attrs for objs in other.objs[1:])
6206-
if check_attrs:
6207-
for name in attrs:
6208-
self.attrs[name] = attrs[name]
6216+
# propagate attrs only if all concat arguments have the same attrs
6217+
if all(bool(obj.attrs) for obj in other.objs):
6218+
# all concatenate arguments have non-empty attrs
6219+
attrs = other.objs[0].attrs
6220+
have_same_attrs = all(obj.attrs == attrs for obj in other.objs[1:])
6221+
if have_same_attrs:
6222+
self.attrs = deepcopy(attrs)
62096223

62106224
allows_duplicate_labels = all(
62116225
x.flags.allows_duplicate_labels for x in other.objs

pandas/tests/frame/test_api.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,15 @@ def test_attrs(self):
315315
result = df.rename(columns=str)
316316
assert result.attrs == {"version": 1}
317317

318+
def test_attrs_deepcopy(self):
319+
df = DataFrame({"A": [2, 3]})
320+
assert df.attrs == {}
321+
df.attrs["tags"] = {"spam", "ham"}
322+
323+
result = df.rename(columns=str)
324+
assert result.attrs == df.attrs
325+
assert result.attrs["tags"] is not df.attrs["tags"]
326+
318327
@pytest.mark.parametrize("allows_duplicate_labels", [True, False, None])
319328
def test_set_flags(
320329
self, allows_duplicate_labels, frame_or_series, using_copy_on_write

0 commit comments

Comments
 (0)