Skip to content

Commit cb34da5

Browse files
author
Tim Hoffmann
committed
ENH: propagating attrs always uses deepcopy
Always using a deepcopy prevents shared state and thus unintentional modification of the attrs of other objects. IMHO this safety has a higher priority than the slight performance cost of the deepcopy. The implementation now skips the copying if *attrs* are not used (i.e. an empty dict). This check takes only ~20ns. Thus, the attrs propagation mechanism has no performance impact if attrs are not used. Closes #54134.
1 parent 824a273 commit cb34da5

File tree

3 files changed

+29
-8
lines changed

3 files changed

+29
-8
lines changed

doc/source/whatsnew/v2.2.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ Other enhancements
7777
- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
7878
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
7979
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
80-
-
80+
- :property:`Series.attrs` / :property:`DataFrame.attrs` now uses a deepcopy for propagation to derieved datasets.
8181

8282
.. ---------------------------------------------------------------------------
8383
.. _whatsnew_220.notable_bug_fixes:

pandas/core/generic.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from __future__ import annotations
33

44
import collections
5+
import copy
56
import datetime as dt
67
from functools import partial
78
import gc
@@ -364,6 +365,11 @@ def attrs(self) -> dict[Hashable, Any]:
364365
365366
attrs is experimental and may change without warning.
366367
368+
Many operations that create new datasets will copy *attrs*. Copies are
369+
always deep so that changing *attrs* will only affect the present
370+
dataset. `pandas.concatenate` copies *attrs* only if all input
371+
datasets have the same *attrs*.
372+
367373
See Also
368374
--------
369375
DataFrame.flags : Global flags applying to this object.
@@ -6191,8 +6197,12 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self:
61916197
stable across pandas releases.
61926198
"""
61936199
if isinstance(other, NDFrame):
6194-
for name in other.attrs:
6195-
self.attrs[name] = other.attrs[name]
6200+
if other.attrs:
6201+
# We want attrs propagation to have minimal performance
6202+
# impact it attrs are not used; i.e. attrs is an empty dict.
6203+
# One could make the deepcopy unconditionally, but a deepcopy
6204+
# of an empty dict is 50x more expensive than the empty check.
6205+
self.attrs = copy.deepcopy(other.attrs)
61966206

61976207
self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels
61986208
# For subclasses using _metadata.
@@ -6201,11 +6211,13 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self:
62016211
object.__setattr__(self, name, getattr(other, name, None))
62026212

62036213
if method == "concat":
6204-
attrs = other.objs[0].attrs
6205-
check_attrs = all(objs.attrs == attrs for objs in other.objs[1:])
6206-
if check_attrs:
6207-
for name in attrs:
6208-
self.attrs[name] = attrs[name]
6214+
# propagate attrs only if all concat arguments have the same attrs
6215+
if all(bool(obj.attrs) for obj in other.objs):
6216+
# all concatenate arguments have non-empty attrs
6217+
attrs = other.objs[0].attrs
6218+
have_same_attrs = all(obj.attrs == attrs for obj in other.objs[1:])
6219+
if have_same_attrs:
6220+
self.attrs = copy.deepcopy(attrs)
62096221

62106222
allows_duplicate_labels = all(
62116223
x.flags.allows_duplicate_labels for x in other.objs

pandas/tests/frame/test_api.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,15 @@ def test_attrs(self):
315315
result = df.rename(columns=str)
316316
assert result.attrs == {"version": 1}
317317

318+
def test_attrs_deepcopy(self):
319+
df = DataFrame({"A": [2, 3]})
320+
assert df.attrs == {}
321+
df.attrs["tags"] = {'spam', 'ham'}
322+
323+
result = df.rename(columns=str)
324+
assert result.attrs == df.attrs
325+
assert result.attrs["tags"] is not df.attrs["tags"]
326+
318327
@pytest.mark.parametrize("allows_duplicate_labels", [True, False, None])
319328
def test_set_flags(
320329
self, allows_duplicate_labels, frame_or_series, using_copy_on_write

0 commit comments

Comments
 (0)