From eae4787701478aaceb62d136f88326f02e0cf9ad Mon Sep 17 00:00:00 2001 From: Tim Hoffmann Date: Thu, 28 Sep 2023 12:01:40 +0200 Subject: [PATCH] ENH: propagating attrs always uses deepcopy Always using a deepcopy prevents shared state and thus unintentional modification of the attrs of other objects. IMHO this safety has a higher priority than the slight performance cost of the deepcopy. The implementation now skips the copying if *attrs* are not used (i.e. an empty dict). This check takes only ~20ns. Thus, the attrs propagation mechanism has no performance impact if attrs are not used. Closes #54134. --- doc/source/whatsnew/v2.2.0.rst | 2 +- pandas/core/generic.py | 26 +++++++++++++++++++------- pandas/tests/frame/test_api.py | 9 +++++++++ 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 445b93705cde53..ffb4208ddfab89 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -77,7 +77,7 @@ Other enhancements - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) -- +- :property:`Series.attrs` / :property:`DataFrame.attrs` now uses a deepcopy for propagation to derieved datasets. .. --------------------------------------------------------------------------- .. _whatsnew_220.notable_bug_fixes: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 738f4cbe6bc430..f46242bc37258f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2,6 +2,7 @@ from __future__ import annotations import collections +import copy import datetime as dt from functools import partial import gc @@ -364,6 +365,11 @@ def attrs(self) -> dict[Hashable, Any]: attrs is experimental and may change without warning. + Many operations that create new datasets will copy *attrs*. Copies are + always deep so that changing *attrs* will only affect the present + dataset. `pandas.concatenate` copies *attrs* only if all input + datasets have the same *attrs*. + See Also -------- DataFrame.flags : Global flags applying to this object. @@ -6191,8 +6197,12 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: stable across pandas releases. """ if isinstance(other, NDFrame): - for name in other.attrs: - self.attrs[name] = other.attrs[name] + if other.attrs: + # One could make the deepcopy unconditionally, however a deepcopy + # of an empty dict is 50x more expensive than the empty check. + # However, we want attrs propagation to have minimal performance + # impact it attrs are not used; i.e. attrs is an empty dict. + self.attrs = copy.deepcopy(other.attrs) self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels # For subclasses using _metadata. @@ -6201,11 +6211,13 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self: object.__setattr__(self, name, getattr(other, name, None)) if method == "concat": - attrs = other.objs[0].attrs - check_attrs = all(objs.attrs == attrs for objs in other.objs[1:]) - if check_attrs: - for name in attrs: - self.attrs[name] = attrs[name] + # propagate attrs only if all concat arguments have the same attrs + if all(bool(obj.attrs) for obj in other.objs): + # all concatenate arguments have non-empty attrs + attrs = other.objs[0].attrs + have_same_attrs = all(obj.attrs == attrs for obj in other.objs[1:]) + if have_same_attrs: + self.attrs = copy.deepcopy(attrs) allows_duplicate_labels = all( x.flags.allows_duplicate_labels for x in other.objs diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 8fc78629beb0af..d9333792d0aeb2 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -315,6 +315,15 @@ def test_attrs(self): result = df.rename(columns=str) assert result.attrs == {"version": 1} + def test_attrs_deepcopy(self): + df = DataFrame({"A": [2, 3]}) + assert df.attrs == {} + df.attrs["tags"] = {'spam', 'ham'} + + result = df.rename(columns=str) + assert result.attrs == df.attrs + assert result.attrs["tags"] is not df.attrs["tags"] + @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None]) def test_set_flags( self, allows_duplicate_labels, frame_or_series, using_copy_on_write