Skip to content

Commit

Permalink
ENH: propagating attrs always uses deepcopy
Browse files Browse the repository at this point in the history
Always using a deepcopy prevents shared state and thus unintentional
modification of the attrs of other objects. IMHO this safety has a
higher priority than the slight performance cost of the deepcopy.
The implementation now skips the copying if *attrs* are not used (i.e.
an empty dict). This check takes only ~20ns. Thus, the attrs propagation
mechanism has no performance impact if attrs are not used.

Closes pandas-dev#54134.
  • Loading branch information
Tim Hoffmann committed Sep 28, 2023
1 parent 824a273 commit eae4787
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 8 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ Other enhancements
- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
-
- :property:`Series.attrs` / :property:`DataFrame.attrs` now uses a deepcopy for propagation to derieved datasets.

.. ---------------------------------------------------------------------------
.. _whatsnew_220.notable_bug_fixes:
Expand Down
26 changes: 19 additions & 7 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import annotations

import collections
import copy
import datetime as dt
from functools import partial
import gc
Expand Down Expand Up @@ -364,6 +365,11 @@ def attrs(self) -> dict[Hashable, Any]:
attrs is experimental and may change without warning.
Many operations that create new datasets will copy *attrs*. Copies are
always deep so that changing *attrs* will only affect the present
dataset. `pandas.concatenate` copies *attrs* only if all input
datasets have the same *attrs*.
See Also
--------
DataFrame.flags : Global flags applying to this object.
Expand Down Expand Up @@ -6191,8 +6197,12 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self:
stable across pandas releases.
"""
if isinstance(other, NDFrame):
for name in other.attrs:
self.attrs[name] = other.attrs[name]
if other.attrs:
# One could make the deepcopy unconditionally, however a deepcopy
# of an empty dict is 50x more expensive than the empty check.
# However, we want attrs propagation to have minimal performance
# impact it attrs are not used; i.e. attrs is an empty dict.
self.attrs = copy.deepcopy(other.attrs)

self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels
# For subclasses using _metadata.
Expand All @@ -6201,11 +6211,13 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self:
object.__setattr__(self, name, getattr(other, name, None))

if method == "concat":
attrs = other.objs[0].attrs
check_attrs = all(objs.attrs == attrs for objs in other.objs[1:])
if check_attrs:
for name in attrs:
self.attrs[name] = attrs[name]
# propagate attrs only if all concat arguments have the same attrs
if all(bool(obj.attrs) for obj in other.objs):
# all concatenate arguments have non-empty attrs
attrs = other.objs[0].attrs
have_same_attrs = all(obj.attrs == attrs for obj in other.objs[1:])
if have_same_attrs:
self.attrs = copy.deepcopy(attrs)

allows_duplicate_labels = all(
x.flags.allows_duplicate_labels for x in other.objs
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/frame/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,15 @@ def test_attrs(self):
result = df.rename(columns=str)
assert result.attrs == {"version": 1}

def test_attrs_deepcopy(self):
df = DataFrame({"A": [2, 3]})
assert df.attrs == {}
df.attrs["tags"] = {'spam', 'ham'}

result = df.rename(columns=str)
assert result.attrs == df.attrs
assert result.attrs["tags"] is not df.attrs["tags"]

@pytest.mark.parametrize("allows_duplicate_labels", [True, False, None])
def test_set_flags(
self, allows_duplicate_labels, frame_or_series, using_copy_on_write
Expand Down

0 comments on commit eae4787

Please sign in to comment.