Skip to content

Commit

Permalink
ENH: propagating attrs always uses deepcopy
Browse files Browse the repository at this point in the history
Always using a deepcopy prevents shared state and thus unintentional
modification of the attrs of other objects. IMHO this safety has a
higher priority than the slight performance cost of the deepcopy.
The implementation now skips the copying if *attrs* are not used (i.e.
an empty dict). This check takes only ~20ns. Thus, the attrs propagation
mechanism has no performance impact if attrs are not used.

Closes pandas-dev#54134.
  • Loading branch information
Tim Hoffmann committed Oct 2, 2023
1 parent 6f0cd8d commit 554a764
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 8 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,11 @@ enhancement2
Other enhancements
^^^^^^^^^^^^^^^^^^

- :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`).
- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
-

.. ---------------------------------------------------------------------------
.. _whatsnew_220.notable_bug_fixes:
Expand Down
28 changes: 21 additions & 7 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import annotations

import collections
from copy import deepcopy
import datetime as dt
from functools import partial
import gc
Expand Down Expand Up @@ -368,6 +369,13 @@ def attrs(self) -> dict[Hashable, Any]:
--------
DataFrame.flags : Global flags applying to this object.
Notes
-----
Many operations that create new datasets will copy ``attrs``. Copies
are always deep so that changing ``attrs`` will only affect the
present dataset. ``pandas.concat`` copies ``attrs`` only if all input
datasets have the same ``attrs``.
Examples
--------
For Series:
Expand Down Expand Up @@ -6191,8 +6199,12 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self:
stable across pandas releases.
"""
if isinstance(other, NDFrame):
for name in other.attrs:
self.attrs[name] = other.attrs[name]
if other.attrs:
# We want attrs propagation to have minimal performance
# impact if attrs are not used; i.e. attrs is an empty dict.
# One could make the deepcopy unconditionally, but a deepcopy
# of an empty dict is 50x more expensive than the empty check.
self.attrs = deepcopy(other.attrs)

self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels
# For subclasses using _metadata.
Expand All @@ -6201,11 +6213,13 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self:
object.__setattr__(self, name, getattr(other, name, None))

if method == "concat":
attrs = other.objs[0].attrs
check_attrs = all(objs.attrs == attrs for objs in other.objs[1:])
if check_attrs:
for name in attrs:
self.attrs[name] = attrs[name]
# propagate attrs only if all concat arguments have the same attrs
if all(bool(obj.attrs) for obj in other.objs):
# all concatenate arguments have non-empty attrs
attrs = other.objs[0].attrs
have_same_attrs = all(obj.attrs == attrs for obj in other.objs[1:])
if have_same_attrs:
self.attrs = deepcopy(attrs)

allows_duplicate_labels = all(
x.flags.allows_duplicate_labels for x in other.objs
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/frame/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,15 @@ def test_attrs(self):
result = df.rename(columns=str)
assert result.attrs == {"version": 1}

def test_attrs_deepcopy(self):
df = DataFrame({"A": [2, 3]})
assert df.attrs == {}
df.attrs["tags"] = {'spam', 'ham'}

result = df.rename(columns=str)
assert result.attrs == df.attrs
assert result.attrs["tags"] is not df.attrs["tags"]

@pytest.mark.parametrize("allows_duplicate_labels", [True, False, None])
def test_set_flags(
self, allows_duplicate_labels, frame_or_series, using_copy_on_write
Expand Down

0 comments on commit 554a764

Please sign in to comment.