Skip to content

Commit

Permalink
BUG: Fix copy semantics in __array__ (pandas-dev#60046)
Browse files Browse the repository at this point in the history
Co-authored-by: Joris Van den Bossche <[email protected]>
  • Loading branch information
seberg and jorisvandenbossche authored Nov 4, 2024
1 parent cbf6e42 commit eacf032
Show file tree
Hide file tree
Showing 20 changed files with 255 additions and 30 deletions.
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v2.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ enhancement1
Other enhancements
^^^^^^^^^^^^^^^^^^

- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called
when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been
updated to work correctly with NumPy >= 2 (:issue:`57739`)
- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
-

Expand Down
11 changes: 10 additions & 1 deletion pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,16 @@ def __array__(
self, dtype: NpDtype | None = None, copy: bool | None = None
) -> np.ndarray:
"""Correctly construct numpy arrays when passed to `np.asarray()`."""
return self.to_numpy(dtype=dtype)
if copy is False:
# TODO: By using `zero_copy_only` it may be possible to implement this
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)
elif copy is None:
# `to_numpy(copy=False)` has the meaning of NumPy `copy=None`.
copy = False

return self.to_numpy(dtype=dtype, copy=copy)

def __invert__(self) -> Self:
# This is a bit wise op for integer types
Expand Down
24 changes: 15 additions & 9 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,11 +579,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
raise ValueError("Cannot convert float NaN to integer")

elif len(self.codes) == 0 or len(self.categories) == 0:
result = np.array(
self,
dtype=dtype,
copy=copy,
)
# For NumPy 1.x compatibility we cannot use copy=None. And
# `copy=False` has the meaning of `copy=None` here:
if not copy:
result = np.asarray(self, dtype=dtype)
else:
result = np.array(self, dtype=dtype)

else:
# GH8628 (PERF): astype category codes instead of astyping array
Expand Down Expand Up @@ -1663,7 +1664,7 @@ def __array__(
Specifies the the dtype for the array.
copy : bool or None, optional
Unused.
See :func:`numpy.asarray`.
Returns
-------
Expand All @@ -1686,13 +1687,18 @@ def __array__(
>>> np.asarray(cat)
array(['a', 'b'], dtype=object)
"""
if copy is False:
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)

ret = take_nd(self.categories._values, self._codes)
if dtype and np.dtype(dtype) != self.categories.dtype:
return np.asarray(ret, dtype)
# When we're a Categorical[ExtensionArray], like Interval,
# we need to ensure __array__ gets all the way to an
# ndarray.
return np.asarray(ret)

# `take_nd` should already make a copy, so don't force again.
return np.asarray(ret, dtype=dtype)

def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
# for binary ops, use our custom dunder methods
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,14 @@ def __array__(
) -> np.ndarray:
# used for Timedelta/DatetimeArray, overwritten by PeriodArray
if is_object_dtype(dtype):
if copy is False:
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)
return np.array(list(self), dtype=object)

if copy is True:
return np.array(self._ndarray, dtype=dtype)
return self._ndarray

@overload
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -1622,6 +1622,11 @@ def __array__(
Return the IntervalArray's data as a numpy array of Interval
objects (with dtype='object')
"""
if copy is False:
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)

left = self._left
right = self._right
mask = self.isna()
Expand Down
12 changes: 11 additions & 1 deletion pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,17 @@ def __array__(
the array interface, return my values
We return an object array here to preserve our scalar values
"""
return self.to_numpy(dtype=dtype)
if copy is False:
if not self._hasna:
# special case, here we can simply return the underlying data
return np.array(self._data, dtype=dtype, copy=copy)
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)

if copy is None:
copy = False # The NumPy copy=False meaning is different here.
return self.to_numpy(dtype=dtype, copy=copy)

_HANDLED_TYPES: tuple[type, ...]

Expand Down
3 changes: 3 additions & 0 deletions pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,9 @@ def dtype(self) -> NumpyEADtype:
def __array__(
self, dtype: NpDtype | None = None, copy: bool | None = None
) -> np.ndarray:
if copy is not None:
# Note: branch avoids `copy=None` for NumPy 1.x support
return np.array(self._ndarray, dtype=dtype, copy=copy)
return np.asarray(self._ndarray, dtype=dtype)

def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
Expand Down
15 changes: 13 additions & 2 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,8 +390,19 @@ def __array__(
self, dtype: NpDtype | None = None, copy: bool | None = None
) -> np.ndarray:
if dtype == "i8":
return self.asi8
elif dtype == bool:
# For NumPy 1.x compatibility we cannot use copy=None. And
# `copy=False` has the meaning of `copy=None` here:
if not copy:
return np.asarray(self.asi8, dtype=dtype)
else:
return np.array(self.asi8, dtype=dtype)

if copy is False:
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)

if dtype == bool:
return ~self._isnan

# This will raise TypeError for non-object dtypes
Expand Down
15 changes: 12 additions & 3 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,11 +547,20 @@ def from_spmatrix(cls, data: spmatrix) -> Self:
def __array__(
self, dtype: NpDtype | None = None, copy: bool | None = None
) -> np.ndarray:
fill_value = self.fill_value

if self.sp_index.ngaps == 0:
# Compat for na dtype and int values.
return self.sp_values
if copy is True:
return np.array(self.sp_values)
else:
return self.sp_values

if copy is False:
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)

fill_value = self.fill_value

if dtype is None:
# Can NumPy represent this type?
# If not, `np.result_type` will raise. We catch that
Expand Down
13 changes: 11 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2015,8 +2015,17 @@ def __array__(
self, dtype: npt.DTypeLike | None = None, copy: bool | None = None
) -> np.ndarray:
values = self._values
arr = np.asarray(values, dtype=dtype)
if astype_is_view(values.dtype, arr.dtype) and self._mgr.is_single_block:
if copy is None:
# Note: branch avoids `copy=None` for NumPy 1.x support
arr = np.asarray(values, dtype=dtype)
else:
arr = np.array(values, dtype=dtype, copy=copy)

if (
copy is not True
and astype_is_view(values.dtype, arr.dtype)
and self._mgr.is_single_block
):
# Check if both conversions can be done without a copy
if astype_is_view(self.dtypes.iloc[0], values.dtype) and astype_is_view(
values.dtype, arr.dtype
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -908,7 +908,11 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray:
"""
The array interface, return my values.
"""
return np.asarray(self._data, dtype=dtype)
if copy is None:
# Note, that the if branch exists for NumPy 1.x support
return np.asarray(self._data, dtype=dtype)

return np.array(self._data, dtype=dtype, copy=copy)

def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs):
if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs):
Expand Down
9 changes: 9 additions & 0 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1391,6 +1391,15 @@ def copy( # type: ignore[override]

def __array__(self, dtype=None, copy=None) -> np.ndarray:
"""the array interface, return my values"""
if copy is False:
# self.values is always a newly construct array, so raise.
raise ValueError(
"Unable to avoid copy while creating an array as requested."
)
if copy is True:
# explicit np.array call to ensure a copy is made and unique objects
# are returned, because self.values is cached
return np.array(self.values, dtype=dtype)
return self.values

def view(self, cls=None) -> Self:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def ndarray_to_mgr(
# and a subsequent `astype` will not already result in a copy
values = np.array(values, copy=True, order="F")
else:
values = np.array(values, copy=False)
values = np.asarray(values)
values = _ensure_2d(values)

else:
Expand Down
13 changes: 10 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -842,7 +842,7 @@ def __array__(
the dtype is inferred from the data.
copy : bool or None, optional
Unused.
See :func:`numpy.asarray`.
Returns
-------
Expand Down Expand Up @@ -879,8 +879,15 @@ def __array__(
dtype='datetime64[ns]')
"""
values = self._values
arr = np.asarray(values, dtype=dtype)
if astype_is_view(values.dtype, arr.dtype):
if copy is None:
# Note: branch avoids `copy=None` for NumPy 1.x support
arr = np.asarray(values, dtype=dtype)
else:
arr = np.array(values, dtype=dtype, copy=copy)

if copy is True:
return arr
if copy is False or astype_is_view(values.dtype, arr.dtype):
arr = arr.view()
arr.flags.writeable = False
return arr
Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest

from pandas._libs.sparse import IntIndex
from pandas.compat.numpy import np_version_gt2

import pandas as pd
from pandas import (
Expand Down Expand Up @@ -480,3 +481,33 @@ def test_zero_sparse_column():

expected = pd.DataFrame({"A": SparseArray([0, 0]), "B": [1, 3]}, index=[0, 2])
tm.assert_frame_equal(result, expected)


def test_array_interface(arr_data, arr):
# https://github.com/pandas-dev/pandas/pull/60046
result = np.asarray(arr)
tm.assert_numpy_array_equal(result, arr_data)

# it always gives a copy by default
result_copy1 = np.asarray(arr)
result_copy2 = np.asarray(arr)
assert not np.may_share_memory(result_copy1, result_copy2)

# or with explicit copy=True
result_copy1 = np.array(arr, copy=True)
result_copy2 = np.array(arr, copy=True)
assert not np.may_share_memory(result_copy1, result_copy2)

if not np_version_gt2:
# copy=False semantics are only supported in NumPy>=2.
return

# for sparse arrays, copy=False is never allowed
with pytest.raises(ValueError, match="Unable to avoid copy while creating"):
np.array(arr, copy=False)

# except when there are actually no sparse filled values
arr2 = SparseArray(np.array([1, 2, 3]))
result_nocopy1 = np.array(arr2, copy=False)
result_nocopy2 = np.array(arr2, copy=False)
assert np.may_share_memory(result_nocopy1, result_nocopy2)
8 changes: 8 additions & 0 deletions pandas/tests/arrays/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -1152,9 +1152,17 @@ def test_array_interface(self, arr1d):
result = np.asarray(arr, dtype=object)
tm.assert_numpy_array_equal(result, expected)

# to int64 gives the underlying representation
result = np.asarray(arr, dtype="int64")
tm.assert_numpy_array_equal(result, arr.asi8)

result2 = np.asarray(arr, dtype="int64")
assert np.may_share_memory(result, result2)

result_copy1 = np.array(arr, dtype="int64", copy=True)
result_copy2 = np.array(arr, dtype="int64", copy=True)
assert not np.may_share_memory(result_copy1, result_copy2)

# to other dtypes
msg = r"float\(\) argument must be a string or a( real)? number, not 'Period'"
with pytest.raises(TypeError, match=msg):
Expand Down
Loading

0 comments on commit eacf032

Please sign in to comment.