diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e3581876e59..5b00f1baba4 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -151,6 +151,21 @@ Bug fixes - Ensure :py:meth:`DataArray.unstack` works when wrapping array API-compliant classes. (:issue:`8666`, :pull:`8668`) By `Tom Nicholas `_. +- Avoid coercing to numpy arrays inside :py:func:`~xarray.core.duck_array_ops.as_shared_dtype`. (:pull:`8714`) + By `Tom Nicholas `_. +- Preserve chunks when writing time-like variables to zarr by enabling lazy CF + encoding of time-like variables (:issue:`7132`, :issue:`8230`, :issue:`8432`, + :pull:`8575`). By `Spencer Clark `_ and + `Mattia Almansi `_. +- Preserve chunks when writing time-like variables to zarr by enabling their + lazy encoding (:issue:`7132`, :issue:`8230`, :issue:`8432`, :pull:`8253`, + :pull:`8575`; see also discussion in :pull:`8253`). By `Spencer Clark + `_ and `Mattia Almansi + `_. +- Raise an informative error if dtype encoding of time-like variables would + lead to integer overflow or unsafe conversion from floating point to integer + values (:issue:`8542`, :pull:`8575`). By `Spencer Clark + `_. - Fix negative slicing of Zarr arrays without dask installed. (:issue:`8252`) By `Deepak Cherian `_. - Preserve chunks when writing time-like variables to zarr by enabling lazy CF encoding of time-like diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index ef497e78ebf..692e74efa23 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -35,10 +35,10 @@ from xarray.core import dask_array_ops, dtypes, nputils from xarray.core.options import OPTIONS -from xarray.core.utils import is_duck_array, is_duck_dask_array, module_available +from xarray.core.utils import is_duck_array, module_available from xarray.namedarray import pycompat -from xarray.namedarray.parallelcompat import get_chunked_array_type -from xarray.namedarray.pycompat import array_type, is_chunked_array +from xarray.namedarray.parallelcompat import get_chunked_array_type, is_chunked_array +from xarray.namedarray.pycompat import is_duck_dask_array, to_lazy_duck_array # remove once numpy 2.0 is the oldest supported version if module_available("numpy", minversion="2.0.0.dev0"): @@ -220,22 +220,10 @@ def asarray(data, xp=np): def as_shared_dtype(scalars_or_arrays, xp=np): - """Cast a arrays to a shared dtype using xarray's type promotion rules.""" - array_type_cupy = array_type("cupy") - if array_type_cupy and any( - isinstance(x, array_type_cupy) for x in scalars_or_arrays - ): - import cupy as cp - - arrays = [asarray(x, xp=cp) for x in scalars_or_arrays] - else: - arrays = [asarray(x, xp=xp) for x in scalars_or_arrays] - # Pass arrays directly instead of dtypes to result_type so scalars - # get handled properly. - # Note that result_type() safely gets the dtype from dask arrays without - # evaluating them. - out_type = dtypes.result_type(*arrays) - return [astype(x, out_type, copy=False) for x in arrays] + """Cast arrays to a shared dtype using xarray's type promotion rules.""" + duckarrays = [to_lazy_duck_array(obj, xp=xp) for obj in scalars_or_arrays] + out_type = dtypes.result_type(*duckarrays) + return [astype(x, out_type, copy=False) for x in duckarrays] def broadcast_to(array, shape): diff --git a/xarray/namedarray/pycompat.py b/xarray/namedarray/pycompat.py index 3ce33d4d8ea..418e80bd180 100644 --- a/xarray/namedarray/pycompat.py +++ b/xarray/namedarray/pycompat.py @@ -7,7 +7,7 @@ import numpy as np from packaging.version import Version -from xarray.core.utils import is_scalar +from xarray.core.utils import is_scalar, module_available from xarray.namedarray.utils import is_duck_array, is_duck_dask_array integer_types = (int, np.integer) @@ -88,6 +88,14 @@ def mod_version(mod: ModType) -> Version: return _get_cached_duck_array_module(mod).version +def is_dask_collection(x) -> bool: + if module_available("dask"): + from dask.base import is_dask_collection + + return is_dask_collection(x) + return False + + def is_chunked_array(x: duckarray[Any, Any]) -> bool: return is_duck_dask_array(x) or (is_duck_array(x) and hasattr(x, "chunks")) @@ -121,8 +129,9 @@ def to_numpy( return data -def to_duck_array(data: Any, **kwargs: dict[str, Any]) -> duckarray[_ShapeType, _DType]: - from xarray.core.indexing import ExplicitlyIndexed +def to_duck_array( + data: Any, xp=np, **kwargs: dict[str, Any] +) -> duckarray[_ShapeType, _DType]: from xarray.namedarray.parallelcompat import get_chunked_array_type if is_chunked_array(data): @@ -130,9 +139,26 @@ def to_duck_array(data: Any, **kwargs: dict[str, Any]) -> duckarray[_ShapeType, loaded_data, *_ = chunkmanager.compute(data, **kwargs) # type: ignore[var-annotated] return loaded_data + return to_lazy_duck_array(data) + + +def to_lazy_duck_array( + data: Any, xp=np, **kwargs: dict[str, Any] +) -> duckarray[_ShapeType, _DType]: + """Doesn't compute chunked data.""" + from xarray.core.indexing import ExplicitlyIndexed + if isinstance(data, ExplicitlyIndexed): return data.get_duck_array() # type: ignore[no-untyped-call, no-any-return] elif is_duck_array(data): return data else: - return np.asarray(data) # type: ignore[return-value] + from xarray.core.duck_array_ops import asarray + + array_type_cupy = array_type("cupy") + if array_type_cupy and any(isinstance(data, array_type_cupy)): + import cupy as cp + + return asarray(data, xp=cp) + else: + return asarray(data, xp=xp) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 5007db9eeb2..81beba80cf1 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -4,7 +4,9 @@ import platform import string import warnings +from collections.abc import Iterable from contextlib import contextmanager, nullcontext +from typing import Any, Callable from unittest import mock # noqa: F401 import numpy as np @@ -208,6 +210,105 @@ def __getitem__(self, key): raise UnexpectedDataAccess("Tried accessing data.") +HANDLED_ARRAY_FUNCTIONS: dict[str, Callable] = {} + + +def implements(numpy_function): + """Register an __array_function__ implementation for ManifestArray objects.""" + + def decorator(func): + HANDLED_ARRAY_FUNCTIONS[numpy_function] = func + return func + + return decorator + + +@implements(np.concatenate) +def concatenate( + arrays: Iterable[ConcatenatableArray], /, *, axis=0 +) -> ConcatenatableArray: + if any(not isinstance(arr, ConcatenatableArray) for arr in arrays): + raise TypeError + + result = np.concatenate([arr.array for arr in arrays], axis=axis) + return ConcatenatableArray(result) + + +@implements(np.stack) +def stack(arrays: Iterable[ConcatenatableArray], /, *, axis=0) -> ConcatenatableArray: + if any(not isinstance(arr, ConcatenatableArray) for arr in arrays): + raise TypeError + + result = np.stack([arr.array for arr in arrays], axis=axis) + return ConcatenatableArray(result) + + +@implements(np.result_type) +def result_type(*arrays_and_dtypes) -> np.dtype: + """Called by xarray to ensure all arguments to concat have the same dtype.""" + first_dtype, *other_dtypes = (np.dtype(obj) for obj in arrays_and_dtypes) + for other_dtype in other_dtypes: + if other_dtype != first_dtype: + raise ValueError("dtypes not all consistent") + return first_dtype + + +@implements(np.broadcast_to) +def broadcast_to( + x: ConcatenatableArray, /, shape: tuple[int, ...] +) -> ConcatenatableArray: + """ + Broadcasts an array to a specified shape, by either manipulating chunk keys or copying chunk manifest entries. + """ + if not isinstance(x, ConcatenatableArray): + raise TypeError + + result = np.broadcast_to(x.array, shape=shape) + return ConcatenatableArray(result) + + +class ConcatenatableArray(utils.NDArrayMixin): + """Disallows loading or coercing to an index but does support concatenation / stacking.""" + + # TODO only reason this is different from InaccessibleArray is to avoid it being a subclass of ExplicitlyIndexed + + HANDLED_ARRAY_FUNCTIONS = [concatenate, stack, result_type] + + def __init__(self, array): + self.array = array + + def get_duck_array(self): + raise UnexpectedDataAccess("Tried accessing data") + + def __array__(self, dtype: np.typing.DTypeLike = None): + raise UnexpectedDataAccess("Tried accessing data") + + def __getitem__(self, key): + raise UnexpectedDataAccess("Tried accessing data.") + + def __array_function__(self, func, types, args, kwargs) -> Any: + if func not in HANDLED_ARRAY_FUNCTIONS: + return NotImplemented + + # Note: this allows subclasses that don't override + # __array_function__ to handle ManifestArray objects + if not all(issubclass(t, ConcatenatableArray) for t in types): + return NotImplemented + + return HANDLED_ARRAY_FUNCTIONS[func](*args, **kwargs) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs) -> Any: + """We have to define this in order to convince xarray that this class is a duckarray, even though we will never support ufuncs.""" + return NotImplemented + + def astype(self, dtype: np.dtype, /, *, copy: bool = True) -> ConcatenatableArray: + """Needed because xarray will call this even when it's a no-op""" + if dtype != self.dtype: + raise NotImplementedError() + else: + return self + + class FirstElementAccessibleArray(InaccessibleArray): def __getitem__(self, key): tuple_idxr = key.tuple diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index d9289aa6674..42c014364dd 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -8,6 +8,7 @@ from typing import Generic import numpy as np +import numpy.testing as npt import pandas as pd import pytest import pytz @@ -31,6 +32,7 @@ from xarray.core.variable import as_compatible_data, as_variable from xarray.namedarray.pycompat import array_type from xarray.tests import ( + ConcatenatableArray, assert_allclose, assert_array_equal, assert_equal, @@ -551,6 +553,16 @@ def test_concat_mixed_dtypes(self): assert_identical(expected, actual) assert actual.dtype == object + def test_concat_without_access(self): + a = self.cls("x", ConcatenatableArray(np.array([0, 1]))) + b = self.cls("x", ConcatenatableArray(np.array([2, 3]))) + actual = Variable.concat([a, b], dim="x") + expected_arr = np.array([0, 1, 2, 3]) + expected = Variable("x", ConcatenatableArray(expected_arr)) + assert isinstance(actual.data, ConcatenatableArray) + assert expected.dims == ("x",) + npt.assert_equal(actual.data.array, expected_arr) + @pytest.mark.parametrize("deep", [True, False]) @pytest.mark.parametrize("astype", [float, int, str]) def test_copy(self, deep: bool, astype: type[object]) -> None: