diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 91c63528741..84114056312 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -123,7 +123,7 @@ jobs: python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v5.0.7 + uses: codecov/codecov-action@v5.1.1 with: file: mypy_report/cobertura.xml flags: mypy @@ -174,7 +174,7 @@ jobs: python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v5.0.7 + uses: codecov/codecov-action@v5.1.1 with: file: mypy_report/cobertura.xml flags: mypy-min @@ -230,7 +230,7 @@ jobs: python -m pyright xarray/ - name: Upload pyright coverage to Codecov - uses: codecov/codecov-action@v5.0.7 + uses: codecov/codecov-action@v5.1.1 with: file: pyright_report/cobertura.xml flags: pyright @@ -286,7 +286,7 @@ jobs: python -m pyright xarray/ - name: Upload pyright coverage to Codecov - uses: codecov/codecov-action@v5.0.7 + uses: codecov/codecov-action@v5.1.1 with: file: pyright_report/cobertura.xml flags: pyright39 diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b0996acf6fe..ad710e36247 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -159,7 +159,9 @@ jobs: path: pytest.xml - name: Upload code coverage to Codecov - uses: codecov/codecov-action@v5.0.7 + uses: codecov/codecov-action@v5.1.1 + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} with: file: ./coverage.xml flags: unittests diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index 30047673187..6a8b8d777c4 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -140,7 +140,7 @@ jobs: run: | python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v5.0.7 + uses: codecov/codecov-action@v5.1.1 with: file: mypy_report/cobertura.xml flags: mypy diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d543a36edd3..8d2b2ee809e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,7 +25,7 @@ repos: - id: text-unicode-replacement-char - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.8.0 + rev: v0.8.1 hooks: - id: ruff-format - id: ruff @@ -37,7 +37,7 @@ repos: exclude: "generate_aggregations.py" additional_dependencies: ["black==24.8.0"] - repo: https://github.com/rbubley/mirrors-prettier - rev: v3.3.3 + rev: v3.4.1 hooks: - id: prettier args: [--cache-location=.prettier_cache/cache] @@ -63,3 +63,13 @@ repos: rev: ebf0b5e44d67f8beaa1cd13a0d0393ea04c6058d hooks: - id: validate-cff + - repo: https://github.com/ComPWA/taplo-pre-commit + rev: v0.9.3 + hooks: + - id: taplo-format + args: ["--option", "array_auto_collapse=false"] + - repo: https://github.com/abravalheri/validate-pyproject + rev: v0.23 + hooks: + - id: validate-pyproject + additional_dependencies: ["validate-pyproject-schema-store[all]"] diff --git a/asv_bench/benchmarks/__init__.py b/asv_bench/benchmarks/__init__.py index 697fcb58494..4a9613ce026 100644 --- a/asv_bench/benchmarks/__init__.py +++ b/asv_bench/benchmarks/__init__.py @@ -30,13 +30,13 @@ def requires_sparse(): def randn(shape, frac_nan=None, chunks=None, seed=0): - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) if chunks is None: x = rng.standard_normal(shape) else: import dask.array as da - rng = da.random.RandomState(seed) + rng = da.random.default_rng(seed) x = rng.standard_normal(shape, chunks=chunks) if frac_nan is not None: @@ -47,7 +47,7 @@ def randn(shape, frac_nan=None, chunks=None, seed=0): def randint(low, high=None, size=None, frac_minus=None, seed=0): - rng = np.random.RandomState(seed) + rng = np.random.default_rng(seed) x = rng.randint(low, high, size) if frac_minus is not None: inds = rng.choice(range(x.size), int(x.size * frac_minus)) diff --git a/asv_bench/benchmarks/reindexing.py b/asv_bench/benchmarks/reindexing.py index 9d0767fc3b3..61e6b2213f3 100644 --- a/asv_bench/benchmarks/reindexing.py +++ b/asv_bench/benchmarks/reindexing.py @@ -11,7 +11,7 @@ class Reindex: def setup(self): - data = np.random.RandomState(0).randn(ntime, nx, ny) + data = np.random.default_rng(0).random((ntime, nx, ny)) self.ds = xr.Dataset( {"temperature": (("time", "x", "y"), data)}, coords={"time": np.arange(ntime), "x": np.arange(nx), "y": np.arange(ny)}, diff --git a/asv_bench/benchmarks/unstacking.py b/asv_bench/benchmarks/unstacking.py index dc8bc3307c3..b3af5eac19c 100644 --- a/asv_bench/benchmarks/unstacking.py +++ b/asv_bench/benchmarks/unstacking.py @@ -8,7 +8,7 @@ class Unstacking: def setup(self): - data = np.random.RandomState(0).randn(250, 500) + data = np.random.default_rng(0).random((250, 500)) self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...]) self.da_missing = self.da_full[:-1] self.df_missing = self.da_missing.to_pandas() diff --git a/ci/requirements/doc.yml b/ci/requirements/doc.yml index 3bf9640ec39..fc1234b787a 100644 --- a/ci/requirements/doc.yml +++ b/ci/requirements/doc.yml @@ -20,6 +20,7 @@ dependencies: - jupyter_client - matplotlib-base - nbsphinx + - ncdata - netcdf4>=1.5 - numba - numpy>=2 diff --git a/doc/getting-started-guide/faq.rst b/doc/getting-started-guide/faq.rst index b7ffd89b74a..af3b55a4086 100644 --- a/doc/getting-started-guide/faq.rst +++ b/doc/getting-started-guide/faq.rst @@ -173,9 +173,9 @@ integration with Cartopy_. We think the design decisions we have made for xarray (namely, basing it on pandas) make it a faster and more flexible data analysis tool. That said, Iris -has some great domain specific functionality, and xarray includes -methods for converting back and forth between xarray and Iris. See -:py:meth:`~xarray.DataArray.to_iris` for more details. +has some great domain specific functionality, and there are dedicated methods for +converting back and forth between xarray and Iris. See +:ref:`Reading and Writing Iris data ` for more details. What other projects leverage xarray? ------------------------------------ diff --git a/doc/user-guide/computation.rst b/doc/user-guide/computation.rst index ff12902cf56..5d7002484c2 100644 --- a/doc/user-guide/computation.rst +++ b/doc/user-guide/computation.rst @@ -30,7 +30,8 @@ numpy) over all array values: .. ipython:: python arr = xr.DataArray( - np.random.RandomState(0).randn(2, 3), [("x", ["a", "b"]), ("y", [10, 20, 30])] + np.random.default_rng(0).random((2, 3)), + [("x", ["a", "b"]), ("y", [10, 20, 30])], ) arr - 3 abs(arr) diff --git a/doc/user-guide/dask.rst b/doc/user-guide/dask.rst index 3ad84133d0b..cadb7962f1c 100644 --- a/doc/user-guide/dask.rst +++ b/doc/user-guide/dask.rst @@ -292,7 +292,7 @@ work as a streaming operation, when run on arrays loaded from disk: .. ipython:: :verbatim: - In [56]: rs = np.random.RandomState(0) + In [56]: rs = np.random.default_rng(0) In [57]: array1 = xr.DataArray(rs.randn(1000, 100000), dims=["place", "time"]) # 800MB diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index 6f0be112024..8561d37ed40 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -13,6 +13,8 @@ format (recommended). import os + import iris + import ncdata.iris_xarray import numpy as np import pandas as pd import xarray as xr @@ -1072,8 +1074,11 @@ Iris The Iris_ tool allows easy reading of common meteorological and climate model formats (including GRIB and UK MetOffice PP files) into ``Cube`` objects which are in many ways very -similar to ``DataArray`` objects, while enforcing a CF-compliant data model. If iris is -installed, xarray can convert a ``DataArray`` into a ``Cube`` using +similar to ``DataArray`` objects, while enforcing a CF-compliant data model. + +DataArray ``to_iris`` and ``from_iris`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +If iris is installed, xarray can convert a ``DataArray`` into a ``Cube`` using :py:meth:`DataArray.to_iris`: .. ipython:: python @@ -1095,9 +1100,36 @@ Conversely, we can create a new ``DataArray`` object from a ``Cube`` using da_cube = xr.DataArray.from_iris(cube) da_cube +Ncdata +~~~~~~ +Ncdata_ provides more sophisticated means of transferring data, including entire +datasets. It uses the file saving and loading functions in both projects to provide a +more "correct" translation between them, but still with very low overhead and not +using actual disk files. -.. _Iris: https://scitools.org.uk/iris +For example: + +.. ipython:: python + :okwarning: + ds = xr.tutorial.open_dataset("air_temperature_gradient") + cubes = ncdata.iris_xarray.cubes_from_xarray(ds) + print(cubes) + print(cubes[1]) + +.. ipython:: python + :okwarning: + + ds = ncdata.iris_xarray.cubes_to_xarray(cubes) + print(ds) + +Ncdata can also adjust file data within load and save operations, to fix data loading +problems or provide exact save formatting without needing to modify files on disk. +See for example : `ncdata usage examples`_ + +.. _Iris: https://scitools.org.uk/iris +.. _Ncdata: https://ncdata.readthedocs.io/en/latest/index.html +.. _ncdata usage examples: https://github.com/pp-mo/ncdata/tree/v0.1.2?tab=readme-ov-file#correct-a-miscoded-attribute-in-iris-input OPeNDAP ------- diff --git a/doc/user-guide/pandas.rst b/doc/user-guide/pandas.rst index 5fe5e15fa63..3d4a6b42bc8 100644 --- a/doc/user-guide/pandas.rst +++ b/doc/user-guide/pandas.rst @@ -202,7 +202,7 @@ Let's take a look: .. ipython:: python - data = np.random.RandomState(0).rand(2, 3, 4) + data = np.random.default_rng(0).rand(2, 3, 4) items = list("ab") major_axis = list("mno") minor_axis = pd.date_range(start="2000", periods=4, name="date") diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 191a55a0285..f1909a67d8b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -14,9 +14,9 @@ What's New np.random.seed(123456) -.. _whats-new.2024.11.1: +.. _whats-new.2024.12.0: -v.2024.11.1 (unreleased) +v.2024.12.0 (unreleased) ------------------------ New Features @@ -29,7 +29,12 @@ New Features Breaking changes ~~~~~~~~~~~~~~~~ - +- Methods including ``dropna``, ``rank``, ``idxmax``, ``idxmin`` require + non-dimension arguments to be passed as keyword arguments. The previous + behavior, which allowed ``.idxmax('foo', 'all')`` was too easily confused with + ``'all'`` being a dimension. The updated equivalent is ``.idxmax('foo', + how='all')``. The previous behavior was deprecated in v2023.10.0. + By `Maximilian Roos `_. Deprecations ~~~~~~~~~~~~ @@ -50,8 +55,8 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ - - +- Move non-CF related ``ensure_dtype_not_object`` from conventions to backends (:pull:`9828`). + By `Kai Mühlbauer `_. .. _whats-new.2024.11.0: diff --git a/pyproject.toml b/pyproject.toml index 3ac1a024195..e6c63c4d010 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] authors = [ - {name = "xarray Developers", email = "xarray@googlegroups.com"}, + { name = "xarray Developers", email = "xarray@googlegroups.com" }, ] classifiers = [ "Development Status :: 5 - Production/Stable", @@ -16,7 +16,7 @@ classifiers = [ ] description = "N-D labeled arrays and datasets in Python" dynamic = ["version"] -license = {text = "Apache-2.0"} +license = { text = "Apache-2.0" } name = "xarray" readme = "README.md" requires-python = ">=3.10" @@ -50,7 +50,16 @@ dev = [ "sphinx_autosummary_accessors", "xarray[complete]", ] -io = ["netCDF4", "h5netcdf", "scipy", 'pydap; python_version<"3.10"', "zarr", "fsspec", "cftime", "pooch"] +io = [ + "netCDF4", + "h5netcdf", + "scipy", + 'pydap; python_version<"3.10"', + "zarr", + "fsspec", + "cftime", + "pooch", +] etc = ["sparse"] parallel = ["dask[complete]"] viz = ["cartopy", "matplotlib", "nc-time-axis", "seaborn"] @@ -249,7 +258,7 @@ extend-select = [ "RUF", ] extend-safe-fixes = [ - "TID252", # absolute imports + "TID252", # absolute imports ] ignore = [ "E402", # module level import not at top of file @@ -327,7 +336,9 @@ filterwarnings = [ "default:the `pandas.MultiIndex` object:FutureWarning:xarray.tests.test_variable", "default:Using a non-tuple sequence for multidimensional indexing is deprecated:FutureWarning", "default:Duplicate dimension names present:UserWarning:xarray.namedarray.core", - "default:::xarray.tests.test_strategies", # TODO: remove once we know how to deal with a changed signature in protocols + + # TODO: remove once we know how to deal with a changed signature in protocols + "default:::xarray.tests.test_strategies", ] log_cli_level = "INFO" diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 3756de90b60..58a98598a5b 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -4,29 +4,36 @@ import os import time import traceback -from collections.abc import Iterable, Mapping, Sequence +from collections.abc import Hashable, Iterable, Mapping, Sequence from glob import glob -from typing import TYPE_CHECKING, Any, ClassVar, TypeVar, overload +from typing import TYPE_CHECKING, Any, ClassVar, TypeVar, Union, overload import numpy as np +import pandas as pd +from xarray.coding import strings, variables +from xarray.coding.variables import SerializationWarning from xarray.conventions import cf_encoder from xarray.core import indexing -from xarray.core.datatree import DataTree +from xarray.core.datatree import DataTree, Variable from xarray.core.types import ReadBuffer from xarray.core.utils import ( FrozenDict, NdimSizeLenMixin, attempt_import, + emit_user_level_warning, is_remote_uri, ) from xarray.namedarray.parallelcompat import get_chunked_array_type from xarray.namedarray.pycompat import is_chunked_array +from xarray.namedarray.utils import is_duck_dask_array if TYPE_CHECKING: from xarray.core.dataset import Dataset from xarray.core.types import NestedSequence + T_Name = Union[Hashable, None] + # Create a logger object, but don't add any handlers. Leave that to user code. logger = logging.getLogger(__name__) @@ -527,6 +534,101 @@ def set_dimensions(self, variables, unlimited_dims=None): self.set_dimension(dim, length, is_unlimited) +def _infer_dtype(array, name=None): + """Given an object array with no missing values, infer its dtype from all elements.""" + if array.dtype.kind != "O": + raise TypeError("infer_type must be called on a dtype=object array") + + if array.size == 0: + return np.dtype(float) + + native_dtypes = set(np.vectorize(type, otypes=[object])(array.ravel())) + if len(native_dtypes) > 1 and native_dtypes != {bytes, str}: + raise ValueError( + "unable to infer dtype on variable {!r}; object array " + "contains mixed native types: {}".format( + name, ", ".join(x.__name__ for x in native_dtypes) + ) + ) + + element = array[(0,) * array.ndim] + # We use the base types to avoid subclasses of bytes and str (which might + # not play nice with e.g. hdf5 datatypes), such as those from numpy + if isinstance(element, bytes): + return strings.create_vlen_dtype(bytes) + elif isinstance(element, str): + return strings.create_vlen_dtype(str) + + dtype = np.array(element).dtype + if dtype.kind != "O": + return dtype + + raise ValueError( + f"unable to infer dtype on variable {name!r}; xarray " + "cannot serialize arbitrary Python objects" + ) + + +def _copy_with_dtype(data, dtype: np.typing.DTypeLike): + """Create a copy of an array with the given dtype. + + We use this instead of np.array() to ensure that custom object dtypes end + up on the resulting array. + """ + result = np.empty(data.shape, dtype) + result[...] = data + return result + + +def ensure_dtype_not_object(var: Variable, name: T_Name = None) -> Variable: + if var.dtype.kind == "O": + dims, data, attrs, encoding = variables.unpack_for_encoding(var) + + # leave vlen dtypes unchanged + if strings.check_vlen_dtype(data.dtype) is not None: + return var + + if is_duck_dask_array(data): + emit_user_level_warning( + f"variable {name} has data in the form of a dask array with " + "dtype=object, which means it is being loaded into memory " + "to determine a data type that can be safely stored on disk. " + "To avoid this, coerce this variable to a fixed-size dtype " + "with astype() before saving it.", + category=SerializationWarning, + ) + data = data.compute() + + missing = pd.isnull(data) + if missing.any(): + # nb. this will fail for dask.array data + non_missing_values = data[~missing] + inferred_dtype = _infer_dtype(non_missing_values, name) + + # There is no safe bit-pattern for NA in typical binary string + # formats, we so can't set a fill_value. Unfortunately, this means + # we can't distinguish between missing values and empty strings. + fill_value: bytes | str + if strings.is_bytes_dtype(inferred_dtype): + fill_value = b"" + elif strings.is_unicode_dtype(inferred_dtype): + fill_value = "" + else: + # insist on using float for numeric values + if not np.issubdtype(inferred_dtype, np.floating): + inferred_dtype = np.dtype(float) + fill_value = inferred_dtype.type(np.nan) + + data = _copy_with_dtype(data, dtype=inferred_dtype) + data[missing] = fill_value + else: + data = _copy_with_dtype(data, dtype=_infer_dtype(data, name)) + + assert data.dtype.kind != "O" or data.dtype.metadata + var = Variable(dims, data, attrs, encoding, fastpath=True) + return var + + class WritableCFDataStore(AbstractWritableDataStore): __slots__ = () @@ -534,6 +636,9 @@ def encode(self, variables, attributes): # All NetCDF files get CF encoded by default, without this attempting # to write times, for example, would fail. variables, attributes = cf_encoder(variables, attributes) + variables = { + k: ensure_dtype_not_object(v, name=k) for k, v in variables.items() + } variables = {k: self.encode_variable(v) for k, v in variables.items()} attributes = {k: self.encode_attribute(v) for k, v in attributes.items()} return variables, attributes diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 1acc0a502e6..cb3ab375c31 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -19,6 +19,7 @@ _encode_variable_name, _normalize_path, datatree_from_dict_with_io_cleanup, + ensure_dtype_not_object, ) from xarray.backends.store import StoreBackendEntrypoint from xarray.core import indexing @@ -507,6 +508,7 @@ def encode_zarr_variable(var, needs_copy=True, name=None): """ var = conventions.encode_cf_variable(var, name=name) + var = ensure_dtype_not_object(var, name=name) # zarr allows unicode, but not variable-length strings, so it's both # simpler and more compact to always encode as UTF-8 explicitly. @@ -970,6 +972,7 @@ def store( if _zarr_v3(): # https://github.com/zarr-developers/zarr-python/pull/2113#issuecomment-2386718323 kwargs["path"] = self.zarr_group.name.lstrip("/") + kwargs["zarr_format"] = self.zarr_group.metadata.zarr_format zarr.consolidate_metadata(self.zarr_group.store, **kwargs) def sync(self): diff --git a/xarray/conventions.py b/xarray/conventions.py index b11f3d6289f..b2683e16691 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Any, Literal, TypeVar, Union import numpy as np -import pandas as pd from xarray.coding import strings, times, variables from xarray.coding.variables import SerializationWarning, pop_to @@ -50,41 +49,6 @@ T_DatasetOrAbstractstore = Union[Dataset, AbstractDataStore] -def _infer_dtype(array, name=None): - """Given an object array with no missing values, infer its dtype from all elements.""" - if array.dtype.kind != "O": - raise TypeError("infer_type must be called on a dtype=object array") - - if array.size == 0: - return np.dtype(float) - - native_dtypes = set(np.vectorize(type, otypes=[object])(array.ravel())) - if len(native_dtypes) > 1 and native_dtypes != {bytes, str}: - raise ValueError( - "unable to infer dtype on variable {!r}; object array " - "contains mixed native types: {}".format( - name, ", ".join(x.__name__ for x in native_dtypes) - ) - ) - - element = array[(0,) * array.ndim] - # We use the base types to avoid subclasses of bytes and str (which might - # not play nice with e.g. hdf5 datatypes), such as those from numpy - if isinstance(element, bytes): - return strings.create_vlen_dtype(bytes) - elif isinstance(element, str): - return strings.create_vlen_dtype(str) - - dtype = np.array(element).dtype - if dtype.kind != "O": - return dtype - - raise ValueError( - f"unable to infer dtype on variable {name!r}; xarray " - "cannot serialize arbitrary Python objects" - ) - - def ensure_not_multiindex(var: Variable, name: T_Name = None) -> None: # only the pandas multi-index dimension coordinate cannot be serialized (tuple values) if isinstance(var._data, indexing.PandasMultiIndexingAdapter): @@ -99,67 +63,6 @@ def ensure_not_multiindex(var: Variable, name: T_Name = None) -> None: ) -def _copy_with_dtype(data, dtype: np.typing.DTypeLike): - """Create a copy of an array with the given dtype. - - We use this instead of np.array() to ensure that custom object dtypes end - up on the resulting array. - """ - result = np.empty(data.shape, dtype) - result[...] = data - return result - - -def ensure_dtype_not_object(var: Variable, name: T_Name = None) -> Variable: - # TODO: move this from conventions to backends? (it's not CF related) - if var.dtype.kind == "O": - dims, data, attrs, encoding = variables.unpack_for_encoding(var) - - # leave vlen dtypes unchanged - if strings.check_vlen_dtype(data.dtype) is not None: - return var - - if is_duck_dask_array(data): - emit_user_level_warning( - f"variable {name} has data in the form of a dask array with " - "dtype=object, which means it is being loaded into memory " - "to determine a data type that can be safely stored on disk. " - "To avoid this, coerce this variable to a fixed-size dtype " - "with astype() before saving it.", - category=SerializationWarning, - ) - data = data.compute() - - missing = pd.isnull(data) - if missing.any(): - # nb. this will fail for dask.array data - non_missing_values = data[~missing] - inferred_dtype = _infer_dtype(non_missing_values, name) - - # There is no safe bit-pattern for NA in typical binary string - # formats, we so can't set a fill_value. Unfortunately, this means - # we can't distinguish between missing values and empty strings. - fill_value: bytes | str - if strings.is_bytes_dtype(inferred_dtype): - fill_value = b"" - elif strings.is_unicode_dtype(inferred_dtype): - fill_value = "" - else: - # insist on using float for numeric values - if not np.issubdtype(inferred_dtype, np.floating): - inferred_dtype = np.dtype(float) - fill_value = inferred_dtype.type(np.nan) - - data = _copy_with_dtype(data, dtype=inferred_dtype) - data[missing] = fill_value - else: - data = _copy_with_dtype(data, dtype=_infer_dtype(data, name)) - - assert data.dtype.kind != "O" or data.dtype.metadata - var = Variable(dims, data, attrs, encoding, fastpath=True) - return var - - def encode_cf_variable( var: Variable, needs_copy: bool = True, name: T_Name = None ) -> Variable: @@ -196,9 +99,6 @@ def encode_cf_variable( ]: var = coder.encode(var, name=name) - # TODO(kmuehlbauer): check if ensure_dtype_not_object can be moved to backends: - var = ensure_dtype_not_object(var, name=name) - for attr_name in CF_RELATED_DATA: pop_to(var.encoding, var.attrs, attr_name) return var diff --git a/xarray/core/dask_array_ops.py b/xarray/core/dask_array_ops.py index 8bf9c68b727..2dca38538e1 100644 --- a/xarray/core/dask_array_ops.py +++ b/xarray/core/dask_array_ops.py @@ -1,6 +1,7 @@ from __future__ import annotations import math +from functools import partial from xarray.core import dtypes, nputils @@ -75,6 +76,47 @@ def least_squares(lhs, rhs, rcond=None, skipna=False): return coeffs, residuals +def _fill_with_last_one(a, b): + import numpy as np + + # cumreduction apply the push func over all the blocks first so, + # the only missing part is filling the missing values using the + # last data of the previous chunk + return np.where(np.isnan(b), a, b) + + +def _dtype_push(a, axis, dtype=None): + from xarray.core.duck_array_ops import _push + + # Not sure why the blelloch algorithm force to receive a dtype + return _push(a, axis=axis) + + +def _reset_cumsum(a, axis, dtype=None): + import numpy as np + + cumsum = np.cumsum(a, axis=axis) + reset_points = np.maximum.accumulate(np.where(a == 0, cumsum, 0), axis=axis) + return cumsum - reset_points + + +def _last_reset_cumsum(a, axis, keepdims=None): + import numpy as np + + # Take the last cumulative sum taking into account the reset + # This is useful for blelloch method + return np.take(_reset_cumsum(a, axis=axis), axis=axis, indices=[-1]) + + +def _combine_reset_cumsum(a, b, axis): + import numpy as np + + # It is going to sum the previous result until the first + # non nan value + bitmask = np.cumprod(b != 0, axis=axis) + return np.where(bitmask, b + a, b) + + def push(array, n, axis, method="blelloch"): """ Dask-aware bottleneck.push @@ -91,16 +133,6 @@ def push(array, n, axis, method="blelloch"): # TODO: Replace all this function # once https://github.com/pydata/xarray/issues/9229 being implemented - def _fill_with_last_one(a, b): - # cumreduction apply the push func over all the blocks first so, - # the only missing part is filling the missing values using the - # last data of the previous chunk - return np.where(np.isnan(b), a, b) - - def _dtype_push(a, axis, dtype=None): - # Not sure why the blelloch algorithm force to receive a dtype - return _push(a, axis=axis) - pushed_array = da.reductions.cumreduction( func=_dtype_push, binop=_fill_with_last_one, @@ -113,26 +145,9 @@ def _dtype_push(a, axis, dtype=None): ) if n is not None and 0 < n < array.shape[axis] - 1: - - def _reset_cumsum(a, axis, dtype=None): - cumsum = np.cumsum(a, axis=axis) - reset_points = np.maximum.accumulate(np.where(a == 0, cumsum, 0), axis=axis) - return cumsum - reset_points - - def _last_reset_cumsum(a, axis, keepdims=None): - # Take the last cumulative sum taking into account the reset - # This is useful for blelloch method - return np.take(_reset_cumsum(a, axis=axis), axis=axis, indices=[-1]) - - def _combine_reset_cumsum(a, b): - # It is going to sum the previous result until the first - # non nan value - bitmask = np.cumprod(b != 0, axis=axis) - return np.where(bitmask, b + a, b) - valid_positions = da.reductions.cumreduction( func=_reset_cumsum, - binop=_combine_reset_cumsum, + binop=partial(_combine_reset_cumsum, axis=axis), ident=0, x=da.isnan(array, dtype=int), axis=axis, diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index d743b059386..cd0428e73ca 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1026,7 +1026,6 @@ def reset_coords( drop: Literal[True], ) -> Self: ... - @_deprecate_positional_args("v2023.10.0") def reset_coords( self, names: Dims = None, @@ -1364,7 +1363,6 @@ def chunksizes(self) -> Mapping[Any, tuple[int, ...]]: all_variables = [self.variable] + [c.variable for c in self.coords.values()] return get_chunksizes(all_variables) - @_deprecate_positional_args("v2023.10.0") def chunk( self, chunks: T_ChunksFreq = {}, # noqa: B006 # {} even though it's technically unsafe, is being used intentionally here (#4667) @@ -1504,8 +1502,8 @@ def isel( See Also -------- - Dataset.isel - DataArray.sel + :func:`Dataset.isel ` + :func:`DataArray.sel ` :doc:`xarray-tutorial:intermediate/indexing/indexing` Tutorial material on indexing with Xarray objects @@ -1642,8 +1640,8 @@ def sel( See Also -------- - Dataset.sel - DataArray.isel + :func:`Dataset.sel ` + :func:`DataArray.isel ` :doc:`xarray-tutorial:intermediate/indexing/indexing` Tutorial material on indexing with Xarray objects @@ -1835,7 +1833,6 @@ def thin( ds = self._to_temp_dataset().thin(indexers, **indexers_kwargs) return self._from_temp_dataset(ds) - @_deprecate_positional_args("v2023.10.0") def broadcast_like( self, other: T_DataArrayOrSet, @@ -1948,7 +1945,6 @@ def _reindex_callback( return da - @_deprecate_positional_args("v2023.10.0") def reindex_like( self, other: T_DataArrayOrSet, @@ -2135,7 +2131,6 @@ def reindex_like( fill_value=fill_value, ) - @_deprecate_positional_args("v2023.10.0") def reindex( self, indexers: Mapping[Any, Any] | None = None, @@ -2960,7 +2955,6 @@ def stack( ) return self._from_temp_dataset(ds) - @_deprecate_positional_args("v2023.10.0") def unstack( self, dim: Dims = None, @@ -3385,7 +3379,6 @@ def drop_isel( dataset = dataset.drop_isel(indexers=indexers, **indexers_kwargs) return self._from_temp_dataset(dataset) - @_deprecate_positional_args("v2023.10.0") def dropna( self, dim: Hashable, @@ -4889,7 +4882,6 @@ def _title_for_slice(self, truncate: int = 50) -> str: return title - @_deprecate_positional_args("v2023.10.0") def diff( self, dim: Hashable, @@ -5198,7 +5190,6 @@ def sortby( ds = self._to_temp_dataset().sortby(variables, ascending=ascending) return self._from_temp_dataset(ds) - @_deprecate_positional_args("v2023.10.0") def quantile( self, q: ArrayLike, @@ -5318,7 +5309,6 @@ def quantile( ) return self._from_temp_dataset(ds) - @_deprecate_positional_args("v2023.10.0") def rank( self, dim: Hashable, @@ -5612,8 +5602,9 @@ def map_blocks( See Also -------- - dask.array.map_blocks, xarray.apply_ufunc, xarray.Dataset.map_blocks - xarray.DataArray.map_blocks + :func:`dask.array.map_blocks ` + :func:`xarray.apply_ufunc ` + :func:`xarray.Dataset.map_blocks ` :doc:`xarray-tutorial:advanced/map_blocks/map_blocks` Advanced Tutorial on map_blocks with dask @@ -5896,7 +5887,6 @@ def pad( ) return self._from_temp_dataset(ds) - @_deprecate_positional_args("v2023.10.0") def idxmin( self, dim: Hashable | None = None, @@ -5994,7 +5984,6 @@ def idxmin( keep_attrs=keep_attrs, ) - @_deprecate_positional_args("v2023.10.0") def idxmax( self, dim: Hashable = None, @@ -6092,7 +6081,6 @@ def idxmax( keep_attrs=keep_attrs, ) - @_deprecate_positional_args("v2023.10.0") def argmin( self, dim: Dims = None, @@ -6194,7 +6182,6 @@ def argmin( else: return self._replace_maybe_drop_dims(result) - @_deprecate_positional_args("v2023.10.0") def argmax( self, dim: Dims = None, @@ -6543,7 +6530,6 @@ def curvefit( kwargs=kwargs, ) - @_deprecate_positional_args("v2023.10.0") def drop_duplicates( self, dim: Hashable | Iterable[Hashable], @@ -6905,13 +6891,13 @@ def groupby( :doc:`xarray-tutorial:fundamentals/03.2_groupby_with_xarray` Tutorial on :py:func:`~xarray.DataArray.Groupby` demonstrating reductions, transformation and comparison with :py:func:`~xarray.DataArray.resample` - DataArray.groupby_bins - Dataset.groupby - core.groupby.DataArrayGroupBy - DataArray.coarsen - pandas.DataFrame.groupby - Dataset.resample - DataArray.resample + :external:py:meth:`pandas.DataFrame.groupby ` + :func:`DataArray.groupby_bins ` + :func:`Dataset.groupby ` + :func:`core.groupby.DataArrayGroupBy ` + :func:`DataArray.coarsen ` + :func:`Dataset.resample ` + :func:`DataArray.resample ` """ from xarray.core.groupby import ( DataArrayGroupBy, @@ -7048,7 +7034,7 @@ def weighted(self, weights: DataArray) -> DataArrayWeighted: See Also -------- - Dataset.weighted + :func:`Dataset.weighted ` :ref:`comput.weighted` User guide on weighted array reduction using :py:func:`~xarray.DataArray.weighted` @@ -7332,8 +7318,8 @@ def coarsen( See Also -------- - core.rolling.DataArrayCoarsen - Dataset.coarsen + :class:`core.rolling.DataArrayCoarsen ` + :func:`Dataset.coarsen ` :ref:`reshape.coarsen` User guide describing :py:func:`~xarray.DataArray.coarsen` diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ae9ce105794..3077b4eadb9 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3082,8 +3082,8 @@ def isel( See Also -------- - Dataset.sel - DataArray.isel + :func:`Dataset.sel ` + :func:`DataArray.isel ` :doc:`xarray-tutorial:intermediate/indexing/indexing` Tutorial material on indexing with Xarray objects @@ -3236,8 +3236,8 @@ def sel( See Also -------- - Dataset.isel - DataArray.sel + :func:`Dataset.isel ` + :func:`DataArray.sel ` :doc:`xarray-tutorial:intermediate/indexing/indexing` Tutorial material on indexing with Xarray objects @@ -3276,9 +3276,11 @@ def _shuffle(self, dim, *, indices: GroupIndices, chunks: T_Chunks) -> Self: subset = self[[name for name in self._variables if name not in is_chunked]] no_slices: list[list[int]] = [ - list(range(*idx.indices(self.sizes[dim]))) - if isinstance(idx, slice) - else idx + ( + list(range(*idx.indices(self.sizes[dim]))) + if isinstance(idx, slice) + else idx + ) for idx in indices ] no_slices = [idx for idx in no_slices if idx] @@ -5102,7 +5104,6 @@ def set_index( variables, coord_names=coord_names, indexes=indexes_ ) - @_deprecate_positional_args("v2023.10.0") def reset_index( self, dims_or_levels: Hashable | Sequence[Hashable], @@ -5740,7 +5741,6 @@ def _unstack_full_reindex( variables, coord_names=coord_names, indexes=indexes ) - @_deprecate_positional_args("v2023.10.0") def unstack( self, dim: Dims = None, @@ -6502,7 +6502,6 @@ def transpose( ds._variables[name] = var.transpose(*var_dims) return ds - @_deprecate_positional_args("v2023.10.0") def dropna( self, dim: Hashable, @@ -7976,7 +7975,6 @@ def _copy_attrs_from(self, other): if v in self.variables: self.variables[v].attrs = other.variables[v].attrs - @_deprecate_positional_args("v2023.10.0") def diff( self, dim: Hashable, @@ -8324,7 +8322,6 @@ def sortby( indices[key] = order if ascending else order[::-1] return aligned_self.isel(indices) - @_deprecate_positional_args("v2023.10.0") def quantile( self, q: ArrayLike, @@ -8505,7 +8502,6 @@ def quantile( ) return new.assign_coords(quantile=q) - @_deprecate_positional_args("v2023.10.0") def rank( self, dim: Hashable, @@ -9020,8 +9016,9 @@ def map_blocks( See Also -------- - dask.array.map_blocks, xarray.apply_ufunc, xarray.Dataset.map_blocks - xarray.DataArray.map_blocks + :func:`dask.array.map_blocks ` + :func:`xarray.apply_ufunc ` + :func:`xarray.DataArray.map_blocks ` :doc:`xarray-tutorial:advanced/map_blocks/map_blocks` Advanced Tutorial on map_blocks with dask @@ -9475,7 +9472,6 @@ def pad( attrs = self._attrs if keep_attrs else None return self._replace_with_new_dims(variables, indexes=indexes, attrs=attrs) - @_deprecate_positional_args("v2023.10.0") def idxmin( self, dim: Hashable | None = None, @@ -9574,7 +9570,6 @@ def idxmin( ) ) - @_deprecate_positional_args("v2023.10.0") def idxmax( self, dim: Hashable | None = None, @@ -10257,7 +10252,6 @@ def _wrapper(Y, *args, **kwargs): return result - @_deprecate_positional_args("v2023.10.0") def drop_duplicates( self, dim: Hashable | Iterable[Hashable], @@ -10551,13 +10545,13 @@ def groupby( :doc:`xarray-tutorial:fundamentals/03.2_groupby_with_xarray` Tutorial on :py:func:`~xarray.Dataset.Groupby` demonstrating reductions, transformation and comparison with :py:func:`~xarray.Dataset.resample`. - Dataset.groupby_bins - DataArray.groupby - core.groupby.DatasetGroupBy - pandas.DataFrame.groupby - Dataset.coarsen - Dataset.resample - DataArray.resample + :external:py:meth:`pandas.DataFrame.groupby ` + :func:`Dataset.groupby_bins ` + :func:`DataArray.groupby ` + :class:`core.groupby.DatasetGroupBy` + :func:`Dataset.coarsen ` + :func:`Dataset.resample ` + :func:`DataArray.resample ` """ from xarray.core.groupby import ( DatasetGroupBy, @@ -10695,7 +10689,7 @@ def weighted(self, weights: DataArray) -> DatasetWeighted: See Also -------- - DataArray.weighted + :func:`DataArray.weighted ` :ref:`comput.weighted` User guide on weighted array reduction using :py:func:`~xarray.Dataset.weighted` @@ -10824,8 +10818,8 @@ def coarsen( See Also -------- - core.rolling.DatasetCoarsen - DataArray.coarsen + :class:`core.rolling.DatasetCoarsen` + :func:`DataArray.coarsen ` :ref:`reshape.coarsen` User guide describing :py:func:`~xarray.Dataset.coarsen` diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 9596d19e735..ceae79031f8 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -50,7 +50,6 @@ ) from xarray.core.variable import IndexVariable, Variable from xarray.namedarray.pycompat import is_chunked_array -from xarray.util.deprecation_helpers import _deprecate_positional_args if TYPE_CHECKING: from numpy.typing import ArrayLike @@ -1183,7 +1182,6 @@ def fillna(self, value: Any) -> T_Xarray: """ return ops.fillna(self, value) - @_deprecate_positional_args("v2023.10.0") def quantile( self, q: ArrayLike, diff --git a/xarray/core/weighted.py b/xarray/core/weighted.py index 2c6e7d4282a..269cb49a2c1 100644 --- a/xarray/core/weighted.py +++ b/xarray/core/weighted.py @@ -11,7 +11,6 @@ from xarray.core.computation import apply_ufunc, dot from xarray.core.types import Dims, T_DataArray, T_Xarray from xarray.namedarray.utils import is_duck_dask_array -from xarray.util.deprecation_helpers import _deprecate_positional_args # Weighted quantile methods are a subset of the numpy supported quantile methods. QUANTILE_METHODS = Literal[ @@ -454,7 +453,6 @@ def _weighted_quantile_1d( def _implementation(self, func, dim, **kwargs): raise NotImplementedError("Use `Dataset.weighted` or `DataArray.weighted`") - @_deprecate_positional_args("v2023.10.0") def sum_of_weights( self, dim: Dims = None, @@ -465,7 +463,6 @@ def sum_of_weights( self._sum_of_weights, dim=dim, keep_attrs=keep_attrs ) - @_deprecate_positional_args("v2023.10.0") def sum_of_squares( self, dim: Dims = None, @@ -477,7 +474,6 @@ def sum_of_squares( self._sum_of_squares, dim=dim, skipna=skipna, keep_attrs=keep_attrs ) - @_deprecate_positional_args("v2023.10.0") def sum( self, dim: Dims = None, @@ -489,7 +485,6 @@ def sum( self._weighted_sum, dim=dim, skipna=skipna, keep_attrs=keep_attrs ) - @_deprecate_positional_args("v2023.10.0") def mean( self, dim: Dims = None, @@ -501,7 +496,6 @@ def mean( self._weighted_mean, dim=dim, skipna=skipna, keep_attrs=keep_attrs ) - @_deprecate_positional_args("v2023.10.0") def var( self, dim: Dims = None, @@ -513,7 +507,6 @@ def var( self._weighted_var, dim=dim, skipna=skipna, keep_attrs=keep_attrs ) - @_deprecate_positional_args("v2023.10.0") def std( self, dim: Dims = None, diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 2b6f854d8b5..48a5e8c4b66 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -298,12 +298,12 @@ def assert_allclose(a, b, check_default_indexes=True, **kwargs): def create_test_data( - seed: int | None = None, + seed: int = 12345, add_attrs: bool = True, dim_sizes: tuple[int, int, int] = _DEFAULT_TEST_DIM_SIZES, use_extension_array: bool = False, ) -> Dataset: - rs = np.random.RandomState(seed) + rs = np.random.default_rng(seed) _vars = { "var1": ["dim1", "dim2"], "var2": ["dim1", "dim2"], @@ -336,7 +336,7 @@ def create_test_data( "dim1", pd.Categorical( rs.choice( - list(string.ascii_lowercase[: rs.randint(1, 5)]), + list(string.ascii_lowercase[: rs.integers(1, 5)]), size=dim_sizes[0], ) ), @@ -344,7 +344,7 @@ def create_test_data( if dim_sizes == _DEFAULT_TEST_DIM_SIZES: numbers_values = np.array([0, 1, 2, 0, 0, 1, 1, 2, 2, 3], dtype="int64") else: - numbers_values = rs.randint(0, 3, _dims["dim3"], dtype="int64") + numbers_values = rs.integers(0, 3, _dims["dim3"], dtype="int64") obj.coords["numbers"] = ("dim3", numbers_values) obj.encoding = {"foo": "bar"} assert_writeable(obj) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index f66f7c97121..7d5636930cb 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1408,6 +1408,22 @@ def test_multiindex_not_implemented(self) -> None: with self.roundtrip(ds_reset) as actual: assert_identical(actual, ds_reset) + @requires_dask + def test_string_object_warning(self) -> None: + original = Dataset( + { + "x": ( + [ + "y", + ], + np.array(["foo", "bar"], dtype=object), + ) + } + ).chunk() + with pytest.warns(SerializationWarning, match="dask array with dtype=object"): + with self.roundtrip(original) as actual: + assert_identical(original, actual) + class NetCDFBase(CFEncodedBase): """Tests for all netCDF3 and netCDF4 backends.""" @@ -2859,8 +2875,11 @@ def test_append_with_new_variable(self) -> None: # check append mode for new variable with self.create_zarr_target() as store_target: - xr.concat([ds, ds_to_append], dim="time").to_zarr( - store_target, mode="w", **self.version_kwargs + combined = xr.concat([ds, ds_to_append], dim="time") + combined.to_zarr(store_target, mode="w", **self.version_kwargs) + assert_identical( + combined, + xr.open_dataset(store_target, engine="zarr", **self.version_kwargs), ) ds_with_new_var.to_zarr(store_target, mode="a", **self.version_kwargs) combined = xr.concat([ds, ds_to_append], dim="time") @@ -6495,7 +6514,7 @@ def test_zarr_safe_chunk_region(tmp_path): arr.isel(a=slice(5, -1)).chunk(a=5).to_zarr(store, region="auto", mode="r+") # Test if the code is detecting the last chunk correctly - data = np.random.RandomState(0).randn(2920, 25, 53) + data = np.random.default_rng(0).random((2920, 25, 53)) ds = xr.Dataset({"temperature": (("time", "lat", "lon"), data)}) chunks = {"time": 1000, "lat": 25, "lon": 53} ds.chunk(chunks).to_zarr(store, compute=False, mode="w") diff --git a/xarray/tests/test_backends_common.py b/xarray/tests/test_backends_common.py index c7dba36ea58..dc89ecefbfe 100644 --- a/xarray/tests/test_backends_common.py +++ b/xarray/tests/test_backends_common.py @@ -1,8 +1,9 @@ from __future__ import annotations +import numpy as np import pytest -from xarray.backends.common import robust_getitem +from xarray.backends.common import _infer_dtype, robust_getitem class DummyFailure(Exception): @@ -30,3 +31,15 @@ def test_robust_getitem() -> None: array = DummyArray(failures=3) with pytest.raises(DummyFailure): robust_getitem(array, ..., catch=DummyFailure, initial_delay=1, max_retries=2) + + +@pytest.mark.parametrize( + "data", + [ + np.array([["ab", "cdef", b"X"], [1, 2, "c"]], dtype=object), + np.array([["x", 1], ["y", 2]], dtype="object"), + ], +) +def test_infer_dtype_error_on_mixed_types(data): + with pytest.raises(ValueError, match="unable to infer dtype on variable"): + _infer_dtype(data, "test") diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 4610aa62f64..fd9f6ef41ea 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -1293,9 +1293,9 @@ def covariance(x, y): (x - x.mean(axis=-1, keepdims=True)) * (y - y.mean(axis=-1, keepdims=True)) ).mean(axis=-1) - rs = np.random.RandomState(42) - array1 = da.from_array(rs.randn(4, 4), chunks=(2, 4)) - array2 = da.from_array(rs.randn(4, 4), chunks=(2, 4)) + rs = np.random.default_rng(42) + array1 = da.from_array(rs.random((4, 4)), chunks=(2, 4)) + array2 = da.from_array(rs.random((4, 4)), chunks=(2, 4)) data_array_1 = xr.DataArray(array1, dims=("x", "z")) data_array_2 = xr.DataArray(array2, dims=("y", "z")) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 5de370e23d2..e76e9a1d346 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -249,13 +249,6 @@ def test_emit_coordinates_attribute_in_encoding(self) -> None: assert enc["b"].attrs.get("coordinates") == "t" assert "coordinates" not in enc["b"].encoding - @requires_dask - def test_string_object_warning(self) -> None: - original = Variable(("x",), np.array(["foo", "bar"], dtype=object)).chunk() - with pytest.warns(SerializationWarning, match="dask array with dtype=object"): - encoded = conventions.encode_cf_variable(original) - assert_identical(original, encoded) - @requires_cftime class TestDecodeCF: @@ -604,18 +597,6 @@ def test_encoding_kwarg_fixed_width_string(self) -> None: pass -@pytest.mark.parametrize( - "data", - [ - np.array([["ab", "cdef", b"X"], [1, 2, "c"]], dtype=object), - np.array([["x", 1], ["y", 2]], dtype="object"), - ], -) -def test_infer_dtype_error_on_mixed_types(data): - with pytest.raises(ValueError, match="unable to infer dtype on variable"): - conventions._infer_dtype(data, "test") - - class TestDecodeCFVariableWithArrayUnits: def test_decode_cf_variable_with_array_units(self) -> None: v = Variable(["t"], [1, 2, 3], {"units": np.array(["foobar"], dtype=object)}) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 54ae80a1d9d..068f57ed42d 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -38,7 +38,7 @@ def test_raise_if_dask_computes(): - data = da.from_array(np.random.RandomState(0).randn(4, 6), chunks=(2, 2)) + data = da.from_array(np.random.default_rng(0).random((4, 6)), chunks=(2, 2)) with pytest.raises(RuntimeError, match=r"Too many computes"): with raise_if_dask_computes(): data.compute() @@ -77,7 +77,7 @@ def assertLazyAndAllClose(self, expected, actual): @pytest.fixture(autouse=True) def setUp(self): - self.values = np.random.RandomState(0).randn(4, 6) + self.values = np.random.default_rng(0).random((4, 6)) self.data = da.from_array(self.values, chunks=(2, 2)) self.eager_var = Variable(("x", "y"), self.values) @@ -791,6 +791,7 @@ def test_tokenize_duck_dask_array(self): class TestToDaskDataFrame: + @pytest.mark.xfail(reason="https://github.com/dask/dask/issues/11584") def test_to_dask_dataframe(self): # Test conversion of Datasets to dask DataFrames x = np.random.randn(10) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 73d224e9e5b..a118212c981 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -2284,7 +2284,7 @@ class NdArraySubclass(np.ndarray): assert isinstance(converted_subok.data, NdArraySubclass) def test_is_null(self) -> None: - x = np.random.RandomState(42).randn(5, 6) + x = np.random.default_rng(42).random((5, 6)) x[x < 0] = np.nan original = DataArray(x, [-np.arange(5), np.arange(6)], ["x", "y"]) expected = DataArray(pd.isnull(x), [-np.arange(5), np.arange(6)], ["x", "y"]) @@ -3528,7 +3528,7 @@ def test_from_multiindex_series_sparse(self) -> None: idx = pd.MultiIndex.from_product([np.arange(3), np.arange(5)], names=["a", "b"]) series: pd.Series = pd.Series( - np.random.RandomState(0).random(len(idx)), index=idx + np.random.default_rng(0).random(len(idx)), index=idx ).sample(n=5, random_state=3) dense = DataArray.from_series(series, sparse=False) @@ -3703,8 +3703,8 @@ def test_to_dict_with_numpy_attrs(self) -> None: assert expected_attrs == actual["attrs"] def test_to_masked_array(self) -> None: - rs = np.random.RandomState(44) - x = rs.random_sample(size=(10, 20)) + rs = np.random.default_rng(44) + x = rs.random(size=(10, 20)) x_masked = np.ma.masked_where(x < 0.5, x) da = DataArray(x_masked) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 7f169f6f533..1731120a5df 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -99,7 +99,7 @@ def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]: - rs = np.random.RandomState(seed) + rs = np.random.default_rng(seed) lat = [2, 1, 0] lon = [0, 1, 2] @@ -127,7 +127,7 @@ def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]: ds = xr.Dataset( data_vars={ "da": xr.DataArray( - rs.rand(3, 3, nt1), + rs.random((3, 3, nt1)), coords=[lat, lon, time1], dims=["lat", "lon", "time"], ), @@ -142,7 +142,7 @@ def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]: ds_to_append = xr.Dataset( data_vars={ "da": xr.DataArray( - rs.rand(3, 3, nt2), + rs.random((3, 3, nt2)), coords=[lat, lon, time2], dims=["lat", "lon", "time"], ), @@ -157,7 +157,7 @@ def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]: ds_with_new_var = xr.Dataset( data_vars={ "new_var": xr.DataArray( - rs.rand(3, 3, nt1 + nt2), + rs.random((3, 3, nt1 + nt2)), coords=[lat, lon, time1.append(time2)], dims=["lat", "lon", "time"], ) @@ -294,9 +294,9 @@ def test_repr(self) -> None: numbers (dim3) int64 80B 0 1 2 0 0 1 1 2 2 3 Dimensions without coordinates: dim1 Data variables: - var1 (dim1, dim2) float64 576B -1.086 0.9973 0.283 ... 0.4684 -0.8312 - var2 (dim1, dim2) float64 576B 1.162 -1.097 -2.123 ... 1.267 0.3328 - var3 (dim3, dim1) float64 640B 0.5565 -0.2121 0.4563 ... -0.2452 -0.3616 + var1 (dim1, dim2) float64 576B -0.9891 -0.3678 1.288 ... -0.2116 0.364 + var2 (dim1, dim2) float64 576B 0.953 1.52 1.704 ... 0.1347 -0.6423 + var3 (dim3, dim1) float64 640B 0.4107 0.9941 0.1665 ... 0.716 1.555 Attributes: foo: bar""".format( data["dim3"].dtype, @@ -304,7 +304,7 @@ def test_repr(self) -> None: ) ) actual = "\n".join(x.rstrip() for x in repr(data).split("\n")) - print(actual) + assert expected == actual with set_options(display_width=100): @@ -7180,13 +7180,13 @@ def test_raise_no_warning_assert_close(ds) -> None: @pytest.mark.parametrize("dask", [True, False]) @pytest.mark.parametrize("edge_order", [1, 2]) def test_differentiate(dask, edge_order) -> None: - rs = np.random.RandomState(42) + rs = np.random.default_rng(42) coord = [0.2, 0.35, 0.4, 0.6, 0.7, 0.75, 0.76, 0.8] da = xr.DataArray( - rs.randn(8, 6), + rs.random((8, 6)), dims=["x", "y"], - coords={"x": coord, "z": 3, "x2d": (("x", "y"), rs.randn(8, 6))}, + coords={"x": coord, "z": 3, "x2d": (("x", "y"), rs.random((8, 6)))}, ) if dask and has_dask: da = da.chunk({"x": 4}) @@ -7229,7 +7229,7 @@ def test_differentiate(dask, edge_order) -> None: @pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.parametrize("dask", [True, False]) def test_differentiate_datetime(dask) -> None: - rs = np.random.RandomState(42) + rs = np.random.default_rng(42) coord = np.array( [ "2004-07-13", @@ -7245,9 +7245,9 @@ def test_differentiate_datetime(dask) -> None: ) da = xr.DataArray( - rs.randn(8, 6), + rs.random((8, 6)), dims=["x", "y"], - coords={"x": coord, "z": 3, "x2d": (("x", "y"), rs.randn(8, 6))}, + coords={"x": coord, "z": 3, "x2d": (("x", "y"), rs.random((8, 6)))}, ) if dask and has_dask: da = da.chunk({"x": 4}) @@ -7279,12 +7279,12 @@ def test_differentiate_datetime(dask) -> None: @requires_cftime @pytest.mark.parametrize("dask", [True, False]) def test_differentiate_cftime(dask) -> None: - rs = np.random.RandomState(42) + rs = np.random.default_rng(42) coord = xr.cftime_range("2000", periods=8, freq="2ME") da = xr.DataArray( - rs.randn(8, 6), - coords={"time": coord, "z": 3, "t2d": (("time", "y"), rs.randn(8, 6))}, + rs.random((8, 6)), + coords={"time": coord, "z": 3, "t2d": (("time", "y"), rs.random((8, 6)))}, dims=["time", "y"], ) @@ -7308,17 +7308,17 @@ def test_differentiate_cftime(dask) -> None: @pytest.mark.parametrize("dask", [True, False]) def test_integrate(dask) -> None: - rs = np.random.RandomState(42) + rs = np.random.default_rng(42) coord = [0.2, 0.35, 0.4, 0.6, 0.7, 0.75, 0.76, 0.8] da = xr.DataArray( - rs.randn(8, 6), + rs.random((8, 6)), dims=["x", "y"], coords={ "x": coord, - "x2": (("x",), rs.randn(8)), + "x2": (("x",), rs.random(8)), "z": 3, - "x2d": (("x", "y"), rs.randn(8, 6)), + "x2d": (("x", "y"), rs.random((8, 6))), }, ) if dask and has_dask: @@ -7362,17 +7362,17 @@ def test_integrate(dask) -> None: @requires_scipy @pytest.mark.parametrize("dask", [True, False]) def test_cumulative_integrate(dask) -> None: - rs = np.random.RandomState(43) + rs = np.random.default_rng(43) coord = [0.2, 0.35, 0.4, 0.6, 0.7, 0.75, 0.76, 0.8] da = xr.DataArray( - rs.randn(8, 6), + rs.random((8, 6)), dims=["x", "y"], coords={ "x": coord, - "x2": (("x",), rs.randn(8)), + "x2": (("x",), rs.random(8)), "z": 3, - "x2d": (("x", "y"), rs.randn(8, 6)), + "x2d": (("x", "y"), rs.random((8, 6))), }, ) if dask and has_dask: @@ -7425,7 +7425,7 @@ def test_cumulative_integrate(dask) -> None: @pytest.mark.parametrize("dask", [True, False]) @pytest.mark.parametrize("which_datetime", ["np", "cftime"]) def test_trapezoid_datetime(dask, which_datetime) -> None: - rs = np.random.RandomState(42) + rs = np.random.default_rng(42) coord: ArrayLike if which_datetime == "np": coord = np.array( @@ -7447,8 +7447,8 @@ def test_trapezoid_datetime(dask, which_datetime) -> None: coord = xr.cftime_range("2000", periods=8, freq="2D") da = xr.DataArray( - rs.randn(8, 6), - coords={"time": coord, "z": 3, "t2d": (("time", "y"), rs.randn(8, 6))}, + rs.random((8, 6)), + coords={"time": coord, "z": 3, "t2d": (("time", "y"), rs.random((8, 6)))}, dims=["time", "y"], ) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index a2f5631ce1b..e1306964757 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -341,16 +341,16 @@ def test_types(self, val1, val2, val3, null): def construct_dataarray(dim_num, dtype, contains_nan, dask): # dimnum <= 3 - rng = np.random.RandomState(0) + rng = np.random.default_rng(0) shapes = [16, 8, 4][:dim_num] dims = ("x", "y", "z")[:dim_num] if np.issubdtype(dtype, np.floating): - array = rng.randn(*shapes).astype(dtype) + array = rng.random(shapes).astype(dtype) elif np.issubdtype(dtype, np.integer): - array = rng.randint(0, 10, size=shapes).astype(dtype) + array = rng.integers(0, 10, size=shapes).astype(dtype) elif np.issubdtype(dtype, np.bool_): - array = rng.randint(0, 1, size=shapes).astype(dtype) + array = rng.integers(0, 1, size=shapes).astype(dtype) elif dtype is str: array = rng.choice(["a", "b", "c", "d"], size=shapes) else: diff --git a/xarray/tests/test_formatting_html.py b/xarray/tests/test_formatting_html.py index b518e7e95d9..7c9cdbeaaf5 100644 --- a/xarray/tests/test_formatting_html.py +++ b/xarray/tests/test_formatting_html.py @@ -11,7 +11,7 @@ @pytest.fixture def dataarray() -> xr.DataArray: - return xr.DataArray(np.random.RandomState(0).randn(4, 6)) + return xr.DataArray(np.random.default_rng(0).random((4, 6))) @pytest.fixture diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 1af707145d0..8c006c19dd9 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -2880,7 +2880,7 @@ def test_multiple_groupers(use_flox: bool, shuffle: bool) -> None: ) b = xr.DataArray( - np.random.RandomState(0).randn(2, 3, 4), + np.random.default_rng(0).random((2, 3, 4)), coords={"xy": (("x", "y"), [["a", "b", "c"], ["b", "c", "c"]], {"foo": "bar"})}, dims=["x", "y", "z"], ) diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index 58d8a9dcf5d..eb21cca0861 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -62,7 +62,7 @@ def ds(): def make_interpolate_example_data(shape, frac_nan, seed=12345, non_uniform=False): - rs = np.random.RandomState(seed) + rs = np.random.default_rng(seed) vals = rs.normal(size=shape) if frac_nan == 1: vals[:] = np.nan diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index c0ea96d0025..5a0edd8b972 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -2382,6 +2382,7 @@ def test_dask_rolling(self, dim, window, center): assert actual.shape == expected.shape assert_equal(actual, expected) + @pytest.mark.xfail(reason="https://github.com/dask/dask/issues/11585") def test_multiindex(self): super().test_multiindex()