From ae41d82127ffd64d46495e45383b5fb2a00980d1 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 18 Oct 2023 04:39:30 -0600 Subject: [PATCH] Enable numbagg for reductions (#8316) Co-authored-by: Anderson Banihirwe <13301940+andersy005@users.noreply.github.com> --- xarray/core/nputils.py | 57 +++++++++++++++++++++++++++++++----------- xarray/core/options.py | 7 ++++++ 2 files changed, 49 insertions(+), 15 deletions(-) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index c49a06dfc9c..316a77ead6a 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd from numpy.core.multiarray import normalize_axis_index # type: ignore[attr-defined] +from packaging.version import Version # remove once numpy 2.0 is the oldest supported version try: @@ -18,11 +19,20 @@ try: import bottleneck as bn - _USE_BOTTLENECK = True + _BOTTLENECK_AVAILABLE = True except ImportError: # use numpy methods instead bn = np - _USE_BOTTLENECK = False + _BOTTLENECK_AVAILABLE = False + +try: + import numbagg + + _HAS_NUMBAGG = Version(numbagg.__version__) >= Version("0.5.0") +except ImportError: + # use numpy methods instead + numbagg = np + _HAS_NUMBAGG = False def _select_along_axis(values, idx, axis): @@ -161,13 +171,30 @@ def __setitem__(self, key, value): self._array[key] = np.moveaxis(value, vindex_positions, mixed_positions) -def _create_bottleneck_method(name, npmodule=np): +def _create_method(name, npmodule=np): def f(values, axis=None, **kwargs): dtype = kwargs.get("dtype", None) bn_func = getattr(bn, name, None) + nba_func = getattr(numbagg, name, None) if ( - _USE_BOTTLENECK + _HAS_NUMBAGG + and OPTIONS["use_numbagg"] + and isinstance(values, np.ndarray) + and nba_func is not None + # numbagg uses ddof=1 only, but numpy uses ddof=0 by default + and (("var" in name or "std" in name) and kwargs.get("ddof", 0) == 1) + # TODO: bool? + and values.dtype.kind in "uifc" + # and values.dtype.isnative + and (dtype is None or np.dtype(dtype) == values.dtype) + ): + # numbagg does not take care dtype, ddof + kwargs.pop("dtype", None) + kwargs.pop("ddof", None) + result = nba_func(values, axis=axis, **kwargs) + elif ( + _BOTTLENECK_AVAILABLE and OPTIONS["use_bottleneck"] and isinstance(values, np.ndarray) and bn_func is not None @@ -233,14 +260,14 @@ def least_squares(lhs, rhs, rcond=None, skipna=False): return coeffs, residuals -nanmin = _create_bottleneck_method("nanmin") -nanmax = _create_bottleneck_method("nanmax") -nanmean = _create_bottleneck_method("nanmean") -nanmedian = _create_bottleneck_method("nanmedian") -nanvar = _create_bottleneck_method("nanvar") -nanstd = _create_bottleneck_method("nanstd") -nanprod = _create_bottleneck_method("nanprod") -nancumsum = _create_bottleneck_method("nancumsum") -nancumprod = _create_bottleneck_method("nancumprod") -nanargmin = _create_bottleneck_method("nanargmin") -nanargmax = _create_bottleneck_method("nanargmax") +nanmin = _create_method("nanmin") +nanmax = _create_method("nanmax") +nanmean = _create_method("nanmean") +nanmedian = _create_method("nanmedian") +nanvar = _create_method("nanvar") +nanstd = _create_method("nanstd") +nanprod = _create_method("nanprod") +nancumsum = _create_method("nancumsum") +nancumprod = _create_method("nancumprod") +nanargmin = _create_method("nanargmin") +nanargmax = _create_method("nanargmax") diff --git a/xarray/core/options.py b/xarray/core/options.py index a197cb4da10..118a67559ad 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -27,6 +27,7 @@ "keep_attrs", "warn_for_unclosed_files", "use_bottleneck", + "use_numbagg", "use_flox", ] @@ -50,6 +51,7 @@ class T_Options(TypedDict): warn_for_unclosed_files: bool use_bottleneck: bool use_flox: bool + use_numbagg: bool OPTIONS: T_Options = { @@ -72,6 +74,7 @@ class T_Options(TypedDict): "warn_for_unclosed_files": False, "use_bottleneck": True, "use_flox": True, + "use_numbagg": True, } _JOIN_OPTIONS = frozenset(["inner", "outer", "left", "right", "exact"]) @@ -98,6 +101,7 @@ def _positive_integer(value: int) -> bool: "file_cache_maxsize": _positive_integer, "keep_attrs": lambda choice: choice in [True, False, "default"], "use_bottleneck": lambda value: isinstance(value, bool), + "use_numbagg": lambda value: isinstance(value, bool), "use_flox": lambda value: isinstance(value, bool), "warn_for_unclosed_files": lambda value: isinstance(value, bool), } @@ -230,6 +234,9 @@ class set_options: use_flox : bool, default: True Whether to use ``numpy_groupies`` and `flox`` to accelerate groupby and resampling reductions. + use_numbagg : bool, default: True + Whether to use ``numbagg`` to accelerate reductions. + Takes precedence over ``use_bottleneck`` when both are True. warn_for_unclosed_files : bool, default: False Whether or not to issue a warning when unclosed files are deallocated. This is mostly useful for debugging.