Merge branch 'master' into fix-agg-sum

Ouranosinc · Feb 16, 2024 · 801895f · 801895f
2 parents 96bf746 + e096bbf
commit 801895f
Show file tree

Hide file tree

Showing 9 changed files with 108 additions and 12 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -22,6 +22,8 @@ New features and enhancements
 * New ``xclim.core.calendar.stack_periods`` and ``unstack_periods`` for performing ``rolling(time=...).construct(..., stride=...)`` but with non-uniform temporal periods like years or months. They replace ``xclim.sdba.processing.construct_moving_yearly_window`` and ``unpack_moving_yearly_window`` which are deprecated and will be removed in a future release.
 * New ``as_dataset`` options for ``xclim.set_options``. When True, indicators will output Datasets instead of DataArrays. (:issue:`1257`, :pull:`1625`).
 * Added new option for UTCI calculation to cap low wind velocities to a minimum of 0.5 m/s following Bröde (2012) guidelines. (:issue:`1634`, :pull:`1635`).
+* Added option ``never_reached`` to ``degree_days_exceedance_date`` to assign a custom value when the sum threshold is never reached. (:issue:`1459`, :pull:`1647`).
+* Added option ``min_members`` to ensemble statistics to mask elements when the number of valid members is under a threshold. (:issue:`1459`, :pull:`1647`).
 
 Breaking changes
 ^^^^^^^^^^^^^^^^

diff --git a/pyproject.toml b/pyproject.toml
@@ -124,7 +124,7 @@ target-version = [
 ]
 
 [tool.bumpversion]
-current_version = "0.47.5-dev.17"
+current_version = "0.47.5-dev.18"
 commit = true
 commit_args = "--no-verify"
 tag = false

diff --git a/tests/test_ensembles.py b/tests/test_ensembles.py
@@ -290,6 +290,34 @@ def test_calc_mean_std_min_max(self, ensemble_dataset_objects, open_dataset):
             out1.tg_mean_min[0, 5, 5], out2.tg_mean_min[0, 5, 5]
         )
 
+    @pytest.mark.parametrize(
+        "aggfunc", [ensembles.ensemble_percentiles, ensembles.ensemble_mean_std_max_min]
+    )
+    def test_stats_min_members(self, ensemble_dataset_objects, open_dataset, aggfunc):
+        ds_all = [open_dataset(n) for n in ensemble_dataset_objects["nc_files_simple"]]
+        ens = ensembles.create_ensemble(ds_all).isel(lat=0, lon=0)
+        ens = ens.where(ens.realization > 0)
+        ens = xr.where((ens.realization == 1) & (ens.time.dt.year == 1950), np.NaN, ens)
+
+        def first(ds):
+            return ds[list(ds.data_vars.keys())[0]]
+
+        # Default, no masking
+        out = first(aggfunc(ens))
+        assert not out.isnull().any()
+
+        # A number
+        out = first(aggfunc(ens, min_members=3))
+        # Only 1950 is null
+        np.testing.assert_array_equal(
+            out.isnull(), [True] + [False] * (ens.time.size - 1)
+        )
+
+        # Special value
+        out = first(aggfunc(ens, min_members=None))
+        # All null
+        assert out.isnull().all()
+
 
 @pytest.mark.slow
 class TestEnsembleReduction:

diff --git a/tests/test_temperature.py b/tests/test_temperature.py
@@ -1289,6 +1289,27 @@ def test_degree_days_exceedance_date(open_dataset):
         np.testing.assert_array_equal(out, np.array([[np.nan, 280, 241, 244]]).T)
 
 
+@pytest.mark.parametrize(
+    "never_reached,exp", [(None, np.NaN), (300, 300), ("12-01", 335)]
+)
+def test_degree_days_exceedance_date_never_reached(open_dataset, never_reached, exp):
+    tas = open_dataset("FWI/GFWED_sample_2017.nc").tas
+    tas.attrs.update(
+        cell_methods="time: mean within days", standard_name="air_temperature"
+    )
+    # Default -> NaN
+    out = atmos.degree_days_exceedance_date(
+        tas=tas,
+        thresh="4 degC",
+        op=">",
+        sum_thresh="1000 K days",
+        after_date="07-01",
+        never_reached=never_reached,
+        freq="YS",
+    ).squeeze("time")
+    np.testing.assert_array_equal(out, np.array([exp, 242, 222, 223]))
+
+
 class TestWarmSpellDurationIndex:
     def test_warm_spell_duration_index(self, open_dataset):
         tasmax = open_dataset("ERA5/daily_surface_cancities_1990-1993.nc").tasmax

diff --git a/xclim/__init__.py b/xclim/__init__.py
@@ -16,7 +16,7 @@
 
 __author__ = """Travis Logan"""
 __email__ = "[email protected]"
-__version__ = "0.47.5-dev.17"
+__version__ = "0.47.5-dev.18"
 
 
 _module_data = _files("xclim.data")

diff --git a/xclim/core/calendar.py b/xclim/core/calendar.py
@@ -39,6 +39,7 @@
     "datetime_to_decimal_year",
     "days_in_year",
     "days_since_to_doy",
+    "doy_from_string",
     "doy_to_days_since",
     "ensure_cftime_array",
     "get_calendar",
@@ -88,6 +89,12 @@ def days_in_year(year: int, calendar: str = "default") -> int:
     )
 
 
+def doy_from_string(doy: DayOfYearStr, year: int, calendar: str) -> int:
+    """Return the day-of-year corresponding to a "MM-DD" string for a given year and calendar."""
+    MM, DD = doy.split("-")
+    return datetime_classes[calendar](year, int(MM), int(DD)).timetuple().tm_yday
+
+
 def date_range(
     *args, calendar: str = "default", **kwargs
 ) -> pd.DatetimeIndex | CFTimeIndex:

diff --git a/xclim/core/utils.py b/xclim/core/utils.py
@@ -139,9 +139,9 @@ def decorator(func):
         def wrapper(*args, **kwargs):
             msg = (
                 f"`{func.__name__}` is deprecated"
-                f"{' from version {}'.format(from_version) if from_version else ''} "
+                f"{f' from version {from_version}' if from_version else ''} "
                 "and will be removed in a future version of xclim"
-                f"{'. Use `{}` instead'.format(suggested) if suggested else ''}. "
+                f"{f'. Use `{suggested}` instead' if suggested else ''}. "
                 "Please update your scripts accordingly."
             )
             warnings.warn(
@@ -683,6 +683,9 @@ def infer_kind_from_parameter(param) -> InputKind:
     if annot == {"Quantified"}:
         return InputKind.QUANTIFIED
 
+    if "DayOfYearStr" in annot:
+        return InputKind.DAY_OF_YEAR
+
     if annot.issubset({"int", "float"}):
         return InputKind.NUMBER
 
@@ -692,9 +695,6 @@ def infer_kind_from_parameter(param) -> InputKind:
     if annot == {"str"}:
         return InputKind.STRING
 
-    if annot == {"DayOfYearStr"}:
-        return InputKind.DAY_OF_YEAR
-
     if annot == {"DateStr"}:
         return InputKind.DATE
 

diff --git a/xclim/ensembles/_base.py b/xclim/ensembles/_base.py
@@ -127,7 +127,7 @@ def create_ensemble(
 
 
 def ensemble_mean_std_max_min(
-    ens: xr.Dataset, weights: xr.DataArray | None = None
+    ens: xr.Dataset, min_members: int | None = 1, weights: xr.DataArray | None = None
 ) -> xr.Dataset:
     """Calculate ensemble statistics between a results from an ensemble of climate simulations.
 
@@ -138,6 +138,10 @@ def ensemble_mean_std_max_min(
     ----------
     ens : xr.Dataset
         Ensemble dataset (see xclim.ensembles.create_ensemble).
+    min_members : int, optional
+        The minimum number of valid ensemble members for a statistic to be valid.
+        Passing None is equivalent to setting min_members to the size of the realization dimension.
+        The default (1) essentially skips this check.
     weights : xr.DataArray, optional
         Weights to apply along the 'realization' dimension. This array cannot contain missing values.
 
@@ -158,6 +162,8 @@ def ensemble_mean_std_max_min(
         # Calculate ensemble statistics:
         ens_mean_std = ensemble_mean_std_max_min(ens)
     """
+    if min_members is None:
+        min_members = ens.realization.size
     ds_out = xr.Dataset(attrs=ens.attrs)
     for v in ens.data_vars:
         if weights is None:
@@ -170,9 +176,14 @@ def ensemble_mean_std_max_min(
         ds_out[f"{v}_max"] = ens[v].max(dim="realization")
         ds_out[f"{v}_min"] = ens[v].min(dim="realization")
 
+        if min_members != 1:
+            enough = ens[v].notnull().sum("realization") >= min_members
+
         # Re-add attributes
         for stat in ["mean", "stdev", "max", "min"]:
             vv = f"{v}_{stat}"
+            if min_members != 1:
+                ds_out[vv] = ds_out[vv].where(enough)
             ds_out[vv].attrs = ens[v].attrs
             if "description" in ds_out[vv].attrs.keys():
                 vv.split()
@@ -182,6 +193,7 @@ def ensemble_mean_std_max_min(
                     + vv.split("_")[-1]
                     + " of ensemble"
                 )
+
     ds_out.attrs["history"] = update_history(
         f"Computation of statistics on {ens.realization.size} ensemble members.", ds_out
     )
@@ -192,6 +204,7 @@ def ensemble_percentiles(
     ens: xr.Dataset | xr.DataArray,
     values: Sequence[int] | None = None,
     keep_chunk_size: bool | None = None,
+    min_members: int | None = 1,
     weights: xr.DataArray | None = None,
     split: bool = True,
 ) -> xr.DataArray | xr.Dataset:
@@ -211,6 +224,10 @@ def ensemble_percentiles(
         so that the chunks keep the same size (approximately).
         If False, no shrinking is performed, resulting in much larger chunks.
         If not defined, the function decides which is best.
+    min_members : int, optional
+        The minimum number of valid ensemble members for a statistic to be valid.
+        Passing None is equivalent to setting min_members to the size of the realization dimension.
+        The default (1) essentially skips this check.
     weights : xr.DataArray, optional
         Weights to apply along the 'realization' dimension. This array cannot contain missing values.
         When given, the function uses xarray's quantile method which is slower than xclim's NaN-optimized algorithm.
@@ -244,6 +261,8 @@ def ensemble_percentiles(
     """
     if values is None:
         values = [10, 50, 90]
+    if min_members is None:
+        min_members = ens.realization.size
 
     if isinstance(ens, xr.Dataset):
         out = xr.merge(
@@ -253,6 +272,7 @@ def ensemble_percentiles(
                     values,
                     keep_chunk_size=keep_chunk_size,
                     split=split,
+                    min_members=min_members,
                     weights=weights,
                 )
                 for da in ens.data_vars.values()
@@ -313,6 +333,8 @@ def ensemble_percentiles(
                 .rename({"quantile": "percentiles"})
             )
 
+    if min_members != 1:
+        out = out.where(ens.notnull().sum("realization") >= min_members)
     out = out.assign_coords(
         percentiles=xr.DataArray(list(values), dims=("percentiles",))
     )

diff --git a/xclim/indices/_threshold.py b/xclim/indices/_threshold.py
@@ -6,7 +6,7 @@
 import numpy as np
 import xarray
 
-from xclim.core.calendar import get_calendar, select_time
+from xclim.core.calendar import doy_from_string, get_calendar, select_time
 from xclim.core.missing import at_least_n_valid
 from xclim.core.units import (
     convert_units_to,
@@ -2966,6 +2966,7 @@ def degree_days_exceedance_date(
     sum_thresh: Quantified = "25 K days",
     op: str = ">",
     after_date: DayOfYearStr | None = None,
+    never_reached: DayOfYearStr | int | None = None,
     freq: str = "YS",
 ) -> xarray.DataArray:
     r"""Degree-days exceedance date.
@@ -2986,7 +2987,12 @@ def degree_days_exceedance_date(
         equivalent to '<', they are computed as `thresh - tas`.
     after_date: str, optional
         Date at which to start the cumulative sum.
-        In "mm-dd" format, defaults to the start of the sampling period.
+        In "MM-DD" format, defaults to the start of the sampling period.
+    never_reached: int, str, optional
+        What to do when `sum_thresh` is never exceeded.
+        If an int, the value to assign as a day-of-year.
+        If a string, must be in "MM-DD" format, the day-of-year of that date is assigned.
+        Default (None) assigns "NaN".
     freq : str
         Resampling frequency. If `after_date` is given, `freq` should be annual.
 
@@ -3030,12 +3036,22 @@ def _exceedance_date(grp):
             strt_idx.size == 0
         ):  # The date is not within the group. Happens at boundaries.
             return xarray.full_like(grp.isel(time=0), np.nan, float).drop_vars("time")  # type: ignore
+        cumsum = grp.where(grp.time >= grp.time[strt_idx][0]).cumsum("time")
 
-        return rl.first_run_after_date(
-            grp.where(grp.time >= grp.time[strt_idx][0]).cumsum("time") > sum_thresh,
+        out = rl.first_run_after_date(
+            cumsum > sum_thresh,
             window=1,
             date=None,
         )
+        if never_reached is None:
+            # This is slightly faster in numpy and generates fewer tasks in dask
+            return out
+        never_reached_val = (
+            doy_from_string(never_reached, grp.time.dt.year[0], grp.time.dt.calendar)
+            if isinstance(never_reached, str)
+            else never_reached
+        )
+        return xarray.where((cumsum <= sum_thresh).all("time"), never_reached_val, out)
 
     out = c.clip(0).resample(time=freq).map(_exceedance_date)
     out.attrs.update(units="", is_dayofyear=np.int32(1), calendar=get_calendar(tas))