Add test - missing - doc

Ouranosinc · Oct 2, 2023 · ce99daa · ce99daa
1 parent 57c2622
commit ce99daa
Show file tree

Hide file tree

Showing 3 changed files with 126 additions and 18 deletions.
diff --git a/docs/goodtoknow.rst b/docs/goodtoknow.rst
@@ -37,12 +37,12 @@ There are many ways to open data in xscen workflows. The list below tries to mak
 Which function to use when resampling data
 ------------------------------------------
 
-:extract_dataset: :py:func:`~xscen.extract.extract_dataset` has resampling capabilities to provide daily data from finer sources.
+:extract_dataset: :py:func:`~xscen.extract.extract_dataset`'s resampling capabilities are meant to provide daily data from finer sources.
 
-:xclim indicators: Through :py:func:`~xscen.indicators.compute_indicators`, xscen workflows can easily use `xclim indicators <https://xclim.readthedocs.io/en/stable/indicators.html>`_
-    to go from daily data to coarser (monthly, seasonal, annual).
+:resample: :py:func`xscen.extract.resample` extends xarray's `resample` methods with support for weighted resampling when starting from data coarser than daily and for handling of missing timesteps or values.
 
-What is currently not covered by either `xscen` or `xclim` is a method to resample from data coarser than daily, where the base period is non-uniform (ex: resampling from monthly to annual data, taking into account the number of days per month).
+:xclim indicators: Through :py:func:`~xscen.indicators.compute_indicators`, xscen workflows can easily use `xclim indicators <https://xclim.readthedocs.io/en/stable/indicators.html>`_
+    to go from daily data to coarser (monthly, seasonal, annual), with missing values handling. This option will add more metadata than the two firsts.
 
 
 Metadata translation

diff --git a/tests/test_extract.py b/tests/test_extract.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+import xarray as xr
 from conftest import notebooks
 from xclim.testing.helpers import test_timeseries as timeseries
 
@@ -413,3 +414,77 @@ def test_outofrange(self):
 
     def test_none(self):
         assert xs.subset_warming_level(TestSubsetWarmingLevel.ds, wl=20) is None
+
+
+class TestResample:
+    @pytest.mark.parametrize(
+        "infrq,meth,outfrq,exp",
+        [
+            ["MS", "mean", "QS-DEC", [0.47457627, 3, 6.01086957, 9, 11.96666667]],
+            [
+                "QS-DEC",
+                "mean",
+                "YS",
+                [1.49041096, 5.49041096, 9.49453552, 13.49041096, 17.49041096],
+            ],
+            ["MS", "std", "2YS", [6.92009239, 6.91557206]],
+        ],
+    )
+    def test_mean_from_monthly(self, infrq, meth, outfrq, exp):
+        da = timeseries(
+            np.arange(48),
+            variable="tas",
+            start="2001-01-01",
+            freq=infrq,
+        )
+        out = xs.extract.resample(da, outfrq, method=meth)
+        np.testing.assert_allclose(out.isel(time=slice(0, 5)), exp)
+
+    def test_wind_from_monthly(self):
+        uas = timeseries(
+            np.arange(48),
+            variable="uas",
+            start="2001-01-01",
+            freq="MS",
+        )
+        vas = timeseries(
+            np.arange(48),
+            variable="vas",
+            start="2001-01-01",
+            freq="MS",
+        )
+        ds = xr.merge([uas, vas])
+        out = xs.extract.resample(ds.uas, "YS", method="wind_direction", ds=ds)
+        np.testing.assert_allclose(out, [5.5260274, 17.5260274, 29.5260274, 41.5136612])
+
+    def test_missing(self):
+        da = timeseries(
+            np.arange(48),
+            variable="tas",
+            start="2001-01-01",
+            freq="MS",
+        )
+        out = xs.extract.resample(da, "QS-DEC", method="mean", missing="drop")
+        assert out.size == 15
+
+        out = xs.extract.resample(da, "QS-DEC", method="mean", missing="mask")
+        assert out.isel(time=0).isnull().all()
+
+    def test_missing_xclim(self):
+        arr = np.arange(48).astype(float)
+        arr[0] = np.nan
+        arr[40:] = np.nan
+        da = timeseries(
+            arr,
+            variable="tas",
+            start="2001-01-01",
+            freq="MS",
+        )
+        out = xs.extract.resample(da, "YS", method="mean", missing={"method": "any"})
+        assert out.isel(time=0).isnull().all()
+
+        out = xs.extract.resample(
+            da, "YS", method="mean", missing={"method": "pct", "tolerance": 0.6}
+        )
+        assert out.isel(time=0).notnull().all()
+        assert out.isel(time=-1).isnull().all()
diff --git a/xscen/extract.py b/xscen/extract.py
@@ -368,36 +368,46 @@ def resample(
     *,
     ds: Optional[xr.Dataset] = None,
     method: Optional[str] = None,
+    missing: Union[str, dict] = None,
 ) -> xr.DataArray:
     """Aggregate variable to the target frequency.
 
+    If the input frequency is greater than a week, the resampling operation is weighted by
+    the number of days in each sampling period.
+
     Parameters
     ----------
     da  : xr.DataArray
         DataArray of the variable to resample, must have a "time" dimension and be of a
-        finer temporal resolution than "target_timestep".
+        finer temporal resolution than "target_frequency".
     target_frequency : str
-        The target frequency/freq str, must be one of the frequency supported by pandas.
+        The target frequency/freq str, must be one of the frequency supported by xarray.
     ds : xr.Dataset, optional
         The "wind_direction" resampling method needs extra variables, which can be given here.
     method : {'mean', 'min', 'max', 'sum', 'wind_direction'}, optional
         The resampling method. If None (default), it is guessed from the variable name and frequency,
         using the mapping in CVs/resampling_methods.json. If the variable is not found there,
         "mean" is used by default.
+    missing: {'mask', 'drop'} or dict, optional
+        If 'mask' or 'drop', target periods with fewer steps than expected are masked or dropped.
+        For example, for daily data beginning in January with a `target_frequency` of "QS-DEC". the first season is missing one month.
+        If a dict, it points to a xclim check missing method which will mask periods according to their number of NaN values.
+        The dict must contain a "method" field corresponding to the xclim method name and may contain
+        any other args to pass. Options are documented in :py:mod:`xclim.core.missing`.
 
     Returns
     -------
     xr.DataArray
         Resampled variable
-
     """
     var_name = da.name
 
     initial_frequency = xr.infer_freq(da.time.dt.round("T")) or "undetected"
     initial_frequency_td = pd.Timedelta(
         CV.xrfreq_to_timedelta(xr.infer_freq(da.time.dt.round("T")), None)
     )
-    days_per_step = None
+
+    weights = None
     if initial_frequency != "undetected" and initial_frequency_td > pd.Timedelta(
         7, "days"
     ):
@@ -422,6 +432,7 @@ def resample(
                 time=days_per_step.time
             )  # Not sure why we need this, but time coord is from the resample even after sel
         )
+        weights = days_per_step / days_per_period
 
     if method is None:
         if (
@@ -471,11 +482,7 @@ def resample(
         # Resample first to find the average wind speed and components
         if days_per_step is not None:
             with xr.set_options(keep_attrs=True):
-                ds = (
-                    (ds * (days_per_step / days_per_period))
-                    .reample(time=target_frequency)
-                    .sum(time="time")
-                )
+                ds = (ds * weights).resample(time=target_frequency).sum(dim="time")
         else:
             ds = ds.resample(time=target_frequency).mean(dim="time", keep_attrs=True)
 
@@ -503,13 +510,14 @@ def resample(
             # Avoiding resample().map() is much more performant
             with xr.set_options(keep_attrs=True):
                 out = (
-                    (da * (days_per_step / days_per_period))
-                    .reample(time=target_frequency)
-                    .sum(time="time")
+                    (da * weights)
+                    .resample(time=target_frequency)
+                    .sum(dim="time")
+                    .rename(da.name)
                 )
         elif hasattr(xr.core.weighted.DataArrayWeighted, method):
-            ds = xr.merge([da, days_per_step.rename("weights")])
-            da = ds.resample(time="QS-DEC").map(
+            ds = xr.merge([da, weights.rename("weights")])
+            out = ds.resample(time=target_frequency).map(
                 lambda grp: getattr(
                     grp.drop_vars("weights").weighted(grp.weights), method
                 )(dim="time")
@@ -523,6 +531,31 @@ def resample(
             dim="time", keep_attrs=True
         )
 
+    if missing in ["mask", "drop"]:
+        steps_per_period = (
+            xr.ones_like(da.time, dtype="int").resample(time=target_frequency).sum()
+        )
+        t = xr.date_range(
+            steps_per_period.indexes["time"][0],
+            periods=steps_per_period.time.size + 1,
+            freq=target_frequency,
+        )
+        expected = (
+            xr.DataArray(t, dims=("time",), coords={"time": t}).diff(
+                "time", label="lower"
+            )
+            / initial_frequency_td
+        )
+        complete = (steps_per_period / expected) > 0.95
+    elif isinstance(missing, dict):
+        missmeth = missing.pop("method")
+        complete = ~xc.core.missing.MISSING_METHODS[missmeth](
+            da, target_frequency, initial_frequency
+        )(**missing)
+        missing = "mask"
+    if missing in {"mask", "drop"}:
+        out = out.where(complete, drop=(missing == "drop"))
+
     new_history = (
         f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {method} "
         f"resample from {initial_frequency} to {target_frequency} - xarray v{xr.__version__}"