Skip to content

Commit

Permalink
Add test - missing - doc
Browse files Browse the repository at this point in the history
  • Loading branch information
aulemahal committed Oct 2, 2023
1 parent 57c2622 commit ce99daa
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 18 deletions.
8 changes: 4 additions & 4 deletions docs/goodtoknow.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ There are many ways to open data in xscen workflows. The list below tries to mak
Which function to use when resampling data
------------------------------------------

:extract_dataset: :py:func:`~xscen.extract.extract_dataset` has resampling capabilities to provide daily data from finer sources.
:extract_dataset: :py:func:`~xscen.extract.extract_dataset`'s resampling capabilities are meant to provide daily data from finer sources.

:xclim indicators: Through :py:func:`~xscen.indicators.compute_indicators`, xscen workflows can easily use `xclim indicators <https://xclim.readthedocs.io/en/stable/indicators.html>`_
to go from daily data to coarser (monthly, seasonal, annual).
:resample: :py:func`xscen.extract.resample` extends xarray's `resample` methods with support for weighted resampling when starting from data coarser than daily and for handling of missing timesteps or values.

What is currently not covered by either `xscen` or `xclim` is a method to resample from data coarser than daily, where the base period is non-uniform (ex: resampling from monthly to annual data, taking into account the number of days per month).
:xclim indicators: Through :py:func:`~xscen.indicators.compute_indicators`, xscen workflows can easily use `xclim indicators <https://xclim.readthedocs.io/en/stable/indicators.html>`_
to go from daily data to coarser (monthly, seasonal, annual), with missing values handling. This option will add more metadata than the two firsts.


Metadata translation
Expand Down
75 changes: 75 additions & 0 deletions tests/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
import pandas as pd
import pytest
import xarray as xr
from conftest import notebooks
from xclim.testing.helpers import test_timeseries as timeseries

Expand Down Expand Up @@ -413,3 +414,77 @@ def test_outofrange(self):

def test_none(self):
assert xs.subset_warming_level(TestSubsetWarmingLevel.ds, wl=20) is None


class TestResample:
@pytest.mark.parametrize(
"infrq,meth,outfrq,exp",
[
["MS", "mean", "QS-DEC", [0.47457627, 3, 6.01086957, 9, 11.96666667]],
[
"QS-DEC",
"mean",
"YS",
[1.49041096, 5.49041096, 9.49453552, 13.49041096, 17.49041096],
],
["MS", "std", "2YS", [6.92009239, 6.91557206]],
],
)
def test_mean_from_monthly(self, infrq, meth, outfrq, exp):
da = timeseries(
np.arange(48),
variable="tas",
start="2001-01-01",
freq=infrq,
)
out = xs.extract.resample(da, outfrq, method=meth)
np.testing.assert_allclose(out.isel(time=slice(0, 5)), exp)

def test_wind_from_monthly(self):
uas = timeseries(
np.arange(48),
variable="uas",
start="2001-01-01",
freq="MS",
)
vas = timeseries(
np.arange(48),
variable="vas",
start="2001-01-01",
freq="MS",
)
ds = xr.merge([uas, vas])
out = xs.extract.resample(ds.uas, "YS", method="wind_direction", ds=ds)
np.testing.assert_allclose(out, [5.5260274, 17.5260274, 29.5260274, 41.5136612])

def test_missing(self):
da = timeseries(
np.arange(48),
variable="tas",
start="2001-01-01",
freq="MS",
)
out = xs.extract.resample(da, "QS-DEC", method="mean", missing="drop")
assert out.size == 15

out = xs.extract.resample(da, "QS-DEC", method="mean", missing="mask")
assert out.isel(time=0).isnull().all()

def test_missing_xclim(self):
arr = np.arange(48).astype(float)
arr[0] = np.nan
arr[40:] = np.nan
da = timeseries(
arr,
variable="tas",
start="2001-01-01",
freq="MS",
)
out = xs.extract.resample(da, "YS", method="mean", missing={"method": "any"})
assert out.isel(time=0).isnull().all()

out = xs.extract.resample(
da, "YS", method="mean", missing={"method": "pct", "tolerance": 0.6}
)
assert out.isel(time=0).notnull().all()
assert out.isel(time=-1).isnull().all()
61 changes: 47 additions & 14 deletions xscen/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,36 +368,46 @@ def resample(
*,
ds: Optional[xr.Dataset] = None,
method: Optional[str] = None,
missing: Union[str, dict] = None,
) -> xr.DataArray:
"""Aggregate variable to the target frequency.
If the input frequency is greater than a week, the resampling operation is weighted by
the number of days in each sampling period.
Parameters
----------
da : xr.DataArray
DataArray of the variable to resample, must have a "time" dimension and be of a
finer temporal resolution than "target_timestep".
finer temporal resolution than "target_frequency".
target_frequency : str
The target frequency/freq str, must be one of the frequency supported by pandas.
The target frequency/freq str, must be one of the frequency supported by xarray.
ds : xr.Dataset, optional
The "wind_direction" resampling method needs extra variables, which can be given here.
method : {'mean', 'min', 'max', 'sum', 'wind_direction'}, optional
The resampling method. If None (default), it is guessed from the variable name and frequency,
using the mapping in CVs/resampling_methods.json. If the variable is not found there,
"mean" is used by default.
missing: {'mask', 'drop'} or dict, optional
If 'mask' or 'drop', target periods with fewer steps than expected are masked or dropped.
For example, for daily data beginning in January with a `target_frequency` of "QS-DEC". the first season is missing one month.
If a dict, it points to a xclim check missing method which will mask periods according to their number of NaN values.
The dict must contain a "method" field corresponding to the xclim method name and may contain
any other args to pass. Options are documented in :py:mod:`xclim.core.missing`.
Returns
-------
xr.DataArray
Resampled variable
"""
var_name = da.name

initial_frequency = xr.infer_freq(da.time.dt.round("T")) or "undetected"
initial_frequency_td = pd.Timedelta(
CV.xrfreq_to_timedelta(xr.infer_freq(da.time.dt.round("T")), None)
)
days_per_step = None

weights = None
if initial_frequency != "undetected" and initial_frequency_td > pd.Timedelta(
7, "days"
):
Expand All @@ -422,6 +432,7 @@ def resample(
time=days_per_step.time
) # Not sure why we need this, but time coord is from the resample even after sel
)
weights = days_per_step / days_per_period

if method is None:
if (
Expand Down Expand Up @@ -471,11 +482,7 @@ def resample(
# Resample first to find the average wind speed and components
if days_per_step is not None:
with xr.set_options(keep_attrs=True):
ds = (
(ds * (days_per_step / days_per_period))
.reample(time=target_frequency)
.sum(time="time")
)
ds = (ds * weights).resample(time=target_frequency).sum(dim="time")
else:
ds = ds.resample(time=target_frequency).mean(dim="time", keep_attrs=True)

Expand Down Expand Up @@ -503,13 +510,14 @@ def resample(
# Avoiding resample().map() is much more performant
with xr.set_options(keep_attrs=True):
out = (
(da * (days_per_step / days_per_period))
.reample(time=target_frequency)
.sum(time="time")
(da * weights)
.resample(time=target_frequency)
.sum(dim="time")
.rename(da.name)
)
elif hasattr(xr.core.weighted.DataArrayWeighted, method):
ds = xr.merge([da, days_per_step.rename("weights")])
da = ds.resample(time="QS-DEC").map(
ds = xr.merge([da, weights.rename("weights")])
out = ds.resample(time=target_frequency).map(
lambda grp: getattr(
grp.drop_vars("weights").weighted(grp.weights), method
)(dim="time")
Expand All @@ -523,6 +531,31 @@ def resample(
dim="time", keep_attrs=True
)

if missing in ["mask", "drop"]:
steps_per_period = (
xr.ones_like(da.time, dtype="int").resample(time=target_frequency).sum()
)
t = xr.date_range(
steps_per_period.indexes["time"][0],
periods=steps_per_period.time.size + 1,
freq=target_frequency,
)
expected = (
xr.DataArray(t, dims=("time",), coords={"time": t}).diff(
"time", label="lower"
)
/ initial_frequency_td
)
complete = (steps_per_period / expected) > 0.95
elif isinstance(missing, dict):
missmeth = missing.pop("method")
complete = ~xc.core.missing.MISSING_METHODS[missmeth](
da, target_frequency, initial_frequency
)(**missing)
missing = "mask"
if missing in {"mask", "drop"}:
out = out.where(complete, drop=(missing == "drop"))

new_history = (
f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {method} "
f"resample from {initial_frequency} to {target_frequency} - xarray v{xr.__version__}"
Expand Down

0 comments on commit ce99daa

Please sign in to comment.