diff --git a/.cruft.json b/.cruft.json index 0cdc4aa8..19851606 100644 --- a/.cruft.json +++ b/.cruft.json @@ -11,7 +11,7 @@ "project_slug": "xscen", "project_short_description": "A climate change scenario-building analysis framework, built with xclim/xarray.", "pypi_username": "RondeauG", - "version": "0.7.12-beta", + "version": "0.7.13-beta", "use_pytest": "y", "use_black": "y", "add_pyup_badge": "n", diff --git a/HISTORY.rst b/HISTORY.rst index 1a1b2e65..12551a20 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -17,7 +17,8 @@ New features and enhancements * ``xs.spatial_mean`` with ``method='xESMF'`` will also automatically segmentize polygons (down to a 1° resolution) to ensure a correct average (:pull:`260`). * Added documentation for `require_all_on` in `search_data_catalogs`. (:pull:`263`). * ``xs.save_to_table`` and ``xs.io.to_table`` to transform datasets and arrays to DataFrames, but with support for multi-columns, multi-sheets and localized table of content generation. -* Added annual global tas timeseries for CMIP6's models CMCC-ESM2 (ssp245, ssp370, ssp585), EC-Earth3-CC (ssp245, ssp585), KACE-1-0-G (ssp245, ssp370, ssp585) and TaiESM1 (ssp245, ssp370). (:issue:`268`, :pull:`270`). +* Better ``xs.extract.resample`` : support for weighted resampling operations when starting with frequencies coarser than daily and missing timesteps/values handling. (:issue:`80`, :issue:`93`, :pull:`265`). +* Added annual global tas timeseries for CMIP6's models CMCC-ESM2 (ssp245, ssp370, ssp585), EC-Earth3-CC (ssp245, ssp585), KACE-1-0-G (ssp245, ssp370, ssp585) and TaiESM1 (ssp245, ssp370). Moved global tas database to a netCDF file. (:issue:`268`, :pull:`270`). Breaking changes ^^^^^^^^^^^^^^^^ diff --git a/docs/goodtoknow.rst b/docs/goodtoknow.rst index 1b79028f..6314b133 100644 --- a/docs/goodtoknow.rst +++ b/docs/goodtoknow.rst @@ -37,12 +37,12 @@ There are many ways to open data in xscen workflows. The list below tries to mak Which function to use when resampling data ------------------------------------------ -:extract_dataset: :py:func:`~xscen.extract.extract_dataset` has resampling capabilities to provide daily data from finer sources. +:extract_dataset: :py:func:`~xscen.extract.extract_dataset`'s resampling capabilities are meant to provide daily data from finer sources. -:xclim indicators: Through :py:func:`~xscen.indicators.compute_indicators`, xscen workflows can easily use `xclim indicators `_ - to go from daily data to coarser (monthly, seasonal, annual). +:resample: :py:func`xscen.extract.resample` extends xarray's `resample` methods with support for weighted resampling when starting from data coarser than daily and for handling of missing timesteps or values. -What is currently not covered by either `xscen` or `xclim` is a method to resample from data coarser than daily, where the base period is non-uniform (ex: resampling from monthly to annual data, taking into account the number of days per month). +:xclim indicators: Through :py:func:`~xscen.indicators.compute_indicators`, xscen workflows can easily use `xclim indicators `_ + to go from daily data to coarser (monthly, seasonal, annual), with missing values handling. This option will add more metadata than the two firsts. Metadata translation diff --git a/setup.cfg b/setup.cfg index c7973609..585a0776 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.7.12-beta +current_version = 0.7.13-beta commit = True tag = False parse = (?P\d+)\.(?P\d+).(?P\d+)(\-(?P[a-z]+))? diff --git a/setup.py b/setup.py index 145a0812..61d18c4a 100644 --- a/setup.py +++ b/setup.py @@ -102,6 +102,6 @@ def run(self): test_suite="tests", extras_require={"dev": dev_requirements}, url="https://github.com/Ouranosinc/xscen", - version="0.7.12-beta", + version="0.7.13-beta", zip_safe=False, ) diff --git a/tests/test_extract.py b/tests/test_extract.py index 983f6454..cd36f34a 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd import pytest +import xarray as xr from conftest import notebooks from xclim.testing.helpers import test_timeseries as timeseries @@ -413,3 +414,83 @@ def test_outofrange(self): def test_none(self): assert xs.subset_warming_level(TestSubsetWarmingLevel.ds, wl=20) is None + + +class TestResample: + @pytest.mark.parametrize( + "infrq,meth,outfrq,exp", + [ + ["MS", "mean", "QS-DEC", [0.47457627, 3, 6.01086957, 9, 11.96666667]], + [ + "QS-DEC", + "mean", + "YS", + [1.49041096, 5.49041096, 9.49453552, 13.49041096, 17.49041096], + ], + ["MS", "std", "2YS", [6.92009239, 6.91557206]], + [ + "QS", + "median", + "YS", + [1.516437, 5.516437, 9.516437, 13.51092864, 17.516437], + ], + ], + ) + def test_weighted(self, infrq, meth, outfrq, exp): + da = timeseries( + np.arange(48), + variable="tas", + start="2001-01-01", + freq=infrq, + ) + out = xs.extract.resample(da, outfrq, method=meth) + np.testing.assert_allclose(out.isel(time=slice(0, 5)), exp) + + def test_weighted_wind(self): + uas = timeseries( + np.arange(48), + variable="uas", + start="2001-01-01", + freq="MS", + ) + vas = timeseries( + np.arange(48), + variable="vas", + start="2001-01-01", + freq="MS", + ) + ds = xr.merge([uas, vas]) + out = xs.extract.resample(ds.uas, "YS", method="wind_direction", ds=ds) + np.testing.assert_allclose(out, [5.5260274, 17.5260274, 29.5260274, 41.5136612]) + + def test_missing(self): + da = timeseries( + np.arange(48), + variable="tas", + start="2001-01-01", + freq="MS", + ) + out = xs.extract.resample(da, "QS-DEC", method="mean", missing="drop") + assert out.size == 15 + + out = xs.extract.resample(da, "QS-DEC", method="mean", missing="mask") + assert out.isel(time=0).isnull().all() + + def test_missing_xclim(self): + arr = np.arange(48).astype(float) + arr[0] = np.nan + arr[40:] = np.nan + da = timeseries( + arr, + variable="tas", + start="2001-01-01", + freq="MS", + ) + out = xs.extract.resample(da, "YS", method="mean", missing={"method": "any"}) + assert out.isel(time=0).isnull().all() + + out = xs.extract.resample( + da, "YS", method="mean", missing={"method": "pct", "tolerance": 0.6} + ) + assert out.isel(time=0).notnull().all() + assert out.isel(time=-1).isnull().all() diff --git a/tests/test_xscen.py b/tests/test_xscen.py index 9a3b5caf..4b35f02a 100644 --- a/tests/test_xscen.py +++ b/tests/test_xscen.py @@ -28,4 +28,4 @@ def test_package_metadata(self): contents = f.read() assert """Gabriel Rondeau-Genesse""" in contents assert '__email__ = "rondeau-genesse.gabriel@ouranos.ca"' in contents - assert '__version__ = "0.7.12-beta"' in contents + assert '__version__ = "0.7.13-beta"' in contents diff --git a/xscen/__init__.py b/xscen/__init__.py index d6476feb..896a42db 100644 --- a/xscen/__init__.py +++ b/xscen/__init__.py @@ -52,7 +52,7 @@ __author__ = """Gabriel Rondeau-Genesse""" __email__ = "rondeau-genesse.gabriel@ouranos.ca" -__version__ = "0.7.12-beta" +__version__ = "0.7.13-beta" # monkeypatch so that warnings.warn() doesn't mention itself diff --git a/xscen/extract.py b/xscen/extract.py index cc844f69..76649120 100644 --- a/xscen/extract.py +++ b/xscen/extract.py @@ -14,6 +14,7 @@ import xarray as xr import xclim as xc from intake_esm.derived import DerivedVariableRegistry +from xclim.core.calendar import compare_offsets from .catalog import DataCatalog # noqa from .catalog import ID_COLUMNS, concat_data_catalogs, generate_id, subset_file_coverage @@ -23,7 +24,7 @@ from .spatial import subset from .utils import CV from .utils import ensure_correct_time as _ensure_correct_time -from .utils import get_cat_attrs, natural_sort, standardize_periods +from .utils import get_cat_attrs, natural_sort, standardize_periods, xrfreq_to_timedelta logger = logging.getLogger(__name__) @@ -368,44 +369,44 @@ def resample( *, ds: Optional[xr.Dataset] = None, method: Optional[str] = None, + missing: Union[str, dict] = None, ) -> xr.DataArray: """Aggregate variable to the target frequency. + If the input frequency is greater than a week, the resampling operation is weighted by + the number of days in each sampling period. + Parameters ---------- da : xr.DataArray DataArray of the variable to resample, must have a "time" dimension and be of a - finer temporal resolution than "target_timestep". + finer temporal resolution than "target_frequency". target_frequency : str - The target frequency/freq str, must be one of the frequency supported by pandas. + The target frequency/freq str, must be one of the frequency supported by xarray. ds : xr.Dataset, optional The "wind_direction" resampling method needs extra variables, which can be given here. method : {'mean', 'min', 'max', 'sum', 'wind_direction'}, optional The resampling method. If None (default), it is guessed from the variable name and frequency, using the mapping in CVs/resampling_methods.json. If the variable is not found there, "mean" is used by default. + missing: {'mask', 'drop'} or dict, optional + If 'mask' or 'drop', target periods that would have been computed from fewer timesteps than expected are masked or dropped, using a threshold of 5% of missing data. + For example, the first season of a `target_frequency` of "QS-DEC" will be masked or dropped if data only starts in January. + If a dict, it points to a xclim check missing method which will mask periods according to their number of NaN values. + The dict must contain a "method" field corresponding to the xclim method name and may contain + any other args to pass. Options are documented in :py:mod:`xclim.core.missing`. Returns ------- xr.DataArray Resampled variable - """ var_name = da.name initial_frequency = xr.infer_freq(da.time.dt.round("T")) or "undetected" - initial_frequency_td = pd.Timedelta( - CV.xrfreq_to_timedelta(xr.infer_freq(da.time.dt.round("T")), None) - ) - if initial_frequency_td == pd.Timedelta("1D"): - logger.warning( - "You appear to be resampling daily data using extract_dataset. " - "It is advised to use compute_indicators instead, as it is far more robust." - ) - elif initial_frequency_td > pd.Timedelta("1D"): - logger.warning( - "You appear to be resampling data that is coarser than daily. " - "Be aware that this is not currently explicitely supported by xscen and might result in erroneous manipulations." + if initial_frequency == "undetected": + warnings.warn( + "Could not infer the frequency of the dataset. Be aware that this might result in erroneous manipulations." ) if method is None: @@ -428,6 +429,35 @@ def resample( method = "mean" logger.info(f"Resampling method for {var_name} defaulted to: 'mean'.") + weights = None + if ( + initial_frequency != "undetected" + and compare_offsets(initial_frequency, ">", "W") + and method in ["mean", "median", "std", "var", "wind_direction"] + ): + # More than a week -> non-uniform sampling length! + t = xr.date_range( + da.indexes["time"][0], + periods=da.time.size + 1, + freq=initial_frequency, + calendar=da.time.dt.calendar, + ) + # This is the number of days in each sampling period + days_per_step = ( + xr.DataArray(t, dims=("time",), coords={"time": t}) + .diff("time", label="lower") + .dt.days + ) + days_per_period = ( + days_per_step.resample(time=target_frequency) + .sum() # Total number of days per period + .sel(time=days_per_step.time, method="ffill") # Upsample to initial freq + .assign_coords( + time=days_per_step.time + ) # Not sure why we need this, but time coord is from the resample even after sel + ) + weights = days_per_step / days_per_period + # TODO : Support non-surface wind? if method == "wind_direction": ds[var_name] = da @@ -454,7 +484,11 @@ def resample( ) # Resample first to find the average wind speed and components - ds = ds.resample(time=target_frequency).mean(dim="time", keep_attrs=True) + if weights is not None: + with xr.set_options(keep_attrs=True): + ds = (ds * weights).resample(time=target_frequency).sum(dim="time") + else: + ds = ds.resample(time=target_frequency).mean(dim="time", keep_attrs=True) # Based on Vector Magnitude and Direction equations # For example: https://www.khanacademy.org/math/precalculus/x9e81a4f98389efdf:vectors/x9e81a4f98389efdf:component-form/a/vector-magnitude-and-direction-review @@ -475,14 +509,69 @@ def resample( else: out = ds[var_name] + elif weights is not None: + if method == "mean": + # Avoiding resample().map() is much more performant + with xr.set_options(keep_attrs=True): + out = ( + (da * weights) + .resample(time=target_frequency) + .sum(dim="time") + .rename(da.name) + ) + else: + kws = {"q": 0.5} if method == "median" else {} + ds = xr.merge([da, weights.rename("weights")]) + out = ds.resample(time=target_frequency).map( + lambda grp: getattr( + grp.drop_vars("weights").weighted(grp.weights), + method if method != "median" else "quantile", + )(dim="time", **kws) + )[da.name] else: out = getattr(da.resample(time=target_frequency), method)( dim="time", keep_attrs=True ) + missing_note = " " + initial_td = xrfreq_to_timedelta(initial_frequency) + if missing in ["mask", "drop"] and not pd.isnull(initial_td): + steps_per_period = ( + xr.ones_like(da.time, dtype="int").resample(time=target_frequency).sum() + ) + t = xr.date_range( + steps_per_period.indexes["time"][0], + periods=steps_per_period.time.size + 1, + freq=target_frequency, + ) + + expected = ( + xr.DataArray(t, dims=("time",), coords={"time": t}).diff( + "time", label="lower" + ) + / initial_td + ) + complete = (steps_per_period / expected) > 0.95 + action = "masking" if missing == "mask" else "dropping" + missing_note = f", {action} incomplete periods " + elif isinstance(missing, dict): + missmeth = missing.pop("method") + complete = ~xc.core.missing.MISSING_METHODS[missmeth]( + da, target_frequency, initial_frequency + )(**missing) + funcstr = xc.core.formatting.gen_call_string( + f"xclim.core.missing_{missmeth}", **missing + ) + missing = "mask" + missing_note = f", masking incomplete periods according to {funcstr} " + if missing in {"mask", "drop"}: + out = out.where(complete, drop=(missing == "drop")) + new_history = ( - f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {method} " - f"resample from {initial_frequency} to {target_frequency} - xarray v{xr.__version__}" + f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] " + f"{'weighted' if weights is not None else ''} {method} " + f"resample from {initial_frequency} to {target_frequency}" + f"{missing_note}- xarray v{xr.__version__}" ) history = ( new_history + " \n " + out.attrs["history"] @@ -831,9 +920,9 @@ def get_warming_level( window: int = 20, tas_baseline_period: list = None, ignore_member: bool = False, - tas_src: Union[str, Path] = Path(__file__).parent - / "data" - / "IPCC_annual_global_tas.nc", + tas_src: Union[str, Path] = ( + Path(__file__).parent / "data" / "IPCC_annual_global_tas.nc" + ), return_horizon: bool = True, ): """Use the IPCC Atlas method to return the window of time over which the requested level of global warming is first reached. @@ -857,11 +946,12 @@ def get_warming_level( [start, end] of the base period. The warming is calculated with respect to it. The default is ["1850", "1900"]. ignore_member : bool Decides whether to ignore the member when searching for the model run in tas_csv. - tas_csv : str - Path to a csv of annual global mean temperature with a row for each year and a column for each dataset. - If None, it will default to data/IPCC_annual_global_tas.csv which was built from + tas_src : str + Path to a netCDF of annual global mean temperature (tas) with an annual "time" dimension + and a "simulation" dimension with the following coordinates: "mip_era", "source", "experiment" and "member". + If None, it will default to data/IPCC_annual_global_tas.nc which was built from the IPCC atlas data from Iturbide et al., 2020 (https://doi.org/10.5194/essd-12-2959-2020) - and extra data from pilot models of MRCC5 and ClimEx. + and extra data for missing CMIP6 models and pilot models of MRCC5 and ClimEx. return_horizon: bool If True, the output will be a list following the format ['start_yr', 'end_yr'] If False, the output will be a string representing the middle of the period. @@ -917,21 +1007,16 @@ def get_warming_level( ) info_models.append(info) - # open csv, split column names for easier usage - tas = xr.open_dataset(tas_src).tas # pd.read_csv(tas_csv, index_col="year") - # models = pd.DataFrame.from_records( - # [c.split("_") for c in annual_tas.columns], - # index=annual_tas.columns, - # columns=FIELDS, - # ) + # open nc + tas = xr.open_dataset(tas_src).tas out = {} for model in info_models: - # choose colum based in ds cat attrs + # choose colum based in ds cat attrs, +'$' to ensure a full match (matches end-of-string) mip = tas.mip_era.str.match(model["mip_era"] + "$") src = tas.source.str.match(model["source"] + "$") if not src.any(): - # Maybe it's an RCM, then source may contain the institute + # Maybe it's an RCM, then requested source may contain the institute src = xr.apply_ufunc(model["source"].endswith, tas.source, vectorize=True) exp = tas.experiment.str.match(model["experiment"] + "$") mem = tas.member.str.match(model["member"] + "$") @@ -969,7 +1054,8 @@ def get_warming_level( yrs = rolling_diff.where(rolling_diff >= wl, drop=True) if yrs.size == 0: logger.info( - f"Global warming level of +{wl}C is not reached by the last year of the provided 'tas_csv' file for {selected}." + f"Global warming level of +{wl}C is not reached by the last year " + f"({tas.time[-1].dt.year}) of the provided 'tas_src' file for {selected}." ) out[name] = [None, None] if return_horizon else None else: @@ -1021,17 +1107,8 @@ def subset_warming_level( Use "{wl}", "{period0}" and "{period1}" in the string to dynamically include `wl`, 'tas_baseline_period[0]' and 'tas_baseline_period[1]'. If None, no new dimensions will be added. - - kwargs - Instructions on how to search for warming levels. - The keyword arguments are passed to `get_warming_level()` - - Valid keyword aguments are: - window : int - tas_baseline_period : list - ignore_member : bool - tas_csv : str - return_horizon: bool + kwargs : + Instructions on how to search for warming levels, passed to :py:func:`get_warming_level`. Returns ------- @@ -1340,41 +1417,28 @@ def _restrict_multimembers(catalogs: dict, id_columns: list, restrictions: dict) def _restrict_wl(df, restrictions: dict): - """Update the results from search_data_catalogs by removing simulations that are not available in the warming level csv.""" - tas_csv = restrictions["tas_csv"] - if tas_csv is None: - tas_csv = Path(__file__).parent / "data/IPCC_annual_global_tas.csv" + """Update the results from search_data_catalogs by removing simulations that are not available in the warming level netCDF.""" + tas_src = restrictions["tas_src"] or Path(__file__).parent / "data" / "IPCC_annual_global_tas.nc" - # open csv - annual_tas = pd.read_csv(tas_csv, index_col="year") + # open nc + annual_tas = xr.open_dataset(tas_src).tas + fields = ('mip_era', 'source', 'experiment', 'member') if restrictions["ignore_member"] and "wl" not in restrictions: - df["csv_name"] = df["mip_era"].str.cat( - [df["source"], df["experiment"]], sep="_" - ) - csv_source = ["_".join(x.split("_")[:-1]) for x in annual_tas.columns[1:]] - else: - df["csv_name"] = df["mip_era"].str.cat( - [df["source"], df["experiment"], df["member"]], sep="_" - ) - csv_source = list(annual_tas.columns[1:]) + fields = fields[:-1] + df["wl_name"] = df.apply(lambda r: '_'.join([r[field] for field in fields])) + wl_source = ["_".join(parts) for parts in zip(*[annual_tas[field] for field in fields])] if "wl" in restrictions: - to_keep = pd.Series( - [ - get_warming_level(x, **restrictions)[0] is not None - for x in df["csv_name"] - ] - ) + to_keep = df["wl_name"].apply(get_warming_level, return_horizons=False, **restrictions).notnull() else: - to_keep = df["csv_name"].isin(csv_source) - removed = pd.unique(df[~to_keep]["id"]) + to_keep = df["wl_name"].isin(wl_source) + removed = pd.unique(df[~to_keep]["id"]) df = df[to_keep] logger.info( f"Removing the following datasets because of the restriction for warming levels: {list(removed)}" ) - df = df.drop(columns=["csv_name"]) - + df = df.drop(columns=["wl_name"]) return df diff --git a/xscen/utils.py b/xscen/utils.py index 0ad5a790..c7d931a8 100644 --- a/xscen/utils.py +++ b/xscen/utils.py @@ -1274,3 +1274,9 @@ def season_sort_key(idx: pd.Index, name: str = None): # TypeError if season element was not a string. pass return idx + + +def xrfreq_to_timedelta(freq): + """Approximate the length of a period based on its frequency offset.""" + N, B, _, _ = parse_offset(freq) + return N * pd.Timedelta(CV.xrfreq_to_timedelta(B, "NaT"))