From 7fd233ad21e4c72c21cfc1f7add578ca7f5ac3e4 Mon Sep 17 00:00:00 2001 From: RondeauG <rondeau-genesse.gabriel@ouranos.ca> Date: Wed, 1 Nov 2023 10:17:36 -0400 Subject: [PATCH 01/22] better typing in aggregate --- xscen/aggregate.py | 66 ++++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/xscen/aggregate.py b/xscen/aggregate.py index 552531bb..50ac3e0f 100644 --- a/xscen/aggregate.py +++ b/xscen/aggregate.py @@ -1,12 +1,13 @@ -# noqa: D100 +"""Functions to aggregate data over time and space.""" import datetime import logging +import os import warnings from collections.abc import Sequence from copy import deepcopy from pathlib import Path from types import ModuleType -from typing import Union +from typing import Optional, Union import geopandas as gpd import numpy as np @@ -16,7 +17,6 @@ import xclim as xc import xclim.core.calendar import xesmf as xe -from shapely.geometry import Polygon from xclim.core.indicator import Indicator from .config import parse_config @@ -44,11 +44,11 @@ def _(s): def climatological_mean( ds: xr.Dataset, *, - window: int = None, - min_periods: int = None, + window: Optional[int] = None, + min_periods: Optional[int] = None, interval: int = 1, - periods: list = None, - to_level: str = "climatology", + periods: Optional[Union[list[str], list[list[str]]]] = None, + to_level: Optional[str] = "climatology", ) -> xr.Dataset: """Compute the mean over 'year' for given time periods, respecting the temporal resolution of ds. @@ -56,17 +56,17 @@ def climatological_mean( ---------- ds : xr.Dataset Dataset to use for the computation. - window : int + window : int, optional Number of years to use for the time periods. If left at None and periods is given, window will be the size of the first period. If left at None and periods is not given, the window will be the size of the input dataset. - min_periods : int + min_periods : int, optional For the rolling operation, minimum number of years required for a value to be computed. If left at None and the xrfreq is either QS or AS and doesn't start in January, min_periods will be one less than window. If left at None, it will be deemed the same as 'window'. interval : int Interval (in years) at which to provide an output. - periods : list + periods : list of str or list of lists of str, optional Either [start, end] or list of [start, end] of continuous periods to be considered. This is needed when the time axis of ds contains some jumps in time. If None, the dataset will be considered continuous. to_level : str, optional @@ -208,7 +208,7 @@ def compute_deltas( *, kind: Union[str, dict] = "+", rename_variables: bool = True, - to_level: str = "deltas", + to_level: Optional[str] = "deltas", ) -> xr.Dataset: """Compute deltas in comparison to a reference time period, respecting the temporal resolution of ds. @@ -218,7 +218,7 @@ def compute_deltas( Dataset to use for the computation. reference_horizon : str or xr.Dataset Either a YYYY-YYYY string corresponding to the 'horizon' coordinate of the reference period, or a xr.Dataset containing the climatological mean. - kind : str + kind : str or dict ['+', '/', '%'] Whether to provide absolute, relative, or percentage deltas. Can also be a dictionary separated per variable name. rename_variables : bool @@ -372,13 +372,13 @@ def spatial_mean( ds: xr.Dataset, method: str, *, - spatial_subset: bool = None, + spatial_subset: Optional[bool] = None, call_clisops: bool = False, - region: Union[dict, str] = None, - kwargs: dict = None, - simplify_tolerance: float = None, - to_domain: str = None, - to_level: str = None, + region: Optional[Union[dict, str]] = None, + kwargs: Optional[dict] = None, + simplify_tolerance: Optional[float] = None, + to_domain: Optional[str] = None, + to_level: Optional[str] = None, ) -> xr.Dataset: """Compute the spatial mean using a variety of available methods. @@ -391,18 +391,18 @@ def spatial_mean( 'interp_centroid' will find the region's centroid (if coordinates are not fed through kwargs), then perform a .interp() over the spatial dimensions of the Dataset. The coordinate can also be directly fed to .interp() through the 'kwargs' argument below. 'xesmf' will make use of xESMF's SpatialAverager. This will typically be more precise, especially for irregular regions, but can be much slower than other methods. - spatial_subset : bool + spatial_subset : bool, optional If True, xscen.spatial.subset will be called prior to the other operations. This requires the 'region' argument. If None, this will automatically become True if 'region' is provided and the subsetting method is either 'cos-lat' or 'mean'. - region : dict or str + region : dict or str, optional Description of the region and the subsetting method (required fields listed in the Notes). If method=='interp_centroid', this is used to find the region's centroid. If method=='xesmf', the bounding box or shapefile is given to SpatialAverager. Can also be "global", for global averages. This is simply a shortcut for `{'name': 'global', 'method': 'bbox', 'lon_bnds' [-180, 180], 'lat_bnds': [-90, 90]}`. - kwargs : dict + kwargs : dict, optional Arguments to send to either mean(), interp() or SpatialAverager(). For SpatialAverager, one can give `skipna` or `out_chunks` here, to be passed to the averager call itself. - simplify_tolerance : float + simplify_tolerance : float, optional Precision (in degree) used to simplify a shapefile before sending it to SpatialAverager(). The simpler the polygons, the faster the averaging, but it will lose some precision. to_domain : str, optional @@ -696,14 +696,18 @@ def spatial_mean( def produce_horizon( ds: xr.Dataset, indicators: Union[ - str, Path, Sequence[Indicator], Sequence[tuple[str, Indicator]], ModuleType + str, + os.PathLike, + Sequence[Indicator], + Sequence[tuple[str, Indicator]], + ModuleType, ], *, - periods: list = None, - warminglevels: dict = None, - to_level: str = "horizons", + periods: Optional[Union[list[str], list[list[str]]]] = None, + warminglevels: Optional[dict] = None, + to_level: Optional[str] = "horizons", period: list = None, -): +) -> xr.Dataset: """Compute indicators, then the climatological mean, and finally unstack dates in order to have a single dataset with all indicators of different frequencies. Once this is done, the function drops 'time' in favor of 'horizon'. @@ -714,16 +718,16 @@ def produce_horizon( ---------- ds: xr.Dataset Input dataset with a time dimension. - indicators: Union[str, Path, Sequence[Indicator], Sequence[Tuple[str, Indicator]]] + indicators: Union[str, os.PathLike, Sequence[Indicator], Sequence[Tuple[str, Indicator]], ModuleType] Indicators to compute. It will be passed to the `indicators` argument of `xs.compute_indicators`. - periods: list + periods: list of str or list of lists of str, optional Either [start, end] or list of [start_year, end_year] for the period(s) to be evaluated. If both periods and warminglevels are None, the full time series will be used. - warminglevels: dict + warminglevels: dict, optional Dictionary of arguments to pass to `py:func:xscen.subset_warming_level`. If 'wl' is a list, the function will be called for each value and produce multiple horizons. If both periods and warminglevels are None, the full time series will be used. - to_level: + to_level: str, optional The processing level to assign to the output. If there is only one horizon, you can use "{wl}", "{period0}" and "{period1}" in the string to dynamically include that information in the processing level. From d55987669bc9998c3549fb95f974fb7b1303fd0a Mon Sep 17 00:00:00 2001 From: RondeauG <rondeau-genesse.gabriel@ouranos.ca> Date: Wed, 1 Nov 2023 10:18:31 -0400 Subject: [PATCH 02/22] better typing in biasadjust --- xscen/biasadjust.py | 48 ++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/xscen/biasadjust.py b/xscen/biasadjust.py index 6664bdcb..b098733c 100644 --- a/xscen/biasadjust.py +++ b/xscen/biasadjust.py @@ -1,6 +1,5 @@ -# noqa: D100 +"""Functions to train and adjust a bias-adjustment algorithm.""" import logging -import warnings from copy import deepcopy from typing import Optional, Union @@ -14,9 +13,6 @@ from .config import parse_config from .utils import minimum_calendar, standardize_periods -# TODO: Change all paths to PosixPath objects, including in the catalog? -# TODO: Compute sometimes fails randomly (in debug, pretty much always). Also (detrend?) fails with pr. Investigate why. - logger = logging.getLogger(__name__) @@ -60,12 +56,12 @@ def _add_preprocessing_attr(scen, train_kwargs): def train( dref: xr.Dataset, dhist: xr.Dataset, - var: list, - period: list, + var: Union[str, list], + period: list[str], *, method: str = "DetrendedQuantileMapping", - group: Union[sdba.Grouper, str, dict] = {"group": "time.dayofyear", "window": 31}, - xclim_train_args: dict = None, + group: Optional[Union[sdba.Grouper, str, dict]] = None, + xclim_train_args: Optional[dict] = None, maximal_calendar: str = "noleap", adapt_freq: Optional[dict] = None, jitter_under: Optional[dict] = None, @@ -81,14 +77,15 @@ def train( The target timeseries, on the reference period. dhist : xr.Dataset The timeseries to adjust, on the reference period. - var : str - Variable on which to do the adjustment - period : list + var : str or list of str + Variable on which to do the adjustment. Currently only supports one variable. + period : list of str [start, end] of the reference period method : str Name of the `sdba.TrainAdjust` method of xclim. - group : str or sdba.Grouper - Grouping information + group : str or sdba.Grouper or dict, optional + Grouping information. If a string, it is interpreted as a grouper on the time dimension. If a dict, it is passed to `sdba.Grouper.from_kwargs`. + Defaults to {"group": "time.dayofyear", "window": 31}. xclim_train_args : dict Dict of arguments to pass to the `.train` of the adjustment object. maximal_calendar: str @@ -101,7 +98,7 @@ def train( jitter_over: dict, optional If given, a dictionary of args to pass to `jitter_over_thresh`. align_on: str, optional - `align_on` argument for the fonction `xclim.core.calendar.convert_calendar`. + `align_on` argument for the function `xclim.core.calendar.convert_calendar`. Returns ------- @@ -114,6 +111,8 @@ def train( """ # TODO: To be adequately fixed later when we add multivariate + if isinstance(var, str): + var = [var] if len(var) != 1: raise ValueError( "biasadjust currently does not support entries with multiple variables." @@ -122,6 +121,7 @@ def train( ref = dref[var[0]] hist = dhist[var[0]] + group = group or {"group": "time.dayofyear", "window": 31} xclim_train_args = xclim_train_args or {} if method == "DetrendedQuantileMapping": xclim_train_args.setdefault("nquantiles", 15) @@ -189,15 +189,15 @@ def train( def adjust( dtrain: xr.Dataset, dsim: xr.Dataset, - periods: list, - xclim_adjust_args: dict, + periods: Union[list[str], list[list[str]]], *, + xclim_adjust_args: Optional[dict] = None, to_level: str = "biasadjusted", - bias_adjust_institution: str = None, - bias_adjust_project: str = None, + bias_adjust_institution: Optional[str] = None, + bias_adjust_project: Optional[str] = None, moving_yearly_window: Optional[dict] = None, align_on: Optional[str] = "year", -): +) -> xr.Dataset: """ Adjust a simulation. @@ -207,11 +207,11 @@ def adjust( A trained algorithm's dataset, as returned by `train`. dsim : xr.Dataset Simulated timeseries, projected period. - periods : list + periods : list of str or list of lists of str Either [start, end] or list of [start, end] of the simulation periods to be adjusted (one at a time). - xclim_adjust_args : dict + xclim_adjust_args : dict, optional Dict of arguments to pass to the `.adjust` of the adjustment object. - to_level : str, optional + to_level : str The processing level to assign to the output. Defaults to 'biasadjusted' bias_adjust_institution : str, optional @@ -240,6 +240,7 @@ def adjust( # TODO: To be adequately fixed later xclim_adjust_args = deepcopy(xclim_adjust_args) + xclim_adjust_args = xclim_adjust_args or {} if moving_yearly_window: dsim = construct_moving_yearly_window(dsim, **moving_yearly_window) @@ -267,7 +268,6 @@ def adjust( if simcal != mincal: sim = convert_calendar(sim, mincal, align_on=align_on) - xclim_adjust_args = xclim_adjust_args or {} # do the adjustment for all the simulation_period lists periods = standardize_periods(periods) slices = [] From 9cd1f07212aa8eef464788a3c89daaea36a49b5b Mon Sep 17 00:00:00 2001 From: RondeauG <rondeau-genesse.gabriel@ouranos.ca> Date: Wed, 1 Nov 2023 10:28:55 -0400 Subject: [PATCH 03/22] better typing in catalog --- xscen/catalog.py | 138 +++++++++++++++++++++++++++++------------------ 1 file changed, 87 insertions(+), 51 deletions(-) diff --git a/xscen/catalog.py b/xscen/catalog.py index 70021d98..777a67c2 100644 --- a/xscen/catalog.py +++ b/xscen/catalog.py @@ -6,6 +6,7 @@ import os import re import warnings +from abc import ABC from collections.abc import Mapping, Sequence from copy import deepcopy from functools import reduce @@ -15,7 +16,6 @@ import fsspec as fs import intake_esm -import numpy as np import pandas as pd import tlz import xarray @@ -41,7 +41,8 @@ ] -# As much as possible, these catalog columns and entries should align with: https://github.com/WCRP-CMIP/CMIP6_CVs and https://github.com/ES-DOC/pyessv-archive +# As much as possible, these catalog columns and entries should align with: +# https://github.com/WCRP-CMIP/CMIP6_CVs and https://github.com/ES-DOC/pyessv-archive # See docs/columns.rst for a description of each entry. COLUMNS = [ "id", @@ -135,22 +136,37 @@ def _parse_dates(elem): """Kwargs to pass to `pd.read_csv` when opening an official Ouranos catalog.""" -class DataCatalog(intake_esm.esm_datastore): - """ +class DataCatalog(intake_esm.esm_datastore, ABC): + r""" A read-only intake_esm catalog adapted to xscen's syntax. This class expects the catalog to have the columns listed in :py:data:`xscen.catalog.COLUMNS` and it comes with default arguments for reading the CSV files (:py:data:`xscen.catalog.csv_kwargs`). - For example, all string columns (except `path`) are casted to a categorical dtype and the + For example, all string columns (except `path`) are cast to a categorical dtype and the datetime columns are parsed with a special function that allows dates outside the conventional `datetime64[ns]` bounds by storing the data using :py:class:`pandas.Period` objects. + Parameters + ---------- + \*args : str or os.PathLike or dict + Path to a catalog JSON file. If a dict, it must have two keys: 'esmcat' and 'df'. + 'esmcat' must be a dict representation of the ESM catalog. + 'df' must be a Pandas DataFrame containing content that would otherwise be in the CSV file. + check_valid : bool + If True, will check that all files in the catalog exist on disk and remove those that don't. + drop_duplicates : bool + If True, will drop duplicates in the catalog based on the 'id' and 'path' columns. + \**kwargs : dict + Any other arguments are passed to intake_esm.esm_datastore. + See Also -------- intake_esm.core.esm_datastore """ - def __init__(self, *args, check_valid=False, drop_duplicates=False, **kwargs): + def __init__( + self, *args, check_valid: bool = False, drop_duplicates: bool = False, **kwargs + ): kwargs["read_csv_kwargs"] = recursive_update( csv_kwargs.copy(), kwargs.get("read_csv_kwargs", {}) ) @@ -178,7 +194,7 @@ def from_df( data: Union[pd.DataFrame, os.PathLike, Sequence[os.PathLike]], esmdata: Optional[Union[os.PathLike, dict]] = None, *, - read_csv_kwargs: Mapping[str, Any] = None, + read_csv_kwargs: Optional[Mapping[str, Any]] = None, name: str = "virtual", **intake_kwargs, ): @@ -186,14 +202,14 @@ def from_df( Parameters ---------- - data: DataFrame or path or sequence of path + data: DataFrame or path or sequence of paths A DataFrame or one or more paths to csv files. esmdata: path or dict, optional The "ESM collection data" as a path to a json file or a dict. If None (default), xscen's default :py:data:`esm_col_data` is used. read_csv_kwargs : dict, optional Extra kwargs to pass to `pd.read_csv`, in addition to the ones in :py:data:`csv_kwargs`. - name: str, optional + name: str If `metadata` doesn't contain it, a name to give to the catalog. See Also @@ -240,10 +256,13 @@ def _find_unique(series): else: return data.apply(_find_unique, result_type="reduce").to_dict() - def unique(self, columns: Union[str, list] = None): + def unique(self, columns: Optional[Union[str, Sequence[str]]] = None): """Return a series of unique values in the catalog. - Subsets on a columns list if specified. + Parameters + ---------- + columns : str or list of str, optional + The columns to get unique values from. If None, all columns are used. """ if self.df.size == 0: raise ValueError("Catalog is empty.") @@ -283,7 +302,14 @@ def search(self, **columns): ) return cat - def drop_duplicates(self, columns: Optional[list[str]] = None): # noqa: D102 + def drop_duplicates(self, columns: Optional[list[str]] = None): + """Drop duplicates in the catalog based on a subset of columns. + + Parameters + ---------- + columns: list of str, optional + The columns used to identify duplicates. If None, 'id' and 'path' are used. + """ # In case variables are being added in an existing Zarr, append them if columns is None: columns = ["id", "path"] @@ -311,7 +337,13 @@ def drop_duplicates(self, columns: Optional[list[str]] = None): # noqa: D102 subset=columns, keep="last", ignore_index=True, inplace=True ) - def check_valid(self): # noqa: D102 + def check_valid(self): + """Verify that all files in the catalog exist on disk and remove those that don't. + + If a file is a Zarr, it will also check that all variables are present and remove those that aren't. + """ + len_df = len(self.df) # This line is required to avoid a D202 pydocstyle error + # In case files were deleted manually, double-check that files do exist def check_existing(row): path = Path(row.path) @@ -338,14 +370,14 @@ def check_variables(row): exists = row.variable return exists - if len(self.df) > 0: + if len_df > 0: self.esmcat._df = self.df[ self.df.apply(check_existing, axis=1) ].reset_index(drop=True) - if len(self.df) > 0: + if len_df > 0: self.esmcat._df["variable"] = self.df.apply(check_variables, axis=1) - def exists_in_cat(self, **columns): + def exists_in_cat(self, **columns) -> bool: """ Check if there is an entry in the catalogue corresponding to the arguments given. @@ -355,7 +387,8 @@ def exists_in_cat(self, **columns): Returns ------- - Boolean if an entry exist + bool + True if there is an entry in the catalogue corresponding to the arguments given. """ exists = bool(len(self.search(**columns))) if exists: @@ -382,13 +415,13 @@ def to_dataset( Parameters ---------- - concat_on : list of strings or str, optional + concat_on : list of str or str, optional A list of catalog columns over which to concat the datasets (in addition to 'time'). Each will become a new dimension with the column values as coordinates. Xarray concatenation rules apply and can be acted upon through `xarray_combine_by_coords_kwargs`. - create_ensemble_on : list of strings or str, optional + create_ensemble_on : list of str or str, optional The given column values will be merged into a new id-like "realization" column, which will be concatenated over. - The given columns are removed from the dataset id, so as to remove them from the groupby_attrs logic. + The given columns are removed from the dataset id, to remove them from the groupby_attrs logic. Xarray concatenation rules apply and can be acted upon through `xarray_combine_by_coords_kwargs`. calendar : str, optional If `create_ensemble_on` is given, all datasets are converted to this calendar before concatenation. @@ -483,7 +516,7 @@ def preprocess(ds): return ds -class ProjectCatalog(DataCatalog): +class ProjectCatalog(DataCatalog, ABC): """A DataCatalog with additional 'write' functionalities that can update and upload itself. See Also @@ -497,7 +530,7 @@ def create( filename: Union[os.PathLike, str], *, project: Optional[dict] = None, - overwrite=False, + overwrite: bool = False, ): r"""Create a new project catalog from some project metadata. @@ -505,9 +538,9 @@ def create( Parameters ---------- - filename : PathLike + filename : os.PathLike or str A path to the json file (with or without suffix). - project : dict-like + project : dict, optional Metadata to create the catalog. If None, `CONFIG['project']` will be used. Valid fields are: @@ -520,7 +553,7 @@ def create( At least one of `id` and `title` must be given, the rest is optional. overwrite : bool - If True, will overwrite any existing JSON and CSV file + If True, will overwrite any existing JSON and CSV file. Returns ------- @@ -577,23 +610,23 @@ def __init__( df: Union[str, dict], *args, create: bool = False, - overwrite: bool = None, - project: dict = None, + overwrite: Optional[bool] = None, + project: Optional[dict] = None, **kwargs, ): """Open or create a project catalog. Parameters ---------- - df : str, dict + df : str or dict If str, this must be a path or URL to a catalog JSON file. If dict, this must be a dict representation of an ESM catalog. See the notes below. create : bool If True, and if 'df' is a string, this will create an empty ProjectCatalog if none already exists. - project : dict-like - Metadata to create the catalog, if required. - overwrite : bool + overwrite : bool, optional If this and 'create' are True, this will overwrite any existing JSON and CSV file with an empty catalog. + project : dict, optional + Metadata to create the catalog, if required. Notes ----- @@ -617,7 +650,7 @@ def update( self, df: Optional[ Union[ - "DataCatalog", + DataCatalog, intake_esm.esm_datastore, pd.DataFrame, pd.Series, @@ -639,8 +672,8 @@ def update( Parameters ---------- - df : Union[pd.DataFrame, pd.Series, DataCatalog] - Data to be added to the catalog. + df : Union[DataCatalog, intake_esm.esm_datastore, pd.DataFrame, pd.Series, Sequence[pd.Series]], optional + Data to be added to the catalog. If None, nothing is added, but the catalog is still updated. """ # Append the new DataFrame or Series if isinstance(df, DataCatalog) or isinstance(df, intake_esm.esm_datastore): @@ -705,7 +738,7 @@ def update( def update_from_ds( self, ds: xarray.Dataset, - path: str, + path: Union[os.PathLike, str], info_dict: Optional[dict] = None, **info_kwargs, ): @@ -728,10 +761,10 @@ def update_from_ds( ds : xarray.Dataset Dataset that we want to add to the catalog. The columns of the catalog will be filled from the global attributes starting with 'cat:' of the dataset. - info_dict : dict - Optional extra information to fill the catalog. - path : str - Path where ds is stored + info_dict : dict, optional + Extra information to fill in the catalog. + path : os.PathLike or str + Path to the file that contains the dataset. This will be added to the 'path' column of the catalog. """ d = {} @@ -751,7 +784,7 @@ def update_from_ds( ds.isel(time=-1).time.dt.strftime("%4Y-%m-%d %H:%M:%S").values ) - d["path"] = str(path) + d["path"] = str(Path(path)) # variable should be based on the Dataset d["variable"] = tuple(v for v in ds.data_vars if len(ds[v].dims) > 0) @@ -789,7 +822,7 @@ def __repr__(self) -> str: # noqa: D105 def concat_data_catalogs(*dcs): """Concatenate a multiple DataCatalogs. - Output catalog is the union of all rows and all derived variables, with the the "esmcat" + Output catalog is the union of all rows and all derived variables, with the "esmcat" of the first DataCatalog. Duplicate rows are dropped and the index is reset. """ registry = {} @@ -823,16 +856,21 @@ def _build_id(element: pd.Series, columns: list[str]): def generate_id( df: Union[pd.DataFrame, xr.Dataset], id_columns: Optional[list] = None -): # noqa: D401 - """Utility to create an ID from column entries. +) -> pd.Series: + """Create an ID from column entries. Parameters ---------- df: pd.DataFrame, xr.Dataset Data for which to create an ID. - id_columns : list + id_columns : list, optional List of column names on which to base the dataset definition. Empty columns will be skipped. If None (default), uses :py:data:`ID_COLUMNS`. + + Returns + ------- + pd.Series + A series of IDs, one per row of the input DataFrame. """ if isinstance(df, xr.Dataset): df = pd.DataFrame.from_dict( @@ -848,15 +886,13 @@ def generate_id( return df.apply(_build_id, axis=1, args=(id_columns,)) -def unstack_id( - df: Union[pd.DataFrame, ProjectCatalog, DataCatalog] -) -> dict: # noqa: D401 - """Utility that reverse-engineers an ID using catalog entries. +def unstack_id(df: Union[pd.DataFrame, ProjectCatalog, DataCatalog]) -> dict: + """Reverse-engineer an ID using catalog entries. Parameters ---------- df : Union[pd.DataFrame, ProjectCatalog, DataCatalog] - Either a Project/DataCatalog or the pandas DataFrame. + Either a Project/DataCatalog or a pandas DataFrame. Returns ------- @@ -892,7 +928,7 @@ def unstack_id( def subset_file_coverage( df: pd.DataFrame, - periods: list, + periods: Union[list[str], list[list[str]]], *, coverage: float = 0.99, duplicates_ok: bool = False, @@ -904,7 +940,7 @@ def subset_file_coverage( df : pd.DataFrame List of files to be evaluated, with at least a date_start and date_end column, which are expected to be `datetime64` objecs. - periods : list + periods : list of str or list of lists of str Either [start, end] or list of [start, end] for the periods to be evaluated. All periods must be covered, otherwise an empty subset is returned. coverage : float From 34e77489e249169224c8c72b2a98cdd220ea9797 Mon Sep 17 00:00:00 2001 From: RondeauG <rondeau-genesse.gabriel@ouranos.ca> Date: Wed, 1 Nov 2023 11:29:42 -0400 Subject: [PATCH 04/22] better typing in catutils --- xscen/catutils.py | 127 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 88 insertions(+), 39 deletions(-) diff --git a/xscen/catutils.py b/xscen/catutils.py index d96cd7ce..27a5bf04 100644 --- a/xscen/catutils.py +++ b/xscen/catutils.py @@ -51,7 +51,7 @@ """ -def register_parse_type(name, regex=r"([^\_\/\\]*)", group_count=1): +def register_parse_type(name: str, regex: str = r"([^\_\/\\]*)", group_count: int = 1): r"""Register a new parse type to be available in :py:func:`parse_directory` patterns. Function decorated by this will be registered in :py:data:`EXTRA_PARSE_TYPES`. @@ -94,7 +94,9 @@ def _parse_level(text: str) -> str: @register_parse_type( "datebounds", regex=r"(([\d]{4,15}(\-[\d]{4,15})?)|fx)", group_count=3 ) -def _parse_datebounds(text: str) -> tuple[str, str]: +def _parse_datebounds( + text: str, +) -> Union[list[str], tuple[None, None], tuple[str, str]]: """Parse helper to translate date bounds, used in the special DATES field.""" if "-" in text: return text.split("-") @@ -104,13 +106,26 @@ def _parse_datebounds(text: str) -> tuple[str, str]: def _find_assets( - root: Union[str, Path], + root: Union[str, os.PathLike], exts: set[str], lengths: set[int], dirglob: Optional[str] = None, ): - """Walk recursively over files in a directory, filtering according to a glob pattern, path depth and extensions.""" - root = str(root) # to be sure + """Walk recursively over files in a directory, filtering according to a glob pattern, path depth and extensions. + + Parameters + ---------- + root: str or Pathlike + Path of the directory to walk through. + exts: set of strings + Set of file extensions to look for. + lengths: set of ints + Set of path depths to look for. + dirglob: str, optional + A glob pattern. If given, only parent folders matching this pattern are walked through. + This pattern can not include the asset's basename. + """ + root = str(Path(root)) # to be sure for top, alldirs, files in os.walk(root): # Split zarr subdirectories from next iteration zarrs = [] @@ -162,20 +177,20 @@ def _compile_pattern(pattern: str) -> parse.Parser: def _name_parser( - path: os.PathLike, - root: os.PathLike, + path: Union[os.PathLike, str], + root: Union[os.PathLike, str], patterns: list[Union[str, parse.Parser]], read_from_file: Optional[Union[list[str], dict]] = None, attrs_map: Optional[dict] = None, xr_open_kwargs: Optional[dict] = None, -): +) -> Union[dict, None]: """Extract metadata information from the file path. Parameters ---------- - path : str + path : os.PathLike or str Full file path. - root : str + root : os.PathLike or str Root directory. Only the part of the path relative to this directory is checked against the patterns. patterns : list of str or parse.Parser List of patterns to try in `parse.parse`. See :py:func:`parse_directory` for the pattern specification. @@ -200,7 +215,7 @@ def _name_parser( parse_from_ds """ abs_path = Path(path) - path = abs_path.relative_to(root) + path = abs_path.relative_to(Path(root)) xr_open_kwargs = xr_open_kwargs or {} d = {} @@ -238,10 +253,10 @@ def _name_parser( def _parse_dir( - root: os.PathLike, + root: Union[os.PathLike, str], patterns: list[str], dirglob: Optional[str] = None, - checks: list[str] = None, + checks: Optional[list[str]] = None, read_from_file: Optional[Union[list[str], dict]] = None, attrs_map: Optional[dict] = None, xr_open_kwargs: Optional[dict] = None, @@ -251,7 +266,7 @@ def _parse_dir( Parameters ---------- - root: Pathlike + root: os.PathLike or str Path to walk through. patterns: list of strings or compiled parsers Patterns that the files will be checked against. @@ -356,7 +371,7 @@ def parse_worker(): # Skip the checks if none are requested (save some overhead) q = q_found if checks else q_checked - for path in _find_assets(root, exts, lengths, dirglob): + for path in _find_assets(Path(root), exts, lengths, dirglob): q.put(path) q_found.join() @@ -418,37 +433,37 @@ def _parse_first_ds( @parse_config def parse_directory( - directories: list, - patterns: list, + directories: list[Union[str, os.PathLike]], + patterns: list[str], *, - id_columns: list = None, + id_columns: Optional[list[str]] = None, read_from_file: Union[ bool, Sequence[str], tuple[Sequence[str], Sequence[str]], Sequence[tuple[Sequence[str], Sequence[str]]], ] = False, - homogenous_info: dict = None, - cvs: Union[str, PosixPath, dict] = None, + homogenous_info: Optional[dict] = None, + cvs: Optional[Union[str, os.PathLike, dict]] = None, dirglob: Optional[str] = None, - xr_open_kwargs: Mapping[str, Any] = None, + xr_open_kwargs: Optional[Mapping[str, Any]] = None, only_official_columns: bool = True, progress: bool = False, parallel_dirs: Union[bool, int] = False, - file_checks: list[str] = None, + file_checks: Optional[list[str]] = None, ) -> pd.DataFrame: r"""Parse files in a directory and return them as a pd.DataFrame. Parameters ---------- - directories : list + directories : list of paths List of directories to parse. The parse is recursive. - patterns : list + patterns : list of str List of possible patterns to be used by :py:func:`parse.parse` to decode the file names. See Notes below. - id_columns : list + id_columns : list of str, optional List of column names on which to base the dataset definition. Empty columns will be skipped. If None (default), it uses :py:data:`ID_COLUMNS`. - read_from_file : boolean or set of strings or tuple of 2 sets of strings. + read_from_file : boolean or set of strings or tuple of 2 sets of strings or list of tuples If True, if some fields were not parsed from their path, files are opened and missing fields are parsed from their metadata, if found. If a sequence of column names, only those fields are parsed from the file, if missing. @@ -462,7 +477,7 @@ def parse_directory( homogenous_info : dict, optional Using the {column_name: description} format, information to apply to all files. These are applied before the `cvs`. - cvs: str or PosixPath or dict, optional + cvs: str or os.PathLike or dict, optional Dictionary with mapping from parsed term to preferred terms (Controlled VocabularieS) for each column. May have an additional "attributes" entry which maps from attribute names in the files to official column names. The attribute translation is done before the rest. @@ -681,7 +696,7 @@ def parse_directory( def parse_from_ds( - obj: Union[os.PathLike, xr.Dataset], + obj: Union[str, os.PathLike, xr.Dataset], names: Sequence[str], attrs_map: Optional[Mapping[str, str]] = None, **xrkwargs, @@ -697,6 +712,17 @@ def parse_from_ds( If the obj is the path to a Zarr dataset and none of "frequency", "xrfreq", "date_start" or "date_end" are requested, :py:func:`parse_from_zarr` is used instead of opening the file. + + Parameters + ---------- + obj: str or os.PathLike or xr.Dataset + Dataset to parse. + names: sequence of str + List of attributes to be parsed from the dataset. + attrs_map: dict, optional + In the case of non-standard names in the file, this can be used to match entries in the files to specific 'names' in the requested list. + xrkwargs: + Arguments to be passed to open_dataset(). """ get_time = bool( {"frequency", "xrfreq", "date_start", "date_end"}.intersection(names) @@ -757,7 +783,9 @@ def parse_from_ds( return attrs -def _parse_from_zarr(path: os.PathLike, get_vars=True, get_time=True): +def _parse_from_zarr( + path: Union[os.PathLike, str], get_vars: bool = True, get_time: bool = True +): """Obtain the list of variables, the time coordinate and the list of global attributes from a zarr dataset. Vars and attrs from reading the JSON files directly, time by reading the data with zarr. @@ -766,6 +794,15 @@ def _parse_from_zarr(path: os.PathLike, get_vars=True, get_time=True): - where .zattrs/_ARRAY_DIMENSIONS is not empty - where .zattrs/_ARRAY_DIMENSIONS does not contain the variable name - who do not appear in any "coordinates" attribute. + + Parameters + ---------- + path: os.PathLike or str + Path to the zarr dataset. + get_vars: bool + If True, return the list of variables. + get_time: bool + If True, return the time coordinate. """ path = Path(path) @@ -809,9 +846,21 @@ def _parse_from_zarr(path: os.PathLike, get_vars=True, get_time=True): return ds_attrs, variables, time -def _parse_from_nc(path: os.PathLike, get_vars=True, get_time=True): - """Obtain the list of variables, the time coordinate, and the list of global attributes from a netCDF dataset, using netCDF4.""" - ds = netCDF4.Dataset(str(path)) +def _parse_from_nc( + path: Union[os.PathLike, str], get_vars: bool = True, get_time: bool = True +): + """Obtain the list of variables, the time coordinate, and the list of global attributes from a netCDF dataset, using netCDF4. + + Parameters + ---------- + path: os.PathLike or str + Path to the netCDF dataset. + get_vars: bool + If True, return the list of variables. + get_time: bool + If True, return the time coordinate. + """ + ds = netCDF4.Dataset(str(Path(path))) ds_attrs = {k: ds.getncattr(k) for k in ds.ncattrs()} variables = [] @@ -851,7 +900,7 @@ def _schema_option(option: dict, facets: dict): return answer -def _schema_level(schema: Union[dict, str], facets: dict): +def _schema_level(schema: Union[dict, list[str], str], facets: dict): if isinstance(schema, str): if schema.startswith("(") and schema.endswith(")"): optional = True @@ -881,7 +930,7 @@ def _schema_level(schema: Union[dict, str], facets: dict): raise ValueError(f"Invalid schema : {schema}") -def _schema_dates(facets, optional=False): +def _schema_dates(facets: dict, optional: bool = False): if facets.get("xrfreq") == "fx": return "fx" @@ -979,7 +1028,7 @@ def _read_schemas(schemas): def _build_path( data: Union[dict, xr.Dataset, xr.DataArray, pd.Series], schemas: dict, - root: Path, + root: Union[str, os.PathLike], get_type: bool = False, **extra_facets, ) -> Union[Path, tuple[Path, str]]: @@ -1031,7 +1080,7 @@ def _build_path( out = Path(*_schema_folders(schema["folders"], facets)) out = out / _schema_filename(schema["filename"], facets) if root is not None: - out = root / out + out = Path(root) / out if "format" in facets: # Add extension # Can't use `with_suffix` in case there are dots in the name out = out.parent / f"{out.name}.{facets['format']}" @@ -1045,7 +1094,7 @@ def _build_path( @parse_config def build_path( data: Union[dict, xr.Dataset, xr.DataArray, pd.Series, DataCatalog, pd.DataFrame], - schemas: Optional[Union[str, os.PathLike, list[dict], dict]] = None, + schemas: Optional[Union[str, os.PathLike, dict]] = None, root: Union[str, os.PathLike] = None, **extra_facets, ) -> Union[Path, DataCatalog, pd.DataFrame]: @@ -1053,12 +1102,12 @@ def build_path( Parameters ---------- - data : dict or catalog + data : dict or xr.Dataset or xr.DataArray or pd.Series or DataCatalog or pd.DataFrame Dict of facets. Or xarray object to read the facets from. In the latter case, variable and time-dependent facets are read with :py:func:`parse_from_ds` and supplemented with all the object's attribute, giving priority to the "official" xscen attributes (prefixed with `cat:`, see :py:func:`xscen.utils.get_cat_attrs`). Can also be a catalog or a DataFrame, in which a "new_path" column is generated for each item. - schemas : Path or dict or dict of dicts, optional + schemas : Path or dict, optional Path to YAML schematic of database schema. If None, will use a default schema. See the comments in the `xscen/data/file_schema.yml` file for more details on its construction. A dict of dict schemas can be given (same as reading the yaml). From 2b5f1f2f193debf854e0dddc3ba1235297b2776b Mon Sep 17 00:00:00 2001 From: RondeauG <rondeau-genesse.gabriel@ouranos.ca> Date: Wed, 1 Nov 2023 11:35:04 -0400 Subject: [PATCH 05/22] better typing in config --- xscen/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xscen/config.py b/xscen/config.py index e044e945..3c3106cc 100644 --- a/xscen/config.py +++ b/xscen/config.py @@ -51,7 +51,7 @@ from copy import deepcopy from functools import wraps from pathlib import Path -from typing import Any, Tuple +from typing import Any import xarray as xr import xclim as xc @@ -129,7 +129,7 @@ def args_as_str(*args: tuple[Any, ...]) -> tuple[str, ...]: return tuple(new_args) -def load_config(*elements, reset=False, verbose=False): +def load_config(*elements, reset: bool = False, verbose: bool = False): """Load configuration from given files or key=value pairs. Once all elements are loaded, special sections are dispatched to their module, but only if From 7cb7433c80895863a99fe13437bdcf2d58e24d2c Mon Sep 17 00:00:00 2001 From: RondeauG <rondeau-genesse.gabriel@ouranos.ca> Date: Wed, 1 Nov 2023 12:07:51 -0400 Subject: [PATCH 06/22] simpler optionals and better typing --- xscen/aggregate.py | 22 ++++++------ xscen/biasadjust.py | 18 +++++----- xscen/catalog.py | 38 ++++++++++---------- xscen/catutils.py | 34 +++++++++--------- xscen/diagnostics.py | 83 ++++++++++++++++++++++++-------------------- 5 files changed, 101 insertions(+), 94 deletions(-) diff --git a/xscen/aggregate.py b/xscen/aggregate.py index 50ac3e0f..5f0c596a 100644 --- a/xscen/aggregate.py +++ b/xscen/aggregate.py @@ -44,10 +44,10 @@ def _(s): def climatological_mean( ds: xr.Dataset, *, - window: Optional[int] = None, - min_periods: Optional[int] = None, + window: int = None, + min_periods: int = None, interval: int = 1, - periods: Optional[Union[list[str], list[list[str]]]] = None, + periods: Union[list[str], list[list[str]]] = None, to_level: Optional[str] = "climatology", ) -> xr.Dataset: """Compute the mean over 'year' for given time periods, respecting the temporal resolution of ds. @@ -372,13 +372,13 @@ def spatial_mean( ds: xr.Dataset, method: str, *, - spatial_subset: Optional[bool] = None, + spatial_subset: bool = None, call_clisops: bool = False, - region: Optional[Union[dict, str]] = None, - kwargs: Optional[dict] = None, - simplify_tolerance: Optional[float] = None, - to_domain: Optional[str] = None, - to_level: Optional[str] = None, + region: Union[dict, str] = None, + kwargs: dict = None, + simplify_tolerance: float = None, + to_domain: str = None, + to_level: str = None, ) -> xr.Dataset: """Compute the spatial mean using a variety of available methods. @@ -703,8 +703,8 @@ def produce_horizon( ModuleType, ], *, - periods: Optional[Union[list[str], list[list[str]]]] = None, - warminglevels: Optional[dict] = None, + periods: Union[list[str], list[list[str]]] = None, + warminglevels: dict = None, to_level: Optional[str] = "horizons", period: list = None, ) -> xr.Dataset: diff --git a/xscen/biasadjust.py b/xscen/biasadjust.py index b098733c..4c2413e4 100644 --- a/xscen/biasadjust.py +++ b/xscen/biasadjust.py @@ -60,12 +60,12 @@ def train( period: list[str], *, method: str = "DetrendedQuantileMapping", - group: Optional[Union[sdba.Grouper, str, dict]] = None, - xclim_train_args: Optional[dict] = None, + group: Union[sdba.Grouper, str, dict] = None, + xclim_train_args: dict = None, maximal_calendar: str = "noleap", - adapt_freq: Optional[dict] = None, - jitter_under: Optional[dict] = None, - jitter_over: Optional[dict] = None, + adapt_freq: dict = None, + jitter_under: dict = None, + jitter_over: dict = None, align_on: Optional[str] = "year", ) -> xr.Dataset: """ @@ -191,11 +191,11 @@ def adjust( dsim: xr.Dataset, periods: Union[list[str], list[list[str]]], *, - xclim_adjust_args: Optional[dict] = None, + xclim_adjust_args: dict = None, to_level: str = "biasadjusted", - bias_adjust_institution: Optional[str] = None, - bias_adjust_project: Optional[str] = None, - moving_yearly_window: Optional[dict] = None, + bias_adjust_institution: str = None, + bias_adjust_project: str = None, + moving_yearly_window: dict = None, align_on: Optional[str] = "year", ) -> xr.Dataset: """ diff --git a/xscen/catalog.py b/xscen/catalog.py index 777a67c2..b095afe6 100644 --- a/xscen/catalog.py +++ b/xscen/catalog.py @@ -192,9 +192,9 @@ def __init__( def from_df( cls, data: Union[pd.DataFrame, os.PathLike, Sequence[os.PathLike]], - esmdata: Optional[Union[os.PathLike, dict]] = None, + esmdata: Union[os.PathLike, dict] = None, *, - read_csv_kwargs: Optional[Mapping[str, Any]] = None, + read_csv_kwargs: Mapping[str, Any] = None, name: str = "virtual", **intake_kwargs, ): @@ -256,7 +256,7 @@ def _find_unique(series): else: return data.apply(_find_unique, result_type="reduce").to_dict() - def unique(self, columns: Optional[Union[str, Sequence[str]]] = None): + def unique(self, columns: Union[str, Sequence[str]] = None): """Return a series of unique values in the catalog. Parameters @@ -302,7 +302,7 @@ def search(self, **columns): ) return cat - def drop_duplicates(self, columns: Optional[list[str]] = None): + def drop_duplicates(self, columns: list[str] = None): """Drop duplicates in the catalog based on a subset of columns. Parameters @@ -397,8 +397,8 @@ def exists_in_cat(self, **columns) -> bool: def to_dataset( self, - concat_on: Optional[Union[list[str], str]] = None, - create_ensemble_on: Optional[Union[list[str], str]] = None, + concat_on: Union[list[str], str] = None, + create_ensemble_on: Union[list[str], str] = None, calendar: Optional[str] = "standard", **kwargs, ) -> xr.Dataset: @@ -529,7 +529,7 @@ def create( cls, filename: Union[os.PathLike, str], *, - project: Optional[dict] = None, + project: dict = None, overwrite: bool = False, ): r"""Create a new project catalog from some project metadata. @@ -610,8 +610,8 @@ def __init__( df: Union[str, dict], *args, create: bool = False, - overwrite: Optional[bool] = None, - project: Optional[dict] = None, + overwrite: bool = None, + project: dict = None, **kwargs, ): """Open or create a project catalog. @@ -648,14 +648,12 @@ def __init__( # TODO: Implement a way to easily destroy part of the catalog to "reset" some steps def update( self, - df: Optional[ - Union[ - DataCatalog, - intake_esm.esm_datastore, - pd.DataFrame, - pd.Series, - Sequence[pd.Series], - ] + df: Union[ + DataCatalog, + intake_esm.esm_datastore, + pd.DataFrame, + pd.Series, + Sequence[pd.Series], ] = None, ): """Update the catalog with new data and writes the new data to the csv file. @@ -673,7 +671,7 @@ def update( Parameters ---------- df : Union[DataCatalog, intake_esm.esm_datastore, pd.DataFrame, pd.Series, Sequence[pd.Series]], optional - Data to be added to the catalog. If None, nothing is added, but the catalog is still updated. + Data to be added to the catalog. If None, nothing is added, but the catalog is still updated. """ # Append the new DataFrame or Series if isinstance(df, DataCatalog) or isinstance(df, intake_esm.esm_datastore): @@ -739,7 +737,7 @@ def update_from_ds( self, ds: xarray.Dataset, path: Union[os.PathLike, str], - info_dict: Optional[dict] = None, + info_dict: dict = None, **info_kwargs, ): """Update the catalog with new data and writes the new data to the csv file. @@ -855,7 +853,7 @@ def _build_id(element: pd.Series, columns: list[str]): def generate_id( - df: Union[pd.DataFrame, xr.Dataset], id_columns: Optional[list] = None + df: Union[pd.DataFrame, xr.Dataset], id_columns: list = None ) -> pd.Series: """Create an ID from column entries. diff --git a/xscen/catutils.py b/xscen/catutils.py index 27a5bf04..4ad30f32 100644 --- a/xscen/catutils.py +++ b/xscen/catutils.py @@ -109,7 +109,7 @@ def _find_assets( root: Union[str, os.PathLike], exts: set[str], lengths: set[int], - dirglob: Optional[str] = None, + dirglob: str = None, ): """Walk recursively over files in a directory, filtering according to a glob pattern, path depth and extensions. @@ -180,9 +180,9 @@ def _name_parser( path: Union[os.PathLike, str], root: Union[os.PathLike, str], patterns: list[Union[str, parse.Parser]], - read_from_file: Optional[Union[list[str], dict]] = None, - attrs_map: Optional[dict] = None, - xr_open_kwargs: Optional[dict] = None, + read_from_file: Union[list[str], dict] = None, + attrs_map: dict = None, + xr_open_kwargs: dict = None, ) -> Union[dict, None]: """Extract metadata information from the file path. @@ -255,11 +255,11 @@ def _name_parser( def _parse_dir( root: Union[os.PathLike, str], patterns: list[str], - dirglob: Optional[str] = None, - checks: Optional[list[str]] = None, - read_from_file: Optional[Union[list[str], dict]] = None, - attrs_map: Optional[dict] = None, - xr_open_kwargs: Optional[dict] = None, + dirglob: str = None, + checks: list[str] = None, + read_from_file: Union[list[str], dict] = None, + attrs_map: dict = None, + xr_open_kwargs: dict = None, progress: bool = False, ): """Iterate and parses files in a directory, filtering according to basic pattern properties and optional checks. @@ -436,21 +436,21 @@ def parse_directory( directories: list[Union[str, os.PathLike]], patterns: list[str], *, - id_columns: Optional[list[str]] = None, + id_columns: list[str] = None, read_from_file: Union[ bool, Sequence[str], tuple[Sequence[str], Sequence[str]], Sequence[tuple[Sequence[str], Sequence[str]]], ] = False, - homogenous_info: Optional[dict] = None, - cvs: Optional[Union[str, os.PathLike, dict]] = None, - dirglob: Optional[str] = None, - xr_open_kwargs: Optional[Mapping[str, Any]] = None, + homogenous_info: dict = None, + cvs: Union[str, os.PathLike, dict] = None, + dirglob: str = None, + xr_open_kwargs: Mapping[str, Any] = None, only_official_columns: bool = True, progress: bool = False, parallel_dirs: Union[bool, int] = False, - file_checks: Optional[list[str]] = None, + file_checks: list[str] = None, ) -> pd.DataFrame: r"""Parse files in a directory and return them as a pd.DataFrame. @@ -698,7 +698,7 @@ def parse_directory( def parse_from_ds( obj: Union[str, os.PathLike, xr.Dataset], names: Sequence[str], - attrs_map: Optional[Mapping[str, str]] = None, + attrs_map: Mapping[str, str] = None, **xrkwargs, ): """Parse a list of catalog fields from the file/dataset itself. @@ -1094,7 +1094,7 @@ def _build_path( @parse_config def build_path( data: Union[dict, xr.Dataset, xr.DataArray, pd.Series, DataCatalog, pd.DataFrame], - schemas: Optional[Union[str, os.PathLike, dict]] = None, + schemas: Union[str, os.PathLike, dict] = None, root: Union[str, os.PathLike] = None, **extra_facets, ) -> Union[Path, DataCatalog, pd.DataFrame]: diff --git a/xscen/diagnostics.py b/xscen/diagnostics.py index 203488a9..1673f4df 100644 --- a/xscen/diagnostics.py +++ b/xscen/diagnostics.py @@ -1,5 +1,6 @@ -# noqa: D100 +"""Functions to perform diagnostics on datasets.""" import logging +import os import warnings from collections.abc import Sequence from copy import deepcopy @@ -15,7 +16,6 @@ from .config import parse_config from .indicators import load_xclim_module -from .io import save_to_zarr from .utils import ( add_attr, change_units, @@ -62,44 +62,44 @@ def health_checks( Parameters ---------- - ds: xr.Dataset | xr.DataArray + ds: xr.Dataset or xr.DataArray Dataset to check. - structure: dict + structure: dict, optional Dictionary with keys "dims" and "coords" containing the expected dimensions and coordinates. This check will fail is extra dimensions or coordinates are found. - calendar: str + calendar: str, optional Expected calendar. Synonyms should be detected correctly (e.g. "standard" and "gregorian"). - start_date: str + start_date: str, optional To check if the dataset starts at least at this date. - end_date: str + end_date: str, optional To check if the dataset ends at least at this date. - variables_and_units: dict + variables_and_units: dict, optional Dictionary containing the expected variables and units. - cfchecks: dict + cfchecks: dict, optional Dictionary where the key is the variable to check and the values are the cfchecks. The cfchecks themselves must be a dictionary with the keys being the cfcheck names and the values being the arguments to pass to the cfcheck. See `xclim.core.cfchecks` for more details. - freq: str + freq: str, optional Expected frequency, written as the result of xr.infer_freq(ds.time). - missing: dict | str | list + missing: dict or str or list of str, optional String, list of strings, or dictionary where the key is the method to check for missing data and the values are the arguments to pass to the method. The methods are: "missing_any", "at_least_n_valid", "missing_pct", "missing_wmo". See :py:func:`xclim.core.missing` for more details. - flags: dict + flags: dict, optional Dictionary where the key is the variable to check and the values are the flags. The flags themselves must be a dictionary with the keys being the data_flags names and the values being the arguments to pass to the data_flags. If `None` is passed instead of a dictionary, then xclim's default flags for the given variable are run. See :py:data:`xclim.core.utils.VARIABLES`. See :py:func:`xclim.core.dataflags.data_flags` for the list of possible flags. - flags_kwargs: dict + flags_kwargs: dict, optional Additional keyword arguments to pass to the data_flags ("dims" and "freq"). return_flags: bool Whether to return the Dataset created by data_flags. - raise_on: list + raise_on: list of str, optional Whether to raise an error if a check fails, else there will only be a warning. The possible values are the names of the checks. Use ["all"] to raise on all checks. Returns ------- - xr.Dataset | None + xr.Dataset or None Dataset containing the flags if return_flags is True & raise_on is False for the "flags" check. """ if isinstance(ds, xr.DataArray): @@ -292,39 +292,43 @@ def _message(): def properties_and_measures( ds: xr.Dataset, properties: Union[ - str, PosixPath, Sequence[Indicator], Sequence[tuple[str, Indicator]], ModuleType + str, + os.PathLike, + Sequence[Indicator], + Sequence[tuple[str, Indicator]], + ModuleType, ], - period: list = None, + period: list[str] = None, unstack: bool = False, rechunk: dict = None, - dref_for_measure: Optional[xr.Dataset] = None, - change_units_arg: Optional[dict] = None, + dref_for_measure: xr.Dataset = None, + change_units_arg: dict = None, to_level_prop: str = "diag-properties", to_level_meas: str = "diag-measures", -): +) -> tuple[xr.Dataset, xr.Dataset]: """Calculate properties and measures of a dataset. Parameters ---------- ds : xr.Dataset Input dataset. - properties : Union[str, PosixPath, Sequence[Indicator], Sequence[Tuple[str, Indicator]]] + properties : Union[str, os.PathLike, Sequence[Indicator], Sequence[tuple[str, Indicator]], ModuleType] Path to a YAML file that instructs on how to calculate properties. Can be the indicator module directly, or a sequence of indicators or a sequence of tuples (indicator name, indicator) as returned by `iter_indicators()`. - period : list + period : list of str, optional [start, end] of the period to be evaluated. The period will be selected on ds and dref_for_measure if it is given. unstack : bool Whether to unstack ds before computing the properties. - rechunk : dict + rechunk : dict, optional Dictionary of chunks to use for a rechunk before computing the properties. - dref_for_measure : xr.Dataset + dref_for_measure : xr.Dataset, optional Dataset of properties to be used as the ref argument in the computation of the measure. Ideally, this is the first output (prop) of a previous call to this function. Only measures on properties that are provided both in this dataset and in the properties list will be computed. If None, the second output of the function (meas) will be an empty Dataset. - change_units_arg : dict + change_units_arg : dict, optional If not None, calls `xscen.utils.change_units` on ds before computing properties using this dictionary for the `variables_and_units` argument. It can be useful to convert units before computing the properties, because it is sometimes @@ -426,7 +430,9 @@ def properties_and_measures( return prop, meas -def measures_heatmap(meas_datasets: Union[list, dict], to_level: str = "diag-heatmap"): +def measures_heatmap( + meas_datasets: Union[list[xr.Dataset], dict], to_level: str = "diag-heatmap" +) -> xr.Dataset: """Create a heatmap to compare the performance of the different datasets. The columns are properties and the rows are datasets. @@ -435,7 +441,7 @@ def measures_heatmap(meas_datasets: Union[list, dict], to_level: str = "diag-hea Parameters ---------- - meas_datasets : list or dict + meas_datasets : list of xr.Dataset or dict List or dictionary of datasets of measures of properties. If it is a dictionary, the keys will be used to name the rows. If it is a list, the rows will be given a number. @@ -444,7 +450,8 @@ def measures_heatmap(meas_datasets: Union[list, dict], to_level: str = "diag-hea Returns ------- - xr.DataArray + xr.Dataset + Dataset containing the heatmap. """ name_of_datasets = None if isinstance(meas_datasets, dict): @@ -504,24 +511,24 @@ def measures_heatmap(meas_datasets: Union[list, dict], to_level: str = "diag-hea def measures_improvement( - meas_datasets: Union[list, dict], to_level: str = "diag-improved" -): + meas_datasets: Union[list[xr.Dataset], dict], to_level: str = "diag-improved" +) -> xr.Dataset: """ - Calculate the fraction of improved grid points for each properties between two datasets of measures. + Calculate the fraction of improved grid points for each property between two datasets of measures. Parameters ---------- - meas_datasets: list|dict - List of 2 datasets: Initial dataset of measures and final (improved) dataset of measures. - Both datasets must have the same variables. - It is also possible to pass a dictionary where the values are the datasets and the key are not used. + meas_datasets: list of xr.Dataset or dict + List of 2 datasets: Initial dataset of measures and final (improved) dataset of measures. + Both datasets must have the same variables. + It is also possible to pass a dictionary where the values are the datasets and the key are not used. to_level: str processing_level to assign to the output dataset Returns ------- xr.Dataset - + Dataset containing information on the fraction of improved grid points for each property. """ if isinstance(meas_datasets, dict): meas_datasets = list(meas_datasets.values()) @@ -566,7 +573,9 @@ def measures_improvement( return ds_better -def measures_improvement_2d(dict_input: dict, to_level: str = "diag-improved-2d"): +def measures_improvement_2d( + dict_input: dict, to_level: str = "diag-improved-2d" +) -> xr.Dataset: """ Create a 2D dataset with dimension `realization` showing the fraction of improved grid cell. From 7644b071de667bb028f528d8af70ee50c300f385 Mon Sep 17 00:00:00 2001 From: RondeauG <rondeau-genesse.gabriel@ouranos.ca> Date: Wed, 1 Nov 2023 13:29:00 -0400 Subject: [PATCH 07/22] more typing fixes --- xscen/ensembles.py | 16 +++--- xscen/extract.py | 137 +++++++++++++++++++++++++++++++++------------ 2 files changed, 109 insertions(+), 44 deletions(-) diff --git a/xscen/ensembles.py b/xscen/ensembles.py index d545b415..72c8c6a0 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -1,6 +1,7 @@ -# noqa: D100 +"""Ensemble statistics and weights.""" import inspect import logging +import os import warnings from copy import deepcopy from itertools import chain, groupby @@ -21,7 +22,7 @@ @parse_config def ensemble_stats( - datasets: Any, + datasets: Union[dict, list[Union[str, os.PathLike, xr.Dataset, xr.DataArray]]], statistics: dict, *, create_kwargs: dict = None, @@ -33,17 +34,18 @@ def ensemble_stats( Parameters ---------- - datasets : Any + datasets : dict or list of str, Path, Dataset or DataArray List of file paths or xarray Dataset/DataArray objects to include in the ensemble. + If using a list, all objects must be of the same type. A dictionary can be passed instead of a list, in which case the keys are used as coordinates along the new `realization` axis. Tip: With a project catalog, you can do: `datasets = pcat.search(**search_dict).to_dataset_dict()`. statistics : dict xclim.ensembles statistics to be called. Dictionary in the format {function: arguments}. If a function requires 'ref', the dictionary entry should be the inputs of a .loc[], e.g. {"ref": {"horizon": "1981-2010"}} - create_kwargs : dict + create_kwargs : dict, optional Dictionary of arguments for xclim.ensembles.create_ensemble. - weights : xr.DataArray + weights : xr.DataArray, optional Weights to apply along the 'realization' dimension. This array cannot contain missing values. common_attrs_only : bool If True, keeps only the global attributes that are the same for all datasets and generate new id. @@ -165,7 +167,7 @@ def generate_weights( balance_experiments : bool If True, each experiment will be given a total weight of 1 (prior to subsequent weighting made through `attribute_weights`). This option requires the 'cat:experiment' attribute to be present in all datasets. - attribute_weights : dict + attribute_weights : dict, optional Nested dictionaries of weights to apply to each dataset. These weights are applied after the independence weighting. The first level of keys are the attributes for which weights are being given. The second level of keys are unique entries for the attribute, with the value being either an individual weight @@ -176,7 +178,7 @@ def generate_weights( skipna : bool If True, weights will be computed from attributes only. If False, weights will be computed from the number of non-missing values. skipna=False requires either a 'time' or 'horizon' dimension in the datasets. - v_for_skipna : str + v_for_skipna : str, optional Variable to use for skipna=False. If None, the first variable in the first dataset is used. standardize : bool If True, the weights are standardized to sum to 1 (per timestep/horizon, if skipna=False). diff --git a/xscen/extract.py b/xscen/extract.py index 96a9f6df..367eaff8 100644 --- a/xscen/extract.py +++ b/xscen/extract.py @@ -1,4 +1,4 @@ -# noqa: D100 +"""Functions to find and extract data from a catalog.""" import datetime import logging import os @@ -83,8 +83,8 @@ def extract_dataset( catalog: DataCatalog, *, variables_and_freqs: dict = None, - periods: list = None, - region: Optional[dict] = None, + periods: Union[list[str], list[list[str]]] = None, + region: dict = None, to_level: str = "extracted", ensure_correct_time: bool = True, xr_open_kwargs: dict = None, @@ -92,7 +92,7 @@ def extract_dataset( preprocess: Callable = None, resample_methods: Optional[dict] = None, mask: Union[bool, xr.Dataset, xr.DataArray] = False, -) -> Union[dict, xr.Dataset]: +) -> dict: """Take one element of the output of `search_data_catalogs` and returns a dataset, performing conversions and resampling as needed. Nothing is written to disk within this function. @@ -101,11 +101,11 @@ def extract_dataset( ---------- catalog : DataCatalog Sub-catalog for a single dataset, one value of the output of `search_data_catalogs`. - variables_and_freqs : dict + variables_and_freqs : dict, optional Variables and freqs, following a 'variable: xrfreq-compatible str' format. A list of strings can also be provided. If None, it will be read from catalog._requested_variables and catalog._requested_variable_freqs (set by `variables_and_freqs` in `search_data_catalogs`) - periods : list + periods : list of str or list of lists of str, optional Either [start, end] or list of [start, end] for the periods to be evaluated. Will be read from catalog._requested_periods if None. Leave both None to extract everything. region : dict, optional @@ -132,7 +132,7 @@ def extract_dataset( If the method is not given for a variable, it is guessed from the variable name and frequency, using the mapping in CVs/resampling_methods.json. If the variable is not found there, "mean" is used by default. - mask: xr.Dataset, bool + mask: xr.Dataset or xr.DataArray or bool A mask that is applied to all variables and only keeps data where it is True. Where the mask is False, variable values are replaced by NaNs. The mask should have the same dimensions as the variables extracted. @@ -141,10 +141,9 @@ def extract_dataset( Returns ------- - dict, xr.Dataset + dict Dictionary (keys = xrfreq) with datasets containing all available and computed variables, subsetted to the region, everything resampled to the requested frequency. - If there is a single frequency, a Dataset will be returned instead. Notes ----- @@ -367,8 +366,8 @@ def resample( da: xr.DataArray, target_frequency: str, *, - ds: Optional[xr.Dataset] = None, - method: Optional[str] = None, + ds: xr.Dataset = None, + method: str = None, missing: Union[str, dict] = None, ) -> xr.DataArray: """Aggregate variable to the target frequency. @@ -378,7 +377,7 @@ def resample( Parameters ---------- - da : xr.DataArray + da : xr.DataArray DataArray of the variable to resample, must have a "time" dimension and be of a finer temporal resolution than "target_frequency". target_frequency : str @@ -586,16 +585,16 @@ def resample( @parse_config def search_data_catalogs( data_catalogs: Union[ - Union[str, os.PathLike], list[Union[str, os.PathLike]], DataCatalog + str, os.PathLike, DataCatalog, list[Union[str, os.PathLike, DataCatalog]] ], variables_and_freqs: dict, *, - other_search_criteria: Optional[dict] = None, + other_search_criteria: dict = None, exclusions: dict = None, match_hist_and_fut: bool = False, - periods: list = None, + periods: Union[list[str], list[list[str]]] = None, coverage_kwargs: dict = None, - id_columns: Optional[list[str]] = None, + id_columns: list[str] = None, allow_resampling: bool = False, allow_conversion: bool = False, conversion_yaml: str = None, @@ -607,7 +606,7 @@ def search_data_catalogs( Parameters ---------- - data_catalogs : Union[Union[str, os.PathLike], List[Union[str, os.PathLike]], DataCatalog] + data_catalogs : str, os.PathLike, DataCatalog, or a list of those DataCatalog (or multiple, in a list) or paths to JSON/CSV data catalogs. They must use the same columns and aggregation options. variables_and_freqs : dict Variables and freqs to search for, following a 'variable: xr-freq-compatible-str' format. A list of strings can also be provided. @@ -617,11 +616,11 @@ def search_data_catalogs( More details available at https://intake-esm.readthedocs.io/en/stable/how-to/enforce-search-query-criteria-via-require-all-on.html . exclusions : dict, optional Same as other_search_criteria, but for eliminating results. Any result that matches any of the exclusions will be removed. - match_hist_and_fut: bool, optional + match_hist_and_fut: bool If True, historical and future simulations will be combined into the same line, and search results lacking one of them will be rejected. - periods : list + periods : list of str or list of lists of str, optional Either [start, end] or list of [start, end] for the periods to be evaluated. - coverage_kwargs : dict + coverage_kwargs : dict, optional Arguments to pass to subset_file_coverage (only used when periods is not None). id_columns : list, optional List of columns used to create a id column. If None is given, the original @@ -631,17 +630,17 @@ def search_data_catalogs( allow_conversion : bool If True (default) and if the requested variable cannot be found, intermediate variables are searched given that there exists a converting function in the "derived variable registry". - conversion_yaml : str + conversion_yaml : str, optional Path to a YAML file that defines the possible conversions (used alongside 'allow_conversion'=True). This file should follow the xclim conventions for building a virtual module. If None, the "derived variable registry" will be defined by the file in "xscen/xclim_modules/conversions.yml" - restrict_resolution : str + restrict_resolution : str, optional Used to restrict the results to the finest/coarsest resolution available for a given simulation. ['finest', 'coarsest']. - restrict_members : dict + restrict_members : dict, optional Used to restrict the results to a given number of members for a given simulation. Currently only supports {"ordered": int} format. - restrict_warming_level : bool, dict + restrict_warming_level : bool or dict, optional Used to restrict the results only to datasets that exist in the csv used to compute warming levels in `subset_warming_level`. If True, this will only keep the datasets that have a mip_era, source, experiment and member combination that exist in the csv. This does not guarantee that a given warming level will be reached, only that the datasets have corresponding columns in the csv. @@ -653,7 +652,7 @@ def search_data_catalogs( Notes ----- - - The "other_search_criteria" argument accepts wildcard (*) and regular expressions. + - The "other_search_criteria" and "exclusions" arguments accept wildcard (*) and regular expressions. - Frequency can be wildcarded with 'NA' in the `variables_and_freqs` dict. - Variable names cannot be wildcarded, they must be CMIP6-standard. @@ -907,10 +906,10 @@ def search_data_catalogs( logger.warning("Found no match corresponding to the search criteria.") if restrict_resolution is not None and len(catalogs) > 0: - catalogs = _restrict_by_resolution(catalogs, id_columns, restrict_resolution) + catalogs = _restrict_by_resolution(catalogs, restrict_resolution, id_columns) if restrict_members is not None and len(catalogs) > 0: - catalogs = _restrict_multimembers(catalogs, id_columns, restrict_members) + catalogs = _restrict_multimembers(catalogs, restrict_members, id_columns) return catalogs @@ -921,11 +920,11 @@ def get_warming_level( wl: float, *, window: int = 20, - tas_baseline_period: list = None, + tas_baseline_period: list[str] = None, ignore_member: bool = False, tas_csv: str = None, return_horizon: bool = True, -): +) -> Union[dict, list[str], str]: """Use the IPCC Atlas method to return the window of time over which the requested level of global warming is first reached. Parameters @@ -943,11 +942,11 @@ def get_warming_level( e.g. 2 for a global warming level of +2 degree Celsius above the mean temperature of the `tas_baseline_period`. window : int Size of the rolling window in years over which to compute the warming level. - tas_baseline_period : list + tas_baseline_period : list, optional [start, end] of the base period. The warming is calculated with respect to it. The default is ["1850", "1900"]. ignore_member : bool Decides whether to ignore the member when searching for the model run in tas_csv. - tas_csv : str + tas_csv : str, optional Path to a csv of annual global mean temperature with a row for each year and a column for each dataset. If None, it will default to data/IPCC_annual_global_tas.csv which was built from the IPCC atlas data from Iturbide et al., 2020 (https://doi.org/10.5194/essd-12-2959-2020) @@ -959,7 +958,7 @@ def get_warming_level( Returns ------- dict, list or str - If `realization` is a Dataset, a dict or a string, the output will follow the format indicated by `return_period`. + If `realization` is a Dataset, a dict or a string, the output will follow the format indicated by `return_horizon`. If `realization` is a list, the output will be a dictionary where the keys are the selected columns from the csv and the values follow the format indicated by `return_period`. """ tas_baseline_period = standardize_periods( @@ -1181,7 +1180,9 @@ def subset_warming_level( return ds_wl -def _dispatch_historical_to_future(catalog: DataCatalog, id_columns: list): +def _dispatch_historical_to_future( + catalog: DataCatalog, id_columns: list[str] = None +) -> DataCatalog: """Update a DataCatalog by recopying each "historical" entry to its corresponding future experiments. For examples, if an historical entry has corresponding "ssp245" and "ssp585" entries, @@ -1190,6 +1191,19 @@ def _dispatch_historical_to_future(catalog: DataCatalog, id_columns: list): with "experiment='ssp245'" includes the _historical_ assets (with no apparent distinction). "Historical" assets that did not find a match are removed from the output catalog. + + Parameters + ---------- + catalog : DataCatalog + Catalog to be evaluated. + id_columns : list of str, optional + List of columns to be used to identify unique simulations. + If None, defaults to ID_COLUMNS. + + Returns + ------- + DataCatalog + Catalog with the historical entries duplicated and modified to match the future experiments. """ expcols = [ "experiment", @@ -1272,9 +1286,26 @@ def _dispatch_historical_to_future(catalog: DataCatalog, id_columns: list): ) -def _restrict_by_resolution(catalogs: dict, id_columns: list, restrictions: str): +def _restrict_by_resolution( + catalogs: dict, restrictions: str, id_columns: list[str] = None +) -> dict: """Update the results from search_data_catalogs by removing simulations with multiple resolutions available. + Parameters + ---------- + catalogs : dict + Dictionary of DataCatalogs to be evaluated. + restrictions : str + Either 'finest' or 'coarsest'. + id_columns : list of str, optional + List of columns to be used to identify unique simulations. + If None, defaults to ID_COLUMNS. + + Returns + ------- + dict + Catalogs with duplicate simulations removed according to the resolution restrictions. + Notes ----- Currently supports: @@ -1395,10 +1426,27 @@ def _restrict_by_resolution(catalogs: dict, id_columns: list, restrictions: str) return catalogs -def _restrict_multimembers(catalogs: dict, id_columns: list, restrictions: dict): +def _restrict_multimembers( + catalogs: dict, restrictions: dict, id_columns: list[str] = None +): """Update the results from search_data_catalogs by removing simulations with multiple members available. Uses regex to try and adequately detect and order the member's identification number, but only tested for 'r-i-p'. + + Parameters + ---------- + catalogs : dict + Dictionary of DataCatalogs to be evaluated. + restrictions : dict + Dictionary of restrictions to be applied. Currently only supports {'ordered': int}. + id_columns : list of str, optional + List of columns to be used to identify unique simulations. + If None, defaults to ID_COLUMNS. + + Returns + ------- + dict + Catalogs where simulations with multiple members have been restricted to the requested maximum number. """ df = pd.concat([catalogs[s].df for s in catalogs.keys()]) # remove the member from the group_by @@ -1441,8 +1489,23 @@ def _restrict_multimembers(catalogs: dict, id_columns: list, restrictions: dict) return catalogs -def _restrict_wl(df, restrictions: dict): - """Update the results from search_data_catalogs by removing simulations that are not available in the warming level csv.""" +def _restrict_wl(df: pd.DataFrame, restrictions: dict): + """Update the results from search_data_catalogs according to warming level restrictions. + + Parameters + ---------- + df : pd.DataFrame + DataFrame to be evaluated. + restrictions : dict + Dictionary of restrictions to be applied. Entries are passed to get_warming_level. + If 'wl' is present, the warming level csv will be used to remove simulations that do not reach the requested warming level. + Otherwise, the warming level csv will be used to remove simulations that are not available in it. + + Returns + ------- + df : + Updated DataFrame. + """ tas_csv = restrictions["tas_csv"] if tas_csv is None: tas_csv = Path(__file__).parent / "data/IPCC_annual_global_tas.csv" From 630a5ec6b27cb5372cc4c56801df572303ea6662 Mon Sep 17 00:00:00 2001 From: RondeauG <rondeau-genesse.gabriel@ouranos.ca> Date: Wed, 1 Nov 2023 13:39:42 -0400 Subject: [PATCH 08/22] better typing --- xscen/catutils.py | 4 ++-- xscen/diagnostics.py | 4 ++-- xscen/ensembles.py | 7 ++++--- xscen/extract.py | 2 +- xscen/indicators.py | 40 +++++++++++++++++++++++++--------------- 5 files changed, 34 insertions(+), 23 deletions(-) diff --git a/xscen/catutils.py b/xscen/catutils.py index 4ad30f32..56d471e5 100644 --- a/xscen/catutils.py +++ b/xscen/catutils.py @@ -12,8 +12,8 @@ from fnmatch import fnmatch from functools import partial, reduce from multiprocessing import Pool -from pathlib import Path, PosixPath -from typing import Any, Optional, Union +from pathlib import Path +from typing import Any, Union import cftime import netCDF4 diff --git a/xscen/diagnostics.py b/xscen/diagnostics.py index 1673f4df..53b0bed1 100644 --- a/xscen/diagnostics.py +++ b/xscen/diagnostics.py @@ -4,9 +4,9 @@ import warnings from collections.abc import Sequence from copy import deepcopy -from pathlib import Path, PosixPath +from pathlib import Path from types import ModuleType -from typing import Optional, Union +from typing import Union import numpy as np import xarray as xr diff --git a/xscen/ensembles.py b/xscen/ensembles.py index 72c8c6a0..aadc74dd 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -6,7 +6,7 @@ from copy import deepcopy from itertools import chain, groupby from pathlib import Path -from typing import Any, Union +from typing import Union import numpy as np import xarray as xr @@ -22,7 +22,9 @@ @parse_config def ensemble_stats( - datasets: Union[dict, list[Union[str, os.PathLike, xr.Dataset, xr.DataArray]]], + datasets: Union[ + dict, list[Union[str, os.PathLike]], list[xr.Dataset], list[xr.DataArray] + ], statistics: dict, *, create_kwargs: dict = None, @@ -36,7 +38,6 @@ def ensemble_stats( ---------- datasets : dict or list of str, Path, Dataset or DataArray List of file paths or xarray Dataset/DataArray objects to include in the ensemble. - If using a list, all objects must be of the same type. A dictionary can be passed instead of a list, in which case the keys are used as coordinates along the new `realization` axis. Tip: With a project catalog, you can do: `datasets = pcat.search(**search_dict).to_dataset_dict()`. diff --git a/xscen/extract.py b/xscen/extract.py index 367eaff8..1f7ae3b5 100644 --- a/xscen/extract.py +++ b/xscen/extract.py @@ -7,7 +7,7 @@ from collections import defaultdict from copy import deepcopy from pathlib import Path -from typing import Callable, List, Optional, Union +from typing import Callable, Optional, Union import numpy as np import pandas as pd diff --git a/xscen/indicators.py b/xscen/indicators.py index 6f1cbc9d..4a1a5869 100644 --- a/xscen/indicators.py +++ b/xscen/indicators.py @@ -1,10 +1,11 @@ -# noqa: D100 +"""Functions to compute xclim indicators.""" import logging +import os from collections.abc import Sequence from functools import partial -from pathlib import Path, PosixPath +from pathlib import Path from types import ModuleType -from typing import Tuple, Union +from typing import Optional, Union import xarray as xr import xclim as xc @@ -23,19 +24,22 @@ __all__ = ["compute_indicators", "load_xclim_module"] -def load_xclim_module(filename, reload=False) -> ModuleType: +def load_xclim_module( + filename: Union[str, os.PathLike], reload: bool = False +) -> ModuleType: """Return the xclim module described by the yaml file (or group of yaml, jsons and py). Parameters ---------- - filename : pathlike - The filepath to the yaml file of the module or to the stem of yaml, jsons and py files. + filename : str or os.PathLike + The filepath to the yaml file of the module or to the stem of yaml, jsons and py files. reload : bool - If False (default) and the module already exists in `xclim.indicators`, it is not re-build. + If False (default) and the module already exists in `xclim.indicators`, it is not re-build. Returns ------- ModuleType + The xclim module. """ if not reload: # Same code as in xclim to get the module name. @@ -62,13 +66,17 @@ def load_xclim_module(filename, reload=False) -> ModuleType: def compute_indicators( ds: xr.Dataset, indicators: Union[ - str, Path, Sequence[Indicator], Sequence[tuple[str, Indicator]], ModuleType + str, + os.PathLike, + Sequence[Indicator], + Sequence[tuple[str, Indicator]], + ModuleType, ], *, - periods: list = None, + periods: Union[list[str], list[list[str]]] = None, restrict_years: bool = True, - to_level: str = "indicators", -) -> Union[dict, xr.Dataset]: + to_level: Optional[str] = "indicators", +) -> dict: """Calculate variables and indicators based on a YAML call to xclim. The function cuts the output to be the same years as the inputs. @@ -79,12 +87,12 @@ def compute_indicators( ---------- ds : xr.Dataset Dataset to use for the indicators. - indicators : Union[str, Path, Sequence[Indicator], Sequence[Tuple[str, Indicator]]] + indicators : Union[str, os.PathLike, Sequence[Indicator], Sequence[tuple[str, Indicator]], ModuleType] Path to a YAML file that instructs on how to calculate missing variables. Can also be only the "stem", if translations and custom indices are implemented. Can be the indicator module directly, or a sequence of indicators or a sequence of tuples (indicator name, indicator) as returned by `iter_indicators()`. - periods : list + periods : list of str or list of lists of str, optional Either [start, end] or list of [start, end] of continuous periods over which to compute the indicators. This is needed when the time axis of ds contains some jumps in time. If None, the dataset will be considered continuous. restrict_years: @@ -105,7 +113,7 @@ def compute_indicators( -------- xclim.indicators, xclim.core.indicator.build_indicator_module_from_yaml """ - if isinstance(indicators, (str, Path)): + if isinstance(indicators, (str, os.PathLike)): logger.debug("Loading indicator module.") module = load_xclim_module(indicators) indicators = module.iter_indicators() @@ -232,7 +240,9 @@ def _infer_freq_from_meta(ind): def registry_from_module( - module, registry=None, variable_column="variable" + module: ModuleType, + registry: DerivedVariableRegistry = None, + variable_column: str = "variable", ) -> DerivedVariableRegistry: """Convert a xclim virtual indicators module to an intake_esm Derived Variable Registry. From 21ca2d9eddf44fd80f3f3b2c0fdc22927dcdaa1e Mon Sep 17 00:00:00 2001 From: RondeauG <rondeau-genesse.gabriel@ouranos.ca> Date: Wed, 1 Nov 2023 14:16:00 -0400 Subject: [PATCH 09/22] typing --- xscen/io.py | 99 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 40 deletions(-) diff --git a/xscen/io.py b/xscen/io.py index b62c482b..c7a2fe8c 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -1,4 +1,4 @@ -# noqa: D100 +"""Input/Output functions for xscen.""" import datetime import logging import os @@ -45,12 +45,12 @@ ] -def get_engine(file: Union[str, Path]) -> str: +def get_engine(file: Union[str, os.PathLike]) -> str: """Use functionality of h5py to determine if a NetCDF file is compatible with h5netcdf. Parameters ---------- - file : str + file : str or os.PathLike Path to the file. Returns @@ -70,7 +70,7 @@ def get_engine(file: Union[str, Path]) -> str: def estimate_chunks( - ds: Union[str, xr.Dataset], + ds: Union[str, os.PathLike, xr.Dataset], dims: list, target_mb: float = 50, chunk_per_variable: bool = False, @@ -83,7 +83,7 @@ def estimate_chunks( Either a xr.Dataset or the path to a NetCDF file. Existing chunks are not taken into account. dims : list Dimension(s) on which to estimate the chunking. Not implemented for more than 2 dimensions. - target_mb : float, optional + target_mb : float Roughly the size of chunks (in Mb) to aim for. chunk_per_variable : bool If True, the output will be separated per variable. Otherwise, a common chunking will be found. @@ -91,8 +91,7 @@ def estimate_chunks( Returns ------- dict - dictionary of estimated chunks - + A dictionary mapping dimensions to chunk sizes. """ def _estimate_chunks(ds, target_mb, size_of_slice, rechunk_dims): @@ -154,7 +153,7 @@ def _estimate_chunks(ds, target_mb, size_of_slice, rechunk_dims): out = {} # If ds is the path to a file, use NetCDF4 - if isinstance(ds, str): + if isinstance(ds, (str, os.PathLike)): ds = netCDF4.Dataset(ds, "r") # Loop on variables @@ -231,7 +230,7 @@ def subset_maxsize( Returns ------- list - list of xr.Dataset subsetted alongside 'time' to limit the filesize to the requested maximum. + List of xr.Dataset subsetted alongside 'time' to limit the filesize to the requested maximum. """ # Estimate the size of the dataset size_of_file = 0 @@ -297,7 +296,7 @@ def _coerce_attrs(attrs): attrs[k] = str(attrs[k]) -def _np_bitround(array, keepbits): +def _np_bitround(array: xr.DataArray, keepbits: int): """Bitround for Arrays.""" codec = BitRound(keepbits=keepbits) data = array.copy() # otherwise overwrites the input @@ -306,7 +305,15 @@ def _np_bitround(array, keepbits): def round_bits(da: xr.DataArray, keepbits: int): - """Round floating point variable by keeping a given number of bits in the mantissa, dropping the rest.""" + """Round floating point variable by keeping a given number of bits in the mantissa, dropping the rest. This allows for a much better compression. + + Parameters + ---------- + da : xr.DataArray + Variable to be rounded. + keepbits : int + The number of bits of the mantissa to keep. + """ da = xr.apply_ufunc( _np_bitround, da, keepbits, dask="parallelized", keep_attrs=True ) @@ -321,7 +328,7 @@ def round_bits(da: xr.DataArray, keepbits: int): return da -def _get_keepbits(bitround, varname, vartype): +def _get_keepbits(bitround: Union[bool, int, dict], varname: str, vartype): # Guess the number of bits to keep depending on how bitround was passed, the var dtype and the var name. if not np.issubdtype(vartype, np.floating) or bitround is False: if isinstance(bitround, dict) and varname in bitround: @@ -341,20 +348,20 @@ def _get_keepbits(bitround, varname, vartype): @parse_config def save_to_netcdf( ds: xr.Dataset, - filename: str, + filename: Union[str, os.PathLike], *, - rechunk: Optional[dict] = None, + rechunk: dict = None, bitround: Union[bool, int, dict] = False, compute: bool = True, - netcdf_kwargs: Optional[dict] = None, -) -> None: - """Save a Dataset to NetCDF, rechunking if requested. + netcdf_kwargs: dict = None, +): + """Save a Dataset to NetCDF, rechunking or compressing if requested. Parameters ---------- ds : xr.Dataset Dataset to be saved. - filename : str + filename : str or os.PathLike Name of the NetCDF file to be saved. rechunk : dict, optional This is a mapping from dimension name to new chunks (in any format understood by dask). @@ -362,10 +369,10 @@ def save_to_netcdf( dimension names. Rechunking is only done on *data* variables sharing dimensions with this argument. bitround : bool or int or dict - If not False, float variables are bit-rounded by dropping a certain number of bits from their mantissa, allowing for a much better compression. - If an int, this is the number of bits to keep for all float variables. - If a dict, a mapping from variable name to the number of bits to keep. - If True, the number of bits to keep is guessed based on the variable's name, defaulting to 12, which yields a relative error below 0.013%. + If not False, float variables are bit-rounded by dropping a certain number of bits from their mantissa, allowing for a much better compression. + If an int, this is the number of bits to keep for all float variables. + If a dict, a mapping from variable name to the number of bits to keep. + If True, the number of bits to keep is guessed based on the variable's name, defaulting to 12, which yields a relative error below 0.013%. compute : bool Whether to start the computation or return a delayed object. netcdf_kwargs : dict, optional @@ -404,18 +411,18 @@ def save_to_netcdf( @parse_config def save_to_zarr( ds: xr.Dataset, - filename: str, + filename: Union[str, os.PathLike], *, - rechunk: Optional[dict] = None, - zarr_kwargs: Optional[dict] = None, + rechunk: dict = None, + zarr_kwargs: dict = None, compute: bool = True, encoding: dict = None, bitround: Union[bool, int, dict] = False, mode: str = "f", itervar: bool = False, timeout_cleanup: bool = True, -) -> None: - """Save a Dataset to Zarr format, rechunking if requested. +): + """Save a Dataset to Zarr format, rechunking and compressing if requested. According to mode, removes variables that we don't want to re-compute in ds. @@ -613,9 +620,9 @@ def _to_dataframe( def to_table( ds: Union[xr.Dataset, xr.DataArray], *, - row: Union[None, str, Sequence[str]] = None, - column: Union[None, str, Sequence[str]] = None, - sheet: Union[None, str, Sequence[str]] = None, + row: Union[str, Sequence[str]] = None, + column: Union[str, Sequence[str]] = None, + sheet: Union[str, Sequence[str]] = None, coords: Union[bool, str, Sequence[str]] = True, ) -> Union[pd.DataFrame, dict]: """Convert a dataset to a pandas DataFrame with support for multicolumns and multisheet. @@ -715,6 +722,18 @@ def make_toc(ds: Union[xr.Dataset, xr.DataArray], loc: str = None) -> pd.DataFra This return a simple DataFrame with variable names as index, the long_name as "description" and units. Column names and long names are taken from the activated locale if found, otherwise the english version is taken. + + Parameters + ---------- + ds : xr.Dataset or xr.DataArray + Dataset or DataArray from which to extract the relevant metadata. + loc : str, optional + The locale to use. If None, either the first locale in the list of activated xclim locales is used, or "en" if none is activated. + + Returns + ------- + pd.DataFrame + A DataFrame with variables as index, and columns "description" and "units". """ if loc is None: loc = (XC_OPTIONS[METADATA_LOCALES] or ["en"])[0] @@ -745,12 +764,12 @@ def make_toc(ds: Union[xr.Dataset, xr.DataArray], loc: str = None) -> pd.DataFra def save_to_table( ds: Union[xr.Dataset, xr.DataArray], - filename: str, - output_format: Optional[str] = None, + filename: Union[str, os.PathLike], + output_format: str = None, *, - row: Union[None, str, Sequence[str]] = None, + row: Union[str, Sequence[str]] = None, column: Union[None, str, Sequence[str]] = "variable", - sheet: Union[None, str, Sequence[str]] = None, + sheet: Union[str, Sequence[str]] = None, coords: Union[bool, Sequence[str]] = True, col_sep: str = "_", row_sep: str = None, @@ -767,7 +786,7 @@ def save_to_table( Dataset or DataArray to be saved. If a Dataset with more than one variable is given, the dimension "variable" must appear in one of `row`, `column` or `sheet`. - filename : str + filename : str or os.PathLike Name of the file to be saved. output_format: {'csv', 'excel', ...}, optional The output format. If None (default), it is inferred @@ -848,7 +867,7 @@ def save_to_table( getattr(out, f"to_{output_format}")(filename, **kwargs) -def rechunk_for_saving(ds, rechunk): +def rechunk_for_saving(ds: xr.Dataset, rechunk: dict): """Rechunk before saving to .zarr or .nc, generalized as Y/X for different axes lat/lon, rlat/rlon. Parameters @@ -891,10 +910,10 @@ def rechunk( path_in: Union[os.PathLike, str, xr.Dataset], path_out: Union[os.PathLike, str], *, - chunks_over_var: Optional[dict] = None, - chunks_over_dim: Optional[dict] = None, + chunks_over_var: dict = None, + chunks_over_dim: dict = None, worker_mem: str, - temp_store: Optional[Union[os.PathLike, str]] = None, + temp_store: Union[os.PathLike, str] = None, overwrite: bool = False, ) -> None: """Rechunk a dataset into a new zarr. @@ -914,7 +933,7 @@ def rechunk( The maximal memory usage of each task. When using a distributed Client, this an approximate memory per thread. Each worker of the client should have access to 10-20% more memory than this times the number of threads. - temp_store : path, str, optional + temp_store : path or str, optional A path to a zarr where to store intermediate results. overwrite : bool If True, it will delete whatever is in path_out before doing the rechunking. From 89de9ca141862c733129340997144cfd32e62b7a Mon Sep 17 00:00:00 2001 From: RondeauG <rondeau-genesse.gabriel@ouranos.ca> Date: Wed, 1 Nov 2023 16:50:13 -0400 Subject: [PATCH 10/22] more typing fixes --- xscen/io.py | 2 +- xscen/reduce.py | 25 ++++---- xscen/regrid.py | 26 ++++---- xscen/scripting.py | 36 ++++++----- xscen/spatial.py | 8 +-- xscen/testing.py | 15 +++-- xscen/utils.py | 146 ++++++++++++++++++++++++++++----------------- 7 files changed, 156 insertions(+), 102 deletions(-) diff --git a/xscen/io.py b/xscen/io.py index c7a2fe8c..eb2af847 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -7,7 +7,7 @@ from collections.abc import Sequence from inspect import signature from pathlib import Path -from typing import Optional, Union +from typing import Union import h5py import netCDF4 diff --git a/xscen/reduce.py b/xscen/reduce.py index 4e56c24d..d0fa7471 100644 --- a/xscen/reduce.py +++ b/xscen/reduce.py @@ -1,6 +1,6 @@ -# noqa: D100 +"""Functions to reduce an ensemble of simulations.""" import logging -from typing import Union +from typing import Optional, Union import numpy as np import xarray as xr @@ -15,7 +15,10 @@ @parse_config def build_reduction_data( - datasets: Union[dict, list], *, xrfreqs: list = None, horizons: list = None + datasets: Union[dict, list[xr.Dataset]], + *, + xrfreqs: list[str] = None, + horizons: list[str] = None, ) -> xr.DataArray: """Construct the input required for ensemble reduction. @@ -25,10 +28,10 @@ def build_reduction_data( ---------- datasets : Union[dict, list] Dictionary of datasets in the format {"id": dataset}, or list of datasets. This can be generated by calling .to_dataset_dict() on a catalog. - xrfreqs : list + xrfreqs : list of str, optional List of unique frequencies across the datasets. If None, the script will attempt to guess the frequencies from the datasets' metadata or with xr.infer_freq(). - horizons : list + horizons : list of str, optional Subset of horizons on which to create the data. Returns @@ -76,8 +79,8 @@ def build_reduction_data( @parse_config -def reduce_ensemble(data: xr.DataArray, method: str, kwargs: dict): # noqa: D401 - """Wrapper for the ensemble reduction methods in xclim.ensembles. +def reduce_ensemble(data: xr.DataArray, method: str, kwargs: dict): + """Reduce an ensemble of simulations using clustering algorithms from xclim.ensembles. Parameters ---------- @@ -119,7 +122,7 @@ def reduce_ensemble(data: xr.DataArray, method: str, kwargs: dict): # noqa: D40 return selected, clusters, fig_data -def _concat_criteria(criteria, ens): +def _concat_criteria(criteria: Optional[xr.DataArray], ens: xr.Dataset): """Combine all variables and dimensions excepting 'realization'.""" if criteria is None: i = 0 @@ -130,10 +133,12 @@ def _concat_criteria(criteria, ens): da = ens[vv] da.name = "values" # Stack all dimensions that are not 'realization' - da = da.stack({"criteria": {d for d in da.dims}.difference(["realization"])}) + da = da.stack( + {"criteria": list({d for d in da.dims}.difference(["realization"]))} + ) da = da.assign_coords({"criteria": np.arange(i, i + len(da.criteria))}) if "horizon" in da.coords: - da = da.drop("horizon") + da = da.drop_vars("horizon") if criteria is None: criteria = da diff --git a/xscen/regrid.py b/xscen/regrid.py index 396c56ed..538c9609 100644 --- a/xscen/regrid.py +++ b/xscen/regrid.py @@ -1,10 +1,9 @@ -# noqa: D100 +"""Regrid datasets using xESMF.""" import datetime import operator import os import warnings from copy import deepcopy -from pathlib import PosixPath from typing import Optional, Union import cartopy.crs as ccrs @@ -16,7 +15,6 @@ from .config import parse_config # TODO: Implement logging, warnings, etc. -# TODO: Change all paths to PosixPath objects, including in the catalog? # TODO: Add an option to call xesmf.util.grid_2d or xesmf.util.grid_global # TODO: Implement support for an OBS2SIM kind of interpolation @@ -28,10 +26,10 @@ def regrid_dataset( ds: xr.Dataset, ds_grid: xr.Dataset, - weights_location: Union[str, PosixPath], + weights_location: Union[str, os.PathLike], *, - regridder_kwargs: Optional[dict] = None, - intermediate_grids: Optional[dict] = None, + regridder_kwargs: dict = None, + intermediate_grids: dict = None, to_level: str = "regridded", ) -> xr.Dataset: """Regrid a dataset according to weights and a reference grid. @@ -43,15 +41,15 @@ def regrid_dataset( ds : xarray.Dataset Dataset to regrid. The Dataset needs to have lat/lon coordinates. Supports a 'mask' variable compatible with ESMF standards. - weights_location : Union[str, PosixPath] + weights_location : Union[str, os.PathLike] Path to the folder where weight file is saved. ds_grid : xr.Dataset Destination grid. The Dataset needs to have lat/lon coordinates. Supports a 'mask' variable compatible with ESMF standards. - regridder_kwargs : dict + regridder_kwargs : dict, optional Arguments to send xe.Regridder(). If it contains `skipna` or `out_chunks`, those are passed to the regridder call directly. - intermediate_grids : dict + intermediate_grids : dict, optional This argument is used to do a regridding in many steps, regridding to regular grids before regridding to the final ds_grid. This is useful when there is a large jump in resolution between ds and ds grid. @@ -291,9 +289,9 @@ def cmp(arg1, op, arg2): def _regridder( ds_in: xr.Dataset, ds_grid: xr.Dataset, - filename: str, + filename: Union[str, os.PathLike], *, - method: Optional[str] = "bilinear", + method: str = "bilinear", unmapped_to_nan: Optional[bool] = True, **kwargs, ) -> xe.frontend.Regridder: @@ -305,9 +303,9 @@ def _regridder( Incoming grid. The Dataset needs to have lat/lon coordinates. ds_grid : xr.Dataset Destination grid. The Dataset needs to have lat/lon coordinates. - filename : str + filename : str or os.PathLike Path to the NetCDF file with weights information. - method : str, optional + method : str Interpolation method. unmapped_to_nan : bool, optional Arguments to send xe.Regridder(). @@ -346,7 +344,7 @@ def _regridder( return regridder -def create_bounds_rotated_pole(ds): +def create_bounds_rotated_pole(ds: xr.Dataset): """Create bounds for rotated pole datasets.""" ds = ds.cf.add_bounds(["rlat", "rlon"]) diff --git a/xscen/scripting.py b/xscen/scripting.py index 0d252911..78dad543 100644 --- a/xscen/scripting.py +++ b/xscen/scripting.py @@ -159,9 +159,9 @@ def err_handler(self, *exc_info): @parse_config def send_mail_on_exit( *, - subject: Optional[str] = None, - msg_ok: Optional[str] = None, - msg_err: Optional[str] = None, + subject: str = None, + msg_ok: str = None, + msg_err: str = None, on_error_only: bool = False, skip_ctrlc: bool = True, **mail_kwargs, @@ -243,7 +243,7 @@ class measure_time: def __init__( self, - name: Optional[str] = None, + name: str = None, cpu: bool = False, logger: logging.Logger = logger, ): @@ -367,7 +367,7 @@ def save_and_update( Dataset to save. pcat: ProjectCatalog Catalog to update after saving the dataset. - path: str | os.pathlike + path: str or os.pathlike, optional Path where to save the dataset. If the string contains variables in curly bracket. They will be filled by catalog attributes. If None, the `catutils.build_path` fonction will be used to create a path. @@ -375,11 +375,11 @@ def save_and_update( Format of the file. If None, look for the following in order: build_path_kwargs['format'], a suffix in path, ds.attrs['cat:format']. If nothing is found, it will default to zarr. - build_path_kwargs: dict + build_path_kwargs: dict, optional Arguments to pass to `build_path`. - save_kwargs: + save_kwargs: dict, optional Arguments to pass to `save_to_netcdf` or `save_to_zarr`. - update_kwargs: dict + update_kwargs: dict, optional Arguments to pass to `update_from_ds`. """ build_path_kwargs = build_path_kwargs or {} @@ -388,8 +388,9 @@ def save_and_update( # try to guess file format if not given. if file_format is None: - file_format = build_path_kwargs.get("format", None) - if path is not None and Path(path).suffix: + if "format" in build_path_kwargs: + file_format = build_path_kwargs.get("format") + elif path is not None and Path(path).suffix: file_format = Path(path).suffix.split(".")[-1] else: file_format = ds.attrs.get("cat:format", "zarr") @@ -423,7 +424,12 @@ def save_and_update( logger.info(f"File {path} has saved succesfully and the catalog was updated.") -def move_and_delete(moving, pcat, deleting=None, copy=False): +def move_and_delete( + moving: list[list[Union[str, os.PathLike]]], + pcat: ProjectCatalog, + deleting: list[Union[str, os.PathLike]] = None, + copy: bool = False, +): """ First, move files, then update the catalog with new locations. Finally, delete directories. @@ -431,14 +437,14 @@ def move_and_delete(moving, pcat, deleting=None, copy=False): Parameters ---------- - moving: list - list of lists of path of files to move with format: [[source 1, destination1], [source 2, destination2],...] + moving: list of lists of str or os.PathLike + list of lists of path of files to move, following the format: [[source 1, destination1], [source 2, destination2],...] pcat: ProjectCatalog Catalog to update with new destinations - deleting: list + deleting: list of str or os.PathLike, optional list of directories to be deleted including all contents and recreated empty. E.g. the working directory of a workflow. - copy: bool + copy: bool, optional If True, copy directories instead of moving them. """ diff --git a/xscen/spatial.py b/xscen/spatial.py index e16716b9..ab48dde7 100644 --- a/xscen/spatial.py +++ b/xscen/spatial.py @@ -23,7 +23,7 @@ @parse_config -def creep_weights(mask, n=1, mode="clip"): +def creep_weights(mask: xr.DataArray, n: int = 1, mode: str = "clip") -> xr.DataArray: """Compute weights for the creep fill. The output is a sparse matrix with the same dimensions as `mask`, twice. @@ -84,7 +84,7 @@ def creep_weights(mask, n=1, mode="clip"): @parse_config -def creep_fill(da, w): +def creep_fill(da: xr.DataArray, w: xr.DataArray) -> xr.DataArray: """Creep fill using pre-computed weights. Parameters @@ -128,7 +128,7 @@ def subset( region: dict = None, *, name: str = None, - method: str = None, + method: str = None, # FIXME: Once the region argument is removed, this should be made mandatory. tile_buffer: float = 0, **kwargs, ) -> xr.Dataset: @@ -144,7 +144,7 @@ def subset( Dataset to be subsetted. region: dict Deprecated argument that is there for legacy reasons and will be abandoned eventually. - name: str + name: str, optional Used to rename the 'cat:domain' attribute. method : str ['gridpoint', 'bbox', shape','sel'] diff --git a/xscen/testing.py b/xscen/testing.py index fcd894f8..bb478dff 100644 --- a/xscen/testing.py +++ b/xscen/testing.py @@ -1,4 +1,5 @@ """Testing utilities for xscen.""" +from typing import Union import numpy as np import pandas as pd @@ -21,7 +22,7 @@ def datablock_3d( freq: str = "D", units: str = None, as_dataset: bool = False, -): +) -> Union[xr.DataArray, xr.Dataset]: """Create a generic timeseries object based on pre-defined dictionaries of existing variables. Parameters @@ -46,8 +47,8 @@ def datablock_3d( The starting date of the time coordinate. freq : str The frequency of the time coordinate. - units : str - The units of the variable. + units : str, optional + The units of the variable. If None, the units are inferred from the variable name. as_dataset : bool If True, return a Dataset, else a DataArray. """ @@ -165,7 +166,13 @@ def datablock_3d( def fake_data( - nyears, nx, ny, rand_type="random", seed=0, amplitude=1, offset=0 + nyears: int, + nx: int, + ny: int, + rand_type: str = "random", + seed: int = 0, + amplitude: float = 1.0, + offset: float = 0.0, ) -> np.ndarray: """Generate fake data for testing. diff --git a/xscen/utils.py b/xscen/utils.py index ff19854c..7603f874 100644 --- a/xscen/utils.py +++ b/xscen/utils.py @@ -67,7 +67,13 @@ ) from err -def update_attr(ds, attr, new, others=None, **fmt): +def update_attr( + ds: Union[xr.Dataset, xr.DataArray], + attr: str, + new: str, + others: Sequence[Union[xr.Dataset, xr.DataArray]] = None, + **fmt, +) -> Union[xr.Dataset, xr.DataArray]: """Format an attribute referencing itself in a translatable way. Parameters @@ -81,7 +87,7 @@ def update_attr(ds, attr, new, others=None, **fmt): of the attribute with the "{attr}" field. others: Sequence of Datasets or DataArrays Other objects from which we can extract the attribute `attr`. - These can be be referenced as "{attrXX}" in `new`, where XX is the based-1 index of the other source in `others`. + These can be referenced as "{attrXX}" in `new`, where XX is the based-1 index of the other source in `others`. If they don't have the `attr` attribute, an empty string is sent to the string formatting. See notes. fmt: @@ -138,7 +144,7 @@ def update_attr(ds, attr, new, others=None, **fmt): ) -def add_attr(ds, attr, new, **fmt): +def add_attr(ds: Union[xr.Dataset, xr.DataArray], attr: str, new: str, **fmt): """Add a formatted translatable attribute to a dataset.""" ds.attrs[attr] = new.format(**fmt) for loc in XC_OPTIONS[METADATA_LOCALES]: @@ -146,9 +152,9 @@ def add_attr(ds, attr, new, **fmt): def date_parser( - date, + date: Union[str, cftime.datetime, pd.Timestamp, datetime, pd.Period], *, - end_of_period: bool = False, + end_of_period: Union[bool, str] = False, out_dtype: str = "datetime", strtime_format: str = "%Y-%m-%d", freq: str = "H", @@ -159,12 +165,12 @@ def date_parser( ---------- date : str, cftime.datetime, pd.Timestamp, datetime.datetime, pd.Period Date to be converted - end_of_period : bool, str, optional + end_of_period : bool or str If 'Y' or 'M', the returned date will be the end of the year or month that contains the received date. If True, the period is inferred from the date's precision, but `date` must be a string, otherwise nothing is done. - out_dtype : str, optional + out_dtype : str Choices are 'datetime', 'period' or 'str' - strtime_format : str, optional + strtime_format : str If out_dtype=='str', this sets the strftime format freq : str If out_dtype=='period', this sets the frequency of the period. @@ -243,10 +249,10 @@ def _parse_date(date, fmts): if out_dtype == "str": return date.strftime(strtime_format) - - if out_dtype == "period": + elif out_dtype == "period": return date.to_period(freq) - return date + else: + return date def minimum_calendar(*calendars) -> str: @@ -294,7 +300,7 @@ def stack_drop_nans( mask: xr.DataArray, *, new_dim: str = "loc", - to_file: Optional[str] = None, + to_file: str = None, ) -> xr.Dataset: """Stack dimensions into a single axis and drops indexes where the mask is false. @@ -364,7 +370,10 @@ def stack_drop_nans( @parse_config def unstack_fill_nan( - ds: xr.Dataset, *, dim: str = "loc", coords: Optional[Sequence[str]] = None + ds: xr.Dataset, + *, + dim: str = "loc", + coords: Union[str, os.PathLike, Sequence[Union[str, os.PathLike]], dict] = None, ): """Unstack a Dataset that was stacked by :py:func:`stack_drop_nans`. @@ -467,7 +476,7 @@ def unstack_fill_nan( return out -def natural_sort(_list: list): +def natural_sort(_list: list[str]): """ For strings of numbers. alternative to sorted() that detects a more natural order. @@ -481,18 +490,18 @@ def natural_sort(_list: list): def get_cat_attrs( - ds: Union[xr.Dataset, dict], prefix: str = "cat:", var_as_str=False + ds: Union[xr.Dataset, xr.DataArray, dict], prefix: str = "cat:", var_as_str=False ) -> dict: """Return the catalog-specific attributes from a dataset or dictionary. Parameters ---------- - ds: xr.Dataset - Dataset to be parsed. + ds: xr.Dataset, dict + Dataset to be parsed. If a dictionary, it is assumed to be the attributes of the dataset (ds.attrs). prefix: str Prefix automatically generated by intake-esm. With xscen, this should be 'cat:' var_as_str: bool - If True, variable will be returned as a string if there is only one. + If True, 'variable' will be returned as a string if there is only one. Returns ------- @@ -525,8 +534,26 @@ def maybe_unstack( coords: str = None, rechunk: bool = None, stack_drop_nans: bool = False, -): - """If stack_drop_nans is True, unstack and rechunk.""" +) -> xr.Dataset: + """If stack_drop_nans is True, unstack and rechunk. + + Parameters + ---------- + ds : xr.Dataset + Dataset to unstack. + coords : str, optional + Path to a dataset containing the coords to unstack (and only those). + rechunk : bool, optional + If True, rechunk the dataset after unstacking. + stack_drop_nans : bool + If True, unstack the dataset and rechunk it. + If False, do nothing. + + Returns + ------- + xr.Dataset + Unstacked dataset. + """ if stack_drop_nans: ds = unstack_fill_nan(ds, coords=coords) if rechunk is not None: @@ -690,19 +717,19 @@ def change_units(ds: xr.Dataset, variables_and_units: dict) -> xr.Dataset: def clean_up( ds: xr.Dataset, *, - variables_and_units: Optional[dict] = None, - convert_calendar_kwargs: Optional[dict] = None, - missing_by_var: Optional[dict] = None, - maybe_unstack_dict: Optional[dict] = None, - round_var: Optional[dict] = None, - common_attrs_only: Union[dict, list] = None, + variables_and_units: dict = None, + convert_calendar_kwargs: dict = None, + missing_by_var: dict = None, + maybe_unstack_dict: dict = None, + round_var: dict = None, + common_attrs_only: Union[dict, list[Union[xr.Dataset, str, os.PathLike]]] = None, common_attrs_open_kwargs: dict = None, - attrs_to_remove: Optional[dict] = None, - remove_all_attrs_except: Optional[dict] = None, - add_attrs: Optional[dict] = None, - change_attr_prefix: Optional[str] = None, - to_level: Optional[str] = None, -): + attrs_to_remove: dict = None, + remove_all_attrs_except: dict = None, + add_attrs: dict = None, + change_attr_prefix: str = None, + to_level: str = None, +) -> xr.Dataset: """Clean up of the dataset. It can: @@ -720,27 +747,27 @@ def clean_up( ---------- ds : xr.Dataset Input dataset to clean up - variables_and_units : dict + variables_and_units : dict, optional Dictionary of variable to convert. eg. {'tasmax': 'degC', 'pr': 'mm d-1'} - convert_calendar_kwargs : dict + convert_calendar_kwargs : dict, optional Dictionary of arguments to feed to xclim.core.calendar.convert_calendar. This will be the same for all variables. If missing_by_vars is given, it will override the 'missing' argument given here. Eg. {target': default, 'align_on': 'random'} - missing_by_var : list + missing_by_var : dict, optional Dictionary where the keys are the variables and the values are the argument to feed the `missing` parameters of the xclim.core.calendar.convert_calendar for the given variable with the `convert_calendar_kwargs`. - If missing_by_var == 'interpolate', the missing will be filled with NaNs, then linearly interpolated over time. - maybe_unstack_dict : dict + When the value of an entry is 'interpolate', the missing values will be filled with NaNs, then linearly interpolated over time. + maybe_unstack_dict : dict, optional Dictionary to pass to xscen.common.maybe_unstack function. The format should be: {'coords': path_to_coord_file, 'rechunk': {'time': -1 }, 'stack_drop_nans': True}. - round_var : dict + round_var : dict, optional Dictionary where the keys are the variables of the dataset and the values are the number of decimal places to round to - common_attrs_only : dict, list + common_attrs_only : dict, list of datasets, or list of paths, optional Dictionnary of datasets or list of datasets, or path to NetCDF or Zarr files. Keeps only the global attributes that are the same for all datasets and generates a new id. - common_attrs_open_kwargs : dict + common_attrs_open_kwargs : dict, optional Dictionary of arguments for xarray.open_dataset(). Used with common_attrs_only if given paths. - attrs_to_remove : dict + attrs_to_remove : dict, optional Dictionary where the keys are the variables and the values are a list of the attrs that should be removed. For global attrs, use the key 'global'. The element of the list can be exact matches for the attributes name @@ -748,7 +775,7 @@ def clean_up( - ending with a '*' means checks if the substring is contained in the string - starting with a '^' means check if the string starts with the substring. eg. {'global': ['unnecessary note', 'cell*'], 'tasmax': 'old_name'} - remove_all_attrs_except : dict + remove_all_attrs_except : dict, optional Dictionary where the keys are the variables and the values are a list of the attrs that should NOT be removed, all other attributes will be deleted. If None (default), nothing will be deleted. For global attrs, use the key 'global'. @@ -757,13 +784,13 @@ def clean_up( - ending with a '*' means checks if the substring is contained in the string - starting with a '^' means check if the string starts with the substring. eg. {'global': ['necessary note', '^cat:'], 'tasmax': 'new_name'} - add_attrs : dict + add_attrs : dict, optional Dictionary where the keys are the variables and the values are a another dictionary of attributes. For global attrs, use the key 'global'. eg. {'global': {'title': 'amazing new dataset'}, 'tasmax': {'note': 'important info about tasmax'}} - change_attr_prefix : str + change_attr_prefix : str, optional Replace "cat:" in the catalog global attrs by this new string - to_level : str + to_level : str, optional The processing level to assign to the output. Returns @@ -839,7 +866,7 @@ def _search(a, b): common_attrs_only = list(common_attrs_only.values()) for i in range(len(common_attrs_only)): - if isinstance(common_attrs_only[i], (str, Path)): + if isinstance(common_attrs_only[i], (str, os.PathLike)): dataset = xr.open_dataset( common_attrs_only[i], **common_attrs_open_kwargs ) @@ -904,7 +931,7 @@ def _search(a, b): def publish_release_notes( - style: str = "md", file: Optional[Union[os.PathLike, StringIO, TextIO]] = None + style: str = "md", file: Union[os.PathLike, StringIO, TextIO] = None ) -> Optional[str]: """Format release history in Markdown or ReStructuredText. @@ -991,13 +1018,14 @@ def unstack_dates( Only supports monthly or coarser frequencies. The time axis must be complete and regular (`xr.infer_freq(ds.time)` doesn't fail). seasons: dict, optional - A dictionary from month number to a season name. + A dictionary from month number (as int) to a season name. If not given, it is guessed from the time coord's frequency. See notes. new_dim: str The name of the new dimension. winter_starts_year: bool - If True, the winter season (DJF) is associated with the year of January, instead of December. + If True, the year of winter (DJF) is built from the year of January, not December. + i.e. DJF made from [Dec 1980, Jan 1981, and Feb 1981] will be associated with the year 1981, not 1980. Returns ------- @@ -1133,8 +1161,8 @@ def reshape_da(da): def show_versions( - file: Optional[Union[os.PathLike, StringIO, TextIO]] = None, - deps: Optional[list] = None, + file: Union[os.PathLike, StringIO, TextIO] = None, + deps: list = None, ) -> Optional[str]: """Print the versions of xscen and its dependencies. @@ -1215,8 +1243,18 @@ def ensure_correct_time(ds: xr.Dataset, xrfreq: str) -> xr.Dataset: return ds -def standardize_periods(periods, multiple=True): - """Reformats 'periods' to a list of strings [['start', 'end'], ['start', 'end']].""" +def standardize_periods( + periods: Optional[Union[list[str], list[list[str]]]], multiple: bool = True +) -> Optional[Union[list[str], list[list[str]]]]: + """Reformats the input to a list of strings, ['start', 'end'], or a list of such lists. + + Parameters + ---------- + periods : list of str or list of lists of str, optional + The period(s) to standardize. If None, return None. + multiple : bool + If True, return a list of periods, otherwise return a single period. + """ if periods is None: return periods @@ -1280,7 +1318,7 @@ def season_sort_key(idx: pd.Index, name: str = None): return idx -def xrfreq_to_timedelta(freq): +def xrfreq_to_timedelta(freq: str): """Approximate the length of a period based on its frequency offset.""" N, B, _, _ = parse_offset(freq) return N * pd.Timedelta(CV.xrfreq_to_timedelta(B, "NaT")) From 1aeae785b5da0254a9804876e12fc7307258bfd7 Mon Sep 17 00:00:00 2001 From: RondeauG <rondeau-genesse.gabriel@ouranos.ca> Date: Wed, 1 Nov 2023 17:11:08 -0400 Subject: [PATCH 11/22] upd HISTORY --- HISTORY.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/HISTORY.rst b/HISTORY.rst index 9281407b..b4db3526 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -31,6 +31,7 @@ Bug fixes * Fixed a bug in the documentation build configuration that prevented stable/latest and tagged documentation builds from resolving on ReadTheDocs. (:pull:`256`). * Fixed ``get_warming_level`` to avoid incomplete matches. (:pull:`269`). * `search_data_catalogs` now eliminates anything that matches any entry in `exclusions`. (:issue:`275`, :pull:`280`). +* Fixed a bug in ``xs.scripting.save_and_update`` where ``build_path_kwargs`` was ignored when trying to guess the file format. (:pull:`282`). Internal changes ^^^^^^^^^^^^^^^^ @@ -44,6 +45,7 @@ Internal changes * Added a new `xscen.testing` module with the `datablock_3d` function previously located in `/tests/conftest.py`. (:pull:`248`). * New function `xscen.testing.fake_data` to generate fake data for testing. (:pull:`248`). * xESMF 0.8 Regridder and SpatialAverager argument ``out_chunks`` is now accepted by ``xs.regrid_dataset`` and ``xs.spatial_mean``. (:pull:`260`). +* Multiple improvements to the docstrings and type annotations. (:pull:`282`). v0.7.1 (2023-08-23) ------------------- From afc054c4b62131117666b3727f14d13dc2828667 Mon Sep 17 00:00:00 2001 From: RondeauG <38501935+RondeauG@users.noreply.github.com> Date: Fri, 3 Nov 2023 09:26:06 -0400 Subject: [PATCH 12/22] Update xscen/biasadjust.py Co-authored-by: juliettelavoie <juliette.lavoie@hotmail.ca> --- xscen/biasadjust.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xscen/biasadjust.py b/xscen/biasadjust.py index 4c2413e4..23e766b1 100644 --- a/xscen/biasadjust.py +++ b/xscen/biasadjust.py @@ -1,4 +1,4 @@ -"""Functions to train and adjust a bias-adjustment algorithm.""" +"""Functions to train and adjust a dataset using a bias-adjustment algorithm.""" import logging from copy import deepcopy from typing import Optional, Union From e1b6af6207a7131345470830a172b4ec45417728 Mon Sep 17 00:00:00 2001 From: RondeauG <38501935+RondeauG@users.noreply.github.com> Date: Fri, 3 Nov 2023 09:26:34 -0400 Subject: [PATCH 13/22] Update xscen/catutils.py Co-authored-by: juliettelavoie <juliette.lavoie@hotmail.ca> --- xscen/catutils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xscen/catutils.py b/xscen/catutils.py index 56d471e5..9a85f344 100644 --- a/xscen/catutils.py +++ b/xscen/catutils.py @@ -456,7 +456,7 @@ def parse_directory( Parameters ---------- - directories : list of paths + directories : list of os.PathLike or list of str List of directories to parse. The parse is recursive. patterns : list of str List of possible patterns to be used by :py:func:`parse.parse` to decode the file names. See Notes below. From 0120ec67eb0d8b1bf4c6387ac390b5c7f1b82961 Mon Sep 17 00:00:00 2001 From: RondeauG <rondeau-genesse.gabriel@ouranos.ca> Date: Mon, 6 Nov 2023 15:28:19 -0500 Subject: [PATCH 14/22] add parse_config --- xscen/diagnostics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xscen/diagnostics.py b/xscen/diagnostics.py index 53b0bed1..72b87de4 100644 --- a/xscen/diagnostics.py +++ b/xscen/diagnostics.py @@ -41,6 +41,7 @@ def _(s): return s +@parse_config def health_checks( ds: Union[xr.Dataset, xr.DataArray], *, From 57dc3ab0e9dde74d5739219dd7c80924affea0b2 Mon Sep 17 00:00:00 2001 From: RondeauG <rondeau-genesse.gabriel@ouranos.ca> Date: Mon, 6 Nov 2023 15:28:41 -0500 Subject: [PATCH 15/22] remove ABC --- xscen/catalog.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/xscen/catalog.py b/xscen/catalog.py index b095afe6..5c4a17ed 100644 --- a/xscen/catalog.py +++ b/xscen/catalog.py @@ -6,7 +6,6 @@ import os import re import warnings -from abc import ABC from collections.abc import Mapping, Sequence from copy import deepcopy from functools import reduce @@ -136,7 +135,7 @@ def _parse_dates(elem): """Kwargs to pass to `pd.read_csv` when opening an official Ouranos catalog.""" -class DataCatalog(intake_esm.esm_datastore, ABC): +class DataCatalog(intake_esm.esm_datastore): r""" A read-only intake_esm catalog adapted to xscen's syntax. @@ -516,7 +515,7 @@ def preprocess(ds): return ds -class ProjectCatalog(DataCatalog, ABC): +class ProjectCatalog(DataCatalog): """A DataCatalog with additional 'write' functionalities that can update and upload itself. See Also From 3394439c54db4034820d10a7b7d4578a630699f4 Mon Sep 17 00:00:00 2001 From: RondeauG <38501935+RondeauG@users.noreply.github.com> Date: Mon, 6 Nov 2023 15:30:05 -0500 Subject: [PATCH 16/22] Update xscen/biasadjust.py Co-authored-by: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> --- xscen/biasadjust.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xscen/biasadjust.py b/xscen/biasadjust.py index 23e766b1..37b7f60d 100644 --- a/xscen/biasadjust.py +++ b/xscen/biasadjust.py @@ -56,7 +56,7 @@ def _add_preprocessing_attr(scen, train_kwargs): def train( dref: xr.Dataset, dhist: xr.Dataset, - var: Union[str, list], + var: Union[str, list[str]], period: list[str], *, method: str = "DetrendedQuantileMapping", From a5191d31a359c80b65b28f9420dd67936fb40f1b Mon Sep 17 00:00:00 2001 From: RondeauG <38501935+RondeauG@users.noreply.github.com> Date: Mon, 6 Nov 2023 15:33:20 -0500 Subject: [PATCH 17/22] Update xscen/catalog.py Co-authored-by: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> --- xscen/catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xscen/catalog.py b/xscen/catalog.py index b095afe6..377f0279 100644 --- a/xscen/catalog.py +++ b/xscen/catalog.py @@ -261,7 +261,7 @@ def unique(self, columns: Union[str, Sequence[str]] = None): Parameters ---------- - columns : str or list of str, optional + columns : str or sequence of str, optional The columns to get unique values from. If None, all columns are used. """ if self.df.size == 0: From 6c9e45ad8c294e7291f5542186f82258e5c04fc8 Mon Sep 17 00:00:00 2001 From: RondeauG <38501935+RondeauG@users.noreply.github.com> Date: Mon, 6 Nov 2023 15:33:41 -0500 Subject: [PATCH 18/22] Update xscen/catalog.py Co-authored-by: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> --- xscen/catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xscen/catalog.py b/xscen/catalog.py index 377f0279..bab8ca0d 100644 --- a/xscen/catalog.py +++ b/xscen/catalog.py @@ -302,7 +302,7 @@ def search(self, **columns): ) return cat - def drop_duplicates(self, columns: list[str] = None): + def drop_duplicates(self, columns: Optional[list[str]] = None): """Drop duplicates in the catalog based on a subset of columns. Parameters From e69a4261351b5e04bf9e36d904cf537cafc2c64c Mon Sep 17 00:00:00 2001 From: RondeauG <38501935+RondeauG@users.noreply.github.com> Date: Mon, 6 Nov 2023 15:33:50 -0500 Subject: [PATCH 19/22] Update xscen/catalog.py Co-authored-by: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> --- xscen/catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xscen/catalog.py b/xscen/catalog.py index bab8ca0d..21d7037c 100644 --- a/xscen/catalog.py +++ b/xscen/catalog.py @@ -398,7 +398,7 @@ def exists_in_cat(self, **columns) -> bool: def to_dataset( self, concat_on: Union[list[str], str] = None, - create_ensemble_on: Union[list[str], str] = None, + create_ensemble_on: Optional[Union[list[str], str]] = None, calendar: Optional[str] = "standard", **kwargs, ) -> xr.Dataset: From 906f44b0f2597f16d17319d2520ff050aefc2af0 Mon Sep 17 00:00:00 2001 From: RondeauG <rondeau-genesse.gabriel@ouranos.ca> Date: Mon, 6 Nov 2023 15:55:33 -0500 Subject: [PATCH 20/22] add Optional --- xscen/aggregate.py | 26 ++++++++++++------------ xscen/biasadjust.py | 18 ++++++++--------- xscen/catalog.py | 34 ++++++++++++++++--------------- xscen/catutils.py | 38 +++++++++++++++++------------------ xscen/diagnostics.py | 32 ++++++++++++++--------------- xscen/ensembles.py | 10 ++++----- xscen/extract.py | 46 +++++++++++++++++++++--------------------- xscen/indicators.py | 4 ++-- xscen/io.py | 36 +++++++++++++++++---------------- xscen/reduce.py | 4 ++-- xscen/regrid.py | 6 +++--- xscen/scripting.py | 26 +++++++++++++----------- xscen/spatial.py | 9 ++++++--- xscen/testing.py | 4 ++-- xscen/utils.py | 48 ++++++++++++++++++++++++-------------------- 15 files changed, 177 insertions(+), 164 deletions(-) diff --git a/xscen/aggregate.py b/xscen/aggregate.py index 5f0c596a..3d872fd0 100644 --- a/xscen/aggregate.py +++ b/xscen/aggregate.py @@ -44,10 +44,10 @@ def _(s): def climatological_mean( ds: xr.Dataset, *, - window: int = None, - min_periods: int = None, + window: Optional[int] = None, + min_periods: Optional[int] = None, interval: int = 1, - periods: Union[list[str], list[list[str]]] = None, + periods: Optional[Union[list[str], list[list[str]]]] = None, to_level: Optional[str] = "climatology", ) -> xr.Dataset: """Compute the mean over 'year' for given time periods, respecting the temporal resolution of ds. @@ -372,13 +372,13 @@ def spatial_mean( ds: xr.Dataset, method: str, *, - spatial_subset: bool = None, - call_clisops: bool = False, - region: Union[dict, str] = None, - kwargs: dict = None, - simplify_tolerance: float = None, - to_domain: str = None, - to_level: str = None, + spatial_subset: Optional[bool] = None, + call_clisops: Optional[bool] = False, + region: Optional[Union[dict, str]] = None, + kwargs: Optional[dict] = None, + simplify_tolerance: Optional[float] = None, + to_domain: Optional[str] = None, + to_level: Optional[str] = None, ) -> xr.Dataset: """Compute the spatial mean using a variety of available methods. @@ -703,10 +703,10 @@ def produce_horizon( ModuleType, ], *, - periods: Union[list[str], list[list[str]]] = None, - warminglevels: dict = None, + periods: Optional[Union[list[str], list[list[str]]]] = None, + warminglevels: Optional[dict] = None, to_level: Optional[str] = "horizons", - period: list = None, + period: Optional[list] = None, ) -> xr.Dataset: """Compute indicators, then the climatological mean, and finally unstack dates in order to have a single dataset with all indicators of different frequencies. diff --git a/xscen/biasadjust.py b/xscen/biasadjust.py index 37b7f60d..77e2eeae 100644 --- a/xscen/biasadjust.py +++ b/xscen/biasadjust.py @@ -60,12 +60,12 @@ def train( period: list[str], *, method: str = "DetrendedQuantileMapping", - group: Union[sdba.Grouper, str, dict] = None, - xclim_train_args: dict = None, + group: Optional[Union[sdba.Grouper, str, dict]] = None, + xclim_train_args: Optional[dict] = None, maximal_calendar: str = "noleap", - adapt_freq: dict = None, - jitter_under: dict = None, - jitter_over: dict = None, + adapt_freq: Optional[dict] = None, + jitter_under: Optional[dict] = None, + jitter_over: Optional[dict] = None, align_on: Optional[str] = "year", ) -> xr.Dataset: """ @@ -191,11 +191,11 @@ def adjust( dsim: xr.Dataset, periods: Union[list[str], list[list[str]]], *, - xclim_adjust_args: dict = None, + xclim_adjust_args: Optional[dict] = None, to_level: str = "biasadjusted", - bias_adjust_institution: str = None, - bias_adjust_project: str = None, - moving_yearly_window: dict = None, + bias_adjust_institution: Optional[str] = None, + bias_adjust_project: Optional[str] = None, + moving_yearly_window: Optional[dict] = None, align_on: Optional[str] = "year", ) -> xr.Dataset: """ diff --git a/xscen/catalog.py b/xscen/catalog.py index 2c81a911..4eb077ad 100644 --- a/xscen/catalog.py +++ b/xscen/catalog.py @@ -191,9 +191,9 @@ def __init__( def from_df( cls, data: Union[pd.DataFrame, os.PathLike, Sequence[os.PathLike]], - esmdata: Union[os.PathLike, dict] = None, + esmdata: Optional[Union[os.PathLike, dict]] = None, *, - read_csv_kwargs: Mapping[str, Any] = None, + read_csv_kwargs: Optional[Mapping[str, Any]] = None, name: str = "virtual", **intake_kwargs, ): @@ -255,7 +255,7 @@ def _find_unique(series): else: return data.apply(_find_unique, result_type="reduce").to_dict() - def unique(self, columns: Union[str, Sequence[str]] = None): + def unique(self, columns: Optional[Union[str, Sequence[str]]] = None): """Return a series of unique values in the catalog. Parameters @@ -396,7 +396,7 @@ def exists_in_cat(self, **columns) -> bool: def to_dataset( self, - concat_on: Union[list[str], str] = None, + concat_on: Optional[Union[list[str], str]] = None, create_ensemble_on: Optional[Union[list[str], str]] = None, calendar: Optional[str] = "standard", **kwargs, @@ -528,7 +528,7 @@ def create( cls, filename: Union[os.PathLike, str], *, - project: dict = None, + project: Optional[dict] = None, overwrite: bool = False, ): r"""Create a new project catalog from some project metadata. @@ -609,8 +609,8 @@ def __init__( df: Union[str, dict], *args, create: bool = False, - overwrite: bool = None, - project: dict = None, + overwrite: bool = False, + project: Optional[dict] = None, **kwargs, ): """Open or create a project catalog. @@ -622,7 +622,7 @@ def __init__( If dict, this must be a dict representation of an ESM catalog. See the notes below. create : bool If True, and if 'df' is a string, this will create an empty ProjectCatalog if none already exists. - overwrite : bool, optional + overwrite : bool If this and 'create' are True, this will overwrite any existing JSON and CSV file with an empty catalog. project : dict, optional Metadata to create the catalog, if required. @@ -647,12 +647,14 @@ def __init__( # TODO: Implement a way to easily destroy part of the catalog to "reset" some steps def update( self, - df: Union[ - DataCatalog, - intake_esm.esm_datastore, - pd.DataFrame, - pd.Series, - Sequence[pd.Series], + df: Optional[ + Union[ + DataCatalog, + intake_esm.esm_datastore, + pd.DataFrame, + pd.Series, + Sequence[pd.Series], + ] ] = None, ): """Update the catalog with new data and writes the new data to the csv file. @@ -736,7 +738,7 @@ def update_from_ds( self, ds: xarray.Dataset, path: Union[os.PathLike, str], - info_dict: dict = None, + info_dict: Optional[dict] = None, **info_kwargs, ): """Update the catalog with new data and writes the new data to the csv file. @@ -852,7 +854,7 @@ def _build_id(element: pd.Series, columns: list[str]): def generate_id( - df: Union[pd.DataFrame, xr.Dataset], id_columns: list = None + df: Union[pd.DataFrame, xr.Dataset], id_columns: Optional[list] = None ) -> pd.Series: """Create an ID from column entries. diff --git a/xscen/catutils.py b/xscen/catutils.py index 9a85f344..ae04e61e 100644 --- a/xscen/catutils.py +++ b/xscen/catutils.py @@ -13,7 +13,7 @@ from functools import partial, reduce from multiprocessing import Pool from pathlib import Path -from typing import Any, Union +from typing import Any, Optional, Union import cftime import netCDF4 @@ -109,7 +109,7 @@ def _find_assets( root: Union[str, os.PathLike], exts: set[str], lengths: set[int], - dirglob: str = None, + dirglob: Optional[str] = None, ): """Walk recursively over files in a directory, filtering according to a glob pattern, path depth and extensions. @@ -180,9 +180,9 @@ def _name_parser( path: Union[os.PathLike, str], root: Union[os.PathLike, str], patterns: list[Union[str, parse.Parser]], - read_from_file: Union[list[str], dict] = None, - attrs_map: dict = None, - xr_open_kwargs: dict = None, + read_from_file: Optional[Union[list[str], dict]] = None, + attrs_map: Optional[dict] = None, + xr_open_kwargs: Optional[dict] = None, ) -> Union[dict, None]: """Extract metadata information from the file path. @@ -255,11 +255,11 @@ def _name_parser( def _parse_dir( root: Union[os.PathLike, str], patterns: list[str], - dirglob: str = None, - checks: list[str] = None, - read_from_file: Union[list[str], dict] = None, - attrs_map: dict = None, - xr_open_kwargs: dict = None, + dirglob: Optional[str] = None, + checks: Optional[list[str]] = None, + read_from_file: Optional[Union[list[str], dict]] = None, + attrs_map: Optional[dict] = None, + xr_open_kwargs: Optional[dict] = None, progress: bool = False, ): """Iterate and parses files in a directory, filtering according to basic pattern properties and optional checks. @@ -436,21 +436,21 @@ def parse_directory( directories: list[Union[str, os.PathLike]], patterns: list[str], *, - id_columns: list[str] = None, + id_columns: Optional[list[str]] = None, read_from_file: Union[ bool, Sequence[str], tuple[Sequence[str], Sequence[str]], Sequence[tuple[Sequence[str], Sequence[str]]], ] = False, - homogenous_info: dict = None, - cvs: Union[str, os.PathLike, dict] = None, - dirglob: str = None, - xr_open_kwargs: Mapping[str, Any] = None, + homogenous_info: Optional[dict] = None, + cvs: Optional[Union[str, os.PathLike, dict]] = None, + dirglob: Optional[str] = None, + xr_open_kwargs: Optional[Mapping[str, Any]] = None, only_official_columns: bool = True, progress: bool = False, parallel_dirs: Union[bool, int] = False, - file_checks: list[str] = None, + file_checks: Optional[list[str]] = None, ) -> pd.DataFrame: r"""Parse files in a directory and return them as a pd.DataFrame. @@ -698,7 +698,7 @@ def parse_directory( def parse_from_ds( obj: Union[str, os.PathLike, xr.Dataset], names: Sequence[str], - attrs_map: Mapping[str, str] = None, + attrs_map: Optional[Mapping[str, str]] = None, **xrkwargs, ): """Parse a list of catalog fields from the file/dataset itself. @@ -1094,8 +1094,8 @@ def _build_path( @parse_config def build_path( data: Union[dict, xr.Dataset, xr.DataArray, pd.Series, DataCatalog, pd.DataFrame], - schemas: Union[str, os.PathLike, dict] = None, - root: Union[str, os.PathLike] = None, + schemas: Optional[Union[str, os.PathLike, dict]] = None, + root: Optional[Union[str, os.PathLike]] = None, **extra_facets, ) -> Union[Path, DataCatalog, pd.DataFrame]: r"""Parse the schema from a configuration and construct path using a dictionary of facets. diff --git a/xscen/diagnostics.py b/xscen/diagnostics.py index 72b87de4..b64042f6 100644 --- a/xscen/diagnostics.py +++ b/xscen/diagnostics.py @@ -6,7 +6,7 @@ from copy import deepcopy from pathlib import Path from types import ModuleType -from typing import Union +from typing import Optional, Union import numpy as np import xarray as xr @@ -45,18 +45,18 @@ def _(s): def health_checks( ds: Union[xr.Dataset, xr.DataArray], *, - structure: dict = None, - calendar: str = None, - start_date: str = None, - end_date: str = None, - variables_and_units: dict = None, - cfchecks: dict = None, - freq: str = None, - missing: Union[dict, str, list] = None, - flags: dict = None, - flags_kwargs: dict = None, + structure: Optional[dict] = None, + calendar: Optional[str] = None, + start_date: Optional[str] = None, + end_date: Optional[str] = None, + variables_and_units: Optional[dict] = None, + cfchecks: Optional[dict] = None, + freq: Optional[str] = None, + missing: Optional[Union[dict, str, list]] = None, + flags: Optional[dict] = None, + flags_kwargs: Optional[dict] = None, return_flags: bool = False, - raise_on: list = None, + raise_on: Optional[list] = None, ) -> Union[None, xr.Dataset]: """ Perform a series of health checks on the dataset. Be aware that missing data checks and flag checks can be slow. @@ -299,11 +299,11 @@ def properties_and_measures( Sequence[tuple[str, Indicator]], ModuleType, ], - period: list[str] = None, + period: Optional[list[str]] = None, unstack: bool = False, - rechunk: dict = None, - dref_for_measure: xr.Dataset = None, - change_units_arg: dict = None, + rechunk: Optional[dict] = None, + dref_for_measure: Optional[xr.Dataset] = None, + change_units_arg: Optional[dict] = None, to_level_prop: str = "diag-properties", to_level_meas: str = "diag-measures", ) -> tuple[xr.Dataset, xr.Dataset]: diff --git a/xscen/ensembles.py b/xscen/ensembles.py index aadc74dd..d72944c7 100644 --- a/xscen/ensembles.py +++ b/xscen/ensembles.py @@ -6,7 +6,7 @@ from copy import deepcopy from itertools import chain, groupby from pathlib import Path -from typing import Union +from typing import Optional, Union import numpy as np import xarray as xr @@ -27,8 +27,8 @@ def ensemble_stats( ], statistics: dict, *, - create_kwargs: dict = None, - weights: xr.DataArray = None, + create_kwargs: Optional[dict] = None, + weights: Optional[xr.DataArray] = None, common_attrs_only: bool = True, to_level: str = "ensemble", ) -> xr.Dataset: @@ -146,9 +146,9 @@ def generate_weights( *, independence_level: str = "model", balance_experiments: bool = False, - attribute_weights: dict = None, + attribute_weights: Optional[dict] = None, skipna: bool = True, - v_for_skipna: str = None, + v_for_skipna: Optional[str] = None, standardize: bool = False, experiment_weights: bool = False, ) -> xr.DataArray: diff --git a/xscen/extract.py b/xscen/extract.py index 1f7ae3b5..1ef34e62 100644 --- a/xscen/extract.py +++ b/xscen/extract.py @@ -82,14 +82,14 @@ def clisops_subset(ds: xr.Dataset, region: dict) -> xr.Dataset: def extract_dataset( catalog: DataCatalog, *, - variables_and_freqs: dict = None, - periods: Union[list[str], list[list[str]]] = None, - region: dict = None, + variables_and_freqs: Optional[dict] = None, + periods: Optional[Union[list[str], list[list[str]]]] = None, + region: Optional[dict] = None, to_level: str = "extracted", ensure_correct_time: bool = True, - xr_open_kwargs: dict = None, - xr_combine_kwargs: dict = None, - preprocess: Callable = None, + xr_open_kwargs: Optional[dict] = None, + xr_combine_kwargs: Optional[dict] = None, + preprocess: Optional[Callable] = None, resample_methods: Optional[dict] = None, mask: Union[bool, xr.Dataset, xr.DataArray] = False, ) -> dict: @@ -366,9 +366,9 @@ def resample( da: xr.DataArray, target_frequency: str, *, - ds: xr.Dataset = None, - method: str = None, - missing: Union[str, dict] = None, + ds: Optional[xr.Dataset] = None, + method: Optional[str] = None, + missing: Optional[Union[str, dict]] = None, ) -> xr.DataArray: """Aggregate variable to the target frequency. @@ -589,18 +589,18 @@ def search_data_catalogs( ], variables_and_freqs: dict, *, - other_search_criteria: dict = None, - exclusions: dict = None, + other_search_criteria: Optional[dict] = None, + exclusions: Optional[dict] = None, match_hist_and_fut: bool = False, - periods: Union[list[str], list[list[str]]] = None, - coverage_kwargs: dict = None, - id_columns: list[str] = None, + periods: Optional[Union[list[str], list[list[str]]]] = None, + coverage_kwargs: Optional[dict] = None, + id_columns: Optional[list[str]] = None, allow_resampling: bool = False, allow_conversion: bool = False, - conversion_yaml: str = None, - restrict_resolution: str = None, - restrict_members: dict = None, - restrict_warming_level: Union[dict, bool] = None, + conversion_yaml: Optional[str] = None, + restrict_resolution: Optional[str] = None, + restrict_members: Optional[dict] = None, + restrict_warming_level: Optional[Union[dict, bool]] = None, ) -> dict: """Search through DataCatalogs. @@ -920,9 +920,9 @@ def get_warming_level( wl: float, *, window: int = 20, - tas_baseline_period: list[str] = None, + tas_baseline_period: Optional[list[str]] = None, ignore_member: bool = False, - tas_csv: str = None, + tas_csv: Optional[str] = None, return_horizon: bool = True, ) -> Union[dict, list[str], str]: """Use the IPCC Atlas method to return the window of time over which the requested level of global warming is first reached. @@ -1181,7 +1181,7 @@ def subset_warming_level( def _dispatch_historical_to_future( - catalog: DataCatalog, id_columns: list[str] = None + catalog: DataCatalog, id_columns: Optional[list[str]] = None ) -> DataCatalog: """Update a DataCatalog by recopying each "historical" entry to its corresponding future experiments. @@ -1287,7 +1287,7 @@ def _dispatch_historical_to_future( def _restrict_by_resolution( - catalogs: dict, restrictions: str, id_columns: list[str] = None + catalogs: dict, restrictions: str, id_columns: Optional[list[str]] = None ) -> dict: """Update the results from search_data_catalogs by removing simulations with multiple resolutions available. @@ -1427,7 +1427,7 @@ def _restrict_by_resolution( def _restrict_multimembers( - catalogs: dict, restrictions: dict, id_columns: list[str] = None + catalogs: dict, restrictions: dict, id_columns: Optional[list[str]] = None ): """Update the results from search_data_catalogs by removing simulations with multiple members available. diff --git a/xscen/indicators.py b/xscen/indicators.py index 4a1a5869..bab6d28d 100644 --- a/xscen/indicators.py +++ b/xscen/indicators.py @@ -73,7 +73,7 @@ def compute_indicators( ModuleType, ], *, - periods: Union[list[str], list[list[str]]] = None, + periods: Optional[Union[list[str], list[list[str]]]] = None, restrict_years: bool = True, to_level: Optional[str] = "indicators", ) -> dict: @@ -241,7 +241,7 @@ def _infer_freq_from_meta(ind): def registry_from_module( module: ModuleType, - registry: DerivedVariableRegistry = None, + registry: Optional[DerivedVariableRegistry] = None, variable_column: str = "variable", ) -> DerivedVariableRegistry: """Convert a xclim virtual indicators module to an intake_esm Derived Variable Registry. diff --git a/xscen/io.py b/xscen/io.py index eb2af847..32bfa0f8 100644 --- a/xscen/io.py +++ b/xscen/io.py @@ -7,7 +7,7 @@ from collections.abc import Sequence from inspect import signature from pathlib import Path -from typing import Union +from typing import Optional, Union import h5py import netCDF4 @@ -350,10 +350,10 @@ def save_to_netcdf( ds: xr.Dataset, filename: Union[str, os.PathLike], *, - rechunk: dict = None, + rechunk: Optional[dict] = None, bitround: Union[bool, int, dict] = False, compute: bool = True, - netcdf_kwargs: dict = None, + netcdf_kwargs: Optional[dict] = None, ): """Save a Dataset to NetCDF, rechunking or compressing if requested. @@ -413,10 +413,10 @@ def save_to_zarr( ds: xr.Dataset, filename: Union[str, os.PathLike], *, - rechunk: dict = None, - zarr_kwargs: dict = None, + rechunk: Optional[dict] = None, + zarr_kwargs: Optional[dict] = None, compute: bool = True, - encoding: dict = None, + encoding: Optional[dict] = None, bitround: Union[bool, int, dict] = False, mode: str = "f", itervar: bool = False, @@ -620,9 +620,9 @@ def _to_dataframe( def to_table( ds: Union[xr.Dataset, xr.DataArray], *, - row: Union[str, Sequence[str]] = None, - column: Union[str, Sequence[str]] = None, - sheet: Union[str, Sequence[str]] = None, + row: Optional[Union[str, Sequence[str]]] = None, + column: Optional[Union[str, Sequence[str]]] = None, + sheet: Optional[Union[str, Sequence[str]]] = None, coords: Union[bool, str, Sequence[str]] = True, ) -> Union[pd.DataFrame, dict]: """Convert a dataset to a pandas DataFrame with support for multicolumns and multisheet. @@ -717,7 +717,9 @@ def _ensure_list(seq): return _to_dataframe(da, **table_kwargs) -def make_toc(ds: Union[xr.Dataset, xr.DataArray], loc: str = None) -> pd.DataFrame: +def make_toc( + ds: Union[xr.Dataset, xr.DataArray], loc: Optional[str] = None +) -> pd.DataFrame: """Make a table of content describing a dataset's variables. This return a simple DataFrame with variable names as index, the long_name as "description" and units. @@ -765,14 +767,14 @@ def make_toc(ds: Union[xr.Dataset, xr.DataArray], loc: str = None) -> pd.DataFra def save_to_table( ds: Union[xr.Dataset, xr.DataArray], filename: Union[str, os.PathLike], - output_format: str = None, + output_format: Optional[str] = None, *, - row: Union[str, Sequence[str]] = None, + row: Optional[Union[str, Sequence[str]]] = None, column: Union[None, str, Sequence[str]] = "variable", - sheet: Union[str, Sequence[str]] = None, + sheet: Optional[Union[str, Sequence[str]]] = None, coords: Union[bool, Sequence[str]] = True, col_sep: str = "_", - row_sep: str = None, + row_sep: Optional[str] = None, add_toc: Union[bool, pd.DataFrame] = False, **kwargs, ): @@ -910,10 +912,10 @@ def rechunk( path_in: Union[os.PathLike, str, xr.Dataset], path_out: Union[os.PathLike, str], *, - chunks_over_var: dict = None, - chunks_over_dim: dict = None, + chunks_over_var: Optional[dict] = None, + chunks_over_dim: Optional[dict] = None, worker_mem: str, - temp_store: Union[os.PathLike, str] = None, + temp_store: Optional[Union[os.PathLike, str]] = None, overwrite: bool = False, ) -> None: """Rechunk a dataset into a new zarr. diff --git a/xscen/reduce.py b/xscen/reduce.py index d0fa7471..b45b77d0 100644 --- a/xscen/reduce.py +++ b/xscen/reduce.py @@ -17,8 +17,8 @@ def build_reduction_data( datasets: Union[dict, list[xr.Dataset]], *, - xrfreqs: list[str] = None, - horizons: list[str] = None, + xrfreqs: Optional[list[str]] = None, + horizons: Optional[list[str]] = None, ) -> xr.DataArray: """Construct the input required for ensemble reduction. diff --git a/xscen/regrid.py b/xscen/regrid.py index 538c9609..6ed5dd05 100644 --- a/xscen/regrid.py +++ b/xscen/regrid.py @@ -1,4 +1,4 @@ -"""Regrid datasets using xESMF.""" +"""Functions to regrid datasets.""" import datetime import operator import os @@ -28,8 +28,8 @@ def regrid_dataset( ds_grid: xr.Dataset, weights_location: Union[str, os.PathLike], *, - regridder_kwargs: dict = None, - intermediate_grids: dict = None, + regridder_kwargs: Optional[dict] = None, + intermediate_grids: Optional[dict] = None, to_level: str = "regridded", ) -> xr.Dataset: """Regrid a dataset according to weights and a reference grid. diff --git a/xscen/scripting.py b/xscen/scripting.py index 78dad543..6e264ec2 100644 --- a/xscen/scripting.py +++ b/xscen/scripting.py @@ -42,7 +42,7 @@ def send_mail( *, subject: str, msg: str, - to: str = None, + to: Optional[str] = None, server: str = "127.0.0.1", port: int = 25, attachments: Optional[ @@ -159,9 +159,9 @@ def err_handler(self, *exc_info): @parse_config def send_mail_on_exit( *, - subject: str = None, - msg_ok: str = None, - msg_err: str = None, + subject: Optional[str] = None, + msg_ok: Optional[str] = None, + msg_err: Optional[str] = None, on_error_only: bool = False, skip_ctrlc: bool = True, **mail_kwargs, @@ -243,7 +243,7 @@ class measure_time: def __init__( self, - name: str = None, + name: Optional[str] = None, cpu: bool = False, logger: logging.Logger = logger, ): @@ -308,7 +308,9 @@ def _timeout_handler(signum, frame): @contextmanager -def skippable(seconds: int = 2, task: str = "", logger: logging.Logger = None): +def skippable( + seconds: int = 2, task: str = "", logger: Optional[logging.Logger] = None +): """Skippable context manager. When CTRL-C (SIGINT, KeyboardInterrupt) is sent within the context, @@ -350,11 +352,11 @@ def skippable(seconds: int = 2, task: str = "", logger: logging.Logger = None): def save_and_update( ds: xr.Dataset, pcat: ProjectCatalog, - path: Union[str, os.PathLike] = None, - file_format: str = None, - build_path_kwargs: dict = None, - save_kwargs: dict = None, - update_kwargs: dict = None, + path: Optional[Union[str, os.PathLike]] = None, + file_format: Optional[str] = None, + build_path_kwargs: Optional[dict] = None, + save_kwargs: Optional[dict] = None, + update_kwargs: Optional[dict] = None, ): """ Construct the path, save and delete. @@ -427,7 +429,7 @@ def save_and_update( def move_and_delete( moving: list[list[Union[str, os.PathLike]]], pcat: ProjectCatalog, - deleting: list[Union[str, os.PathLike]] = None, + deleting: Optional[list[Union[str, os.PathLike]]] = None, copy: bool = False, ): """ diff --git a/xscen/spatial.py b/xscen/spatial.py index ab48dde7..3618c546 100644 --- a/xscen/spatial.py +++ b/xscen/spatial.py @@ -4,6 +4,7 @@ import warnings from copy import deepcopy from pathlib import Path +from typing import Optional import clisops.core.subset import dask @@ -125,10 +126,12 @@ def _dot(arr, wei): def subset( ds: xr.Dataset, - region: dict = None, + region: Optional[dict] = None, *, - name: str = None, - method: str = None, # FIXME: Once the region argument is removed, this should be made mandatory. + name: Optional[str] = None, + method: Optional[ + str + ] = None, # FIXME: Once the region argument is removed, this should be made mandatory. tile_buffer: float = 0, **kwargs, ) -> xr.Dataset: diff --git a/xscen/testing.py b/xscen/testing.py index bb478dff..b59479fe 100644 --- a/xscen/testing.py +++ b/xscen/testing.py @@ -1,5 +1,5 @@ """Testing utilities for xscen.""" -from typing import Union +from typing import Optional, Union import numpy as np import pandas as pd @@ -20,7 +20,7 @@ def datablock_3d( y_step: float = 0.1, start: str = "7/1/2000", freq: str = "D", - units: str = None, + units: Optional[str] = None, as_dataset: bool = False, ) -> Union[xr.DataArray, xr.Dataset]: """Create a generic timeseries object based on pre-defined dictionaries of existing variables. diff --git a/xscen/utils.py b/xscen/utils.py index 7603f874..011cef6d 100644 --- a/xscen/utils.py +++ b/xscen/utils.py @@ -71,7 +71,7 @@ def update_attr( ds: Union[xr.Dataset, xr.DataArray], attr: str, new: str, - others: Sequence[Union[xr.Dataset, xr.DataArray]] = None, + others: Optional[Sequence[Union[xr.Dataset, xr.DataArray]]] = None, **fmt, ) -> Union[xr.Dataset, xr.DataArray]: """Format an attribute referencing itself in a translatable way. @@ -300,7 +300,7 @@ def stack_drop_nans( mask: xr.DataArray, *, new_dim: str = "loc", - to_file: str = None, + to_file: Optional[str] = None, ) -> xr.Dataset: """Stack dimensions into a single axis and drops indexes where the mask is false. @@ -373,7 +373,9 @@ def unstack_fill_nan( ds: xr.Dataset, *, dim: str = "loc", - coords: Union[str, os.PathLike, Sequence[Union[str, os.PathLike]], dict] = None, + coords: Optional[ + Union[str, os.PathLike, Sequence[Union[str, os.PathLike]], dict] + ] = None, ): """Unstack a Dataset that was stacked by :py:func:`stack_drop_nans`. @@ -531,8 +533,8 @@ def get_cat_attrs( @parse_config def maybe_unstack( ds: xr.Dataset, - coords: str = None, - rechunk: bool = None, + coords: Optional[str] = None, + rechunk: Optional[bool] = None, stack_drop_nans: bool = False, ) -> xr.Dataset: """If stack_drop_nans is True, unstack and rechunk. @@ -717,18 +719,20 @@ def change_units(ds: xr.Dataset, variables_and_units: dict) -> xr.Dataset: def clean_up( ds: xr.Dataset, *, - variables_and_units: dict = None, - convert_calendar_kwargs: dict = None, - missing_by_var: dict = None, - maybe_unstack_dict: dict = None, - round_var: dict = None, - common_attrs_only: Union[dict, list[Union[xr.Dataset, str, os.PathLike]]] = None, - common_attrs_open_kwargs: dict = None, - attrs_to_remove: dict = None, - remove_all_attrs_except: dict = None, - add_attrs: dict = None, - change_attr_prefix: str = None, - to_level: str = None, + variables_and_units: Optional[dict] = None, + convert_calendar_kwargs: Optional[dict] = None, + missing_by_var: Optional[dict] = None, + maybe_unstack_dict: Optional[dict] = None, + round_var: Optional[dict] = None, + common_attrs_only: Optional[ + Union[dict, list[Union[xr.Dataset, str, os.PathLike]]] + ] = None, + common_attrs_open_kwargs: Optional[dict] = None, + attrs_to_remove: Optional[dict] = None, + remove_all_attrs_except: Optional[dict] = None, + add_attrs: Optional[dict] = None, + change_attr_prefix: Optional[str] = None, + to_level: Optional[str] = None, ) -> xr.Dataset: """Clean up of the dataset. @@ -931,7 +935,7 @@ def _search(a, b): def publish_release_notes( - style: str = "md", file: Union[os.PathLike, StringIO, TextIO] = None + style: str = "md", file: Optional[Union[os.PathLike, StringIO, TextIO]] = None ) -> Optional[str]: """Format release history in Markdown or ReStructuredText. @@ -1005,7 +1009,7 @@ def publish_release_notes( def unstack_dates( ds: xr.Dataset, - seasons: dict[int, str] = None, + seasons: Optional[dict[int, str]] = None, new_dim: str = "season", winter_starts_year: bool = False, ): @@ -1161,8 +1165,8 @@ def reshape_da(da): def show_versions( - file: Union[os.PathLike, StringIO, TextIO] = None, - deps: list = None, + file: Optional[Union[os.PathLike, StringIO, TextIO]] = None, + deps: Optional[list] = None, ) -> Optional[str]: """Print the versions of xscen and its dependencies. @@ -1282,7 +1286,7 @@ def standardize_periods( return periods[0] -def season_sort_key(idx: pd.Index, name: str = None): +def season_sort_key(idx: pd.Index, name: Optional[str] = None): """Get a proper sort key for a "season" or "month" index to avoid alphabetical sorting. If any of the values in the index is not recognized as a 3-letter From fa7a4c60360cc87aa14487c9f6cc219ad36cc80c Mon Sep 17 00:00:00 2001 From: RondeauG <rondeau-genesse.gabriel@ouranos.ca> Date: Mon, 6 Nov 2023 16:40:24 -0500 Subject: [PATCH 21/22] fixes from suggestions --- xscen/catutils.py | 2 +- xscen/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/xscen/catutils.py b/xscen/catutils.py index ae04e61e..c3908262 100644 --- a/xscen/catutils.py +++ b/xscen/catutils.py @@ -183,7 +183,7 @@ def _name_parser( read_from_file: Optional[Union[list[str], dict]] = None, attrs_map: Optional[dict] = None, xr_open_kwargs: Optional[dict] = None, -) -> Union[dict, None]: +) -> Optional[dict]: """Extract metadata information from the file path. Parameters diff --git a/xscen/utils.py b/xscen/utils.py index 011cef6d..a926001e 100644 --- a/xscen/utils.py +++ b/xscen/utils.py @@ -534,7 +534,7 @@ def get_cat_attrs( def maybe_unstack( ds: xr.Dataset, coords: Optional[str] = None, - rechunk: Optional[bool] = None, + rechunk: Optional[dict] = None, stack_drop_nans: bool = False, ) -> xr.Dataset: """If stack_drop_nans is True, unstack and rechunk. @@ -545,7 +545,7 @@ def maybe_unstack( Dataset to unstack. coords : str, optional Path to a dataset containing the coords to unstack (and only those). - rechunk : bool, optional + rechunk : dict, optional If True, rechunk the dataset after unstacking. stack_drop_nans : bool If True, unstack the dataset and rechunk it. From 31cd3e259f5a0fc9a2ecb15b24ecaaafb734f5c7 Mon Sep 17 00:00:00 2001 From: RondeauG <rondeau-genesse.gabriel@ouranos.ca> Date: Mon, 6 Nov 2023 16:43:15 -0500 Subject: [PATCH 22/22] fix docstrings --- xscen/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xscen/utils.py b/xscen/utils.py index a926001e..ece38dac 100644 --- a/xscen/utils.py +++ b/xscen/utils.py @@ -546,7 +546,7 @@ def maybe_unstack( coords : str, optional Path to a dataset containing the coords to unstack (and only those). rechunk : dict, optional - If True, rechunk the dataset after unstacking. + If not None, rechunk the dataset after unstacking. stack_drop_nans : bool If True, unstack the dataset and rechunk it. If False, do nothing.