diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3209d695..290ca20b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,35 @@ Changelog ========= +v0.11.0 (unreleased) +-------------------- +Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`). + +New features and enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +* N/A + +Breaking changes +^^^^^^^^^^^^^^^^ +* ``xs.utils.publish_release_notes`` and ``xs.utils.show_versions`` have been moved to ``xs.testing``. (:pull:`492`). + +Bug fixes +^^^^^^^^^ +* Added a missing library (``openpyxl``) to the requirements. (:pull:`492`). +* Fixed a bug in ``xs.io.subset_maxsize`` where the function would drop the last year. (:pull:`492`). +* Fixed a bug in ``xs.io.clean_incomplete`` where the `.zmetadata` file was not removed. (:pull:`492`). +* Fixed a bug in the saving of datasets where encoding was sometimes not applied, resulting for example in rechunking not being respected. (:pull:`492`). +* Fixed multiple bugs in ``xs.io.save_to_zarr`` with `mode='a'`. (:pull:`492`). +* Fixed a few minor bugs in ``xs.io.save_to_table``. (:pull:`492`). + +Internal changes +^^^^^^^^^^^^^^^^ +* Added a new parameter `latest` to ``xs.testing.publish_release_notes`` to only print the latest release notes. (:pull:`492`). +* The estimation method in ``xs.io.estimate_chunks`` has been improved. (:pull:`492`). +* A new parameter `incomplete` has been added to ``xs.io.clean_incomplete`` to remove incomplete variables. (:pull:`492`). +* Continued work on adding tests. (:pull:`492`). + + v0.10.1 (2024-11-04) -------------------- Contributors to this version: Gabriel Rondeau-Genesse (:user:`RondeauG`), Pascal Bourgault (:user:`aulemahal`), Éric Dupuis (:user:`coxipi`). diff --git a/docs/notebooks/4_ensembles.ipynb b/docs/notebooks/4_ensembles.ipynb index dad0407f..fa76d1ac 100644 --- a/docs/notebooks/4_ensembles.ipynb +++ b/docs/notebooks/4_ensembles.ipynb @@ -40,10 +40,14 @@ "metadata": {}, "outputs": [], "source": [ - "from xclim.testing import open_dataset\n", + "import pooch\n", + "import xarray as xr\n", + "from xclim.testing.utils import nimbus\n", "\n", "import xscen as xs\n", "\n", + "downloader = pooch.HTTPDownloader(headers={\"User-Agent\": f\"xscen-{xs.__version__}\"})\n", + "\n", "datasets = {\n", " \"ACCESS\": \"EnsembleStats/BCCAQv2+ANUSPLIN300_ACCESS1-0_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc\",\n", " \"BNU-ESM\": \"EnsembleStats/BCCAQv2+ANUSPLIN300_BNU-ESM_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc\",\n", @@ -53,7 +57,8 @@ "}\n", "\n", "for d in datasets:\n", - " ds = open_dataset(datasets[d]).isel(lon=slice(0, 4), lat=slice(0, 4))\n", + " file = nimbus().fetch(datasets[d], downloader=downloader)\n", + " ds = xr.open_dataset(file).isel(lon=slice(0, 4), lat=slice(0, 4))\n", " ds = xs.climatological_op(\n", " ds,\n", " op=\"mean\",\n", diff --git a/environment-dev.yml b/environment-dev.yml index 5b06e841..56345653 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -10,8 +10,8 @@ dependencies: - cftime - cf_xarray >=0.7.6 - clisops >=0.10 - - dask - - flox + - dask >=2024.8.1,<2024.11 # FIXME: https://github.com/Ouranosinc/xclim/issues/1992 + - flox !=0.9.14 # FIXME: 0.9.14 is a broken version. This pin could be removed eventually. - fsspec - geopandas - h5netcdf @@ -21,6 +21,7 @@ dependencies: - netCDF4 - numcodecs - numpy >=1.24 + - openpyxl - pandas >=2.2 - parse - pyyaml @@ -31,7 +32,7 @@ dependencies: - toolz - xarray >=2023.11.0, !=2024.6.0 - xclim >=0.53.2, <0.54 - - xesmf >=0.7 + - xesmf >=0.7, <0.8.8 # FIXME: 0.8.8 currently creates segfaults on ReadTheDocs. - zarr >=2.13 # Opt - nc-time-axis >=1.3.1 diff --git a/environment.yml b/environment.yml index 3e4232f3..126cb271 100644 --- a/environment.yml +++ b/environment.yml @@ -10,8 +10,8 @@ dependencies: - cftime - cf_xarray >=0.7.6 - clisops >=0.10 - - dask - - flox + - dask >=2024.8.1,<2024.11 # FIXME: https://github.com/Ouranosinc/xclim/issues/1992 + - flox !=0.9.14 # FIXME: 0.9.14 is a broken version. This pin could be removed eventually. - fsspec - geopandas - h5netcdf @@ -21,6 +21,7 @@ dependencies: - netCDF4 - numcodecs - numpy >=1.24 + - openpyxl - pandas >=2.2 - parse - pyyaml @@ -31,7 +32,7 @@ dependencies: - toolz - xarray >=2023.11.0, !=2024.6.0 - xclim >=0.53.2, <0.54 - - xesmf >=0.7 + - xesmf >=0.7, <0.8.8 # FIXME: 0.8.8 currently creates segfaults on ReadTheDocs. - zarr >=2.13 # To install from source - setuptools >=65.0.0 diff --git a/pyproject.toml b/pyproject.toml index fc5f6214..8b3a62f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,8 +43,8 @@ dependencies = [ "cftime", "cf_xarray >=0.7.6", "clisops >=0.10", - "dask", - "flox", + "dask >=2024.8.1,<2024.11", # FIXME: https://github.com/Ouranosinc/xclim/issues/1992 + "flox !=0.9.14", # FIXME: 0.9.14 is a broken version. This pin could be removed eventually. "fsspec", "geopandas", "h5netcdf", @@ -54,6 +54,7 @@ dependencies = [ "netCDF4", "numcodecs", "numpy >=1.24", + "openpyxl", "pandas >=2.2", "parse", # Used when opening catalogs. @@ -111,7 +112,7 @@ docs = [ "sphinxcontrib-napoleon" ] extra = [ - "xesmf>=0.7" + "xesmf>=0.7, <0.8.8" # FIXME: 0.8.8 currently creates segfaults on ReadTheDocs. ] all = ["xscen[dev]", "xscen[docs]", "xscen[extra]"] diff --git a/src/xscen/data/fr/LC_MESSAGES/xscen.mo b/src/xscen/data/fr/LC_MESSAGES/xscen.mo index 3821b177..51b5812a 100644 Binary files a/src/xscen/data/fr/LC_MESSAGES/xscen.mo and b/src/xscen/data/fr/LC_MESSAGES/xscen.mo differ diff --git a/src/xscen/data/fr/LC_MESSAGES/xscen.po b/src/xscen/data/fr/LC_MESSAGES/xscen.po index 269ebed3..be7efb40 100644 --- a/src/xscen/data/fr/LC_MESSAGES/xscen.po +++ b/src/xscen/data/fr/LC_MESSAGES/xscen.po @@ -19,8 +19,8 @@ msgstr "Description" msgid "Units" msgstr "Unités" -msgid "Content" -msgstr "Contenu" - msgid "Global attributes" msgstr "Attributs globaux" + +msgid "Content" +msgstr "Contenu" diff --git a/src/xscen/io.py b/src/xscen/io.py index f0c2e149..ed7df22c 100644 --- a/src/xscen/io.py +++ b/src/xscen/io.py @@ -1,6 +1,7 @@ """Input/Output functions for xscen.""" import datetime +import json import logging import os import shutil as sh @@ -99,58 +100,30 @@ def estimate_chunks( # noqa: C901 A dictionary mapping dimensions to chunk sizes. """ - def _estimate_chunks(ds, target_mb, size_of_slice, rechunk_dims): - # Approximate size of the chunks (equal across dims) + def _estimate_chunks(da, target_mb, size_of_slice, rechunk_dims): + # Divide the dimensions by the smallest dimension + min_dim = np.min([da[d].shape[0] for d in rechunk_dims]) + ratio = {d: da[d].shape[0] / min_dim for d in rechunk_dims} + + # Get the approximate number of chunks, supposing the chunks are cubes approx_chunks = np.power(target_mb / size_of_slice, 1 / len(rechunk_dims)) - chunks_per_dim = dict() - if len(rechunk_dims) == 1: - rounding = ( - 1 - if ds[rechunk_dims[0]].shape[0] <= 15 - else 5 if ds[rechunk_dims[0]].shape[0] <= 250 else 10 - ) - chunks_per_dim[rechunk_dims[0]] = np.max( - [ - np.min( - [ - int(rounding * np.round(approx_chunks / rounding)), - ds[rechunk_dims[0]].shape[0], - ] - ), - 1, - ] - ) - elif len(rechunk_dims) == 2: - # Adjust approx_chunks based on the ratio of the rectangle sizes - for d in rechunk_dims: - rounding = ( - 1 if ds[d].shape[0] <= 15 else 5 if ds[d].shape[0] <= 250 else 10 - ) - adjusted_chunk = int( - rounding - * np.round( - approx_chunks - * ( - ds[d].shape[0] - / np.prod( - [ - ds[dd].shape[0] - for dd in rechunk_dims - if dd not in [d] - ] - ) - ) - / rounding - ) - ) - chunks_per_dim[d] = np.max( - [np.min([adjusted_chunk, ds[d].shape[0]]), 1] - ) - else: - raise NotImplementedError( - "estimating chunks on more than 2 dimensions is not implemented yet." - ) + # Redistribute the chunks based on the ratio of the dimensions + x = (approx_chunks ** len(rechunk_dims) / np.prod(list(ratio.values()))) ** ( + 1 / len(rechunk_dims) + ) + rounding_per_dim = { + d: 1 if da[d].shape[0] <= 15 else 5 if da[d].shape[0] <= 250 else 10 + for d in rechunk_dims + } + chunks_per_dim = { + d: int(rounding_per_dim[d] * np.round(x * ratio[d] / rounding_per_dim[d])) + for d in rechunk_dims + } + chunks_per_dim = { + d: np.max([np.min([chunks_per_dim[d], da[d].shape[0]]), 1]) + for d in rechunk_dims + } return chunks_per_dim @@ -163,7 +136,7 @@ def _estimate_chunks(ds, target_mb, size_of_slice, rechunk_dims): for v in ds.variables: # Find dimensions to chunk rechunk_dims = list(set(dims).intersection(ds.variables[v].dimensions)) - if not rechunk_dims: + if not rechunk_dims or v in ds.dimensions: continue dtype_size = ds.variables[v].datatype.itemsize @@ -219,7 +192,7 @@ def _estimate_chunks(ds, target_mb, size_of_slice, rechunk_dims): def subset_maxsize( ds: xr.Dataset, maxsize_gb: float, -) -> list: +) -> list[xr.Dataset]: """Estimate a dataset's size and, if higher than the given limit, subset it alongside the 'time' dimension. Parameters @@ -232,7 +205,7 @@ def subset_maxsize( Returns ------- - list + list of xr.Dataset List of xr.Dataset subsetted alongside 'time' to limit the filesize to the requested maximum. """ # Estimate the size of the dataset @@ -247,11 +220,11 @@ def subset_maxsize( logger.info(msg) return [ds] - elif "time" in ds: + elif "time" in ds.dims: years = np.unique(ds.time.dt.year) - ratio = int(len(years) / (size_of_file / maxsize_gb)) + ratio = np.max([int(len(years) / (size_of_file / maxsize_gb)), 1]) ds_sub = [] - for y in range(years[0], years[-1], ratio): + for y in range(years[0], years[-1] + 1, ratio): ds_sub.extend([ds.sel({"time": slice(str(y), str(y + ratio - 1))})]) return ds_sub @@ -261,7 +234,11 @@ def subset_maxsize( ) -def clean_incomplete(path: str | os.PathLike, complete: Sequence[str]) -> None: +def clean_incomplete( + path: str | os.PathLike, + complete: Sequence[str] | None = None, + incomplete: Sequence[str] | None = None, +) -> None: """Delete un-catalogued variables from a zarr folder. The goal of this function is to clean up an incomplete calculation. @@ -272,22 +249,58 @@ def clean_incomplete(path: str | os.PathLike, complete: Sequence[str]) -> None: ---------- path : str, Path A path to a zarr folder. - complete : sequence of strings - Name of variables that were completed. + complete : sequence of strings, optional + Name of variables that were completed. All other variables (except coordinates) will be removed. + Use either `complete` or `incomplete`, not both. + incomplete : sequence of strings, optional + Name of variables that should be removed. Coordinates and dimensions will never be removed through this function. + Use either `complete` or `incomplete`, not both. Returns ------- None """ path = Path(path) - with xr.open_zarr(path) as ds: - complete = set(complete).union(ds.coords.keys()) - for fold in filter(lambda p: p.is_dir(), path.iterdir()): - if fold.name not in complete: - msg = f"Removing {fold} from disk" - logger.warning(msg) - sh.rmtree(fold) + if complete is not None and incomplete is not None: + raise ValueError("Use either `complete` or `incomplete`, not both.") + v_to_rm = [] + + if complete is not None: + with xr.open_zarr(path) as ds: + complete = set(complete).union(ds.coords.keys()) + + for fold in filter(lambda p: p.is_dir(), path.iterdir()): + if fold.name not in complete: + v_to_rm.append(fold.name) + msg = f"Removing {fold} from disk" + logger.warning(msg) + sh.rmtree(fold) + + elif incomplete is not None: + with xr.open_zarr(path) as ds: + incomplete = [ + v for v in incomplete if (v not in ds.coords) and (v not in ds.dims) + ] + + for fold in filter(lambda p: p.is_dir(), path.iterdir()): + if fold.name in incomplete: + v_to_rm.append(fold.name) + msg = f"Removing {fold} from disk" + logger.warning(msg) + sh.rmtree(fold) + + # Update the .zmetadata file + with (path / ".zmetadata").open("r") as f: + metadata = json.load(f) + for v in v_to_rm: + [ + metadata["metadata"].pop(k) + for k in list(metadata["metadata"].keys()) + if k.startswith(f"{v}/.") + ] + with (path / ".zmetadata").open("w") as f: + json.dump(metadata, f, indent=2) def _coerce_attrs(attrs): @@ -319,9 +332,11 @@ def round_bits(da: xr.DataArray, keepbits: int): keepbits : int The number of bits of the mantissa to keep. """ + encoding = da.encoding da = xr.apply_ufunc( _np_bitround, da, keepbits, dask="parallelized", keep_attrs=True ) + da.encoding = encoding da.attrs["_QuantizeBitRoundNumberOfSignificantDigits"] = keepbits new_history = f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Data compressed with BitRound by keeping {keepbits} bits." history = ( @@ -456,7 +471,8 @@ def save_to_zarr( # noqa: C901 if 'o', removes the existing variables. if 'a', skip existing variables, writes the others. encoding : dict, optional - If given, skipped variables are popped in place. + If given here instead of 'zarr_kwargs', encoding will only be applied to the variables that are being written, + skipping those that are already in the zarr. bitround : bool or int or dict If not False, float variables are bit-rounded by dropping a certain number of bits from their mantissa, allowing for a much better compression. @@ -510,15 +526,25 @@ def _skip(var): if mode == "o": if exists: - var_path = path / var - msg = f"Removing {var_path} to overwrite." - logger.warning(msg) - sh.rmtree(var_path) + clean_incomplete(path, incomplete=[var]) return False if mode == "a": + # In all cases, we need to skip the encoding of existing variables. + if exists: + if encoding: + encoding.pop(var, None) + + # If we are not appending, we need to skip the writing of existing variables. if "append_dim" not in zarr_kwargs: return exists + + # If we are appending, we need to raise an error if there are new variables. + elif exists is False: + raise ValueError( + f"When 'append_dim' is set in zarr_kwargs, all variables must already exist in the dataset." + ) + return False for var in list(ds.data_vars.keys()): @@ -526,8 +552,7 @@ def _skip(var): msg = f"Skipping {var} in {path}." logger.info(msg) ds = ds.drop_vars(var) - if encoding: - encoding.pop(var) + continue if keepbits := _get_keepbits(bitround, var, ds[var].dtype): ds = ds.assign({var: round_bits(ds[var], keepbits)}) # Remove original_shape from encoding, since it can cause issues with some engines. @@ -562,9 +587,7 @@ def _skip(var): ) except TimeoutException: if timeout_cleanup: - msg = f"Removing incomplete {name}." - logger.info(msg) - sh.rmtree(path / name) + clean_incomplete(path, incomplete=[name]) raise else: @@ -576,10 +599,7 @@ def _skip(var): ) except TimeoutException: if timeout_cleanup: - msg = f"Removing incomplete {list(ds.data_vars.keys())} for {filename}." - logger.info(msg) - for name in ds.data_vars: - sh.rmtree(path / name) + clean_incomplete(path, incomplete=list(ds.data_vars.keys())) raise @@ -783,7 +803,6 @@ def make_toc(ds: xr.Dataset | xr.DataArray, loc: str | None = None) -> pd.DataFr for vv, da in ds.data_vars.items() ], ).set_index(_("Variable")) - toc.attrs["name"] = _("Content") # Add global attributes by using a fake variable and description if len(ds.attrs) > 0: @@ -803,6 +822,7 @@ def make_toc(ds: xr.Dataset | xr.DataArray, loc: str | None = None) -> pd.DataFr toc = pd.concat([toc, pd.DataFrame(index=[""])]) toc = pd.concat([toc, pd.DataFrame(index=[_("Global attributes")])]) toc = pd.concat([toc, globattr]) + toc.attrs["name"] = _("Content") return toc @@ -810,13 +830,13 @@ def make_toc(ds: xr.Dataset | xr.DataArray, loc: str | None = None) -> pd.DataFr TABLE_FORMATS = {".csv": "csv", ".xls": "excel", ".xlsx": "excel"} -def save_to_table( +def save_to_table( # noqa: C901 ds: xr.Dataset | xr.DataArray, filename: str | os.PathLike, output_format: str | None = None, *, row: str | Sequence[str] | None = None, - column: None | str | Sequence[str] = "variable", + column: None | str | Sequence[str] = None, sheet: str | Sequence[str] | None = None, coords: bool | Sequence[str] = True, col_sep: str = "_", @@ -824,7 +844,7 @@ def save_to_table( add_toc: bool | pd.DataFrame = False, **kwargs, ): - """Save the dataset to a tabular file (csv, excel, ...). + r"""Save the dataset to a tabular file (csv, excel, ...). This function will trigger a computation of the dataset. @@ -845,7 +865,8 @@ def save_to_table( Default is all data dimensions. column : str or sequence of str, optional Name of the dimension(s) to use as columns. - Default is "variable", i.e. the name of the variable(s). + When using a Dataset with more than 1 variable, default is "variable", i.e. the name of the variable(s). + When using a DataArray, default is None. sheet : str or sequence of str, optional Name of the dimension(s) to use as sheet names. Only valid if the output format is excel. @@ -861,7 +882,7 @@ def save_to_table( A table of content to add as the first sheet. Only valid if the output format is excel. If True, :py:func:`make_toc` is used to generate the toc. The sheet name of the toc can be given through the "name" attribute of the DataFrame, otherwise "Content" is used. - kwargs: + \*\*kwargs: Other arguments passed to the pandas function. If the output format is excel, kwargs to :py:class:`pandas.ExcelWriter` can be given here as well. """ @@ -874,6 +895,9 @@ def save_to_table( f"Output format could not be inferred from filename {filename.name}. Please pass `output_format`." ) + if column is None and isinstance(ds, xr.Dataset) and len(ds.data_vars) > 1: + column = "variable" + if sheet is not None and output_format != "excel": raise ValueError( f"Argument `sheet` is only valid with excel as the output format. Got {output_format}." @@ -892,15 +916,22 @@ def save_to_table( add_toc = make_toc(ds) out = {(add_toc.attrs.get("name", "Content"),): add_toc, **out} - if sheet or (add_toc is not False): + # Get engine_kwargs + if output_format == "excel": engine_kwargs = {} # Extract engine kwargs for arg in signature(pd.ExcelWriter).parameters: if arg in kwargs: engine_kwargs[arg] = kwargs.pop(arg) + else: + engine_kwargs = {} + if sheet or (add_toc is not False): with pd.ExcelWriter(filename, **engine_kwargs) as writer: for sheet_name, df in out.items(): df.to_excel(writer, sheet_name=col_sep.join(sheet_name), **kwargs) + elif len(engine_kwargs) > 0: + with pd.ExcelWriter(filename, **engine_kwargs) as writer: + out.to_excel(writer, **kwargs) else: if output_format != "excel" and isinstance(out.columns, pd.MultiIndex): out.columns = out.columns.map(lambda lvls: col_sep.join(map(str, lvls))) @@ -952,7 +983,7 @@ def rechunk_for_saving(ds: xr.Dataset, rechunk: dict): for d in ds[rechunk_var].dims ) ds[rechunk_var].encoding.pop("chunks", None) - ds[rechunk_var].encoding.pop("preferred_chunks", None) + ds[rechunk_var].encoding["preferred_chunks"] = rechunk_dims return ds diff --git a/src/xscen/testing.py b/src/xscen/testing.py index 232437e1..c68e5264 100644 --- a/src/xscen/testing.py +++ b/src/xscen/testing.py @@ -1,12 +1,20 @@ """Testing utilities for xscen.""" +import importlib.metadata +import os +import re +from io import StringIO +from pathlib import Path +from typing import TextIO + import cartopy.crs as ccrs import numpy as np import pandas as pd import xarray as xr from xclim.testing.helpers import test_timeseries as timeseries +from xclim.testing.utils import show_versions as _show_versions -__all__ = ["datablock_3d", "fake_data"] +__all__ = ["datablock_3d", "fake_data", "publish_release_notes", "show_versions"] def datablock_3d( @@ -242,3 +250,131 @@ def fake_data( data = data + offset - (np.random.random() * amplitude - amplitude / 2) return data + + +def publish_release_notes( + style: str = "md", + file: os.PathLike | StringIO | TextIO | None = None, + changes: str | os.PathLike | None = None, + latest: bool = True, +) -> str | None: + """Format release history in Markdown or ReStructuredText. + + Parameters + ---------- + style : {"rst", "md"} + Use ReStructuredText (`rst`) or Markdown (`md`) formatting. Default: Markdown. + file : {os.PathLike, StringIO, TextIO, None} + If provided, prints to the given file-like object. Otherwise, returns a string. + changes : {str, os.PathLike}, optional + If provided, manually points to the file where the changelog can be found. + Assumes a relative path otherwise. + latest : bool + Whether to return the release notes of the latest version or all the content of the changelog. + + Returns + ------- + str, optional + + Notes + ----- + This function exists solely for development purposes. Adapted from xclim.testing.utils.publish_release_notes. + """ + if isinstance(changes, str | Path): + changes_file = Path(changes).absolute() + else: + changes_file = Path(__file__).absolute().parents[2].joinpath("CHANGELOG.rst") + + if not changes_file.exists(): + raise FileNotFoundError("Changes file not found in xscen file tree.") + + with Path(changes_file).open(encoding="utf-8") as f: + changes = f.read() + + if style == "rst": + hyperlink_replacements = { + r":issue:`([0-9]+)`": r"`GH/\1 `_", + r":pull:`([0-9]+)`": r"`PR/\1 `_", + r":user:`([a-zA-Z0-9_.-]+)`": r"`@\1 `_", + } + elif style == "md": + hyperlink_replacements = { + r":issue:`([0-9]+)`": r"[GH/\1](https://github.com/Ouranosinc/xscen/issues/\1)", + r":pull:`([0-9]+)`": r"[PR/\1](https://github.com/Ouranosinc/xscen/pull/\1)", + r":user:`([a-zA-Z0-9_.-]+)`": r"[@\1](https://github.com/\1)", + } + else: + raise NotImplementedError() + + for search, replacement in hyperlink_replacements.items(): + changes = re.sub(search, replacement, changes) + + if latest: + changes_split = changes.split("\n\nv0.") + changes = changes_split[0] + "\n\nv0." + changes_split[1] + + if style == "md": + changes = changes.replace("=========\nChangelog\n=========", "# Changelog") + + titles = {r"\n(.*?)\n([\-]{1,})": "-", r"\n(.*?)\n([\^]{1,})": "^"} + for title_expression, level in titles.items(): + found = re.findall(title_expression, changes) + for grouping in found: + fixed_grouping = ( + str(grouping[0]).replace("(", r"\(").replace(")", r"\)") + ) + search = rf"({fixed_grouping})\n([\{level}]{'{' + str(len(grouping[1])) + '}'})" + replacement = f"{'##' if level == '-' else '###'} {grouping[0]}" + changes = re.sub(search, replacement, changes) + + link_expressions = r"[\`]{1}([\w\s]+)\s<(.+)>`\_" + found = re.findall(link_expressions, changes) + for grouping in found: + search = rf"`{grouping[0]} <.+>`\_" + replacement = f"[{str(grouping[0]).strip()}]({grouping[1]})" + changes = re.sub(search, replacement, changes) + + if not file: + return changes + if isinstance(file, Path | os.PathLike): + file = Path(file).open("w") + print(changes, file=file) + + +def show_versions( + file: os.PathLike | StringIO | TextIO | None = None, + deps: list | None = None, +) -> str | None: + """Print the versions of xscen and its dependencies. + + Parameters + ---------- + file : {os.PathLike, StringIO, TextIO}, optional + If provided, prints to the given file-like object. Otherwise, returns a string. + deps : list, optional + A list of dependencies to gather and print version information from. Otherwise, prints `xscen` dependencies. + + Returns + ------- + str or None + """ + + def _get_xscen_dependencies(): + xscen_metadata = importlib.metadata.metadata("xscen") + requires = xscen_metadata.get_all("Requires-Dist") + requires = [ + req.split("[")[0] + .split(";")[0] + .split(">")[0] + .split("<")[0] + .split("=")[0] + .split("!")[0] + for req in requires + ] + + return ["xscen"] + requires + + if deps is None: + deps = _get_xscen_dependencies() + + return _show_versions(file=file, deps=deps) diff --git a/src/xscen/utils.py b/src/xscen/utils.py index d68bebec..c9e86701 100644 --- a/src/xscen/utils.py +++ b/src/xscen/utils.py @@ -11,11 +11,9 @@ from collections.abc import Sequence from copy import deepcopy from datetime import datetime -from io import StringIO from itertools import chain from pathlib import Path from types import ModuleType -from typing import TextIO import cftime import flox.xarray @@ -28,7 +26,6 @@ from xclim.core.options import METADATA_LOCALES from xclim.core.options import OPTIONS as XC_OPTIONS from xclim.core.utils import uses_dask -from xclim.testing.utils import show_versions as _show_versions from .config import parse_config @@ -46,7 +43,6 @@ "maybe_unstack", "minimum_calendar", "natural_sort", - "publish_release_notes", "stack_drop_nans", "standardize_periods", "translate_time_chunk", @@ -1089,88 +1085,6 @@ def clean_up( # noqa: C901 return ds -def publish_release_notes( - style: str = "md", - file: os.PathLike | StringIO | TextIO | None = None, - changes: str | os.PathLike | None = None, -) -> str | None: - """Format release history in Markdown or ReStructuredText. - - Parameters - ---------- - style : {"rst", "md"} - Use ReStructuredText (`rst`) or Markdown (`md`) formatting. Default: Markdown. - file : {os.PathLike, StringIO, TextIO, None} - If provided, prints to the given file-like object. Otherwise, returns a string. - changes : {str, os.PathLike}, optional - If provided, manually points to the file where the changelog can be found. - Assumes a relative path otherwise. - - Returns - ------- - str, optional - - Notes - ----- - This function exists solely for development purposes. Adapted from xclim.testing.utils.publish_release_notes. - """ - if isinstance(changes, str | Path): - changes_file = Path(changes).absolute() - else: - changes_file = Path(__file__).absolute().parents[2].joinpath("CHANGELOG.rst") - - if not changes_file.exists(): - raise FileNotFoundError("Changes file not found in xscen file tree.") - - with Path(changes_file).open(encoding="utf-8") as f: - changes = f.read() - - if style == "rst": - hyperlink_replacements = { - r":issue:`([0-9]+)`": r"`GH/\1 `_", - r":pull:`([0-9]+)`": r"`PR/\1 `_", - r":user:`([a-zA-Z0-9_.-]+)`": r"`@\1 `_", - } - elif style == "md": - hyperlink_replacements = { - r":issue:`([0-9]+)`": r"[GH/\1](https://github.com/Ouranosinc/xscen/issues/\1)", - r":pull:`([0-9]+)`": r"[PR/\1](https://github.com/Ouranosinc/xscen/pull/\1)", - r":user:`([a-zA-Z0-9_.-]+)`": r"[@\1](https://github.com/\1)", - } - else: - raise NotImplementedError() - - for search, replacement in hyperlink_replacements.items(): - changes = re.sub(search, replacement, changes) - - if style == "md": - changes = changes.replace("=========\nChangelog\n=========", "# Changelog") - - titles = {r"\n(.*?)\n([\-]{1,})": "-", r"\n(.*?)\n([\^]{1,})": "^"} - for title_expression, level in titles.items(): - found = re.findall(title_expression, changes) - for grouping in found: - fixed_grouping = ( - str(grouping[0]).replace("(", r"\(").replace(")", r"\)") - ) - search = rf"({fixed_grouping})\n([\{level}]{'{' + str(len(grouping[1])) + '}'})" - replacement = f"{'##' if level == '-' else '###'} {grouping[0]}" - changes = re.sub(search, replacement, changes) - - link_expressions = r"[\`]{1}([\w\s]+)\s<(.+)>`\_" - found = re.findall(link_expressions, changes) - for grouping in found: - search = rf"`{grouping[0]} <.+>`\_" - replacement = f"[{str(grouping[0]).strip()}]({grouping[1]})" - changes = re.sub(search, replacement, changes) - - if not file: - return changes - if isinstance(file, Path | os.PathLike): - file = Path(file).open("w") - print(changes, file=file) - - def unstack_dates( # noqa: C901 ds: xr.Dataset, seasons: dict[int, str] | None = None, @@ -1346,99 +1260,6 @@ def reshape_da(da): return dso.assign_coords(**new_coords) -def show_versions( - file: os.PathLike | StringIO | TextIO | None = None, - deps: list | None = None, -) -> str | None: - """Print the versions of xscen and its dependencies. - - Parameters - ---------- - file : {os.PathLike, StringIO, TextIO}, optional - If provided, prints to the given file-like object. Otherwise, returns a string. - deps : list, optional - A list of dependencies to gather and print version information from. Otherwise, prints `xscen` dependencies. - - Returns - ------- - str or None - """ - if deps is None: - deps = [ - "xscen", - # Main packages - "cartopy", - "cftime", - "cf_xarray", - "clisops", - "dask", - "flox", - "fsspec", - "geopandas", - "h5netcdf", - "h5py", - "intake_esm", - "matplotlib", - "netCDF4", - "numcodecs", - "numpy", - "pandas", - "parse", - "pyyaml", - "rechunker", - "scipy", - "shapely", - "sparse", - "toolz", - "xarray", - "xclim", - "xesmf", - "zarr", - # Opt - "nc-time-axis", - "pyarrow", - # Dev - "babel", - "black", - "blackdoc", - "bump-my-version", - "coverage", - "coveralls", - "flake8", - "flake8-rst-docstrings", - "ipykernel", - "ipython", - "isort", - "jupyter_client", - "nbsphinx", - "nbval", - "pandoc", - "pooch", - "pre-commit", - "pytest", - "pytest-cov", - "ruff", - "setuptools", - "setuptools-scm", - "sphinx", - "sphinx-autoapi", - "sphinx-rtd-theme", - "sphinxcontrib-napoleon", - "sphinx-codeautolink", - "sphinx-copybutton", - "sphinx-mdinclude", - "watchdog", - "xdoctest", - "tox", - "build", - "wheel", - "pip", - "flake8-alphabetize", - ] - - return _show_versions(file=file, deps=deps) - - def ensure_correct_time(ds: xr.Dataset, xrfreq: str) -> xr.Dataset: """Ensure a dataset has the correct time coordinate, as expected for the given frequency. @@ -1616,3 +1437,29 @@ def rechunk_for_resample(obj: xr.DataArray | xr.Dataset, **resample_kwargs): res = obj.resample(**resample_kwargs) return flox.xarray.rechunk_for_blockwise(obj, res._dim, res._codes) + + +def publish_release_notes(*args, **kwargs): + """Backward compatibility for the old function.""" + warnings.warn( + "'xscen.utils.publish_release_notes' has been moved to 'xscen.testing.publish_release_notes'." + "Support for this function will be removed in xscen v0.12.0.", + FutureWarning, + ) + + from .testing import publish_release_notes as prn + + return prn(*args, **kwargs) + + +def show_versions(*args, **kwargs): + """Backward compatibility for the old function.""" + warnings.warn( + "'xscen.utils.show_versions' has been moved to 'xscen.testing.show_versions'." + "Support for this function will be removed in xscen v0.12.0.", + FutureWarning, + ) + + from .testing import show_versions as sv + + return sv(*args, **kwargs) diff --git a/tests/test_ensembles.py b/tests/test_ensembles.py index e21a409a..43dc383e 100644 --- a/tests/test_ensembles.py +++ b/tests/test_ensembles.py @@ -5,26 +5,17 @@ import pytest import xarray as xr import xclim as xc -from packaging.version import Version try: import xesmf as xe except ImportError: xe = None -# temp fix for changes to xclim-testdata -from functools import partial -from xclim.testing import open_dataset from xclim.testing.helpers import test_timeseries as timeseries +from xclim.testing.utils import nimbus import xscen as xs -# FIXME: Remove if-else when updating minimum xclim version to 0.53 -if Version(xc.__version__) < Version("0.53.0"): - # Hack to revert to old testdata with old xclim - open_dataset = partial(open_dataset, branch="v2023.12.14") - - LOGGER = logging.getLogger(__name__) @@ -1103,7 +1094,7 @@ def test_build_partition_data(self, samplecat, tmp_path): class TestReduceEnsemble: def test_with_criteria(self): - ds = open_dataset("EnsembleReduce/TestEnsReduceCriteria.nc") + ds = xr.open_dataset(nimbus().fetch("EnsembleReduce/TestEnsReduceCriteria.nc")) selected, clusters, fig_data = xs.reduce_ensemble( ds["data"], method="kmeans", max_clusters=3 ) @@ -1122,7 +1113,9 @@ def test_without_criteria(self, horizon): "CNRM-CM5": "EnsembleStats/BCCAQv2+ANUSPLIN300_CNRM-CM5_historical+rcp45_r1i1p1_1970-2050_tg_mean_YS.nc", } for d in datasets: - ds = open_dataset(datasets[d]).isel(lon=slice(0, 4), lat=slice(0, 4)) + ds = xr.open_dataset(nimbus().fetch(datasets[d])).isel( + lon=slice(0, 4), lat=slice(0, 4) + ) ds = xs.climatological_op( ds, op="mean", @@ -1146,7 +1139,7 @@ def test_without_criteria(self, horizon): assert fig_data == {} def test_errors(self): - ds = open_dataset("EnsembleReduce/TestEnsReduceCriteria.nc") + ds = xr.open_dataset(nimbus().fetch("EnsembleReduce/TestEnsReduceCriteria.nc")) with pytest.raises( ValueError, match="Data must have a 'horizon' dimension to be subsetted." ): diff --git a/tests/test_io.py b/tests/test_io.py index 7db611c9..87e46a4c 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -1,9 +1,175 @@ +import datetime +from pathlib import Path + import numpy as np +import pandas as pd import pytest import xarray as xr import xclim as xc +from xclim.testing.helpers import test_timeseries as timeseries import xscen as xs +from xscen.testing import datablock_3d + + +@pytest.mark.parametrize("suffix", [".zarr", ".zarr.zip", "h5", "nc"]) +def test_get_engine(tmpdir, suffix): + if suffix in [".zarr", ".zarr.zip"]: + path = "some/path" + suffix + assert xs.io.get_engine(path) == "zarr" + else: + ds = timeseries( + np.zeros(60), + variable="tas", + as_dataset=True, + ) + ds.to_netcdf( + Path(tmpdir) / f"test.nc", + engine="netcdf4" if suffix == "nc" else "h5netcdf", + ) + assert xs.io.get_engine(Path(tmpdir) / f"test.nc") in [ + "netcdf4", + "h5netcdf", + ] # Hard to predict which one + + +class TestEstimateChunks: + ds = datablock_3d( + np.zeros((50, 100, 150)), + variable="tas", + x="lon", + x_start=-70, + x_step=0.1, + y="lat", + y_start=45, + y_step=-0.1, + as_dataset=True, + ) + ds2 = ds.copy() + ds2["tas"] = ds2["tas"].astype(np.float32) + ds["just_a_variable"] = xr.DataArray(np.zeros(50), dims="new_dim") + + def test_normal(self): + out1 = xs.io.estimate_chunks(self.ds, dims=["time", "lat", "lon"], target_mb=1) + assert out1 == {"time": 30, "lat": 55, "lon": 85} + out2 = xs.io.estimate_chunks(self.ds2, dims=["time", "lat", "lon"], target_mb=1) + assert out2 == {"time": 35, "lat": 70, "lon": 105} + out3 = xs.io.estimate_chunks(self.ds, dims=["lat", "lon"], target_mb=1) + assert out3 == {"lon": 65, "lat": 40, "time": -1} + out4 = xs.io.estimate_chunks(self.ds2, dims=["time"], target_mb=1) + assert out4 == {"time": 15, "lat": -1, "lon": -1} + + @pytest.mark.parametrize("chunk_per_variable", [True, False]) + @pytest.mark.parametrize("as_file", [True, False]) + def test_multiple_vars(self, tmpdir, chunk_per_variable, as_file): + ds = self.ds.copy() + ds["pr"] = ds["tas"].isel(time=0) + + if as_file: + ds.to_netcdf(Path(tmpdir) / "test.nc") + ds = Path(tmpdir) / "test.nc" + + out = xs.io.estimate_chunks( + ds, dims=["lat", "lon"], target_mb=1, chunk_per_variable=chunk_per_variable + ) + if chunk_per_variable is False: + assert out == {"lon": 65, "lat": 40, "time": -1} + else: + assert out == { + "tas": {"lon": 65, "lat": 40, "time": -1}, + "pr": {"lon": 150, "lat": 100}, + } + + +class TestSubsetMaxsize: + def test_normal(self): + ds = datablock_3d( + np.zeros((1500, 5, 5)), + variable="tas", + x="lon", + x_start=-70, + x_step=0.1, + y="lat", + y_start=45, + y_step=-0.1, + as_dataset=True, + ) + ds["pr"] = ds["tas"] + # First, test with a dataset that is already small enough + out = xs.io.subset_maxsize(ds, maxsize_gb=1) + assert len(out) == 1 + assert out[0].equals(ds) + + out = xs.io.subset_maxsize(ds, maxsize_gb=0.0005) + assert len(out) == 2 + assert xr.concat(out, dim="time").equals(ds) + + def test_error(self): + ds = datablock_3d( + np.zeros((1, 50, 10)), + variable="tas", + x="lon", + x_start=-70, + x_step=0.1, + y="lat", + y_start=45, + y_step=-0.1, + as_dataset=True, + ) + ds = ds.isel(time=0) + + with pytest.raises(NotImplementedError, match="does not contain a"): + xs.io.subset_maxsize(ds, maxsize_gb=1e-15) + + +class TestCleanIncomplete: + @pytest.mark.parametrize("which", ["complete", "incomplete"]) + def test_complete(self, tmpdir, which): + ds = datablock_3d( + np.ones((5, 5, 5)), + variable="tas", + x="lon", + x_start=-70, + x_step=0.1, + y="lat", + y_start=45, + y_step=-0.1, + as_dataset=True, + ) + ds["pr"] = ds["tas"].copy() + ds.to_zarr(Path(tmpdir) / "test.zarr") + + if which == "complete": + xs.io.clean_incomplete(Path(tmpdir) / "test.zarr", complete=["tas"]) + else: + xs.io.clean_incomplete(Path(tmpdir) / "test.zarr", incomplete=["pr"]) + assert (Path(tmpdir) / "test.zarr/tas").exists() + assert not (Path(tmpdir) / "test.zarr/pr").exists() + assert (Path(tmpdir) / "test.zarr/.zmetadata").exists() + + ds2 = xr.open_zarr(Path(tmpdir) / "test.zarr") + assert "pr" not in ds2 + assert ds2.equals(ds[["tas"]]) + + def test_error(self, tmpdir): + ds = datablock_3d( + np.ones((5, 5, 5)), + variable="tas", + x="lon", + x_start=-70, + x_step=0.1, + y="lat", + y_start=45, + y_step=-0.1, + as_dataset=True, + ) + ds["pr"] = ds["tas"].copy() + ds.to_zarr(Path(tmpdir) / "test.zarr") + + with pytest.raises(ValueError, match="Use either"): + xs.io.clean_incomplete( + Path(tmpdir) / "test.zarr", complete=["tas"], incomplete=["pr"] + ) class TestRechunkForSaving: @@ -16,7 +182,7 @@ class TestRechunkForSaving: (["rlon", "rlat"], False), ], ) - def test_options(self, datablock_3d, dims, xy): + def test_options(self, dims, xy): ds = datablock_3d( np.random.random((30, 30, 50)), variable="tas", @@ -38,7 +204,7 @@ def test_options(self, datablock_3d, dims, xy): ) assert chunks[0] == new_chunks[dim] - def test_variables(self, datablock_3d): + def test_variables(self): ds = datablock_3d( np.random.random((30, 30, 50)), variable="tas", @@ -73,7 +239,7 @@ class TestToTable: xr.merge( [ xs.testing.datablock_3d( - np.random.random_sample((20, 3, 2)), + np.ones((20, 3, 2)), v, "lon", 0, @@ -96,7 +262,7 @@ class TestToTable: @pytest.mark.parametrize( "multiple, as_dataset", [(True, True), (False, True), (False, False)] ) - def test_normal(self, multiple, as_dataset): + def test_normal(self, tmpdir, multiple, as_dataset): if multiple is False: if as_dataset: ds = self.ds[["tas"]].copy() @@ -106,9 +272,25 @@ def test_normal(self, multiple, as_dataset): ds = self.ds.copy() # Default + xs.save_to_table(ds, Path(tmpdir) / "test.csv") + saved = pd.read_csv(Path(tmpdir) / "test.csv") tab = xs.io.to_table(ds) - assert tab.shape == (120, 5 if multiple else 3) # 3 vars + 2 aux coords + + assert tab.shape == ( + 120, + 5 if multiple else 3, + ) # 3 variables + 2 coords that are not dimensions + assert saved.shape == ( + 120, + 8 if multiple else 6, + ) # everything gets mapped, so dimensions are included in the columns assert tab.columns.names == ["variable"] if multiple else [None] + assert ( + set(saved.columns) + == {"season", "time", "site", "lat", "lon", "pr", "snw", "tas"} + if multiple + else {"season", "time", "site", "tas"} + ) assert tab.index.names == ["season", "time", "site"] # Season order is chronological, rather than alphabetical np.testing.assert_array_equal( @@ -117,13 +299,24 @@ def test_normal(self, multiple, as_dataset): .index.get_level_values("season"), ["JFM", "AMJ", "JAS", "OND"], ) + np.testing.assert_array_equal(saved.loc[0, "season"], "JFM") if multiple: # Variable in the index, thus no coords + xs.save_to_table( + ds, + Path(tmpdir) / "test.xlsx", + row=["time", "variable"], + column=["season", "site"], + coords=False, + ) tab = xs.io.to_table( ds, row=["time", "variable"], column=["season", "site"], coords=False ) + saved = pd.read_excel(Path(tmpdir) / "test.xlsx") + assert tab.shape == (15, 24) + assert saved.shape == (17, 26) # Because of the headers assert tab.columns.names == ["season", "site"] np.testing.assert_array_equal( tab.loc[("1993", "pr"), ("JFM",)], ds.pr.sel(time="1993", season="JFM") @@ -137,8 +330,23 @@ def test_normal(self, multiple, as_dataset): ) == 0 ) + # Excel is not the prettiest thing to test + np.testing.assert_array_equal(saved.iloc[2, 2:], np.tile([1], 24)) + assert saved.iloc[0, 2] == "a" + assert saved.iloc[2, 0] == datetime.datetime(1993, 1, 1, 0, 0) - def test_sheet(self): + def test_sheet(self, tmpdir): + xs.save_to_table( + self.ds, + Path(tmpdir) / "test.xlsx", + row=["time", "variable"], + column=["season"], + sheet="site", + coords=False, + ) + saved = pd.read_excel( + Path(tmpdir) / "test.xlsx", sheet_name=["a", "b", "c", "d", "e", "f"] + ) # This is a test by itself tab = xs.io.to_table( self.ds, row=["time", "variable"], @@ -146,31 +354,77 @@ def test_sheet(self): sheet="site", coords=False, ) + assert set(tab.keys()) == {("a",), ("b",), ("c",), ("d",), ("e",), ("f",)} assert tab[("a",)].shape == (15, 4) # 5 time * 3 variable X 4 season + assert saved["a"].shape == (15, 6) # Because of the headers - def test_error(self): + def test_kwargs(self, tmpdir): + xs.save_to_table( + self.ds, + Path(tmpdir) / "test.xlsx", + row=["time", "variable"], + column=["season", "site"], + coords=False, + datetime_format="dd/mm/yyyy", + ) + saved = pd.read_excel(Path(tmpdir) / "test.xlsx") + assert saved.iloc[2, 0] == datetime.datetime( + 1993, 1, 1, 0, 0 + ) # No real way to test the format + + def test_multiindex(self, tmpdir): + xs.save_to_table( + self.ds, + Path(tmpdir) / "test.csv", + row=["time", "variable"], + column=["season", "site"], + coords=False, + row_sep="|", + col_sep=";", + ) + out = pd.read_csv(Path(tmpdir) / "test.csv") + assert out.shape == (15, 25) + assert out.columns[0] == "time|variable" + assert out.columns[1] == "JFM;a" + + def test_error(self, tmpdir): with pytest.raises(ValueError, match="Repeated dimension names."): - xs.io.to_table( - self.ds, row=["time", "variable"], column=["season", "site", "time"] + xs.save_to_table( + self.ds, + Path(tmpdir) / "test.xlsx", + row=["time", "variable"], + column=["season", "site", "time"], ) with pytest.raises(ValueError, match="Passed row, column and sheet"): - xs.io.to_table( - self.ds, row=["time", "variable"], column=["season", "site", "foo"] + xs.save_to_table( + self.ds, + Path(tmpdir) / "test.xlsx", + row=["time", "variable"], + column=["season", "site", "foo"], ) with pytest.raises( NotImplementedError, match="Keeping auxiliary coords is not implemented when", ): - xs.io.to_table( + xs.save_to_table( self.ds, + Path(tmpdir) / "test.xlsx", row=["time", "variable"], column=["season", "site"], coords=True, ) + with pytest.raises(ValueError, match="Output format could not be inferred"): + xs.save_to_table(self.ds, Path(tmpdir) / "test") + with pytest.raises( + ValueError, match="is only valid with excel as the output format" + ): + xs.save_to_table(self.ds, Path(tmpdir) / "test.csv", sheet="site") + with pytest.raises(ValueError, match="but the output format is not Excel."): + xs.save_to_table(self.ds, Path(tmpdir) / "test.csv", add_toc=True) @pytest.mark.parametrize("as_dataset", [True, False]) - def test_make_toc(self, as_dataset): + def test_make_toc(self, tmpdir, as_dataset): ds = self.ds.copy() for v in ds.data_vars: ds[v].attrs["long_name"] = f"Long name for {v}" @@ -180,7 +434,10 @@ def test_make_toc(self, as_dataset): ds = ds["tas"] with xc.set_options(metadata_locales="fr"): - toc = xs.io.make_toc(ds) + xs.save_to_table(ds, Path(tmpdir) / "test.xlsx", add_toc=True) + + toc = pd.read_excel(Path(tmpdir) / "test.xlsx", sheet_name="Contenu") + toc = toc.set_index("Unnamed: 0" if as_dataset else "Variable") if as_dataset: assert toc.shape == (8, 2) @@ -189,7 +446,7 @@ def test_make_toc(self, as_dataset): "tas", "pr", "snw", - "", + np.nan, "Attributs globaux", "foo", "baz", @@ -205,7 +462,7 @@ def test_make_toc(self, as_dataset): assert toc.loc["tas", "Unités"] == "K" -def test_round_bits(datablock_3d): +def test_round_bits(): da = datablock_3d( np.random.random((30, 30, 50)), variable="tas", @@ -241,3 +498,295 @@ def test_guess_bitround(self, vname, vtype, bitr, exp): xs.io._get_keepbits(bitr, vname, vtype) else: assert xs.io._get_keepbits(bitr, vname, vtype) == exp + + @pytest.mark.parametrize("mode", ["f", "o", "a"]) + @pytest.mark.parametrize("itervar", [True, False]) + def test_mode(self, tmpdir, mode, itervar): + ds1 = timeseries( + np.arange(1, 5), + variable="tas", + as_dataset=True, + ) + xs.save_to_zarr(ds1, Path(tmpdir) / "test.zarr") + + ds2 = timeseries( + np.arange(10, 14), + variable="tas", + as_dataset=True, + ) + ds2["pr"] = ds2["tas"].copy() + ds2 = ds2[["pr", "tas"]] + + if mode == "f": + with pytest.raises(ValueError, match="exists in dataset"): + xs.save_to_zarr( + ds2, Path(tmpdir) / "test.zarr", mode=mode, itervar=itervar + ) + assert not (Path(tmpdir) / "test.zarr/pr").exists() + if itervar: + # Essentially just to reach 100% coverage and make sure the function doesn't crash with mode="f" and itervar=True + xs.save_to_zarr( + ds2, Path(tmpdir) / "test2.zarr", mode=mode, itervar=itervar + ) + ds3 = xr.open_zarr(Path(tmpdir) / "test2.zarr") + np.testing.assert_array_almost_equal(ds3.tas.isel(time=0), [10]) + np.testing.assert_array_almost_equal(ds3.pr.isel(time=0), [10]) + + elif mode == "o": + xs.save_to_zarr(ds2, Path(tmpdir) / "test.zarr", mode=mode, itervar=itervar) + ds3 = xr.open_zarr(Path(tmpdir) / "test.zarr") + np.testing.assert_array_almost_equal(ds3.tas.isel(time=0), [10]) + np.testing.assert_array_almost_equal(ds3.pr.isel(time=0), [10]) + + elif mode == "a": + # First, try only with variables that are already in the dataset + xs.save_to_zarr( + ds2[["tas"]], Path(tmpdir) / "test.zarr", mode=mode, itervar=itervar + ) + ds3 = xr.open_zarr(Path(tmpdir) / "test.zarr") + np.testing.assert_array_almost_equal(ds3.tas.isel(time=0), [1]) + + # Now, try with a new variable + xs.save_to_zarr(ds2, Path(tmpdir) / "test.zarr", mode=mode, itervar=itervar) + ds3 = xr.open_zarr(Path(tmpdir) / "test.zarr") + np.testing.assert_array_almost_equal(ds3.tas.isel(time=0), [1]) + np.testing.assert_array_almost_equal(ds3.pr.isel(time=0), [10]) + + @pytest.mark.parametrize("append", [True, False]) + def test_append(self, tmpdir, append): + ds1 = datablock_3d( + np.array([[[1, 2], [3, 4]]]), + variable="tas", + x="lon", + x_start=-70, + y="lat", + y_start=45, + as_dataset=True, + ) + ds2 = datablock_3d( + np.array([[[11, 12], [13, 14]]]), + variable="tas", + x="lon", + x_start=-70, + y="lat", + y_start=45, + start="2005-01-01", + as_dataset=True, + ) + ds2["pr"] = ds2["tas"].copy() + xs.save_to_zarr( + ds1, Path(tmpdir) / "test.zarr", encoding={"tas": {"dtype": "float32"}} + ) + + encoding = { + "tas": {"dtype": "int32"} + } # This should be ignored, as the variable is already in the dataset + if append: + with pytest.raises( + ValueError, + match="is set in zarr_kwargs, all variables must already exist in the dataset.", + ): + xs.save_to_zarr( + ds2, + Path(tmpdir) / "test.zarr", + mode="a", + zarr_kwargs={"append_dim": "time"}, + encoding=encoding, + ) + xs.save_to_zarr( + ds2[["tas"]], + Path(tmpdir) / "test.zarr", + mode="a", + zarr_kwargs={"append_dim": "time"}, + encoding=encoding, + ) + out = xr.open_zarr(Path(tmpdir) / "test.zarr") + np.testing.assert_array_equal( + out.tas, np.array([[[1, 2], [3, 4]], [[11, 12], [13, 14]]]) + ) + else: + xs.save_to_zarr( + ds2, Path(tmpdir) / "test.zarr", mode="a", encoding=encoding + ) + out = xr.open_zarr(Path(tmpdir) / "test.zarr") + np.testing.assert_array_equal(out.tas, np.array([[[1, 2], [3, 4]]])) + np.testing.assert_array_equal(out.pr, np.array([[[11, 12], [13, 14]]])) + assert out.tas.dtype == np.float32 + + def test_skip(self, tmpdir): + ds1 = timeseries( + np.arange(1, 5), + variable="tas", + as_dataset=True, + ) + ds2 = timeseries( + np.arange(10, 14), + variable="tas", + as_dataset=True, + ) + xs.save_to_zarr(ds1, Path(tmpdir) / "test.zarr") + xs.save_to_zarr(ds2, Path(tmpdir) / "test.zarr", mode="a") + ds3 = xr.open_zarr(Path(tmpdir) / "test.zarr") + np.testing.assert_array_almost_equal(ds3.tas.isel(time=0), [1]) + + +@pytest.mark.parametrize("engine", ["netcdf", "zarr"]) +def test_savefuncs_normal(tmpdir, engine): + ds = datablock_3d( + np.tile(np.arange(1111, 1121), 15).reshape(15, 5, 2) * 1e-7, + variable="tas", + x="lon", + x_start=-70, + y="lat", + y_start=45, + as_dataset=True, + ) + ds["pr"] = ds["tas"].copy() + ds["other"] = ds["tas"].copy() + ds["other"].encoding = {"dtype": "float32"} + ds.attrs["foo"] = {"bar": 1} + ds["pr"].attrs["foo"] = {"bar": 2} + + ds = ds.assign_coords( + some_coord=("lat", np.array(["hi", "how", "are", "you", "doing"])) + ) + ds["some_coord"] = ds["some_coord"].astype(object) + ds["some_coord"].encoding = {"source": "this is a source"} + + rechunk = {"time": 5, "lon": 2, "lat": 2} + bitround = {"tas": 2, "pr": 3} + if engine == "netcdf": + xs.save_to_netcdf( + ds, + Path(tmpdir) / "test.nc", + rechunk=rechunk, + bitround=bitround, + ) + ds2 = xr.open_dataset(Path(tmpdir) / "test.nc", chunks={}) + else: + xs.save_to_zarr( + ds, + Path(tmpdir) / "test.zarr", + rechunk=rechunk, + bitround=bitround, + ) + ds2 = xr.open_zarr(Path(tmpdir) / "test.zarr") + + # Chunks + assert ds2.tas.chunks == ((5, 5, 5), (2, 2, 1), (2,)) + + # Dtype + assert ds2.tas.dtype == np.float64 + assert ds2.other.dtype == np.float32 + + # Bitround + np.testing.assert_array_almost_equal( + ds2.tas.isel(time=0, lat=0, lon=0), [0.00010681], decimal=8 + ) + assert ds2.tas.attrs["_QuantizeBitRoundNumberOfSignificantDigits"] == 2 + np.testing.assert_array_almost_equal( + ds2.pr.isel(time=0, lat=0, lon=0), [0.00011444], decimal=8 + ) + assert ds2.pr.attrs["_QuantizeBitRoundNumberOfSignificantDigits"] == 3 + np.testing.assert_array_almost_equal( + ds2.other.isel(time=0, lat=0, lon=0), [0.0001111], decimal=8 + ) + assert ds2.other.attrs["_QuantizeBitRoundNumberOfSignificantDigits"] == 12 + + # Attributes + assert ds2.attrs["foo"] == "{'bar': 1}" + assert ds2.pr.attrs["foo"] == "{'bar': 2}" + + if engine == "netcdf": + assert ds.some_coord.encoding == {"source": "this is a source"} + else: + assert ds.some_coord.encoding == {} + + +class TestRechunk: + @pytest.mark.parametrize("engine", ["nc", "zarr"]) + def test_rechunk(self, tmpdir, engine): + ds = datablock_3d( + np.tile(np.arange(1111, 1121), 15).reshape(15, 5, 2) * 1e-7, + variable="tas", + x="lon", + x_start=-70, + y="lat", + y_start=45, + as_dataset=True, + ) + ds["pr"] = ds["tas"].copy() + + if engine == "nc": + xs.save_to_netcdf( + ds, + Path(tmpdir) / "test.nc", + ) + else: + xs.save_to_zarr( + ds, + Path(tmpdir) / "test.zarr", + ) + + (Path(tmpdir) / f"test2.zarr").mkdir() + + xs.io.rechunk( + Path(tmpdir) / f"test.{engine}", + Path(tmpdir) / "test2.zarr", + chunks_over_dim={"time": 5, "lon": 2, "lat": 2}, + overwrite=True, + worker_mem="1GB", + temp_store=Path(tmpdir) / "temp", + ) + xs.io.rechunk( + Path(tmpdir) / f"test.{engine}", + Path(tmpdir) / "test3.zarr", + chunks_over_var={"tas": {"time": 5, "lon": 2, "lat": 2}}, + overwrite=True, + worker_mem="1GB", + temp_store=Path(tmpdir) / "temp", + ) + + ds2 = xr.open_zarr(Path(tmpdir) / "test2.zarr") + ds3 = xr.open_zarr(Path(tmpdir) / "test3.zarr") + assert ds2.tas.chunks == ((5, 5, 5), (2, 2, 1), (2,)) + assert ds2.pr.chunks == ((5, 5, 5), (2, 2, 1), (2,)) + assert ds3.tas.chunks == ((5, 5, 5), (2, 2, 1), (2,)) + assert ds3.pr.chunks == ((15,), (5,), (2,)) + + def test_error(self, tmpdir): + ds = datablock_3d( + np.tile(np.arange(1111, 1121), 15).reshape(15, 5, 2) * 1e-7, + variable="tas", + x="lon", + x_start=-70, + y="lat", + y_start=45, + as_dataset=True, + ) + with pytest.raises(ValueError, match="No chunks given. "): + xs.io.rechunk(ds, Path(tmpdir) / "test.nc", worker_mem="1GB") + + +def test_zip_zip(tmpdir): + ds = datablock_3d( + np.tile(np.arange(1111, 1121), 15).reshape(15, 5, 2) * 1e-7, + variable="tas", + x="lon", + x_start=-70, + y="lat", + y_start=45, + as_dataset=True, + ) + xs.save_to_zarr(ds, Path(tmpdir) / "test.zarr") + xs.io.zip_directory( + Path(tmpdir) / "test.zarr", Path(tmpdir) / "test.zarr.zip", delete=True + ) + assert not (Path(tmpdir) / "test.zarr").exists() + + with xr.open_zarr(Path(tmpdir) / "test.zarr.zip") as ds2: + assert ds2.equals(ds) + + xs.io.unzip_directory(Path(tmpdir) / "test.zarr.zip", Path(tmpdir) / "test2.zarr") + with xr.open_zarr(Path(tmpdir) / "test2.zarr") as ds3: + assert ds3.equals(ds) diff --git a/tests/test_testing.py b/tests/test_testing.py new file mode 100644 index 00000000..db95d242 --- /dev/null +++ b/tests/test_testing.py @@ -0,0 +1,68 @@ +from pathlib import Path + +import pytest + +import xscen as xs + + +class TestPublish: + @pytest.mark.requires_netcdf + @pytest.mark.parametrize("fmt", ["md", "rst"]) + def test_normal(self, fmt): + out = xs.testing.publish_release_notes( + fmt, + changes=Path(__file__).parent.parent.joinpath("CHANGELOG.rst"), + latest=False, + ) + if fmt == "md": + assert out.startswith("# Changelog\n\n") + assert "[PR/413](https://github.com/Ouranosinc/xscen/pull/413)" in out + elif fmt == "rst": + assert out.startswith("=========\nChangelog\n=========\n\n") + assert "`PR/413 `_" in out + + def test_error(self): + with pytest.raises(FileNotFoundError): + xs.testing.publish_release_notes("md", changes="foo") + with pytest.raises(NotImplementedError): + xs.testing.publish_release_notes( + "foo", changes=Path(__file__).parent.parent.joinpath("CHANGELOG.rst") + ) + + @pytest.mark.requires_netcdf + def test_file(self, tmpdir): + xs.testing.publish_release_notes( + "md", + file=tmpdir / "foo.md", + changes=Path(__file__).parent.parent.joinpath("CHANGELOG.rst"), + ) + with Path(tmpdir).joinpath("foo.md").open(encoding="utf-8") as f: + assert f.read().startswith("# Changelog\n\n") + + @pytest.mark.parametrize("latest", [True, False]) + @pytest.mark.requires_netcdf + def test_latest(self, tmpdir, latest): + out = xs.testing.publish_release_notes( + "md", + changes=Path(__file__).parent.parent.joinpath("CHANGELOG.rst"), + latest=latest, + ) + if latest: + assert len(out.split("\n\n## v0.")) == 2 + else: + assert len(out.split("\n\n## v0.")) > 2 + + +def test_show_version(tmpdir): + xs.testing.show_versions(file=tmpdir / "versions.txt") + with Path(tmpdir).joinpath("versions.txt").open(encoding="utf-8") as f: + out = f.read() + assert "xscen" in out + assert "xclim" in out + assert "xesmf" in out + assert "xarray" in out + assert "numpy" in out + assert "pandas" in out + assert "dask" in out + assert "cftime" in out + assert "netCDF4" in out diff --git a/tests/test_utils.py b/tests/test_utils.py index b5fff0f8..9b49b6c3 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -71,7 +71,7 @@ class TestDateParser: ("2001", True, "datetime", pd.Timestamp("2001-12-31 23:59:59")), ("150004", True, "datetime", pd.Timestamp("1500-04-30 23:59:59")), ("31231212", None, "datetime", pd.Timestamp("3123-12-12")), - ("2001-07-08", None, "period", pd.Period("2001-07-08", "H")), + ("2001-07-08", None, "period", pd.Period("2001-07-08", "h")), (pd.Timestamp("1993-05-20T12:07"), None, "str", "1993-05-20"), ( cftime.Datetime360Day(1981, 2, 30), @@ -94,7 +94,7 @@ class TestDateParser: ("abc", None, "datetime", pd.Timestamp("NaT")), ("", True, "datetime", pd.Timestamp("NaT")), ( - pd.Period("2001-07-08", "H"), + pd.Period("2001-07-08", "h"), None, "datetime", pd.Timestamp("2001-07-08"), @@ -825,39 +825,6 @@ def test_change_prefix(self, change_prefix): } -class TestPublish: - @pytest.mark.requires_netcdf - @pytest.mark.parametrize("fmt", ["md", "rst"]) - def test_normal(self, fmt): - out = xs.utils.publish_release_notes( - fmt, changes=Path(__file__).parent.parent.joinpath("CHANGELOG.rst") - ) - if fmt == "md": - assert out.startswith("# Changelog\n\n") - assert "[PR/413](https://github.com/Ouranosinc/xscen/pull/413)" in out - elif fmt == "rst": - assert out.startswith("=========\nChangelog\n=========\n\n") - assert "`PR/413 `_" in out - - def test_error(self): - with pytest.raises(FileNotFoundError): - xs.utils.publish_release_notes("md", changes="foo") - with pytest.raises(NotImplementedError): - xs.utils.publish_release_notes( - "foo", changes=Path(__file__).parent.parent.joinpath("CHANGELOG.rst") - ) - - @pytest.mark.requires_netcdf - def test_file(self, tmpdir): - xs.utils.publish_release_notes( - "md", - file=tmpdir / "foo.md", - changes=Path(__file__).parent.parent.joinpath("CHANGELOG.rst"), - ) - with Path(tmpdir).joinpath("foo.md").open(encoding="utf-8") as f: - assert f.read().startswith("# Changelog\n\n") - - class TestUnstackDates: @pytest.mark.parametrize( "freq", ["MS", "2MS", "3MS", "QS-DEC", "QS", "2QS", "YS", "YS-DEC", "4YS"] @@ -1043,20 +1010,6 @@ def test_errors(self): xs.utils.unstack_dates(ds) -def test_show_version(tmpdir): - xs.utils.show_versions(file=tmpdir / "versions.txt") - with Path(tmpdir).joinpath("versions.txt").open(encoding="utf-8") as f: - out = f.read() - assert "xscen" in out - assert "xclim" in out - assert "xarray" in out - assert "numpy" in out - assert "pandas" in out - assert "dask" in out - assert "cftime" in out - assert "netCDF4" in out - - class TestEnsureTime: def test_xrfreq_ok(self): ds = timeseries(