diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index aa6a87364..83e4c3099 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -19,7 +19,7 @@ on: - submitted env: - XCLIM_TESTDATA_BRANCH: v2023.12.14 + XCLIM_TESTDATA_BRANCH: v2024.8.23 concurrency: # For a given workflow, if we push to the same branch, cancel all previous builds on that branch except on main. diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b5930a36f..deb1b6128 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,14 +2,29 @@ Changelog ========= -v0.53.0 +v0.53.0 (unreleased) -------------------- -Contributors to this version: Adrien Lamarche (:user:`LamAdr`). +Contributors to this version: Adrien Lamarche (:user:`LamAdr`), Trevor James Smith (:user:`Zeitsperre`). Bug fixes ^^^^^^^^^ * Fixed a small inefficiency in ``_otc_adjust`` (:pull:`1890`). +Breaking changes +^^^^^^^^^^^^^^^^ +* `platformdirs` is no longer a direct dependency of `xclim`, but `pooch` is required to use many of the new testing functions (installable via `pip install pooch` or `pip install 'xclim[dev]'`). (:pull:`1889`). + +Internal changes +^^^^^^^^^^^^^^^^ +* The `Ouranosinc/xclim-testdata` repository has been restructured for better organization and to make better use of `pooch` and data registries for testing data fetching (see: `xclim-testdata PR/29 `_). (:pull:`1889`). +* The ``xclim.testing`` module has been refactored to make use of `pooch` with file registries. Several testing functions have been removed as a result: (:pull:`1889`) + * ``xclim.testing.utils.open_dataset`` now uses a `pooch` instance to deliver locally-stored datasets. Its call signature has also changed. + * ``xclim`` now accepts more environment variables to control the behaviour of the testing setup functions. These include ``XCLIM_TESTDATA_BRANCH``, ``XCLIM_TESTDATA_REPO_URL``, and ``XCLIM_TESTDATA_CACHE_DIR``. + * ``xclim.testing.utils.get_file``, ``xclim.testing.utils.get_local_testdata``, ``xclim.testing.utils.list_datasets``, and ``xclim.testing.utils.file_md5_checksum`` have been removed. + * ``xclim.testing.utils.nimbus`` replaces much of this functionality. See the `xclim` documentation for more information. +* Many tests focused on evaluating the normal operation of remote file access tools under ``xclim.testing`` have been removed. (:pull:`1889`). +* Setup and teardown functions that were found under ``tests/conftest.py`` have been optimized to reduce redundant calls when running ``pytest xclim``. Some obsolete `pytest` fixtures have also been removed.(:pull:`1889`). + v0.52.0 (2024-08-08) -------------------- Contributors to this version: David Huard (:user:`huard`), Trevor James Smith (:user:`Zeitsperre`), Hui-Min Wang (:user:`Hem-W`), Γ‰ric Dupuis (:user:`coxipi`), Sarah Gammon (:user:`SarahG-579462`), Pascal Bourgault (:user:`aulemahal`), Juliette Lavoie (:user:`juliettelavoie`), Adrien Lamarche (:user:`LamAdr`). diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 1ecc47a54..df93ee2f3 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -269,9 +269,10 @@ Updating Testing Data If your code changes require changes to the testing data of `xclim` (i.e.: modifications to existing datasets or new datasets), these changes must be made via a Pull Request at the `xclim-testdata repository`_. -`xclim` allows for developers to test specific branches/versions of `xclim-testdata` via the `XCLIM_TESTDATA_BRANCH` environment variable, either through export, e.g.:: +`xclim` allows for developers to test specific branches/versions or forks of the `xclim-testdata` repository via the `XCLIM_TESTDATA_BRANCH` and `XCLIM_TESTDATA_REPO` environment variables, respectively, either through export, e.g.:: $ export XCLIM_TESTDATA_BRANCH="my_new_branch_of_testing_data" + $ export XCLIM_TESTDATA_REPO="https://github.com/my_username/xclim-testdata" $ pytest # or, alternatively: @@ -279,11 +280,11 @@ If your code changes require changes to the testing data of `xclim` (i.e.: modif or by setting the variable at runtime:: - $ env XCLIM_TESTDATA_BRANCH="my_new_branch_of_testing_data" pytest + $ env XCLIM_TESTDATA_BRANCH="my_new_branch_of_testing_data" XCLIM_TESTDATA_REPO="https://github.com/my_username/xclim-testdata" pytest # or, alternatively: - $ env XCLIM_TESTDATA_BRANCH="my_new_branch_of_testing_data" tox + $ env XCLIM_TESTDATA_BRANCH="my_new_branch_of_testing_data" XCLIM_TESTDATA_REPO="https://github.com/my_username/xclim-testdata" tox -This will ensure that tests load the testing data from this branch before running. +This will ensure that tests load the appropriate testing data from this branch or repository before running. If you anticipate not having internet access, we suggest prefetching the testing data from `xclim-testdata repository`_ and storing it in your local cache. This can be done by running the following console command:: @@ -296,7 +297,7 @@ If your development branch relies on a specific branch of `Ouranosinc/xclim-test or, alternatively, with the `--branch` option:: - $ xclim prefetch_testing_data --branch my_new_branch_of_testing_data + $ xclim prefetch_testing_data --branch my_new_branch_of_testing_data --repo "https://github.com/my_username/xclim-testdata" If you wish to test a specific branch using GitHub CI, this can be set in `.github/workflows/main.yml`: @@ -306,7 +307,7 @@ If you wish to test a specific branch using GitHub CI, this can be set in `.gith XCLIM_TESTDATA_BRANCH: my_new_branch_of_testing_data .. warning:: - In order for a Pull Request to be allowed to merge to main development branch, this variable must match the latest tagged commit name on `xclim-testdata repository`_. + In order for a Pull Request to be allowed to merge to the `main` development branch, this variable must match the latest tagged commit name on `xclim-testdata repository`_. We suggest merging changed testing data first, tagging a new version of `xclim-testdata`, then re-running tests on your Pull Request at `Ouranosinc/xclim` with the newest tag. Running Tests in Offline Mode @@ -323,8 +324,8 @@ or, alternatively, using `tox` :: $ tox -e offline -These options will disable all network calls and skip tests marked with the `requires_internet` marker. -The `--allow-unix-socket` option is required to allow the `pytest-xdist`_ plugin to function properly. +These options will disable all network calls and skip tests marked with the ``requires_internet`` marker. +The ``--allow-unix-socket`` option is required to allow the `pytest-xdist`_ plugin to function properly. Tips ---- diff --git a/docs/notebooks/analogs.ipynb b/docs/notebooks/analogs.ipynb index c60f6f28e..2c53f01a8 100644 --- a/docs/notebooks/analogs.ipynb +++ b/docs/notebooks/analogs.ipynb @@ -24,9 +24,9 @@ "from __future__ import annotations\n", "\n", "import matplotlib.pyplot as plt\n", + "from xarray.coding.calendar_ops import convert_calendar\n", "\n", "from xclim import analog\n", - "from xclim.core.calendar import convert_calendar\n", "from xclim.testing import open_dataset" ] }, @@ -105,7 +105,7 @@ "outputs": [], "source": [ "fig, axs = plt.subplots(nrows=3, figsize=(6, 6), sharex=True)\n", - "sim_std = convert_calendar(sim, \"default\")\n", + "sim_std = convert_calendar(sim, \"standard\")\n", "obs_chibou = obs.sel(lat=sim.lat, lon=sim.lon, method=\"nearest\")\n", "\n", "for ax, var in zip(axs, obs_chibou.data_vars.keys()):\n", @@ -258,7 +258,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.12.5" } }, "nbformat": 4, diff --git a/docs/notebooks/cli.ipynb b/docs/notebooks/cli.ipynb index aac26b04e..560ed6771 100644 --- a/docs/notebooks/cli.ipynb +++ b/docs/notebooks/cli.ipynb @@ -90,8 +90,6 @@ "metadata": {}, "outputs": [], "source": [ - "from __future__ import annotations\n", - "\n", "import warnings\n", "\n", "import numpy as np\n", diff --git a/docs/notebooks/customize.ipynb b/docs/notebooks/customize.ipynb index a5f8a9b34..1b1ed7a51 100644 --- a/docs/notebooks/customize.ipynb +++ b/docs/notebooks/customize.ipynb @@ -19,8 +19,7 @@ "\n", "import xarray as xr\n", "\n", - "import xclim\n", - "from xclim.testing import open_dataset" + "import xclim" ] }, { diff --git a/docs/notebooks/ensembles.ipynb b/docs/notebooks/ensembles.ipynb index 10cc60a49..381b3af14 100644 --- a/docs/notebooks/ensembles.ipynb +++ b/docs/notebooks/ensembles.ipynb @@ -155,8 +155,6 @@ }, "outputs": [], "source": [ - "from pathlib import Path\n", - "\n", "import xarray as xr\n", "\n", "# Set display to HTML style (for fancy output)\n", @@ -165,10 +163,10 @@ "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "\n", - "%matplotlib inline\n", - "\n", "from xclim import ensembles\n", "\n", + "%matplotlib inline\n", + "\n", "ens = ensembles.create_ensemble(data_folder.glob(\"ens_tas_m*.nc\")).load()\n", "ens.close()" ] diff --git a/docs/notebooks/extendxclim.ipynb b/docs/notebooks/extendxclim.ipynb index aa2b679c9..d6ef80233 100644 --- a/docs/notebooks/extendxclim.ipynb +++ b/docs/notebooks/extendxclim.ipynb @@ -599,7 +599,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.5" }, "toc": { "base_numbering": 1, diff --git a/docs/notebooks/sdba-advanced.ipynb b/docs/notebooks/sdba-advanced.ipynb index 97456b6bd..2c214c9a3 100644 --- a/docs/notebooks/sdba-advanced.ipynb +++ b/docs/notebooks/sdba-advanced.ipynb @@ -54,7 +54,6 @@ "from __future__ import annotations\n", "\n", "import matplotlib.pyplot as plt\n", - "import nc_time_axis\n", "import numpy as np\n", "import xarray as xr\n", "\n", @@ -429,8 +428,9 @@ "metadata": {}, "outputs": [], "source": [ + "from xarray.coding.calendar_ops import convert_calendar\n", + "\n", "import xclim.sdba as sdba\n", - "from xclim.core.calendar import convert_calendar\n", "from xclim.core.units import convert_units_to\n", "from xclim.testing import open_dataset\n", "\n", @@ -751,7 +751,6 @@ "source": [ "from matplotlib import pyplot as plt\n", "\n", - "import xclim as xc\n", "from xclim import sdba\n", "from xclim.testing import open_dataset\n", "\n", @@ -880,7 +879,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.12.5" }, "toc": { "base_numbering": 1, diff --git a/docs/notebooks/sdba.ipynb b/docs/notebooks/sdba.ipynb index 345d5c446..72c7df816 100644 --- a/docs/notebooks/sdba.ipynb +++ b/docs/notebooks/sdba.ipynb @@ -808,7 +808,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.12.5" }, "toc": { "base_numbering": 1, diff --git a/docs/notebooks/units.ipynb b/docs/notebooks/units.ipynb index beaaa6a4b..014180b8a 100644 --- a/docs/notebooks/units.ipynb +++ b/docs/notebooks/units.ipynb @@ -18,7 +18,6 @@ "import xarray as xr\n", "\n", "import xclim\n", - "from xclim import indices\n", "from xclim.core import units\n", "from xclim.testing import open_dataset\n", "\n", diff --git a/docs/notebooks/usage.ipynb b/docs/notebooks/usage.ipynb index 90aad6b6b..527b016a6 100644 --- a/docs/notebooks/usage.ipynb +++ b/docs/notebooks/usage.ipynb @@ -26,7 +26,7 @@ "import xarray as xr\n", "\n", "import xclim.indices\n", - "from xclim import testing" + "from xclim.testing import open_dataset" ] }, { @@ -48,7 +48,7 @@ "# ds = xr.open_dataset(\"your_file.nc\")\n", "\n", "# For this example, let's use a test dataset from xclim:\n", - "ds = testing.open_dataset(\"ERA5/daily_surface_cancities_1990-1993.nc\")\n", + "ds = open_dataset(\"ERA5/daily_surface_cancities_1990-1993.nc\")\n", "ds.tas" ] }, @@ -164,11 +164,6 @@ "Resampling to a daily frequency and running the same indicator succeeds, but we will still get warnings from the CF metadata checks." ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, { "cell_type": "code", "execution_count": null, @@ -387,7 +382,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.12.5" } }, "nbformat": 4, diff --git a/environment.yml b/environment.yml index 070f86943..9a7c4ea35 100644 --- a/environment.yml +++ b/environment.yml @@ -28,9 +28,9 @@ dependencies: - flox - lmoments3 # Required for some Jupyter notebooks # Testing and development dependencies - - black ==24.4.2 + - black ==24.8.0 - blackdoc ==0.3.9 - - bump-my-version >=0.24.3 + - bump-my-version >=0.25.4 - cairosvg - codespell ==2.3.0 - coverage >=7.5.0 @@ -54,7 +54,6 @@ dependencies: - nc-time-axis >=1.4.1 - notebook - pandas-stubs >=2.2 - - platformdirs >=3.2 - pooch >=1.8.0 - pre-commit >=3.7 - pybtex >=0.24.0 @@ -74,7 +73,7 @@ dependencies: - tokenize-rt >=5.2.0 - tox >=4.16.0 # - tox-conda # Will be added when a tox@v4.0+ compatible plugin is released. - - vulture # ==2.11 # The conda-forge version is out of date. + - vulture ==2.11 - xdoctest >=1.1.5 - yamllint >=1.35.1 - pip >=24.0 diff --git a/pyproject.toml b/pyproject.toml index 2dc0f85a6..b64384994 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,6 @@ dependencies = [ "packaging >=24.0", "pandas >=2.2", "pint >=0.18", - "platformdirs >=3.2", "pyarrow >=15.0.0", # Strongly encouraged for pandas v2.2.0+ "pyyaml >=6.0.1", "scikit-learn >=0.21.3", @@ -79,7 +78,6 @@ dev = [ "nbval >=0.11.0", "pandas-stubs >=2.2", "pip >=24.0", - "platformdirs >=3.2", "pooch >=1.8.0", "pre-commit >=3.7", "pylint >=3.2.4", @@ -138,7 +136,7 @@ target-version = [ ] [tool.bumpversion] -current_version = "0.52.1-dev.0" +current_version = "0.52.1-dev.1" commit = true commit_args = "--no-verify" tag = false diff --git a/tests/conftest.py b/tests/conftest.py index 32f7c978b..dc2b25fc6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,11 +12,21 @@ from xclim.core import indicator from xclim.core.calendar import max_doy -from xclim.testing import helpers -from xclim.testing.helpers import test_timeseries -from xclim.testing.utils import _default_cache_dir # noqa -from xclim.testing.utils import get_file +from xclim.testing.helpers import ( + add_ensemble_dataset_objects, + generate_atmos, + test_timeseries, +) +from xclim.testing.utils import ( + TESTDATA_BRANCH, + TESTDATA_CACHE_DIR, + TESTDATA_REPO_URL, + default_testdata_cache, + gather_testing_data, +) +from xclim.testing.utils import nimbus as _nimbus from xclim.testing.utils import open_dataset as _open_dataset +from xclim.testing.utils import testing_setup_warnings @pytest.fixture @@ -24,16 +34,6 @@ def random() -> np.random.Generator: return np.random.default_rng(seed=list(map(ord, "π•½π”žπ–“π”‘π–”π”ͺ"))) -@pytest.fixture -def tmp_netcdf_filename(tmpdir) -> Path: - yield Path(tmpdir).joinpath("testfile.nc") - - -@pytest.fixture(autouse=True, scope="session") -def threadsafe_data_dir(tmp_path_factory) -> Path: - yield Path(tmp_path_factory.getbasetemp().joinpath("data")) - - @pytest.fixture def lat_series(): def _lat_series(values): @@ -48,6 +48,11 @@ def _lat_series(values): return _lat_series +@pytest.fixture +def timeseries(): + return test_timeseries + + @pytest.fixture def tas_series(): """Return mean temperature time series.""" @@ -300,32 +305,38 @@ def rlus_series(): @pytest.fixture(scope="session") -def cmip3_day_tas(threadsafe_data_dir): - # xr.set_options(enable_cftimeindex=False) - ds = _open_dataset( - "cmip3/tas.sresb1.giss_model_e_r.run1.atm.da.nc", - cache_dir=threadsafe_data_dir, - branch=helpers.TESTDATA_BRANCH, - engine="h5netcdf", +def threadsafe_data_dir(tmp_path_factory): + return Path(tmp_path_factory.getbasetemp().joinpath("data")) + + +@pytest.fixture(scope="session") +def nimbus(threadsafe_data_dir, worker_id): + return _nimbus( + repo=TESTDATA_REPO_URL, + branch=TESTDATA_BRANCH, + cache_dir=( + TESTDATA_CACHE_DIR if worker_id == "master" else threadsafe_data_dir + ), ) - yield ds.tas - ds.close() @pytest.fixture(scope="session") -def open_dataset(threadsafe_data_dir): - def _open_session_scoped_file( - file: str | os.PathLike, branch: str = helpers.TESTDATA_BRANCH, **xr_kwargs - ): +def open_dataset(nimbus): + def _open_session_scoped_file(file: str | os.PathLike, **xr_kwargs): + xr_kwargs.setdefault("cache", True) xr_kwargs.setdefault("engine", "h5netcdf") return _open_dataset( - file, cache_dir=threadsafe_data_dir, branch=branch, **xr_kwargs + file, + branch=TESTDATA_BRANCH, + repo=TESTDATA_REPO_URL, + cache_dir=nimbus.path, + **xr_kwargs, ) return _open_session_scoped_file -@pytest.fixture +@pytest.fixture(scope="session") def official_indicators(): # Remove unofficial indicators (as those created during the tests, and those from YAML-built modules) registry_cp = indicator.registry.copy() @@ -335,44 +346,16 @@ def official_indicators(): return registry_cp -@pytest.fixture(scope="function") -def atmosds(threadsafe_data_dir) -> xr.Dataset: - return _open_dataset( - threadsafe_data_dir.joinpath("atmosds.nc"), - cache_dir=threadsafe_data_dir, - branch=helpers.TESTDATA_BRANCH, - engine="h5netcdf", - ).load() - - -@pytest.fixture(scope="function") -def ensemble_dataset_objects() -> dict[str, str]: - edo = dict() - edo["nc_files_simple"] = [ - "EnsembleStats/BCCAQv2+ANUSPLIN300_ACCESS1-0_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", - "EnsembleStats/BCCAQv2+ANUSPLIN300_BNU-ESM_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", - "EnsembleStats/BCCAQv2+ANUSPLIN300_CCSM4_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", - "EnsembleStats/BCCAQv2+ANUSPLIN300_CCSM4_historical+rcp45_r2i1p1_1950-2100_tg_mean_YS.nc", - ] - edo["nc_files_extra"] = [ - "EnsembleStats/BCCAQv2+ANUSPLIN300_CNRM-CM5_historical+rcp45_r1i1p1_1970-2050_tg_mean_YS.nc" - ] - edo["nc_files"] = edo["nc_files_simple"] + edo["nc_files_extra"] - return edo - - -@pytest.fixture(scope="session") -def lafferty_sriver_ds() -> xr.Dataset: +@pytest.fixture +def lafferty_sriver_ds(nimbus) -> xr.Dataset: """Get data from Lafferty & Sriver unit test. Notes ----- https://github.com/david0811/lafferty-sriver_2023_npjCliAtm/tree/main/unit_test """ - fn = get_file( + fn = nimbus.fetch( "uncertainty_partitioning/seattle_avg_tas.csv", - cache_dir=_default_cache_dir, - branch=helpers.TESTDATA_BRANCH, ) df = pd.read_csv(fn, parse_dates=["time"]).rename( @@ -385,36 +368,43 @@ def lafferty_sriver_ds() -> xr.Dataset: ) -@pytest.fixture(scope="session", autouse=True) -def gather_session_data(threadsafe_data_dir, worker_id): - """Gather testing data on pytest run. +@pytest.fixture +def atmosds(nimbus) -> xr.Dataset: + """Get synthetic atmospheric dataset.""" + return _open_dataset( + "atmosds.nc", + cache_dir=nimbus.path, + engine="h5netcdf", + ).load() - When running pytest with multiple workers, one worker will copy data remotely to _default_cache_dir while - other workers wait using lockfile. Once the lock is released, all workers will then copy data to their local - threadsafe_data_dir.As this fixture is scoped to the session, it will only run once per pytest run. - Additionally, this fixture is also used to generate the `atmosds` synthetic testing dataset. - """ - helpers.testing_setup_warnings() - helpers.gather_testing_data(threadsafe_data_dir, worker_id) - helpers.generate_atmos(threadsafe_data_dir) +@pytest.fixture(scope="session") +def ensemble_dataset_objects() -> dict[str, str]: + return add_ensemble_dataset_objects() + + +@pytest.fixture(autouse=True, scope="session") +def gather_session_data(request, nimbus, worker_id): + """Gather testing data on pytest run. + When running pytest with multiple workers, one worker will copy data remotely to default cache dir while + other workers wait using lockfile. Once the lock is released, all workers will then copy data to their local + threadsafe_data_dir. As this fixture is scoped to the session, it will only run once per pytest run. -@pytest.fixture(scope="session", autouse=True) -def cleanup(request): - """Cleanup a testing file once we are finished. + Due to the lack of UNIX sockets on Windows, the lockfile mechanism is not supported, requiring users on + Windows to run `$ xclim prefetch_testing_data` before running any tests for the first time to populate the + default cache dir. - This flag prevents remote data from being downloaded multiple times in the same pytest run. + Additionally, this fixture is also used to generate the `atmosds` synthetic testing dataset. """ + testing_setup_warnings() + gather_testing_data(worker_cache_dir=nimbus.path, worker_id=worker_id) + generate_atmos(branch=TESTDATA_BRANCH, cache_dir=nimbus.path) def remove_data_written_flag(): - flag = _default_cache_dir.joinpath(".data_written") + """Cleanup cache folder once we are finished.""" + flag = default_testdata_cache.joinpath(".data_written") if flag.exists(): flag.unlink() request.addfinalizer(remove_data_written_flag) - - -@pytest.fixture -def timeseries(): - return test_timeseries diff --git a/tests/test_analog.py b/tests/test_analog.py index 72857b007..2608df226 100644 --- a/tests/test_analog.py +++ b/tests/test_analog.py @@ -58,8 +58,8 @@ def test_exact_randn(exact_randn): @pytest.mark.slow @pytest.mark.parametrize("method", xca.metrics.keys()) def test_spatial_analogs(method, open_dataset): - diss = open_dataset("SpatialAnalogs/dissimilarity") - data = open_dataset("SpatialAnalogs/indicators") + diss = open_dataset("SpatialAnalogs/dissimilarity.nc") + data = open_dataset("SpatialAnalogs/indicators.nc") target = data.sel(lat=46.1875, lon=-72.1875, time=slice("1970", "1990")) candidates = data.sel(time=slice("1970", "1990")) @@ -75,7 +75,7 @@ def test_spatial_analogs(method, open_dataset): def test_unsupported_spatial_analog_method(open_dataset): method = "KonMari" - data = open_dataset("SpatialAnalogs/indicators") + data = open_dataset("SpatialAnalogs/indicators.nc") target = data.sel(lat=46.1875, lon=-72.1875, time=slice("1970", "1990")) candidates = data.sel(time=slice("1970", "1990")) @@ -87,8 +87,8 @@ def test_unsupported_spatial_analog_method(open_dataset): def test_spatial_analogs_multi_index(open_dataset): # Test multi-indexes - diss = open_dataset("SpatialAnalogs/dissimilarity") - data = open_dataset("SpatialAnalogs/indicators") + diss = open_dataset("SpatialAnalogs/dissimilarity.nc") + data = open_dataset("SpatialAnalogs/indicators.nc") target = data.sel(lat=46.1875, lon=-72.1875, time=slice("1970", "1990")) candidates = data.sel(time=slice("1970", "1990")) diff --git a/tests/test_atmos.py b/tests/test_atmos.py index 10d5d0efe..23929550d 100644 --- a/tests/test_atmos.py +++ b/tests/test_atmos.py @@ -94,7 +94,7 @@ def test_humidex(tas_series): def test_heat_index(atmosds): - # Keep just Montreal values for summertime as we need tas > 20 degC + # Keep just Montreal values for summer as we need tas > 20 degC tas = atmosds.tasmax[1][150:170] hurs = atmosds.hurs[1][150:170] diff --git a/tests/test_ensembles.py b/tests/test_ensembles.py index 79033b441..896340634 100644 --- a/tests/test_ensembles.py +++ b/tests/test_ensembles.py @@ -26,7 +26,6 @@ from xclim import ensembles from xclim.indices.stats import get_dist -from xclim.testing.helpers import TESTDATA_BRANCH # sklearn's KMeans doesn't accept the standard numpy Generator, so we create a special fixture for these tests @@ -38,9 +37,7 @@ def random_state(): class TestEnsembleStats: - def test_create_ensemble( - self, open_dataset, ensemble_dataset_objects, threadsafe_data_dir - ): + def test_create_ensemble(self, open_dataset, ensemble_dataset_objects, nimbus): ds_all = [] for n in ensemble_dataset_objects["nc_files_simple"]: ds = open_dataset(n, decode_times=False) @@ -62,10 +59,7 @@ def test_create_ensemble( ens1 = ensembles.create_ensemble(ds_all, realizations=reals) # Kinda a hack? Alternative is to open and rewrite in a temp folder. - files = [ - threadsafe_data_dir / TESTDATA_BRANCH / "EnsembleStats" / Path(f).name - for f in ensemble_dataset_objects["nc_files_simple"] - ] + files = [nimbus.fetch(f) for f in ensemble_dataset_objects["nc_files_simple"]] ens2 = ensembles.create_ensemble(dict(zip(reals, files))) xr.testing.assert_identical(ens1, ens2) diff --git a/tests/test_indices.py b/tests/test_indices.py index f6e38cf25..70191bfda 100644 --- a/tests/test_indices.py +++ b/tests/test_indices.py @@ -2697,12 +2697,14 @@ def test_simple(self, open_dataset, ind, exp): out = ind(ds.tas.sel(location="Victoria")) np.testing.assert_almost_equal(out[0], exp, decimal=4) - def test_indice_against_icclim(self, cmip3_day_tas): + def test_indice_against_icclim(self, open_dataset): from xclim.indicators import icclim # noqa + cmip3_tas = open_dataset("cmip3/tas.sresb1.giss_model_e_r.run1.atm.da.nc").tas + with set_options(cf_compliance="log"): - ind = xci.tg_mean(cmip3_day_tas) - icclim = icclim.TG(cmip3_day_tas) + ind = xci.tg_mean(cmip3_tas) + icclim = icclim.TG(cmip3_tas) np.testing.assert_array_equal(icclim, ind) diff --git a/tests/test_testing_utils.py b/tests/test_testing_utils.py index 3bbc044e3..6fb5d2dd1 100644 --- a/tests/test_testing_utils.py +++ b/tests/test_testing_utils.py @@ -1,17 +1,15 @@ from __future__ import annotations import platform -import sys from pathlib import Path -from urllib.error import URLError import numpy as np import pytest from xarray import Dataset -import xclim.testing.utils as utilities from xclim import __version__ as __xclim_version__ from xclim.testing.helpers import test_timeseries as timeseries +from xclim.testing.utils import open_dataset, publish_release_notes, show_versions class TestFixtures: @@ -29,80 +27,35 @@ def test_timeseries_made_up_variable(self): class TestFileRequests: - @pytest.mark.requires_internet - def test_get_failure(self, tmp_path): - bad_repo_address = "https://github.com/beard/of/zeus/" - with pytest.raises(FileNotFoundError): - utilities._get( - Path("san_diego", "60_percent_of_the_time_it_works_everytime"), - bad_repo_address, - "main", - ".rudd", - tmp_path, - ) - @pytest.mark.requires_internet - def test_open_dataset_with_bad_file(self, tmp_path): - cmip3_folder = tmp_path.joinpath("main", "cmip3") - cmip3_folder.mkdir(parents=True) - - cmip3_file = "tas.sresb1.giss_model_e_r.run1.atm.da.nc" - Path(cmip3_folder, cmip3_file).write_text("This file definitely isn't right.") - - cmip3_md5 = f"{cmip3_file}.md5" - bad_cmip3_md5 = "bc51206e6462fc8ed08fd4926181274c" - Path(cmip3_folder, cmip3_md5).write_text(bad_cmip3_md5) - - # Check for raised warning for local file md5 sum and remote md5 sum - with pytest.warns(UserWarning): - new_cmip3_file = utilities._get( - Path("cmip3", cmip3_file), - github_url="https://github.com/Ouranosinc/xclim-testdata", - suffix=".nc", - branch="main", - cache_dir=tmp_path, - ) - - # Ensure that the new cmip3 file is in the cache directory - assert ( - utilities.file_md5_checksum(Path(cmip3_folder, new_cmip3_file)) - != bad_cmip3_md5 - ) + @staticmethod + def file_md5_checksum(f_name): + import hashlib - # Ensure that the md5 file was updated at the same time - assert ( - utilities.file_md5_checksum(Path(cmip3_folder, new_cmip3_file)) - == Path(cmip3_folder, cmip3_md5).read_text() - ) + hash_md5 = hashlib.md5() # noqa: S324 + with open(f_name, "rb") as f: + hash_md5.update(f.read()) + return hash_md5.hexdigest() @pytest.mark.requires_internet - def test_open_testdata(self): - ds = utilities.open_dataset( - Path("cmip5/tas_Amon_CanESM2_rcp85_r1i1p1_200701-200712"), engine="h5netcdf" + def test_open_testdata( + self, + ): + from xclim.testing.utils import default_testdata_cache, default_testdata_version + + # Test with top-level default engine + ds = open_dataset( + Path("cmip5/tas_Amon_CanESM2_rcp85_r1i1p1_200701-200712.nc"), + cache_dir=default_testdata_cache.joinpath(default_testdata_version), + engine="h5netcdf", ) assert ds.lon.size == 128 - # Not that this test is super slow, but there is no real value in spamming GitHub's API for no reason. - @pytest.mark.slow - @pytest.mark.xfail(reason="Test is rate limited by GitHub.") - def test_list_datasets(self): - out = utilities.list_datasets() - - assert list(out.columns) == ["size", "url"] - np.testing.assert_allclose( - out.loc["cmip6/o3_Amon_GFDL-ESM4_historical_r1i1p1f1_gr1_185001-194912.nc"][ - "size" - ], - 845.021484375, - ) - - -class TestFileAssertions: def test_md5_sum(self): test_data = Path(__file__).parent / "data" callendar = test_data / "callendar_1938.txt" - md5_sum = utilities.file_md5_checksum(callendar) - if sys.platform == "win32": + md5_sum = self.file_md5_checksum(callendar) + if platform.system() == "Windows": # Windows has a different line ending (CR-LF) than Unix (LF) assert md5_sum == "38083271c2d4c85dea6bd6baf23d34de" # noqa else: @@ -112,7 +65,7 @@ def test_md5_sum(self): class TestReleaseSupportFuncs: def test_show_version_file(self, tmp_path): temp_filename = tmp_path.joinpath("version_info.txt") - utilities.show_versions(file=temp_filename) + show_versions(file=temp_filename) with open(temp_filename) as f: contents = f.readlines().copy() @@ -125,7 +78,7 @@ def test_show_version_file(self, tmp_path): @pytest.mark.requires_docs def test_release_notes_file(self, tmp_path): temp_filename = tmp_path.joinpath("version_info.txt") - utilities.publish_release_notes(style="md", file=temp_filename) + publish_release_notes(style="md", file=temp_filename) with open(temp_filename) as f: assert "# Changelog" in f.readlines()[0] @@ -134,23 +87,4 @@ def test_release_notes_file(self, tmp_path): def test_release_notes_file_not_implemented(self, tmp_path): temp_filename = tmp_path.joinpath("version_info.txt") with pytest.raises(NotImplementedError): - utilities.publish_release_notes(style="qq", file=temp_filename) - - -class TestTestingFileAccessors: - def test_unsafe_urls(self): - with pytest.raises( - ValueError, match="GitHub URL not secure: 'ftp://domain.does.not.exist/'." - ): - utilities.open_dataset( - "doesnt_exist.nc", github_url="ftp://domain.does.not.exist/" - ) - - def test_malicious_urls(self): - with pytest.raises( - URLError, - match="urlopen error OPeNDAP URL is not well-formed: 'doesnt_exist.nc'", - ): - utilities.open_dataset( - "doesnt_exist.nc", dap_url="Robert'); DROP TABLE STUDENTS; --" - ) + publish_release_notes(style="qq", file=temp_filename) diff --git a/xclim/__init__.py b/xclim/__init__.py index 6957bfbe9..fd3e98cf2 100644 --- a/xclim/__init__.py +++ b/xclim/__init__.py @@ -13,7 +13,7 @@ __author__ = """Travis Logan""" __email__ = "logan.travis@ouranos.ca" -__version__ = "0.52.1-dev.0" +__version__ = "0.52.1-dev.1" with _resources.as_file(_resources.files("xclim.data")) as _module_data: diff --git a/xclim/cli.py b/xclim/cli.py index 67a6da1eb..a7fcc1174 100644 --- a/xclim/cli.py +++ b/xclim/cli.py @@ -11,13 +11,22 @@ import click import xarray as xr -from dask.diagnostics import ProgressBar +from dask.diagnostics.progress import ProgressBar import xclim as xc from xclim.core.dataflags import DataQualityException, data_flags, ecad_compliant from xclim.core.utils import InputKind -from xclim.testing.helpers import TESTDATA_BRANCH, populate_testing_data -from xclim.testing.utils import _default_cache_dir, publish_release_notes, show_versions +from xclim.testing.utils import ( + TESTDATA_BRANCH, + TESTDATA_CACHE_DIR, + TESTDATA_REPO_URL, + default_testdata_cache, + default_testdata_repo_url, + default_testdata_version, + populate_testing_data, + publish_release_notes, + show_versions, +) distributed = False try: @@ -151,25 +160,47 @@ def show_version_info(ctx): @click.command(short_help="Prefetch xclim testing data for development purposes.") +@click.option( + "-r", + "--repo", + help="The xclim-testdata repo to be fetched and cached. If not specified, defaults to " + f"`XCLIM_TESTDATA_REPO_URL` (if set) or `{default_testdata_repo_url}`.", +) @click.option( "-b", "--branch", help="The xclim-testdata branch to be fetched and cached. If not specified, defaults to " - "`XCLIM_TESTING_DATA_BRANCH` (if set) or `main`.", + f"`XCLIM_TESTDATA_BRANCH` (if set) or `{default_testdata_version}`.", +) +@click.option( + "-c", + "--cache-dir", + help="The xclim-testdata branch to be fetched and cached. If not specified, defaults to " + f"`XCLIM_TESTDATA_CACHE` (if set) or `{default_testdata_cache}`.", ) @click.pass_context -def prefetch_testing_data(ctx, branch): +def prefetch_testing_data(ctx, repo, branch, cache_dir): """Prefetch xclim testing data for development purposes.""" + if repo: + testdata_repo = repo + else: + testdata_repo = TESTDATA_REPO_URL if branch: testdata_branch = branch else: testdata_branch = TESTDATA_BRANCH + if cache_dir: + testdata_cache_dir = cache_dir + else: + testdata_cache_dir = TESTDATA_CACHE_DIR + click.echo(f"Gathering testing data from {testdata_repo}/{testdata_branch} ...") click.echo( - f"Gathering testing data from xclim-testdata `{testdata_branch}` branch..." + populate_testing_data( + repo=testdata_repo, branch=testdata_branch, local_cache=testdata_cache_dir + ) ) - click.echo(populate_testing_data(branch=testdata_branch)) - click.echo(f"Testing data saved to `{_default_cache_dir}`.") + click.echo(f"Testing data saved to `{testdata_cache_dir}`.") ctx.exit() diff --git a/xclim/ensembles/_partitioning.py b/xclim/ensembles/_partitioning.py index ce957d672..5c18b5102 100644 --- a/xclim/ensembles/_partitioning.py +++ b/xclim/ensembles/_partitioning.py @@ -197,7 +197,7 @@ def hawkins_sutton_09_weighting(da, obs, baseline=("1971", "2000")): def lafferty_sriver( da: xr.DataArray, - sm: xr.DataArray = None, + sm: xr.DataArray | None = None, bb13: bool = False, ) -> tuple[xr.DataArray, xr.DataArray]: """Return the mean and partitioned variance of an ensemble based on method from Lafferty and Sriver (2023). diff --git a/xclim/testing/conftest.py b/xclim/testing/conftest.py index 12af10934..095a22efd 100644 --- a/xclim/testing/conftest.py +++ b/xclim/testing/conftest.py @@ -10,33 +10,59 @@ import pytest -from xclim.testing import helpers -from xclim.testing.utils import _default_cache_dir # noqa +from xclim.testing.helpers import ( + add_doctest_filepaths, + add_example_file_paths, + generate_atmos, +) +from xclim.testing.utils import ( + TESTDATA_BRANCH, + TESTDATA_CACHE_DIR, + TESTDATA_REPO_URL, + gather_testing_data, +) +from xclim.testing.utils import nimbus as _nimbus from xclim.testing.utils import open_dataset as _open_dataset +from xclim.testing.utils import testing_setup_warnings @pytest.fixture(autouse=True, scope="session") -def threadsafe_data_dir(tmp_path_factory) -> Path: +def threadsafe_data_dir(tmp_path_factory): """Return a threadsafe temporary directory for storing testing data.""" yield Path(tmp_path_factory.getbasetemp().joinpath("data")) @pytest.fixture(scope="session") -def open_dataset(threadsafe_data_dir): - """Return a function that opens a dataset from the test data directory.""" +def nimbus(threadsafe_data_dir, worker_id): + """Return a nimbus object for the test data.""" + return _nimbus( + repo=TESTDATA_REPO_URL, + branch=TESTDATA_BRANCH, + cache_dir=( + TESTDATA_CACHE_DIR if worker_id == "master" else threadsafe_data_dir + ), + ) - def _open_session_scoped_file( - file: str | os.PathLike, branch: str = helpers.TESTDATA_BRANCH, **xr_kwargs - ): + +@pytest.fixture(scope="session") +def open_dataset(nimbus): + """Return a function that opens a dataset from the test data.""" + + def _open_session_scoped_file(file: str | os.PathLike, **xr_kwargs): + xr_kwargs.setdefault("cache", True) xr_kwargs.setdefault("engine", "h5netcdf") return _open_dataset( - file, cache_dir=threadsafe_data_dir, branch=branch, **xr_kwargs + file, + branch=TESTDATA_BRANCH, + repo=TESTDATA_REPO_URL, + cache_dir=nimbus.path, + **xr_kwargs, ) return _open_session_scoped_file -@pytest.fixture(autouse=True, scope="session") +@pytest.fixture(scope="session", autouse=True) def is_matplotlib_installed(xdoctest_namespace) -> None: """Skip tests that require matplotlib if it is not installed.""" @@ -51,14 +77,14 @@ def _is_matplotlib_installed(): xdoctest_namespace["is_matplotlib_installed"] = _is_matplotlib_installed -@pytest.fixture(autouse=True, scope="session") -def doctest_setup( - xdoctest_namespace, threadsafe_data_dir, worker_id, open_dataset -) -> None: +@pytest.fixture(scope="session", autouse=True) +def doctest_setup(xdoctest_namespace, nimbus, worker_id, open_dataset) -> None: """Gather testing data on doctest run.""" - helpers.testing_setup_warnings() - helpers.gather_testing_data(threadsafe_data_dir, worker_id) - xdoctest_namespace.update(helpers.generate_atmos(threadsafe_data_dir)) + testing_setup_warnings() + gather_testing_data(worker_cache_dir=nimbus.path, worker_id=worker_id) + xdoctest_namespace.update( + generate_atmos(branch=TESTDATA_BRANCH, cache_dir=nimbus.path) + ) class AttrDict(dict): def __init__(self, *args, **kwargs): @@ -68,5 +94,5 @@ def __init__(self, *args, **kwargs): xdoctest_namespace["open_dataset"] = open_dataset xdoctest_namespace["xr"] = AttrDict() xdoctest_namespace["xr"].update({"open_dataset": open_dataset}) - xdoctest_namespace.update(helpers.add_doctest_filepaths()) - xdoctest_namespace.update(helpers.add_example_file_paths()) + xdoctest_namespace.update(add_doctest_filepaths()) + xdoctest_namespace.update(add_example_file_paths()) diff --git a/xclim/testing/helpers.py b/xclim/testing/helpers.py index 34e10823d..5dd312e4e 100644 --- a/xclim/testing/helpers.py +++ b/xclim/testing/helpers.py @@ -2,134 +2,57 @@ from __future__ import annotations +import logging import os -import re -import time import warnings -from datetime import datetime as dt from pathlib import Path -from shutil import copytree -from sys import platform +from typing import Any import numpy as np import pandas as pd import xarray as xr -from dask.diagnostics import Callback -from filelock import FileLock -from packaging.version import Version +from dask.callbacks import Callback import xclim -from xclim import __version__ as __xclim_version__ +import xclim.testing.utils as xtu from xclim.core import calendar from xclim.core.utils import VARIABLES from xclim.indices import ( longwave_upwelling_radiation_from_net_downwelling, shortwave_upwelling_radiation_from_net_downwelling, ) -from xclim.testing.utils import _default_cache_dir # noqa -from xclim.testing.utils import get_file as _get_file -from xclim.testing.utils import get_local_testdata as _get_local_testdata -from xclim.testing.utils import open_dataset as _open_dataset -TESTDATA_BRANCH = os.getenv("XCLIM_TESTDATA_BRANCH", "main") -"""Sets the branch of Ouranosinc/xclim-testdata to use when fetching testing datasets. +logger = logging.getLogger("xclim") -Notes ------ -When running tests locally, this can be set for both `pytest` and `tox` by exporting the variable: - -.. code-block:: console - - $ export XCLIM_TESTDATA_BRANCH="my_testing_branch" - -or setting the variable at runtime: - -.. code-block:: console - - $ env XCLIM_TESTDATA_BRANCH="my_testing_branch" pytest - -""" - -PREFETCH_TESTING_DATA = os.getenv("XCLIM_PREFETCH_TESTING_DATA", False) -"""Indicates whether the testing data should be downloaded when running tests. - -Notes ------ -When running tests multiple times, this flag allows developers to significantly speed up the pytest suite -by preventing sha256sum checks for all downloaded files. Proceed with caution. - -This can be set for both `pytest` and `tox` by exporting the variable: - -.. code-block:: console - - $ export XCLIM_PREFETCH_TESTING_DATA=1 - -or setting the variable at runtime: - -.. code-block:: console - - $ env XCLIM_PREFETCH_TESTING_DATA=1 pytest - -""" __all__ = [ - "PREFETCH_TESTING_DATA", - "TESTDATA_BRANCH", + "add_doctest_filepaths", + "add_ensemble_dataset_objects", "add_example_file_paths", "assert_lazy", "generate_atmos", - "populate_testing_data", "test_timeseries", ] -def testing_setup_warnings(): - """Warn users about potential incompatibilities between xclim and xclim-testdata versions.""" - if re.match(r"^\d+\.\d+\.\d+$", __xclim_version__) and TESTDATA_BRANCH == "main": - # This does not need to be emitted on GitHub Workflows and ReadTheDocs - if not os.getenv("CI") and not os.getenv("READTHEDOCS"): - warnings.warn( - f'`xclim` {__xclim_version__} is running tests against the "main" branch of `Ouranosinc/xclim-testdata`. ' - "It is possible that changes in xclim-testdata may be incompatible with test assertions in this version. " - "Please be sure to check https://github.com/Ouranosinc/xclim-testdata for more information.", - UserWarning, - ) - - if re.match(r"^v\d+\.\d+\.\d+", TESTDATA_BRANCH): - # Find the date of last modification of xclim source files to generate a calendar version - install_date = dt.strptime( - time.ctime(os.path.getmtime(xclim.__file__)), - "%a %b %d %H:%M:%S %Y", - ) - install_calendar_version = ( - f"{install_date.year}.{install_date.month}.{install_date.day}" - ) - - if Version(TESTDATA_BRANCH) > Version(install_calendar_version): - warnings.warn( - f"Installation date of `xclim` ({install_date.ctime()}) " - f"predates the last release of `xclim-testdata` ({TESTDATA_BRANCH}). " - "It is very likely that the testing data is incompatible with this build of `xclim`.", - UserWarning, - ) - - -def generate_atmos(cache_dir: Path) -> dict[str, xr.DataArray]: +def generate_atmos( + branch: str | os.PathLike[str] | Path, + cache_dir: str | os.PathLike[str] | Path, +) -> dict[str, xr.DataArray]: """Create the `atmosds` synthetic testing dataset.""" - with _open_dataset( + with xtu.open_dataset( "ERA5/daily_surface_cancities_1990-1993.nc", + branch=branch, cache_dir=cache_dir, - branch=TESTDATA_BRANCH, engine="h5netcdf", ) as ds: + rsus = shortwave_upwelling_radiation_from_net_downwelling(ds.rss, ds.rsds) + rlus = longwave_upwelling_radiation_from_net_downwelling(ds.rls, ds.rlds) tn10 = calendar.percentile_doy(ds.tasmin, per=10) t10 = calendar.percentile_doy(ds.tas, per=10) t90 = calendar.percentile_doy(ds.tas, per=90) tx90 = calendar.percentile_doy(ds.tasmax, per=90) - rsus = shortwave_upwelling_radiation_from_net_downwelling(ds.rss, ds.rsds) - rlus = longwave_upwelling_radiation_from_net_downwelling(ds.rls, ds.rlds) - ds = ds.assign( rsus=rsus, rlus=rlus, @@ -144,130 +67,53 @@ def generate_atmos(cache_dir: Path) -> dict[str, xr.DataArray]: ds.to_netcdf(atmos_file, engine="h5netcdf") # Give access to dataset variables by name in namespace - namespace = dict() - with _open_dataset( - atmos_file, branch=TESTDATA_BRANCH, cache_dir=cache_dir, engine="h5netcdf" + with xtu.open_dataset( + atmos_file, branch=branch, cache_dir=cache_dir, engine="h5netcdf" ) as ds: - for variable in ds.data_vars: - namespace[f"{variable}_dataset"] = ds.get(variable) + namespace = {f"{var}_dataset": ds[var] for var in ds.data_vars} return namespace -def populate_testing_data( - temp_folder: Path | None = None, - branch: str = TESTDATA_BRANCH, - _local_cache: Path = _default_cache_dir, -): - """Perform `_get_file` or `get_local_dataset` calls to GitHub to download or copy relevant testing data.""" - if _local_cache.joinpath(".data_written").exists(): - # This flag prevents multiple calls from re-attempting to download testing data in the same pytest run - return - - data_entries = [ - "CanESM2_365day/pr_day_CanESM2_rcp85_r1i1p1_na10kgrid_qm-moving-50bins-detrend_2095.nc", - "ERA5/daily_surface_cancities_1990-1993.nc", - "EnsembleReduce/TestEnsReduceCriteria.nc", - "EnsembleStats/BCCAQv2+ANUSPLIN300_ACCESS1-0_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", - "EnsembleStats/BCCAQv2+ANUSPLIN300_BNU-ESM_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", - "EnsembleStats/BCCAQv2+ANUSPLIN300_CCSM4_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", - "EnsembleStats/BCCAQv2+ANUSPLIN300_CCSM4_historical+rcp45_r2i1p1_1950-2100_tg_mean_YS.nc", - "EnsembleStats/BCCAQv2+ANUSPLIN300_CNRM-CM5_historical+rcp45_r1i1p1_1970-2050_tg_mean_YS.nc", - "FWI/GFWED_sample_2017.nc", - "FWI/cffdrs_test_fwi.nc", - "FWI/cffdrs_test_wDC.nc", - "HadGEM2-CC_360day/pr_day_HadGEM2-CC_rcp85_r1i1p1_na10kgrid_qm-moving-50bins-detrend_2095.nc", - "NRCANdaily/nrcan_canada_daily_pr_1990.nc", - "NRCANdaily/nrcan_canada_daily_tasmax_1990.nc", - "NRCANdaily/nrcan_canada_daily_tasmin_1990.nc", - "Raven/q_sim.nc", - "SpatialAnalogs/CanESM2_ScenGen_Chibougamau_2041-2070.nc", - "SpatialAnalogs/NRCAN_SECan_1981-2010.nc", - "SpatialAnalogs/dissimilarity.nc", - "SpatialAnalogs/indicators.nc", - "cmip3/tas.sresb1.giss_model_e_r.run1.atm.da.nc", - "cmip5/tas_Amon_CanESM2_rcp85_r1i1p1_200701-200712.nc", - "sdba/CanESM2_1950-2100.nc", - "sdba/ahccd_1950-2013.nc", - "sdba/nrcan_1950-2013.nc", - "uncertainty_partitioning/cmip5_pr_global_mon.nc", - "uncertainty_partitioning/seattle_avg_tas.csv", - ] - - data = dict() - for filepattern in data_entries: - if temp_folder is None: - try: - data[filepattern] = _get_file( - filepattern, branch=branch, cache_dir=_local_cache - ) - except FileNotFoundError: - warnings.warn( - "File {filepattern} was not found. Consider verifying the file exists." - ) - continue - elif temp_folder: - try: - data[filepattern] = _get_local_testdata( - filepattern, - temp_folder=temp_folder, - branch=branch, - _local_cache=_local_cache, - ) - except FileNotFoundError: - warnings.warn("File {filepattern} was not found.") - continue - return - - -def gather_testing_data(threadsafe_data_dir: Path, worker_id: str): - """Gather testing data across workers.""" - if ( - not _default_cache_dir.joinpath(TESTDATA_BRANCH).exists() - or PREFETCH_TESTING_DATA - ): - if PREFETCH_TESTING_DATA: - print("`XCLIM_PREFETCH_TESTING_DATA` set. Prefetching testing data...") - if platform == "win32": - raise OSError( - "UNIX-style file-locking is not supported on Windows. " - "Consider running `$ xclim prefetch_testing_data` to download testing data." - ) - elif worker_id in ["master"]: - populate_testing_data(branch=TESTDATA_BRANCH) - else: - _default_cache_dir.mkdir(exist_ok=True, parents=True) - lockfile = _default_cache_dir.joinpath(".lock") - test_data_being_written = FileLock(lockfile) - with test_data_being_written: - # This flag prevents multiple calls from re-attempting to download testing data in the same pytest run - populate_testing_data(branch=TESTDATA_BRANCH) - _default_cache_dir.joinpath(".data_written").touch() - with test_data_being_written.acquire(): - if lockfile.exists(): - lockfile.unlink() - copytree(_default_cache_dir, threadsafe_data_dir) +def add_ensemble_dataset_objects() -> dict[str, str]: + """Create a dictionary of xclim ensemble-related datasets to be patched into the xdoctest namespace.""" + namespace = { + "nc_files_simple": [ + "EnsembleStats/BCCAQv2+ANUSPLIN300_ACCESS1-0_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", + "EnsembleStats/BCCAQv2+ANUSPLIN300_BNU-ESM_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", + "EnsembleStats/BCCAQv2+ANUSPLIN300_CCSM4_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", + "EnsembleStats/BCCAQv2+ANUSPLIN300_CCSM4_historical+rcp45_r2i1p1_1950-2100_tg_mean_YS.nc", + ], + "nc_files_extra": [ + "EnsembleStats/BCCAQv2+ANUSPLIN300_CNRM-CM5_historical+rcp45_r1i1p1_1970-2050_tg_mean_YS.nc" + ], + } + namespace["nc_files"] = namespace["nc_files_simple"] + namespace["nc_files_extra"] + return namespace def add_example_file_paths() -> dict[str, str | list[xr.DataArray]]: - """Create a dictionary of relevant datasets to be patched into the xdoctest namespace.""" - namespace: dict = dict() - namespace["path_to_ensemble_file"] = "EnsembleReduce/TestEnsReduceCriteria.nc" - namespace["path_to_pr_file"] = "NRCANdaily/nrcan_canada_daily_pr_1990.nc" - namespace["path_to_sfcWind_file"] = "ERA5/daily_surface_cancities_1990-1993.nc" - namespace["path_to_tas_file"] = "ERA5/daily_surface_cancities_1990-1993.nc" - namespace["path_to_tasmax_file"] = "NRCANdaily/nrcan_canada_daily_tasmax_1990.nc" - namespace["path_to_tasmin_file"] = "NRCANdaily/nrcan_canada_daily_tasmin_1990.nc" + """Create a dictionary of doctest-relevant datasets to be patched into the xdoctest namespace.""" + namespace = { + "path_to_ensemble_file": "EnsembleReduce/TestEnsReduceCriteria.nc", + "path_to_pr_file": "NRCANdaily/nrcan_canada_daily_pr_1990.nc", + "path_to_sfcWind_file": "ERA5/daily_surface_cancities_1990-1993.nc", + "path_to_tas_file": "ERA5/daily_surface_cancities_1990-1993.nc", + "path_to_tasmax_file": "NRCANdaily/nrcan_canada_daily_tasmax_1990.nc", + "path_to_tasmin_file": "NRCANdaily/nrcan_canada_daily_tasmin_1990.nc", + "path_to_example_py": ( + Path(__file__).parent.parent.parent.parent + / "docs" + / "notebooks" + / "example.py" + ), + } # For core.utils.load_module example - namespace["path_to_example_py"] = ( - Path(__file__).parent.parent.parent.parent / "docs" / "notebooks" / "example.py" - ) - - time = xr.cftime_range("1990-01-01", "2049-12-31", freq="D") + sixty_years = xr.cftime_range("1990-01-01", "2049-12-31", freq="D") namespace["temperature_datasets"] = [ xr.DataArray( - 12 * np.random.random_sample(time.size) + 273, - coords={"time": time}, + 12 * np.random.random_sample(sixty_years.size) + 273, + coords={"time": sixty_years}, name="tas", dims=("time",), attrs={ @@ -277,8 +123,8 @@ def add_example_file_paths() -> dict[str, str | list[xr.DataArray]]: }, ), xr.DataArray( - 12 * np.random.random_sample(time.size) + 273, - coords={"time": time}, + 12 * np.random.random_sample(sixty_years.size) + 273, + coords={"time": sixty_years}, name="tas", dims=("time",), attrs={ @@ -288,12 +134,11 @@ def add_example_file_paths() -> dict[str, str | list[xr.DataArray]]: }, ), ] - return namespace -def add_doctest_filepaths(): - """Add filepaths to the xdoctest namespace.""" +def add_doctest_filepaths() -> dict[str, Any]: + """Overload some libraries directly into the xdoctest namespace.""" namespace: dict = dict() namespace["np"] = np namespace["xclim"] = xclim @@ -301,7 +146,6 @@ def add_doctest_filepaths(): np.random.rand(365) * 20 + 253.15, variable="tas" ) namespace["pr"] = test_timeseries(np.random.rand(365) * 5, variable="pr") - return namespace diff --git a/xclim/testing/registry.txt b/xclim/testing/registry.txt new file mode 100644 index 000000000..ec0fbbfd4 --- /dev/null +++ b/xclim/testing/registry.txt @@ -0,0 +1,51 @@ +CanESM2_365day/pr_day_CanESM2_rcp85_r1i1p1_na10kgrid_qm-moving-50bins-detrend_2095.nc sha256:16dafec260dd74bf38f87482baa34cc35a1689facfb5557ebfc7d2c928618fc7 +CanESM2_365day/tasmax_day_CanESM2_rcp85_r1i1p1_na10kgrid_qm-moving-50bins-detrend_2095.nc sha256:0c57c56e38a9e5b0623180c3def9406e9ddabbe7b1c01b282f1a34c4a61ea357 +CanESM2_365day/tasmin_day_CanESM2_rcp85_r1i1p1_na10kgrid_qm-moving-50bins-detrend_2095.nc sha256:5d43ec47759bf9d118942277fe8d7c632765c3a0ba02dc828b0610e1f2030a63 +cmip3/tas.sresb1.giss_model_e_r.run1.atm.da.nc sha256:e709552beeeccafcfe280759edf5477ae5241c698409ca051b0899c16e92c95e +cmip5/tas_Amon_CanESM2_rcp85_r1i1p1_200701-200712.nc sha256:7471770e4e654997225ab158f2b24aa0510b6f06006fb757b9ea7c0d4a47e1f2 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_200512-203011.nc sha256:3cb54d67bf89cdf542a7b93205785da3800f9a77eaa8436f4ee74af13b248b95 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_203012-205511.nc sha256:31b9a4139574012acbc9d7fdb210af8d00d45119a9b98ebcab67905262543c6d +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_205512-208011.nc sha256:8c18253f8039dfda0aba71f69e5fde367453fc8a239936ee54c6d32db184f3b9 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_208012-209912.nc sha256:bd7e419c8d6b60dbe700517a16453f787b147bb15cfdebf0519e882fa967f5a0 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_209912-212411.nc sha256:54dda14b6c2d8dce8e3a2ff526ffba8cc54bf5de5ace96eec93d060256fd63b6 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_212412-214911.nc sha256:35791a451c392d3dae69ecb789c4a952eff761dddab934389c7d0686feeb6e72 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_214912-217411.nc sha256:156577a84d82c23f65e019ba58fcdbb7677f1a1128f4745d72441896d0485a11 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_217412-219911.nc sha256:b6378f082aa6d877fae46be9663e1fe3bf82e0d596aaf501afa6217fcc300878 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_219912-222411.nc sha256:21c8db59941ad5481433b69eae5c9efed534c0fc35062ab767a481be9da503b6 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_222412-224911.nc sha256:e8d406cc7b87d0899236610e1a9ddecde8279d0d26316114496f159565fb78ba +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_224912-227411.nc sha256:abbe16349870c501335f7f17a5372703f82e8db84f911d29c31783bb07100e6e +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_227412-229911.nc sha256:ecf52dc8ac13e04d0b643fc53cc5b367b32e68a311e6718686eaa87088788f98 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_229912-229912.nc sha256:3fa657483072d8a04363b8718bc9c4e63e6354617a4ab3d627b25222a4cd094c +cmip6/o3_Amon_GFDL-ESM4_historical_r1i1p1f1_gr1_185001-194912.nc sha256:cfff189d4986289efb2b88f418cd6d65b26b59355b67b73ca26ac8fa12a9f83f +cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc sha256:58a03aa401f80751ad60c8950f14bcf717aeb6ef289169cb5ae3081bb4689825 +CRCM5/tasmax_bby_198406_se.nc sha256:9a80cc19ed212428ef90ce0cc40790fbf0d1fc301df0abdf578da45843dae93d +EnsembleReduce/TestEnsReduceCriteria.nc sha256:ae7a70b9d5c54ab072f1cfbfab91d430a41c5067db3c1968af57ea2122cfe8e7 +EnsembleStats/BCCAQv2+ANUSPLIN300_ACCESS1-0_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc sha256:ca0cc893cf91db7c6dfe3df10d605684eabbea55b7e26077c10142d302e55aed +EnsembleStats/BCCAQv2+ANUSPLIN300_BNU-ESM_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc sha256:c796276f563849c31bf388a3beb4a440eeb72062a84b4cf9760c854d1e990ca4 +EnsembleStats/BCCAQv2+ANUSPLIN300_CCSM4_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc sha256:9cfa9bc4e81e936eb680a55db428ccd9f0a6d366d4ae2c4a9064bfa5d71e5ca7 +EnsembleStats/BCCAQv2+ANUSPLIN300_CCSM4_historical+rcp45_r2i1p1_1950-2100_tg_mean_YS.nc sha256:ca36aafb3c63ddb6bfc8537abb854b71f719505c1145d5c81c3315eb1a13647c +EnsembleStats/BCCAQv2+ANUSPLIN300_CNRM-CM5_historical+rcp45_r1i1p1_1970-2050_tg_mean_YS.nc sha256:623eab96d75d8cc8abd59dfba1c14cfb06fd7c0fe9ce86788d3c8b0891684df2 +ERA5/daily_surface_cancities_1990-1993.nc sha256:049d54ace3d229a96cc621189daa3e1a393959ab8d988221cfc7b2acd7ab94b2 +FWI/GFWED_sample_2017.nc sha256:cf3bde795825663894fa7619a028d5a14fee307c623968235f25393f7afe159e +FWI/cffdrs_test_fwi.nc sha256:147be24e080aa67f17261f61f05a5dfb381a66a23785a327e47e2303667ca3ab +FWI/cffdrs_test_wDC.nc sha256:ebadcad1dd6a1a1e93c29a1143d7caefd46593ea2fbeb721015245981cce90c3 +HadGEM2-CC_360day/pr_day_HadGEM2-CC_rcp85_r1i1p1_na10kgrid_qm-moving-50bins-detrend_2095.nc sha256:c45ff4c17ba9fd92392bb08a7705789071a0bec40bde48f5a838ff12413cc33b +HadGEM2-CC_360day/tasmax_day_HadGEM2-CC_rcp85_r1i1p1_na10kgrid_qm-moving-50bins-detrend_2095.nc sha256:aa3eb54ea69bb00330de1037a48ac13dbc5b72f346c801d97731dec8260f400c +HadGEM2-CC_360day/tasmin_day_HadGEM2-CC_rcp85_r1i1p1_na10kgrid_qm-moving-50bins-detrend_2095.nc sha256:5c8fa666603fd68f614d95ac8c5a0dbdfb9f8e2e86666a270516a38526c1aa20 +NRCANdaily/nrcan_canada_daily_pr_1990.nc sha256:144479ec7a976cfecb6a10762d128a771356093d72caf5f075508ee86d25a1b0 +NRCANdaily/nrcan_canada_daily_tasmax_1990.nc sha256:84880205b798740e37a102c7f40e595d7a4fde6e35fb737a1ef68b8dad447526 +NRCANdaily/nrcan_canada_daily_tasmin_1990.nc sha256:13d61fc54cdcb4c1617ec777ccbf59575d8fdc24754f914042301bc1b024d7f7 +Raven/q_sim.nc sha256:f7a0ae73c498235e1c3e7338a184c5ca3729941b81521e606aa60b2c639f6e71 +sdba/CanESM2_1950-2100.nc sha256:b41fe603676e70d16c747ec207eb75ec86a39b665de401dcb23b5969ab3e1b32 +sdba/adjusted_external.nc sha256:ff325c88eca96844bc85863744e4e08bcdf3d257388255636427ad5e11960d2e +sdba/ahccd_1950-2013.nc sha256:7e9a1f61c1d04ca257b09857a82715f1fa3f0550d77f97b7306d4eaaf0c70239 +sdba/nrcan_1950-2013.nc sha256:4ce2dcfdac09b028db0f3e348272a496d796c36d4f3c4a412ebcca11449b7237 +uncertainty_partitioning/cmip5_pr_global_mon.nc sha256:7e585c995e95861979fd23dd9346f78a879403ea1d1d15acaa627802b4c5f1f4 +uncertainty_partitioning/cmip5_pr_pnw_mon.nc sha256:1cdfe74f5bd5cf71cd0737c190277821ea90e4e79de5b37367bf2b82c35a66c9 +uncertainty_partitioning/cmip5_tas_global_mon.nc sha256:41ba79a43bab169a0487e3f3f66a68a699bef9355a13e26a87fdb65744555cb5 +uncertainty_partitioning/cmip5_tas_pnw_mon.nc sha256:eeb48765fd430186f3634e7f779b4be45ab3df73e806a4cbb743fefb13279398 +SpatialAnalogs/CanESM2_ScenGen_Chibougamau_2041-2070.nc sha256:b6cfc4a963d68b6da8978acd26ffb506f33c9c264d8057badd90bf47cd9f3f3d +SpatialAnalogs/NRCAN_SECan_1981-2010.nc sha256:bde680ddad84106caad3a2e83a70ecdd8138578a70e875d77c2ec6d3ff868fee +SpatialAnalogs/dissimilarity.nc sha256:200ab9b7d43d41e6db917c54d35b43e3c5853e0df701e44efd5b813e47590110 +SpatialAnalogs/indicators.nc sha256:3bcbb0e4540d4badc085ac42b9d04a353e815fb55c62271eb73275b889c80a15 +uncertainty_partitioning/seattle_avg_tas.csv sha256:157d6721f9925eec8268848e34548df2b1da50935f247a9b136d251ef53898d7 diff --git a/xclim/testing/utils.py b/xclim/testing/utils.py index bcc3691f8..13fa86563 100644 --- a/xclim/testing/utils.py +++ b/xclim/testing/utils.py @@ -3,424 +3,139 @@ ====================================== """ -# Some of this code was copied and adapted from xarray from __future__ import annotations -import hashlib -import json +import importlib.resources as ilr import logging import os import platform import re import sys +import time import warnings from collections.abc import Sequence +from datetime import datetime as dt from importlib import import_module from io import StringIO from pathlib import Path -from shutil import copy +from shutil import copytree from typing import TextIO from urllib.error import HTTPError, URLError from urllib.parse import urljoin, urlparse -from urllib.request import urlopen, urlretrieve +from urllib.request import urlretrieve -import pandas as pd -from platformdirs import user_cache_dir +from filelock import FileLock +from packaging.version import Version from xarray import Dataset from xarray import open_dataset as _open_dataset +import xclim +from xclim import __version__ as __xclim_version__ + try: from pytest_socket import SocketBlockedError except ImportError: SocketBlockedError = None -_xclim_deps = [ - "xclim", - "xarray", - "statsmodels", - "sklearn", - "scipy", - "pint", - "pandas", - "numpy", - "numba", - "lmoments3", - "jsonpickle", - "flox", - "dask", - "cf_xarray", - "cftime", - "clisops", - "click", - "bottleneck", - "boltons", -] - +try: + import pooch +except ImportError: + warnings.warn( + "The `pooch` library is not installed. " + "The default cache directory for testing data will not be set." + ) + pooch = None -_default_cache_dir = Path(user_cache_dir("xclim-testdata")) logger = logging.getLogger("xclim") + __all__ = [ - "_default_cache_dir", + "TESTDATA_BRANCH", + "TESTDATA_CACHE_DIR", + "TESTDATA_REPO_URL", "audit_url", - "get_file", - "get_local_testdata", - "list_datasets", + "default_testdata_cache", + "default_testdata_repo_url", + "default_testdata_version", + "gather_testing_data", "list_input_variables", + "nimbus", "open_dataset", + "populate_testing_data", "publish_release_notes", "run_doctests", "show_versions", + "testing_setup_warnings", ] +default_testdata_version = "v2024.8.23" +"""Default version of the testing data to use when fetching datasets.""" -def file_md5_checksum(f_name): - hash_md5 = hashlib.md5() # noqa: S324 - with open(f_name, "rb") as f: - hash_md5.update(f.read()) - return hash_md5.hexdigest() - - -def audit_url(url: str, context: str = None) -> str: - """Check if the URL is well-formed. - - Raises - ------ - URLError - If the URL is not well-formed. - """ - msg = "" - result = urlparse(url) - if result.scheme == "http": - msg = f"{context if context else ''} URL is not using secure HTTP: '{url}'".strip() - if not all([result.scheme, result.netloc]): - msg = f"{context if context else ''} URL is not well-formed: '{url}'".strip() - - if msg: - logger.error(msg) - raise URLError(msg) - return url - - -def get_file( - name: str | os.PathLike[str] | Sequence[str | os.PathLike[str]], - github_url: str = "https://github.com/Ouranosinc/xclim-testdata", - branch: str = "main", - cache_dir: Path = _default_cache_dir, -) -> Path | list[Path]: - """Return a file from an online GitHub-like repository. - - If a local copy is found then always use that to avoid network traffic. - - Parameters - ---------- - name : str | os.PathLike[str] | Sequence[str | os.PathLike[str]] - Name of the file or list/tuple of names of files containing the dataset(s) including suffixes. - github_url : str - URL to GitHub repository where the data is stored. - branch : str, optional - For GitHub-hosted files, the branch to download from. - cache_dir : Path - The directory in which to search for and write cached data. - - Returns - ------- - Path | list[Path] - """ - if isinstance(name, (str, os.PathLike)): - name = [name] - - files = [] - for n in name: - fullname = Path(n) - suffix = fullname.suffix - files.append( - _get( - fullname=fullname, - github_url=github_url, - branch=branch, - suffix=suffix, - cache_dir=cache_dir, - ) - ) - if len(files) == 1: - return files[0] - return files - - -def get_local_testdata( - patterns: str | Sequence[str], - temp_folder: str | os.PathLike, - branch: str = "master", - _local_cache: str | os.PathLike = _default_cache_dir, -) -> Path | list[Path]: - """Copy specific testdata from a default cache to a temporary folder. - - Return files matching `pattern` in the default cache dir and move to a local temp folder. - - Parameters - ---------- - patterns : str | Sequence[str] - Glob patterns, which must include the folder. - temp_folder : str | os.PathLike - Target folder to copy files and filetree to. - branch : str - For GitHub-hosted files, the branch to download from. - _local_cache : str | os.PathLike - Local cache of testing data. +default_testdata_repo_url = "https://github.com/Ouranosinc/xclim-testdata" +"""Default URL of the testing data repository to use when fetching datasets.""" - Returns - ------- - Path | list[Path] - """ - temp_paths = [] +try: + default_testdata_cache = Path(pooch.os_cache("xclim-testdata")) + """Default location for the testing data cache.""" +except AttributeError: + default_testdata_cache = None - if isinstance(patterns, str): - patterns = [patterns] +TESTDATA_REPO_URL = str(os.getenv("XCLIM_TESTDATA_REPO_URL", default_testdata_repo_url)) +"""Sets the URL of the testing data repository to use when fetching datasets. - for pattern in patterns: - potential_paths = [ - path for path in Path(temp_folder).joinpath(branch).glob(pattern) - ] - if potential_paths: - temp_paths.extend(potential_paths) - continue +Notes +----- +When running tests locally, this can be set for both `pytest` and `tox` by exporting the variable: - testdata_path = Path(_local_cache) - if not testdata_path.exists(): - raise RuntimeError(f"{testdata_path} does not exists") - paths = [path for path in testdata_path.joinpath(branch).glob(pattern)] - if not paths: - raise FileNotFoundError( - f"No data found for {pattern} at {testdata_path}/{branch}." - ) +.. code-block:: console - main_folder = Path(temp_folder).joinpath(branch).joinpath(Path(pattern).parent) - main_folder.mkdir(exist_ok=True, parents=True) + $ export XCLIM_TESTDATA_REPO_URL="https://github.com/my_username/xclim-testdata" - for file in paths: - temp_file = main_folder.joinpath(file.name) - if not temp_file.exists(): - copy(file, main_folder) - temp_paths.append(temp_file) +or setting the variable at runtime: - # Return item directly when singleton, for convenience - return temp_paths[0] if len(temp_paths) == 1 else temp_paths +.. code-block:: console + $ env XCLIM_TESTDATA_REPO_URL="https://github.com/my_username/xclim-testdata" pytest +""" -def _get( - fullname: Path, - github_url: str, - branch: str, - suffix: str, - cache_dir: Path, -) -> Path: - cache_dir = cache_dir.absolute() - local_file = cache_dir / branch / fullname - md5_name = fullname.with_suffix(f"{suffix}.md5") - md5_file = cache_dir / branch / md5_name +TESTDATA_BRANCH = str(os.getenv("XCLIM_TESTDATA_BRANCH", default_testdata_version)) +"""Sets the branch of the testing data repository to use when fetching datasets. - if not github_url.startswith("https"): - raise ValueError(f"GitHub URL not secure: '{github_url}'.") +Notes +----- +When running tests locally, this can be set for both `pytest` and `tox` by exporting the variable: - if local_file.is_file(): - local_md5 = file_md5_checksum(local_file) - try: - url = "/".join((github_url, "raw", branch, md5_name.as_posix())) - msg = f"Attempting to fetch remote file md5: {md5_name.as_posix()}" - logger.info(msg) - urlretrieve(audit_url(url), md5_file) # noqa: S310 - with open(md5_file) as f: - remote_md5 = f.read() - if local_md5.strip() != remote_md5.strip(): - local_file.unlink() - msg = ( - f"MD5 checksum for {local_file.as_posix()} does not match upstream md5. " - "Attempting new download." - ) - warnings.warn(msg) - except HTTPError: - msg = ( - f"{md5_name.as_posix()} not accessible in remote repository. " - "Unable to determine validity with upstream repo." - ) - warnings.warn(msg) - except URLError: - msg = ( - f"{md5_name.as_posix()} not found in remote repository. " - "Unable to determine validity with upstream repo." - ) - warnings.warn(msg) - except SocketBlockedError: - msg = f"Unable to access {md5_name.as_posix()} online. Testing suite is being run with `--disable-socket`." - warnings.warn(msg) - - if not local_file.is_file(): - # This will always leave this directory on disk. - # We may want to add an option to remove it. - local_file.parent.mkdir(exist_ok=True, parents=True) - - url = "/".join((github_url, "raw", branch, fullname.as_posix())) - msg = f"Fetching remote file: {fullname.as_posix()}" - logger.info(msg) - try: - urlretrieve(audit_url(url), local_file) # noqa: S310 - except HTTPError as e: - msg = f"{fullname.as_posix()} not accessible in remote repository. Aborting file retrieval." - raise FileNotFoundError(msg) from e - except URLError as e: - msg = ( - f"{fullname.as_posix()} not found in remote repository. " - "Verify filename and repository address. Aborting file retrieval." - ) - raise FileNotFoundError(msg) from e - except SocketBlockedError as e: - msg = ( - f"Unable to access {fullname.as_posix()} online. Testing suite is being run with `--disable-socket`. " - f"If you intend to run tests with this option enabled, please download the file beforehand with the " - f"following console command: `xclim prefetch_testing_data`." - ) - raise FileNotFoundError(msg) from e - try: - url = "/".join((github_url, "raw", branch, md5_name.as_posix())) - msg = f"Fetching remote file md5: {md5_name.as_posix()}" - logger.info(msg) - urlretrieve(audit_url(url), md5_file) # noqa: S310 - except (HTTPError, URLError) as e: - msg = ( - f"{md5_name.as_posix()} not accessible online. " - "Unable to determine validity of file from upstream repo. " - "Aborting file retrieval." - ) - local_file.unlink() - raise FileNotFoundError(msg) from e +.. code-block:: console - local_md5 = file_md5_checksum(local_file) - try: - with open(md5_file) as f: - remote_md5 = f.read() - if local_md5.strip() != remote_md5.strip(): - local_file.unlink() - msg = ( - f"{local_file.as_posix()} and md5 checksum do not match. " - "There may be an issue with the upstream origin data." - ) - raise OSError(msg) - except OSError as e: - logger.error(e) + $ export XCLIM_TESTDATA_BRANCH="my_testing_branch" - return local_file +or setting the variable at runtime: +.. code-block:: console -# idea copied from raven that it borrowed from xclim that borrowed it from xarray that was borrowed from Seaborn -def open_dataset( - name: str | os.PathLike[str], - suffix: str | None = None, - dap_url: str | None = None, - github_url: str = "https://github.com/Ouranosinc/xclim-testdata", - branch: str = "main", - cache: bool = True, - cache_dir: Path = _default_cache_dir, - **kwargs, -) -> Dataset: - r"""Open a dataset from the online GitHub-like repository. + $ env XCLIM_TESTDATA_BRANCH="my_testing_branch" pytest +""" - If a local copy is found then always use that to avoid network traffic. +TESTDATA_CACHE_DIR = os.getenv("XCLIM_TESTDATA_CACHE_DIR", default_testdata_cache) +"""Sets the directory to store the testing datasets. - Parameters - ---------- - name : str or os.PathLike - Name of the file containing the dataset. - suffix : str, optional - If no suffix is given, assumed to be netCDF ('.nc' is appended). For no suffix, set "". - dap_url : str, optional - URL to OPeNDAP folder where the data is stored. If supplied, supersedes github_url. - github_url : str - URL to GitHub repository where the data is stored. - branch : str, optional - For GitHub-hosted files, the branch to download from. - cache_dir : Path - The directory in which to search for and write cached data. - cache : bool - If True, then cache data locally for use on subsequent calls. - \*\*kwargs - For NetCDF files, keywords passed to :py:func:`xarray.open_dataset`. +If not set, the default location will be used (based on ``platformdirs``, see :func:`pooch.os_cache`). - Returns - ------- - Union[Dataset, Path] +Notes +----- +When running tests locally, this can be set for both `pytest` and `tox` by exporting the variable: - See Also - -------- - xarray.open_dataset - """ - if isinstance(name, (str, os.PathLike)): - name = Path(name) - if suffix is None: - suffix = ".nc" - fullname = name.with_suffix(suffix) - - if dap_url is not None: - dap_file_address = urljoin(dap_url, str(name)) - try: - ds = _open_dataset(audit_url(dap_file_address, context="OPeNDAP"), **kwargs) - return ds - except URLError: - raise - except OSError: - msg = f"OPeNDAP file not read. Verify that the service is available: '{dap_file_address}'" - logger.error(msg) - raise OSError(msg) - - local_file = _get( - fullname=fullname, - github_url=github_url, - branch=branch, - suffix=suffix, - cache_dir=cache_dir, - ) +.. code-block:: console - try: - ds = _open_dataset(local_file, **kwargs) - if not cache: - ds = ds.load() - local_file.unlink() - return ds - except OSError as err: - raise err + $ export XCLIM_TESTDATA_CACHE_DIR="/path/to/my/data" +or setting the variable at runtime: -def list_datasets(github_repo="Ouranosinc/xclim-testdata", branch="main"): - """Return a DataFrame listing all xclim test datasets available on the GitHub repo for the given branch. +.. code-block:: console - The result includes the filepath, as passed to `open_dataset`, the file size (in KB) and the html url to the file. - This uses an unauthenticated call to GitHub's REST API, so it is limited to 60 requests per hour (per IP). - A single call of this function triggers one request per subdirectory, so use with parsimony. - """ - with urlopen( # noqa: S310 - audit_url(f"https://api.github.com/repos/{github_repo}/contents?ref={branch}") - ) as res: - base = json.loads(res.read().decode()) - records = [] - for folder in base: - if folder["path"].startswith(".") or folder["size"] > 0: - # drop hidden folders and other files. - continue - with urlopen(audit_url(folder["url"])) as res: # noqa: S310 - listing = json.loads(res.read().decode()) - for file in listing: - if file["path"].endswith(".nc"): - records.append( - { - "name": file["path"], - "size": file["size"] / 2**10, - "url": file["html_url"], - } - ) - df = pd.DataFrame.from_records(records).set_index("name") - print(f"Found {len(df)} datasets.") - return df + $ env XCLIM_TESTDATA_CACHE_DIR="/path/to/my/data" pytest +""" def list_input_variables( @@ -478,18 +193,7 @@ def list_input_variables( return variables -def run_doctests(): - """Run the doctests for the module.""" - import pytest - - cmd = [ - f"--rootdir={Path(__file__).absolute().parent}", - "--numprocesses=0", - "--xdoctest", - f"{Path(__file__).absolute().parents[1]}", - ] - - sys.exit(pytest.main(cmd)) +# Publishing Tools ### def publish_release_notes( @@ -577,6 +281,29 @@ def publish_release_notes( return None +_xclim_deps = [ + "xclim", + "xarray", + "statsmodels", + "sklearn", + "scipy", + "pint", + "pandas", + "numpy", + "numba", + "lmoments3", + "jsonpickle", + "flox", + "dask", + "cf_xarray", + "cftime", + "clisops", + "click", + "bottleneck", + "boltons", +] + + def show_versions( file: os.PathLike | StringIO | TextIO | None = None, deps: list[str] | None = None, @@ -638,3 +365,338 @@ def show_versions( else: print(message, file=file) return None + + +# Test Data Utilities ### + + +def run_doctests(): + """Run the doctests for the module.""" + import pytest + + cmd = [ + f"--rootdir={Path(__file__).absolute().parent}", + "--numprocesses=0", + "--xdoctest", + f"{Path(__file__).absolute().parents[1]}", + ] + + sys.exit(pytest.main(cmd)) + + +def testing_setup_warnings(): + """Warn users about potential incompatibilities between xclim and xclim-testdata versions.""" + if ( + re.match(r"^\d+\.\d+\.\d+$", __xclim_version__) + and TESTDATA_BRANCH != default_testdata_version + ): + # This does not need to be emitted on GitHub Workflows and ReadTheDocs + if not os.getenv("CI") and not os.getenv("READTHEDOCS"): + warnings.warn( + f"`xclim` stable ({__xclim_version__}) is running tests against a non-default branch of the testing data. " + "It is possible that changes to the testing data may be incompatible with some assertions in this version. " + f"Please be sure to check {TESTDATA_REPO_URL} for more information.", + ) + + if re.match(r"^v\d+\.\d+\.\d+", TESTDATA_BRANCH): + # Find the date of last modification of xclim source files to generate a calendar version + install_date = dt.strptime( + time.ctime(os.path.getmtime(xclim.__file__)), + "%a %b %d %H:%M:%S %Y", + ) + install_calendar_version = ( + f"{install_date.year}.{install_date.month}.{install_date.day}" + ) + + if Version(TESTDATA_BRANCH) > Version(install_calendar_version): + warnings.warn( + f"The installation date of `xclim` ({install_date.ctime()}) " + f"predates the last release of testing data ({TESTDATA_BRANCH}). " + "It is very likely that the testing data is incompatible with this build of `xclim`.", + ) + + +def load_registry( + branch: str = TESTDATA_BRANCH, repo: str = TESTDATA_REPO_URL +) -> dict[str, str]: + """Load the registry file for the test data. + + Returns + ------- + dict + Dictionary of filenames and hashes. + """ + remote_registry = audit_url(f"{repo}/raw/{branch}/data/registry.txt") + + if branch != default_testdata_version: + custom_registry_folder = Path( + str(ilr.files("xclim").joinpath(f"testing/{branch}")) + ) + custom_registry_folder.mkdir(parents=True, exist_ok=True) + registry_file = custom_registry_folder.joinpath("registry.txt") + urlretrieve(remote_registry, registry_file) # noqa: S310 + + elif repo != default_testdata_repo_url: + registry_file = Path(str(ilr.files("xclim").joinpath("testing/registry.txt"))) + urlretrieve(remote_registry, registry_file) # noqa: S310 + + registry_file = Path(str(ilr.files("xclim").joinpath("testing/registry.txt"))) + if not registry_file.exists(): + raise FileNotFoundError(f"Registry file not found: {registry_file}") + + # Load the registry file + with registry_file.open() as f: + registry = {line.split()[0]: line.split()[1] for line in f} + return registry + + +def nimbus( # noqa: PR01 + repo: str = TESTDATA_REPO_URL, + branch: str = TESTDATA_BRANCH, + cache_dir: str | Path = TESTDATA_CACHE_DIR, + data_updates: bool = True, +): + """Pooch registry instance for xclim test data. + + Parameters + ---------- + repo : str + URL of the repository to use when fetching testing datasets. + branch : str + Branch of repository to use when fetching testing datasets. + cache_dir : str or Path + The path to the directory where the data files are stored. + data_updates : bool + If True, allow updates to the data files. Default is True. + + Returns + ------- + pooch.Pooch + The Pooch instance for accessing the xclim testing data. + + Notes + ----- + There are three environment variables that can be used to control the behaviour of this registry: + - ``XCLIM_TESTDATA_CACHE_DIR``: If this environment variable is set, it will be used as the base directory to + store the data files. The directory should be an absolute path (i.e., it should start with ``/``). + Otherwise,the default location will be used (based on ``platformdirs``, see :py:func:`pooch.os_cache`). + - ``XCLIM_TESTDATA_REPO_URL``: If this environment variable is set, it will be used as the URL of the repository + to use when fetching datasets. Otherwise, the default repository will be used. + - ``XCLIM_TESTDATA_BRANCH``: If this environment variable is set, it will be used as the branch of the repository + to use when fetching datasets. Otherwise, the default branch will be used. + + Examples + -------- + Using the registry to download a file: + + .. code-block:: python + + import xarray as xr + from xclim.testing.helpers import nimbus + + example_file = nimbus().fetch("example.nc") + data = xr.open_dataset(example_file) + """ + if pooch is None: + raise ImportError( + "The `pooch` package is required to fetch the xclim testing data. " + "You can install it with `pip install pooch` or `pip install xclim[dev]`." + ) + + remote = audit_url(f"{repo}/raw/{branch}/data") + return pooch.create( + path=cache_dir, + base_url=remote, + version=default_testdata_version, + version_dev=branch, + allow_updates=data_updates, + registry=load_registry(branch=branch, repo=repo), + ) + + +# idea copied from raven that it borrowed from xclim that borrowed it from xarray that was borrowed from Seaborn +def open_dataset( + name: str | os.PathLike[str], + dap_url: str | None = None, + branch: str = TESTDATA_BRANCH, + repo: str = TESTDATA_REPO_URL, + cache_dir: str | os.PathLike[str] | None = TESTDATA_CACHE_DIR, + **kwargs, +) -> Dataset: + r"""Open a dataset from the online GitHub-like repository. + + If a local copy is found then always use that to avoid network traffic. + + Parameters + ---------- + name : str + Name of the file containing the dataset. + dap_url : str, optional + URL to OPeNDAP folder where the data is stored. If supplied, supersedes github_url. + branch : str + Branch of the repository to use when fetching datasets. + repo: str + URL of the repository to use when fetching testing datasets. + cache_dir : Path + The directory in which to search for and write cached data. + \*\*kwargs + For NetCDF files, keywords passed to :py:func:`xarray.open_dataset`. + + Returns + ------- + Union[Dataset, Path] + + Raises + ------ + OSError + If the file is not found in the cache directory or cannot be read. + + See Also + -------- + xarray.open_dataset + """ + if cache_dir is None: + raise ValueError( + "The cache directory must be set. " + "Please set the `cache_dir` parameter or the `XCLIM_DATA_DIR` environment variable." + ) + + if dap_url: + try: + return _open_dataset( + audit_url(urljoin(dap_url, str(name)), context="OPeNDAP"), **kwargs + ) + except URLError: + raise + except OSError as e: + msg = f"OPeNDAP file not read. Verify that the service is available: '{urljoin(dap_url, str(name))}'" + raise OSError(msg) from e + + local_file = Path(cache_dir).joinpath(name) + if not local_file.exists(): + try: + local_file = nimbus(branch=branch, repo=repo, cache_dir=cache_dir).fetch( + name + ) + except OSError as e: + raise OSError( + f"File not found locally. Verify that the testing data is available in remote: {local_file}" + ) from e + try: + ds = _open_dataset(local_file, **kwargs) + return ds + except OSError: + raise + + +def populate_testing_data( + temp_folder: Path | None = None, + repo: str = TESTDATA_REPO_URL, + branch: str = TESTDATA_BRANCH, + local_cache: Path = TESTDATA_CACHE_DIR, +) -> None: + """Populate the local cache with the testing data. + + Parameters + ---------- + temp_folder : Path, optional + Path to a temporary folder to use as the local cache. If not provided, the default location will be used. + repo : str, optional + URL of the repository to use when fetching testing datasets. + branch : str, optional + Branch of xclim-testdata to use when fetching testing datasets. + local_cache : Path + The path to the local cache. Defaults to the location set by the platformdirs library. + The testing data will be downloaded to this local cache. + + Returns + ------- + None + """ + # Create the Pooch instance + n = nimbus(repo=repo, branch=branch, cache_dir=temp_folder or local_cache) + + # Download the files + errored_files = [] + for file in load_registry(): + try: + n.fetch(file) + except HTTPError: + msg = f"File `{file}` not accessible in remote repository." + logging.error(msg) + errored_files.append(file) + except SocketBlockedError as e: # noqa + msg = ( + "Unable to access registry file online. Testing suite is being run with `--disable-socket`. " + "If you intend to run tests with this option enabled, please download the file beforehand with the " + "following console command: `$ xclim prefetch_testing_data`." + ) + raise SocketBlockedError(msg) from e + else: + logging.info("Files were downloaded successfully.") + + if errored_files: + logging.error( + "The following files were unable to be downloaded: %s", + errored_files, + ) + + +def gather_testing_data( + worker_cache_dir: str | os.PathLike[str] | Path, + worker_id: str, + _cache_dir: str | os.PathLike[str] | None = TESTDATA_CACHE_DIR, +): + """Gather testing data across workers.""" + if _cache_dir is None: + raise ValueError( + "The cache directory must be set. " + "Please set the `cache_dir` parameter or the `XCLIM_DATA_DIR` environment variable." + ) + cache_dir = Path(_cache_dir) + + if worker_id == "master": + populate_testing_data(branch=TESTDATA_BRANCH) + else: + if platform.system() == "Windows": + if not cache_dir.joinpath(default_testdata_version).exists(): + raise FileNotFoundError( + "Testing data not found and UNIX-style file-locking is not supported on Windows. " + "Consider running `$ xclim prefetch_testing_data` to download testing data beforehand." + ) + else: + cache_dir.mkdir(exist_ok=True, parents=True) + lockfile = cache_dir.joinpath(".lock") + test_data_being_written = FileLock(lockfile) + with test_data_being_written: + # This flag prevents multiple calls from re-attempting to download testing data in the same pytest run + populate_testing_data(branch=TESTDATA_BRANCH) + cache_dir.joinpath(".data_written").touch() + with test_data_being_written.acquire(): + if lockfile.exists(): + lockfile.unlink() + copytree(cache_dir.joinpath(default_testdata_version), worker_cache_dir) + + +# Testing Utilities ### + + +def audit_url(url: str, context: str | None = None) -> str: + """Check if the URL is well-formed. + + Raises + ------ + URLError + If the URL is not well-formed. + """ + msg = "" + result = urlparse(url) + if result.scheme == "http": + msg = f"{context if context else ''} URL is not using secure HTTP: '{url}'".strip() + if not all([result.scheme, result.netloc]): + msg = f"{context if context else ''} URL is not well-formed: '{url}'".strip() + + if msg: + logger.error(msg) + raise URLError(msg) + return url