Skip to content

Commit

Permalink
Merge pull request #251 from Ouranosinc/test_extract
Browse files Browse the repository at this point in the history
Test search_data_catalogs
  • Loading branch information
RondeauG authored Sep 13, 2023
2 parents f305ca5 + b0542e4 commit 2d1a7fd
Show file tree
Hide file tree
Showing 5 changed files with 329 additions and 53 deletions.
5 changes: 3 additions & 2 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,19 @@ Announcements

New features and enhancements
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
* N/A
* Added the ability to search for simulations that reach a given warming level. (:pull:`251`).

Breaking changes
^^^^^^^^^^^^^^^^
* N/A

Bug fixes
^^^^^^^^^
* N/A
* Fixed a bug in ``xs.search_data_catalogs`` when searching for fixed fields and specific experiments/members. (:pull:`251`).

Internal changes
^^^^^^^^^^^^^^^^
* Continued work on adding tests. (:pull:`251`).
* Fixed pre-commit's pretty-format-json so it ignores notebooks. (:pull:`254`).
* Fixed the labeler so docs/CI isn't automatically added for contributions by new collaborators. (:pull:`254`).
* Made it so that `tests` are no longer treated as an installable package. (:pull:`248`).
Expand Down
6 changes: 4 additions & 2 deletions docs/notebooks/1_catalog.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@
"- `allow_conversion` is used to allow searching for calculable variables, in the case where the requested variable would not be available.\n",
"- `restrict_resolution` is used to limit the results to the finest or coarsest resolution available for each source.\n",
"- `restrict_members` is used to limit the results to a maximum number of realizations for each source.\n",
"- `restrict_warming_level` is used to limit the results to only datasets that are present in the csv used for calculating warming levels.\n",
"- `restrict_warming_level` is used to limit the results to only datasets that are present in the csv used for calculating warming levels. You can also pass a dict to verify that a given warming level is reached.\n",
"\n",
"Note that compared to `search`, the result of `search_data_catalog` is a dictionary with one entry per unique ID. A given unique ID might contain multiple datasets as per `intake-esm`'s definition, because it groups catalog lines per *id - domain - processing_level - xrfreq*. Thus, it would separate model data that exists at different frequencies.\n",
"\n",
Expand Down Expand Up @@ -390,7 +390,9 @@
" data_catalogs=[f\"{Path().absolute()}/samples/pangeo-cmip6.json\"],\n",
" variables_and_freqs=variables_and_freqs,\n",
" match_hist_and_fut=True,\n",
" restrict_warming_level=True, # In this case all models exist in our database, so nothing gets eliminated.\n",
" restrict_warming_level={\n",
" \"wl\": 2\n",
" }, # SSP126 gets eliminated, since it doesn't reach +2°C by 2100.\n",
")\n",
"\n",
"cat_sim"
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def remove_data_folder():
request.addfinalizer(remove_data_folder)


@pytest.mark.requires_docs
@pytest.mark.requires_netcdf
@pytest.fixture(scope="session")
def samplecat():
"""Generate a sample catalog with the tutorial netCDFs."""
Expand Down
262 changes: 262 additions & 0 deletions tests/test_extract.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,274 @@
from copy import deepcopy

import numpy as np
import pandas as pd
import pytest
from conftest import notebooks
from xclim.testing.helpers import test_timeseries as timeseries

import xscen as xs


class TestSearchDataCatalogs:
cat = xs.DataCatalog(notebooks / "samples" / "pangeo-cmip6.json")

@pytest.mark.parametrize(
"variables_and_freqs, other_arg",
[
({"tasmin": "D"}, None),
({"sftlf": "fx"}, "other"),
({"tasmin": "D", "sftlf": "fx"}, "exclusion"),
],
)
def test_basic(self, variables_and_freqs, other_arg):
out = xs.search_data_catalogs(
data_catalogs=self.cat,
variables_and_freqs=variables_and_freqs,
other_search_criteria={"experiment": ["ssp585"]}
if other_arg == "other"
else None,
exclusions={"member": "r2.*"} if other_arg == "exclusion" else None,
)
assert len(out) == 13 if other_arg is None else 2 if other_arg == "other" else 6

@pytest.mark.parametrize(
"periods, coverage_kwargs",
[
([["2020", "2030"], ["2035", "2040"]], None),
([["1900", "2030"], ["2035", "2040"]], None),
([["2020", "2130"]], {"coverage": 0.70}),
],
)
def test_periods(self, periods, coverage_kwargs):
out = xs.search_data_catalogs(
data_catalogs=self.cat,
variables_and_freqs={"tasmin": "D"},
periods=periods,
coverage_kwargs=coverage_kwargs,
)
assert len(out) == (0 if periods[0] == ["1900", "2030"] else 5)

def test_ids(self):
out = xs.search_data_catalogs(
data_catalogs=deepcopy(self.cat),
variables_and_freqs={"tasmin": "D"},
id_columns=["source"],
)
assert len(out) == 3
assert len(out["NorESM2-MM"].df) == 5

@pytest.mark.parametrize("allow_resampling", [True, False])
def test_allow_resampling(self, allow_resampling):
out = xs.search_data_catalogs(
data_catalogs=deepcopy(self.cat),
variables_and_freqs={"tasmin": "YS"},
allow_resampling=allow_resampling,
)
assert len(out) == (13 if allow_resampling else 0)

@pytest.mark.parametrize(
"restrict_warming_level",
[
True,
{"wl": 2, "ignore_member": True},
{"wl": 4},
],
)
def test_warminglevel(self, restrict_warming_level):
cat = deepcopy(self.cat)
new_line = deepcopy(cat.df.iloc[13])
new_line["experiment"] = "ssp245"
new_line["id"] = xs.catalog.generate_id(new_line.to_frame().T).iloc[0]
cat.esmcat._df = pd.concat([cat.df, new_line.to_frame().T], ignore_index=True)

out = xs.search_data_catalogs(
data_catalogs=cat,
variables_and_freqs={"tasmax": "D"},
restrict_warming_level=restrict_warming_level,
)
if isinstance(restrict_warming_level, bool):
assert len(out) == 5
elif restrict_warming_level == {"wl": 2, "ignore_member": True}:
assert len(out) == 5
elif restrict_warming_level == {"wl": 4}:
assert len(out) == 2

@pytest.mark.parametrize("restrict_resolution", [None, "finest", "coarsest"])
def test_restrict_resolution(self, restrict_resolution):
cat = deepcopy(self.cat)
for i in range(2):
new_line = deepcopy(cat.df.iloc[0])
new_line["mip_era"] = "CMIP5"
new_line["activity"] = "CORDEX"
new_line["institution"] = "CCCma"
new_line["driving_model"] = "CanESM2"
new_line["source"] = "CRCM5"
new_line["experiment"] = "rcp85"
new_line["member"] = "r1i1p1"
new_line["domain"] = "NAM-22" if i == 0 else "NAM-11"
new_line["frequency"] = "day"
new_line["xrfreq"] = "D"
new_line["variable"] = ("tasmin",)
new_line["id"] = xs.catalog.generate_id(new_line.to_frame().T).iloc[0]

cat.esmcat._df = pd.concat(
[cat.df, new_line.to_frame().T], ignore_index=True
)

out = xs.search_data_catalogs(
data_catalogs=cat,
variables_and_freqs={"tasmin": "D"},
other_search_criteria={
"source": ["GFDL-CM4", "CRCM5"],
"experiment": ["ssp585", "rcp85"],
},
restrict_resolution=restrict_resolution,
)
if restrict_resolution is None:
assert len(out) == 4
elif restrict_resolution == "finest":
assert len(out) == 2
assert any("NAM-11" in x for x in out)
assert any("_gr1" in x for x in out)
elif restrict_resolution == "coarsest":
assert len(out) == 2
assert any("NAM-22" in x for x in out)
assert any("_gr2" in x for x in out)

@pytest.mark.parametrize("restrict_members", [None, {"ordered": 2}])
def test_restrict_members(self, restrict_members):
out = xs.search_data_catalogs(
data_catalogs=self.cat,
variables_and_freqs={"tasmin": "D"},
other_search_criteria={
"source": ["NorESM2-LM"],
"experiment": ["historical"],
},
restrict_members=restrict_members,
)
assert len(out) == (3 if restrict_members is None else 2)
if restrict_members is not None:
assert all(
o in out.keys()
for o in [
"CMIP_NCC_NorESM2-LM_historical_r1i1p1f1_gn",
"CMIP_NCC_NorESM2-LM_historical_r2i1p1f1_gn",
]
)

# Make sure that those with fewer members are still returned
assert (
len(
xs.search_data_catalogs(
data_catalogs=self.cat,
variables_and_freqs={"tasmin": "D"},
other_search_criteria={
"source": ["GFDL-CM4"],
"experiment": ["ssp585"],
"domain": "gr1",
},
restrict_members=restrict_members,
)
)
== 1
)

@pytest.mark.parametrize("allow_conversion", [True, False])
def test_allow_conversion(self, allow_conversion):
out = xs.search_data_catalogs(
data_catalogs=self.cat,
variables_and_freqs={"evspsblpot": "D"},
other_search_criteria={
"institution": ["NOAA-GFDL"],
"experiment": ["ssp585"],
},
allow_conversion=allow_conversion,
)
assert len(out) == (2 if allow_conversion else 0)
if allow_conversion:
assert all(
v in out[list(out.keys())[0]].unique("variable")
for v in ["tasmin", "tasmax"]
)
assert "tas" not in out[list(out.keys())[0]].unique("variable")

def test_no_match(self):
out = xs.search_data_catalogs(
data_catalogs=self.cat,
variables_and_freqs={"tas": "YS"},
allow_resampling=False,
)
assert isinstance(out, dict)
assert len(out) == 0
out = xs.search_data_catalogs(
data_catalogs=self.cat,
variables_and_freqs={"tas": "D"},
other_search_criteria={"experiment": "not_real"},
)
assert isinstance(out, dict)
assert len(out) == 0

def test_input_types(self, samplecat):
data_catalogs_2 = notebooks / "samples" / "pangeo-cmip6.json"

assert (
xs.search_data_catalogs(
data_catalogs=[samplecat, data_catalogs_2],
variables_and_freqs={"tas": "D"},
other_search_criteria={
"experiment": "ssp585",
"source": "NorESM.*",
"member": "r1i1p1f1",
},
).keys()
== xs.search_data_catalogs(
data_catalogs=[samplecat, self.cat],
variables_and_freqs={"tas": "D"},
other_search_criteria={
"experiment": "ssp585",
"source": "NorESM.*",
"member": "r1i1p1f1",
},
).keys()
)

def test_match_histfut(self):
out = xs.search_data_catalogs(
data_catalogs=self.cat,
variables_and_freqs={"tasmin": "D"},
other_search_criteria={"experiment": "ssp585", "source": "GFDL-CM4"},
match_hist_and_fut=True,
)
k = list(out.keys())[0]
assert str(sorted(out[k].unique("date_start"))[0]) == "1985-01-01 00:00:00"
assert str(sorted(out[k].unique("date_start"))[1]) == "2015-01-01 00:00:00"

def test_fx(self):
cat = deepcopy(self.cat)
new_line = deepcopy(cat.df.iloc[0])
new_line["id"] = new_line["id"].replace(
new_line["experiment"], "another_experiment"
)
new_line["experiment"] = "another_experiment"
cat.esmcat._df = pd.concat([cat.df, new_line.to_frame().T], ignore_index=True)

with pytest.warns(
UserWarning,
match="doesn't have the fixed field sftlf, but it can be acquired from ",
):
out = xs.search_data_catalogs(
data_catalogs=cat,
variables_and_freqs={"sftlf": "fx"},
other_search_criteria={"experiment": "another_experiment"},
)
assert len(out) == 1
k = list(out.keys())[0]
np.testing.assert_array_equal(
out[k].df["experiment"],
"another_experiment",
)


class TestGetWarmingLevel:
def test_list(self):
out = xs.get_warming_level(
Expand Down
Loading

0 comments on commit 2d1a7fd

Please sign in to comment.