Merge pull request #251 from Ouranosinc/test_extract

Test search_data_catalogs
Ouranosinc · Sep 13, 2023 · 2d1a7fd · 2d1a7fd
2 parents f305ca5 + b0542e4
commit 2d1a7fd
Show file tree

Hide file tree

Showing 5 changed files with 329 additions and 53 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -12,18 +12,19 @@ Announcements
 
 New features and enhancements
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-* N/A
+* Added the ability to search for simulations that reach a given warming level. (:pull:`251`).
 
 Breaking changes
 ^^^^^^^^^^^^^^^^
 * N/A
 
 Bug fixes
 ^^^^^^^^^
-* N/A
+* Fixed a bug in ``xs.search_data_catalogs`` when searching for fixed fields and specific experiments/members. (:pull:`251`).
 
 Internal changes
 ^^^^^^^^^^^^^^^^
+* Continued work on adding tests. (:pull:`251`).
 * Fixed pre-commit's pretty-format-json so it ignores notebooks. (:pull:`254`).
 * Fixed the labeler so docs/CI isn't automatically added for contributions by new collaborators. (:pull:`254`).
 * Made it so that `tests` are no longer treated as an installable package. (:pull:`248`).

diff --git a/docs/notebooks/1_catalog.ipynb b/docs/notebooks/1_catalog.ipynb
@@ -254,7 +254,7 @@
     "- `allow_conversion` is used to allow searching for calculable variables, in the case where the requested variable would not be available.\n",
     "- `restrict_resolution` is used to limit the results to the finest or coarsest resolution available for each source.\n",
     "- `restrict_members` is used to limit the results to a maximum number of realizations for each source.\n",
-    "- `restrict_warming_level` is used to limit the results to only datasets that are present in the csv used for calculating warming levels.\n",
+    "- `restrict_warming_level` is used to limit the results to only datasets that are present in the csv used for calculating warming levels. You can also pass a dict to verify that a given warming level is reached.\n",
     "\n",
     "Note that compared to `search`, the result of `search_data_catalog` is a dictionary with one entry per unique ID. A given unique ID might contain multiple datasets as per `intake-esm`'s definition, because it groups catalog lines per *id - domain - processing_level - xrfreq*. Thus, it would separate model data that exists at different frequencies.\n",
     "\n",
@@ -390,7 +390,9 @@
     "    data_catalogs=[f\"{Path().absolute()}/samples/pangeo-cmip6.json\"],\n",
     "    variables_and_freqs=variables_and_freqs,\n",
     "    match_hist_and_fut=True,\n",
-    "    restrict_warming_level=True,  # In this case all models exist in our database, so nothing gets eliminated.\n",
+    "    restrict_warming_level={\n",
+    "        \"wl\": 2\n",
+    "    },  # SSP126  gets eliminated, since it doesn't reach +2°C by 2100.\n",
     ")\n",
     "\n",
     "cat_sim"

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -26,7 +26,7 @@ def remove_data_folder():
     request.addfinalizer(remove_data_folder)
 
 
-@pytest.mark.requires_docs
+@pytest.mark.requires_netcdf
 @pytest.fixture(scope="session")
 def samplecat():
     """Generate a sample catalog with the tutorial netCDFs."""

diff --git a/tests/test_extract.py b/tests/test_extract.py
@@ -1,12 +1,274 @@
 from copy import deepcopy
 
 import numpy as np
+import pandas as pd
 import pytest
+from conftest import notebooks
 from xclim.testing.helpers import test_timeseries as timeseries
 
 import xscen as xs
 
 
+class TestSearchDataCatalogs:
+    cat = xs.DataCatalog(notebooks / "samples" / "pangeo-cmip6.json")
+
+    @pytest.mark.parametrize(
+        "variables_and_freqs, other_arg",
+        [
+            ({"tasmin": "D"}, None),
+            ({"sftlf": "fx"}, "other"),
+            ({"tasmin": "D", "sftlf": "fx"}, "exclusion"),
+        ],
+    )
+    def test_basic(self, variables_and_freqs, other_arg):
+        out = xs.search_data_catalogs(
+            data_catalogs=self.cat,
+            variables_and_freqs=variables_and_freqs,
+            other_search_criteria={"experiment": ["ssp585"]}
+            if other_arg == "other"
+            else None,
+            exclusions={"member": "r2.*"} if other_arg == "exclusion" else None,
+        )
+        assert len(out) == 13 if other_arg is None else 2 if other_arg == "other" else 6
+
+    @pytest.mark.parametrize(
+        "periods, coverage_kwargs",
+        [
+            ([["2020", "2030"], ["2035", "2040"]], None),
+            ([["1900", "2030"], ["2035", "2040"]], None),
+            ([["2020", "2130"]], {"coverage": 0.70}),
+        ],
+    )
+    def test_periods(self, periods, coverage_kwargs):
+        out = xs.search_data_catalogs(
+            data_catalogs=self.cat,
+            variables_and_freqs={"tasmin": "D"},
+            periods=periods,
+            coverage_kwargs=coverage_kwargs,
+        )
+        assert len(out) == (0 if periods[0] == ["1900", "2030"] else 5)
+
+    def test_ids(self):
+        out = xs.search_data_catalogs(
+            data_catalogs=deepcopy(self.cat),
+            variables_and_freqs={"tasmin": "D"},
+            id_columns=["source"],
+        )
+        assert len(out) == 3
+        assert len(out["NorESM2-MM"].df) == 5
+
+    @pytest.mark.parametrize("allow_resampling", [True, False])
+    def test_allow_resampling(self, allow_resampling):
+        out = xs.search_data_catalogs(
+            data_catalogs=deepcopy(self.cat),
+            variables_and_freqs={"tasmin": "YS"},
+            allow_resampling=allow_resampling,
+        )
+        assert len(out) == (13 if allow_resampling else 0)
+
+    @pytest.mark.parametrize(
+        "restrict_warming_level",
+        [
+            True,
+            {"wl": 2, "ignore_member": True},
+            {"wl": 4},
+        ],
+    )
+    def test_warminglevel(self, restrict_warming_level):
+        cat = deepcopy(self.cat)
+        new_line = deepcopy(cat.df.iloc[13])
+        new_line["experiment"] = "ssp245"
+        new_line["id"] = xs.catalog.generate_id(new_line.to_frame().T).iloc[0]
+        cat.esmcat._df = pd.concat([cat.df, new_line.to_frame().T], ignore_index=True)
+
+        out = xs.search_data_catalogs(
+            data_catalogs=cat,
+            variables_and_freqs={"tasmax": "D"},
+            restrict_warming_level=restrict_warming_level,
+        )
+        if isinstance(restrict_warming_level, bool):
+            assert len(out) == 5
+        elif restrict_warming_level == {"wl": 2, "ignore_member": True}:
+            assert len(out) == 5
+        elif restrict_warming_level == {"wl": 4}:
+            assert len(out) == 2
+
+    @pytest.mark.parametrize("restrict_resolution", [None, "finest", "coarsest"])
+    def test_restrict_resolution(self, restrict_resolution):
+        cat = deepcopy(self.cat)
+        for i in range(2):
+            new_line = deepcopy(cat.df.iloc[0])
+            new_line["mip_era"] = "CMIP5"
+            new_line["activity"] = "CORDEX"
+            new_line["institution"] = "CCCma"
+            new_line["driving_model"] = "CanESM2"
+            new_line["source"] = "CRCM5"
+            new_line["experiment"] = "rcp85"
+            new_line["member"] = "r1i1p1"
+            new_line["domain"] = "NAM-22" if i == 0 else "NAM-11"
+            new_line["frequency"] = "day"
+            new_line["xrfreq"] = "D"
+            new_line["variable"] = ("tasmin",)
+            new_line["id"] = xs.catalog.generate_id(new_line.to_frame().T).iloc[0]
+
+            cat.esmcat._df = pd.concat(
+                [cat.df, new_line.to_frame().T], ignore_index=True
+            )
+
+        out = xs.search_data_catalogs(
+            data_catalogs=cat,
+            variables_and_freqs={"tasmin": "D"},
+            other_search_criteria={
+                "source": ["GFDL-CM4", "CRCM5"],
+                "experiment": ["ssp585", "rcp85"],
+            },
+            restrict_resolution=restrict_resolution,
+        )
+        if restrict_resolution is None:
+            assert len(out) == 4
+        elif restrict_resolution == "finest":
+            assert len(out) == 2
+            assert any("NAM-11" in x for x in out)
+            assert any("_gr1" in x for x in out)
+        elif restrict_resolution == "coarsest":
+            assert len(out) == 2
+            assert any("NAM-22" in x for x in out)
+            assert any("_gr2" in x for x in out)
+
+    @pytest.mark.parametrize("restrict_members", [None, {"ordered": 2}])
+    def test_restrict_members(self, restrict_members):
+        out = xs.search_data_catalogs(
+            data_catalogs=self.cat,
+            variables_and_freqs={"tasmin": "D"},
+            other_search_criteria={
+                "source": ["NorESM2-LM"],
+                "experiment": ["historical"],
+            },
+            restrict_members=restrict_members,
+        )
+        assert len(out) == (3 if restrict_members is None else 2)
+        if restrict_members is not None:
+            assert all(
+                o in out.keys()
+                for o in [
+                    "CMIP_NCC_NorESM2-LM_historical_r1i1p1f1_gn",
+                    "CMIP_NCC_NorESM2-LM_historical_r2i1p1f1_gn",
+                ]
+            )
+
+        # Make sure that those with fewer members are still returned
+        assert (
+            len(
+                xs.search_data_catalogs(
+                    data_catalogs=self.cat,
+                    variables_and_freqs={"tasmin": "D"},
+                    other_search_criteria={
+                        "source": ["GFDL-CM4"],
+                        "experiment": ["ssp585"],
+                        "domain": "gr1",
+                    },
+                    restrict_members=restrict_members,
+                )
+            )
+            == 1
+        )
+
+    @pytest.mark.parametrize("allow_conversion", [True, False])
+    def test_allow_conversion(self, allow_conversion):
+        out = xs.search_data_catalogs(
+            data_catalogs=self.cat,
+            variables_and_freqs={"evspsblpot": "D"},
+            other_search_criteria={
+                "institution": ["NOAA-GFDL"],
+                "experiment": ["ssp585"],
+            },
+            allow_conversion=allow_conversion,
+        )
+        assert len(out) == (2 if allow_conversion else 0)
+        if allow_conversion:
+            assert all(
+                v in out[list(out.keys())[0]].unique("variable")
+                for v in ["tasmin", "tasmax"]
+            )
+            assert "tas" not in out[list(out.keys())[0]].unique("variable")
+
+    def test_no_match(self):
+        out = xs.search_data_catalogs(
+            data_catalogs=self.cat,
+            variables_and_freqs={"tas": "YS"},
+            allow_resampling=False,
+        )
+        assert isinstance(out, dict)
+        assert len(out) == 0
+        out = xs.search_data_catalogs(
+            data_catalogs=self.cat,
+            variables_and_freqs={"tas": "D"},
+            other_search_criteria={"experiment": "not_real"},
+        )
+        assert isinstance(out, dict)
+        assert len(out) == 0
+
+    def test_input_types(self, samplecat):
+        data_catalogs_2 = notebooks / "samples" / "pangeo-cmip6.json"
+
+        assert (
+            xs.search_data_catalogs(
+                data_catalogs=[samplecat, data_catalogs_2],
+                variables_and_freqs={"tas": "D"},
+                other_search_criteria={
+                    "experiment": "ssp585",
+                    "source": "NorESM.*",
+                    "member": "r1i1p1f1",
+                },
+            ).keys()
+            == xs.search_data_catalogs(
+                data_catalogs=[samplecat, self.cat],
+                variables_and_freqs={"tas": "D"},
+                other_search_criteria={
+                    "experiment": "ssp585",
+                    "source": "NorESM.*",
+                    "member": "r1i1p1f1",
+                },
+            ).keys()
+        )
+
+    def test_match_histfut(self):
+        out = xs.search_data_catalogs(
+            data_catalogs=self.cat,
+            variables_and_freqs={"tasmin": "D"},
+            other_search_criteria={"experiment": "ssp585", "source": "GFDL-CM4"},
+            match_hist_and_fut=True,
+        )
+        k = list(out.keys())[0]
+        assert str(sorted(out[k].unique("date_start"))[0]) == "1985-01-01 00:00:00"
+        assert str(sorted(out[k].unique("date_start"))[1]) == "2015-01-01 00:00:00"
+
+    def test_fx(self):
+        cat = deepcopy(self.cat)
+        new_line = deepcopy(cat.df.iloc[0])
+        new_line["id"] = new_line["id"].replace(
+            new_line["experiment"], "another_experiment"
+        )
+        new_line["experiment"] = "another_experiment"
+        cat.esmcat._df = pd.concat([cat.df, new_line.to_frame().T], ignore_index=True)
+
+        with pytest.warns(
+            UserWarning,
+            match="doesn't have the fixed field sftlf, but it can be acquired from ",
+        ):
+            out = xs.search_data_catalogs(
+                data_catalogs=cat,
+                variables_and_freqs={"sftlf": "fx"},
+                other_search_criteria={"experiment": "another_experiment"},
+            )
+        assert len(out) == 1
+        k = list(out.keys())[0]
+        np.testing.assert_array_equal(
+            out[k].df["experiment"],
+            "another_experiment",
+        )
+
+
 class TestGetWarmingLevel:
     def test_list(self):
         out = xs.get_warming_level(