diff --git a/.cruft.json b/.cruft.json index 919e8b66..f67b403f 100644 --- a/.cruft.json +++ b/.cruft.json @@ -11,7 +11,7 @@ "project_slug": "xscen", "project_short_description": "A climate change scenario-building analysis framework, built with xclim/xarray.", "pypi_username": "RondeauG", - "version": "0.7.4-beta", + "version": "0.7.5-beta", "use_pytest": "y", "use_black": "y", "add_pyup_badge": "n", diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d82123ad..2dfe235a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ repos: rev: v3.10.1 hooks: - id: pyupgrade - args: [--py39-plus] + args: [ '--py39-plus' ] - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.4.0 hooks: @@ -27,8 +27,8 @@ repos: rev: v1.10.0 hooks: - id: rst-inline-touching-normal - - repo: https://github.com/psf/black - rev: 23.7.0 + - repo: https://github.com/psf/black-pre-commit-mirror + rev: 23.9.1 hooks: - id: black exclude: ^docs/ @@ -53,7 +53,7 @@ repos: rev: v0.3.8 hooks: - id: blackdoc - additional_dependencies: [ 'black==23.3.0' ] + additional_dependencies: [ 'black==23.9.1' ] exclude: config.py - repo: https://github.com/adrienverge/yamllint.git rev: v1.32.0 diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 50a353e3..303148b8 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -114,9 +114,14 @@ Ready to contribute? Here's how to set up `xscen` for local development. # or to simply generate the html $ cd docs/ $ make html + +.. note:: -9. Submit a pull request through the GitHub website. + When building the documentation, the default behaviour is to evaluate notebooks ('nbsphinx_execute = "always"'), rather than simply parse the content ('nbsphinx_execute = "never"'). Due to their complexity, this can sometimes be a very computationally demanding task and should only be performed when necessary (i.e.: when the notebooks have been modified). + + In order to speed up documentation builds, setting a value for the environment variable "SKIP_NOTEBOOKS" (e.g. "$ export SKIP_NOTEBOOKS=1") will prevent the notebooks from being evaluated on all subsequent "$ tox -e docs" or "$ make docs" invocations. +9. Submit a pull request through the GitHub website. Translating xscen ~~~~~~~~~~~~~~~~~ diff --git a/HISTORY.rst b/HISTORY.rst index baf99fe0..4bab7fbf 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -12,7 +12,7 @@ Announcements New features and enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -* N/A +* Added the ability to search for simulations that reach a given warming level. (:pull:`251`). Breaking changes ^^^^^^^^^^^^^^^^ @@ -20,10 +20,12 @@ Breaking changes Bug fixes ^^^^^^^^^ -* N/A +* Fixed a bug in ``xs.search_data_catalogs`` when searching for fixed fields and specific experiments/members. (:pull:`251`). +* Fixed a bug in the documentation build configuration that prevented stable/latest and tagged documentation builds from resolving on ReadTheDocs. (:pull:`256`). Internal changes ^^^^^^^^^^^^^^^^ +* Continued work on adding tests. (:pull:`251`). * Fixed pre-commit's pretty-format-json so it ignores notebooks. (:pull:`254`). * Fixed the labeler so docs/CI isn't automatically added for contributions by new collaborators. (:pull:`254`). * Made it so that `tests` are no longer treated as an installable package. (:pull:`248`). diff --git a/docs/conf.py b/docs/conf.py index c0eeccfd..dcee2304 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -21,14 +21,15 @@ import os import sys import warnings +from datetime import datetime from pathlib import Path -sys.path.insert(0, os.path.abspath('..')) -if os.environ.get('READTHEDOCS') and 'ESMFMKFILE' not in os.environ: +sys.path.insert(0, os.path.abspath("..")) +if os.environ.get("READTHEDOCS") and "ESMFMKFILE" not in os.environ: # RTD doesn't activate the env, and esmpy depends on a env var set there # We assume the `os` package is in {ENV}/lib/pythonX.X/os.py # See conda-forge/esmf-feedstock#91 and readthedocs/readthedocs.org#4067 - os.environ['ESMFMKFILE'] = str(Path(os.__file__).parent.parent / 'esmf.mk') + os.environ["ESMFMKFILE"] = str(Path(os.__file__).parent.parent / "esmf.mk") import xscen # noqa import xarray # noqa @@ -53,7 +54,7 @@ "sphinx.ext.viewcode", "nbsphinx", "sphinx_codeautolink", - "sphinx_copybutton" + "sphinx_copybutton", ] # To ensure that underlined fields (e.g. `_field`) are shown in the docs. @@ -68,25 +69,29 @@ autosectionlabel_maxdepth = 2 autosummary_generate = True -nbsphinx_execute = "always" -# To avoid running notebooks on linkcheck +nbsphinx_execute = "always" +# To avoid running notebooks on linkcheck and when building PDF. try: skip_notebooks = int(os.getenv("SKIP_NOTEBOOKS")) except TypeError: skip_notebooks = False if skip_notebooks: - warnings.warn("Not executing notebooks.") + warnings.warn("SKIP_NOTEBOOKS is set. Not executing notebooks.") nbsphinx_execute = "never" +elif os.getenv("READTHEDOCS_VERSION_NAME") in ["latest", "stable"] or os.getenv( + "READTHEDOCS_VERSION_TYPE" +) in ["tag"]: + if os.getenv("READTHEDOCS_OUTPUT") in ["pdf"]: + warnings.warn("Generating PDF version. Not executing notebooks.") + nbsphinx_execute = "never" # if skip_notebooks or os.getenv("READTHEDOCS_VERSION_TYPE") in [ # "branch", # "external", # ]: -# elif os.getenv("READTHEDOCS_VERSION_NAME") in ["latest", "stable"]: -# nbsphinx_execute = "always" -# else: -# nbsphinx_execute = "auto" +# warnings.warn("Not executing notebooks.") +# nbsphinx_execute = "never" # To avoid having to install these and burst memory limit on ReadTheDocs. # autodoc_mock_imports = [ @@ -136,14 +141,14 @@ # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = ['.rst'] +source_suffix = [".rst"] # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'xscen' -copyright = "2022, Ouranos Inc., Gabriel Rondeau-Genesse, and contributors" +project = "xscen" +copyright = f"2022-{datetime.now().year}, Ouranos Inc., Gabriel Rondeau-Genesse, and contributors" author = "Gabriel Rondeau-Genesse" # The version info for the project you're documenting, acts as replacement @@ -172,7 +177,7 @@ ] # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False @@ -201,13 +206,13 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # -- Options for HTMLHelp output --------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'xscendoc' +htmlhelp_basename = "xscendoc" # -- Options for LaTeX output ------------------------------------------ @@ -216,15 +221,12 @@ # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -234,9 +236,13 @@ # (source start file, target name, title, author, documentclass # [howto, manual, or own class]). latex_documents = [ - (master_doc, 'xscen.tex', - 'xscen Documentation', - 'Gabriel Rondeau-Genesse', 'manual'), + ( + master_doc, + "xscen.tex", + "xscen Documentation", + "Gabriel Rondeau-Genesse", + "manual", + ), ] @@ -244,11 +250,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'xscen', - 'xscen Documentation', - [author], 1) -] +man_pages = [(master_doc, "xscen", "xscen Documentation", [author], 1)] # -- Options for Texinfo output ---------------------------------------- @@ -257,10 +259,13 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'xscen', - 'xscen Documentation', - author, - 'xscen', - 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "xscen", + "xscen Documentation", + author, + "xscen", + "One line description of project.", + "Miscellaneous", + ), ] diff --git a/docs/notebooks/1_catalog.ipynb b/docs/notebooks/1_catalog.ipynb index 458deba8..063b89ee 100644 --- a/docs/notebooks/1_catalog.ipynb +++ b/docs/notebooks/1_catalog.ipynb @@ -254,7 +254,7 @@ "- `allow_conversion` is used to allow searching for calculable variables, in the case where the requested variable would not be available.\n", "- `restrict_resolution` is used to limit the results to the finest or coarsest resolution available for each source.\n", "- `restrict_members` is used to limit the results to a maximum number of realizations for each source.\n", - "- `restrict_warming_level` is used to limit the results to only datasets that are present in the csv used for calculating warming levels.\n", + "- `restrict_warming_level` is used to limit the results to only datasets that are present in the csv used for calculating warming levels. You can also pass a dict to verify that a given warming level is reached.\n", "\n", "Note that compared to `search`, the result of `search_data_catalog` is a dictionary with one entry per unique ID. A given unique ID might contain multiple datasets as per `intake-esm`'s definition, because it groups catalog lines per *id - domain - processing_level - xrfreq*. Thus, it would separate model data that exists at different frequencies.\n", "\n", @@ -390,7 +390,9 @@ " data_catalogs=[f\"{Path().absolute()}/samples/pangeo-cmip6.json\"],\n", " variables_and_freqs=variables_and_freqs,\n", " match_hist_and_fut=True,\n", - " restrict_warming_level=True, # In this case all models exist in our database, so nothing gets eliminated.\n", + " restrict_warming_level={\n", + " \"wl\": 2\n", + " }, # SSP126 gets eliminated, since it doesn't reach +2°C by 2100.\n", ")\n", "\n", "cat_sim" diff --git a/setup.cfg b/setup.cfg index a037532e..c7349779 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.7.4-beta +current_version = 0.7.5-beta commit = True tag = False parse = (?P\d+)\.(?P\d+).(?P\d+)(\-(?P[a-z]+))? diff --git a/setup.py b/setup.py index aa597b63..5971b887 100644 --- a/setup.py +++ b/setup.py @@ -102,6 +102,6 @@ def run(self): test_suite="tests", extras_require={"dev": dev_requirements}, url="https://github.com/Ouranosinc/xscen", - version="0.7.4-beta", + version="0.7.5-beta", zip_safe=False, ) diff --git a/tests/conftest.py b/tests/conftest.py index 96781d5b..6050ebb3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -26,7 +26,7 @@ def remove_data_folder(): request.addfinalizer(remove_data_folder) -@pytest.mark.requires_docs +@pytest.mark.requires_netcdf @pytest.fixture(scope="session") def samplecat(): """Generate a sample catalog with the tutorial netCDFs.""" diff --git a/tests/test_extract.py b/tests/test_extract.py index da5f3780..983f6454 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -1,12 +1,274 @@ from copy import deepcopy import numpy as np +import pandas as pd import pytest +from conftest import notebooks from xclim.testing.helpers import test_timeseries as timeseries import xscen as xs +class TestSearchDataCatalogs: + cat = xs.DataCatalog(notebooks / "samples" / "pangeo-cmip6.json") + + @pytest.mark.parametrize( + "variables_and_freqs, other_arg", + [ + ({"tasmin": "D"}, None), + ({"sftlf": "fx"}, "other"), + ({"tasmin": "D", "sftlf": "fx"}, "exclusion"), + ], + ) + def test_basic(self, variables_and_freqs, other_arg): + out = xs.search_data_catalogs( + data_catalogs=self.cat, + variables_and_freqs=variables_and_freqs, + other_search_criteria={"experiment": ["ssp585"]} + if other_arg == "other" + else None, + exclusions={"member": "r2.*"} if other_arg == "exclusion" else None, + ) + assert len(out) == 13 if other_arg is None else 2 if other_arg == "other" else 6 + + @pytest.mark.parametrize( + "periods, coverage_kwargs", + [ + ([["2020", "2030"], ["2035", "2040"]], None), + ([["1900", "2030"], ["2035", "2040"]], None), + ([["2020", "2130"]], {"coverage": 0.70}), + ], + ) + def test_periods(self, periods, coverage_kwargs): + out = xs.search_data_catalogs( + data_catalogs=self.cat, + variables_and_freqs={"tasmin": "D"}, + periods=periods, + coverage_kwargs=coverage_kwargs, + ) + assert len(out) == (0 if periods[0] == ["1900", "2030"] else 5) + + def test_ids(self): + out = xs.search_data_catalogs( + data_catalogs=deepcopy(self.cat), + variables_and_freqs={"tasmin": "D"}, + id_columns=["source"], + ) + assert len(out) == 3 + assert len(out["NorESM2-MM"].df) == 5 + + @pytest.mark.parametrize("allow_resampling", [True, False]) + def test_allow_resampling(self, allow_resampling): + out = xs.search_data_catalogs( + data_catalogs=deepcopy(self.cat), + variables_and_freqs={"tasmin": "YS"}, + allow_resampling=allow_resampling, + ) + assert len(out) == (13 if allow_resampling else 0) + + @pytest.mark.parametrize( + "restrict_warming_level", + [ + True, + {"wl": 2, "ignore_member": True}, + {"wl": 4}, + ], + ) + def test_warminglevel(self, restrict_warming_level): + cat = deepcopy(self.cat) + new_line = deepcopy(cat.df.iloc[13]) + new_line["experiment"] = "ssp245" + new_line["id"] = xs.catalog.generate_id(new_line.to_frame().T).iloc[0] + cat.esmcat._df = pd.concat([cat.df, new_line.to_frame().T], ignore_index=True) + + out = xs.search_data_catalogs( + data_catalogs=cat, + variables_and_freqs={"tasmax": "D"}, + restrict_warming_level=restrict_warming_level, + ) + if isinstance(restrict_warming_level, bool): + assert len(out) == 5 + elif restrict_warming_level == {"wl": 2, "ignore_member": True}: + assert len(out) == 5 + elif restrict_warming_level == {"wl": 4}: + assert len(out) == 2 + + @pytest.mark.parametrize("restrict_resolution", [None, "finest", "coarsest"]) + def test_restrict_resolution(self, restrict_resolution): + cat = deepcopy(self.cat) + for i in range(2): + new_line = deepcopy(cat.df.iloc[0]) + new_line["mip_era"] = "CMIP5" + new_line["activity"] = "CORDEX" + new_line["institution"] = "CCCma" + new_line["driving_model"] = "CanESM2" + new_line["source"] = "CRCM5" + new_line["experiment"] = "rcp85" + new_line["member"] = "r1i1p1" + new_line["domain"] = "NAM-22" if i == 0 else "NAM-11" + new_line["frequency"] = "day" + new_line["xrfreq"] = "D" + new_line["variable"] = ("tasmin",) + new_line["id"] = xs.catalog.generate_id(new_line.to_frame().T).iloc[0] + + cat.esmcat._df = pd.concat( + [cat.df, new_line.to_frame().T], ignore_index=True + ) + + out = xs.search_data_catalogs( + data_catalogs=cat, + variables_and_freqs={"tasmin": "D"}, + other_search_criteria={ + "source": ["GFDL-CM4", "CRCM5"], + "experiment": ["ssp585", "rcp85"], + }, + restrict_resolution=restrict_resolution, + ) + if restrict_resolution is None: + assert len(out) == 4 + elif restrict_resolution == "finest": + assert len(out) == 2 + assert any("NAM-11" in x for x in out) + assert any("_gr1" in x for x in out) + elif restrict_resolution == "coarsest": + assert len(out) == 2 + assert any("NAM-22" in x for x in out) + assert any("_gr2" in x for x in out) + + @pytest.mark.parametrize("restrict_members", [None, {"ordered": 2}]) + def test_restrict_members(self, restrict_members): + out = xs.search_data_catalogs( + data_catalogs=self.cat, + variables_and_freqs={"tasmin": "D"}, + other_search_criteria={ + "source": ["NorESM2-LM"], + "experiment": ["historical"], + }, + restrict_members=restrict_members, + ) + assert len(out) == (3 if restrict_members is None else 2) + if restrict_members is not None: + assert all( + o in out.keys() + for o in [ + "CMIP_NCC_NorESM2-LM_historical_r1i1p1f1_gn", + "CMIP_NCC_NorESM2-LM_historical_r2i1p1f1_gn", + ] + ) + + # Make sure that those with fewer members are still returned + assert ( + len( + xs.search_data_catalogs( + data_catalogs=self.cat, + variables_and_freqs={"tasmin": "D"}, + other_search_criteria={ + "source": ["GFDL-CM4"], + "experiment": ["ssp585"], + "domain": "gr1", + }, + restrict_members=restrict_members, + ) + ) + == 1 + ) + + @pytest.mark.parametrize("allow_conversion", [True, False]) + def test_allow_conversion(self, allow_conversion): + out = xs.search_data_catalogs( + data_catalogs=self.cat, + variables_and_freqs={"evspsblpot": "D"}, + other_search_criteria={ + "institution": ["NOAA-GFDL"], + "experiment": ["ssp585"], + }, + allow_conversion=allow_conversion, + ) + assert len(out) == (2 if allow_conversion else 0) + if allow_conversion: + assert all( + v in out[list(out.keys())[0]].unique("variable") + for v in ["tasmin", "tasmax"] + ) + assert "tas" not in out[list(out.keys())[0]].unique("variable") + + def test_no_match(self): + out = xs.search_data_catalogs( + data_catalogs=self.cat, + variables_and_freqs={"tas": "YS"}, + allow_resampling=False, + ) + assert isinstance(out, dict) + assert len(out) == 0 + out = xs.search_data_catalogs( + data_catalogs=self.cat, + variables_and_freqs={"tas": "D"}, + other_search_criteria={"experiment": "not_real"}, + ) + assert isinstance(out, dict) + assert len(out) == 0 + + def test_input_types(self, samplecat): + data_catalogs_2 = notebooks / "samples" / "pangeo-cmip6.json" + + assert ( + xs.search_data_catalogs( + data_catalogs=[samplecat, data_catalogs_2], + variables_and_freqs={"tas": "D"}, + other_search_criteria={ + "experiment": "ssp585", + "source": "NorESM.*", + "member": "r1i1p1f1", + }, + ).keys() + == xs.search_data_catalogs( + data_catalogs=[samplecat, self.cat], + variables_and_freqs={"tas": "D"}, + other_search_criteria={ + "experiment": "ssp585", + "source": "NorESM.*", + "member": "r1i1p1f1", + }, + ).keys() + ) + + def test_match_histfut(self): + out = xs.search_data_catalogs( + data_catalogs=self.cat, + variables_and_freqs={"tasmin": "D"}, + other_search_criteria={"experiment": "ssp585", "source": "GFDL-CM4"}, + match_hist_and_fut=True, + ) + k = list(out.keys())[0] + assert str(sorted(out[k].unique("date_start"))[0]) == "1985-01-01 00:00:00" + assert str(sorted(out[k].unique("date_start"))[1]) == "2015-01-01 00:00:00" + + def test_fx(self): + cat = deepcopy(self.cat) + new_line = deepcopy(cat.df.iloc[0]) + new_line["id"] = new_line["id"].replace( + new_line["experiment"], "another_experiment" + ) + new_line["experiment"] = "another_experiment" + cat.esmcat._df = pd.concat([cat.df, new_line.to_frame().T], ignore_index=True) + + with pytest.warns( + UserWarning, + match="doesn't have the fixed field sftlf, but it can be acquired from ", + ): + out = xs.search_data_catalogs( + data_catalogs=cat, + variables_and_freqs={"sftlf": "fx"}, + other_search_criteria={"experiment": "another_experiment"}, + ) + assert len(out) == 1 + k = list(out.keys())[0] + np.testing.assert_array_equal( + out[k].df["experiment"], + "another_experiment", + ) + + class TestGetWarmingLevel: def test_list(self): out = xs.get_warming_level( diff --git a/tests/test_xscen.py b/tests/test_xscen.py index 74ac8c37..5bbcbc69 100644 --- a/tests/test_xscen.py +++ b/tests/test_xscen.py @@ -28,4 +28,4 @@ def test_package_metadata(self): contents = f.read() assert """Gabriel Rondeau-Genesse""" in contents assert '__email__ = "rondeau-genesse.gabriel@ouranos.ca"' in contents - assert '__version__ = "0.7.4-beta"' in contents + assert '__version__ = "0.7.5-beta"' in contents diff --git a/xscen/__init__.py b/xscen/__init__.py index ea232cc3..7e61e09c 100644 --- a/xscen/__init__.py +++ b/xscen/__init__.py @@ -52,7 +52,7 @@ __author__ = """Gabriel Rondeau-Genesse""" __email__ = "rondeau-genesse.gabriel@ouranos.ca" -__version__ = "0.7.4-beta" +__version__ = "0.7.5-beta" # monkeypatch so that warnings.warn() doesn't mention itself diff --git a/xscen/extract.py b/xscen/extract.py index 2bf82bc2..c0eb00a6 100644 --- a/xscen/extract.py +++ b/xscen/extract.py @@ -552,11 +552,13 @@ def search_data_catalogs( Currently only supports {"ordered": int} format. restrict_warming_level : bool, dict Used to restrict the results only to datasets that exist in the csv used to compute warming levels in `subset_warming_level`. - If True, this will only keep the datasets that have a mip_era, source, experiment - and member combination that exist in the csv. This does not guarantees that a given warming level will be reached, only that the datasets have corresponding columns in the csv. + If True, this will only keep the datasets that have a mip_era, source, experiment and member combination that exist in the csv. + This does not guarantee that a given warming level will be reached, only that the datasets have corresponding columns in the csv. More option can be added by passing a dictionary instead of a boolean. If {'ignore_member':True}, it will disregard the member when trying to match the dataset to a column. If {tas_csv: Path_to_csv}, it will use an alternative csv instead of the default one provided by xscen. + If 'wl' is a provided key, then `xs.get_warming_level` will be called and only datasets that reach the given warming level will be kept. + This can be combined with other arguments of the function, for example {'wl': 1.5, 'window': 30}. Notes ----- @@ -586,43 +588,31 @@ def search_data_catalogs( "registry": registry_from_module(load_xclim_module(conversion_yaml)) } - # Cast paths to single item list - if isinstance(data_catalogs, (str, Path)): + # Cast single items to a list + if isinstance(data_catalogs, (str, os.PathLike, DataCatalog)): data_catalogs = [data_catalogs] + # Open the catalogs given as paths + for i, dc in enumerate(data_catalogs): + if isinstance(dc, (str, os.PathLike)): + data_catalogs[i] = ( + DataCatalog(dc, **cat_kwargs) + if Path(dc).suffix == ".json" + else DataCatalog.from_df(dc) + ) - # Prepare a unique catalog to search from, with the DerivedCat added if required - if isinstance(data_catalogs, DataCatalog): - catalog = DataCatalog( - {"esmcat": data_catalogs.esmcat.dict(), "df": data_catalogs.df}, - **cat_kwargs, - ) - data_catalogs = [catalog] # simply for a meaningful logging line - elif isinstance(data_catalogs, list) and all( + if not isinstance(data_catalogs, list) or not all( isinstance(dc, DataCatalog) for dc in data_catalogs ): - catalog = DataCatalog( - { - "esmcat": data_catalogs[0].esmcat.dict(), - "df": pd.concat([dc.df for dc in data_catalogs], ignore_index=True), - }, - **cat_kwargs, - ) - elif isinstance(data_catalogs, list) and all( - isinstance(dc, str) for dc in data_catalogs - ): - data_catalogs = [ - DataCatalog(path) if path.endswith(".json") else DataCatalog.from_df(path) - for path in data_catalogs - ] - catalog = DataCatalog( - { - "esmcat": data_catalogs[0].esmcat.dict(), - "df": pd.concat([dc.df for dc in data_catalogs], ignore_index=True), - }, - **cat_kwargs, - ) - else: raise ValueError("Catalogs type not recognized.") + + # Prepare a unique catalog to search from, with the DerivedCat added if required + catalog = DataCatalog( + { + "esmcat": data_catalogs[0].esmcat.dict(), + "df": pd.concat([dc.df for dc in data_catalogs], ignore_index=True), + }, + **cat_kwargs, + ) logger.info(f"Catalog opened: {catalog} from {len(data_catalogs)} files.") if match_hist_and_fut: @@ -630,17 +620,18 @@ def search_data_catalogs( catalog = _dispatch_historical_to_future(catalog, id_columns) # Cut entries that do not match search criteria - if other_search_criteria: - catalog = catalog.search(**other_search_criteria) - logger.info( - f"{len(catalog.df)} assets matched the criteria : {other_search_criteria}." - ) if exclusions: ex = catalog.search(**exclusions) catalog.esmcat._df = pd.concat([catalog.df, ex.df]).drop_duplicates(keep=False) logger.info( f"Removing {len(ex.df)} assets based on exclusion dict : {exclusions}." ) + full_catalog = deepcopy(catalog) # Used for searching for fixed fields + if other_search_criteria: + catalog = catalog.search(**other_search_criteria) + logger.info( + f"{len(catalog.df)} assets matched the criteria : {other_search_criteria}." + ) if restrict_warming_level: if isinstance(restrict_warming_level, bool): restrict_warming_level = {} @@ -654,11 +645,16 @@ def search_data_catalogs( # Recreate id from user specifications catalog.df["id"] = ids else: - # Only fill in the missing IDs + # Only fill in the missing IDs. + # Unreachable line if 'id' is in the aggregation control columns, but this is a safety measure. catalog.df["id"] = catalog.df["id"].fillna(ids) if catalog.df.empty: - logger.warning("Found no match corresponding to the 'other' search criteria.") + warnings.warn( + "Found no match corresponding to the search criteria.", + UserWarning, + stacklevel=1, + ) return {} coverage_kwargs = coverage_kwargs or {} @@ -687,11 +683,14 @@ def search_data_catalogs( scat_id = { i: scat.df[i].iloc[0] for i in id_columns or ID_COLUMNS - if i in scat.df.columns + if ( + (i in scat.df.columns) + and (not pd.isnull(scat.df[i].iloc[0])) + ) } scat_id.pop("experiment", None) scat_id.pop("member", None) - varcat = catalog.search( + varcat = full_catalog.search( **scat_id, xrfreq=xrfreq, variable=var_id, @@ -700,8 +699,10 @@ def search_data_catalogs( if len(varcat) > 1: varcat.esmcat._df = varcat.df.iloc[[0]] if len(varcat) == 1: - logger.warning( - f"Dataset {sim_id} doesn't have the fixed field {var_id}, but it can be acquired from {varcat.df['id'].iloc[0]}." + warnings.warn( + f"Dataset {sim_id} doesn't have the fixed field {var_id}, but it can be acquired from {varcat.df['id'].iloc[0]}.", + UserWarning, + stacklevel=1, ) for i in {"member", "experiment", "id"}.intersection( varcat.df.columns @@ -851,7 +852,7 @@ def get_warming_level( tas_baseline_period : list [start, end] of the base period. The warming is calculated with respect to it. The default is ["1850", "1900"]. ignore_member : bool - Only used for Datasets. Decides whether to ignore the member when searching for the model run in tas_csv. + Decides whether to ignore the member when searching for the model run in tas_csv. tas_csv : str Path to a csv of annual global mean temperature with a row for each year and a column for each dataset. If None, it will default to data/IPCC_annual_global_tas.csv which was built from @@ -901,6 +902,8 @@ def get_warming_level( info["experiment"], info["member"], ) = real.split("_") + if ignore_member: + info["member"] = ".*" elif isinstance(real, dict) and set(real.keys()).issuperset( (set(FIELDS) - {"member"}) if ignore_member else FIELDS ): @@ -1353,7 +1356,7 @@ def _restrict_wl(df, restrictions: dict): # open csv annual_tas = pd.read_csv(tas_csv, index_col="year") - if restrictions["ignore_member"]: + if restrictions["ignore_member"] and "wl" not in restrictions: df["csv_name"] = df["mip_era"].str.cat( [df["source"], df["experiment"]], sep="_" ) @@ -1364,7 +1367,15 @@ def _restrict_wl(df, restrictions: dict): ) csv_source = list(annual_tas.columns[1:]) - to_keep = df["csv_name"].isin(csv_source) + if "wl" in restrictions: + to_keep = pd.Series( + [ + get_warming_level(x, **restrictions)[0] is not None + for x in df["csv_name"] + ] + ) + else: + to_keep = df["csv_name"].isin(csv_source) removed = pd.unique(df[~to_keep]["id"]) df = df[to_keep]