From fde3888d7804ae8894303072c1cebd5c7d9b7e67 Mon Sep 17 00:00:00 2001 From: Anderson Banihirwe Date: Thu, 15 Sep 2022 17:44:25 -0600 Subject: [PATCH 1/3] Update environment --- docs/environment.yml => ci/environment-docs.yml | 17 ++++++++++------- readthedocs.yml | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) rename docs/environment.yml => ci/environment-docs.yml (79%) diff --git a/docs/environment.yml b/ci/environment-docs.yml similarity index 79% rename from docs/environment.yml rename to ci/environment-docs.yml index 6b4b2648..a69e9c52 100644 --- a/docs/environment.yml +++ b/ci/environment-docs.yml @@ -4,27 +4,30 @@ channels: - nodefaults dependencies: - cftime - - furo + - distributed + - fsspec>=2022.7.0 + - gcsfs + - intake>=0.6.6 - jupyterlab - matplotlib - myst-nb - pip + - pydantic>=1.9 - python-graphviz - - python=3.9 + - python=3.10 - s3fs - - fsspec>=2022.7.0 - - intake>=0.6.6 - - pydantic>=1.9 - sphinx - sphinx-copybutton - sphinx-design + - watermark - - xarray-datatree + - xarray-datatree>=0.0.9 - xarray>=2022.06 - zarr>=2.12 - pip: + - furo>=2022.09.15 + - tornado>=6.2 - sphinxext-opengraph - autodoc_pydantic - - -r ../requirements.txt - -e .. diff --git a/readthedocs.yml b/readthedocs.yml index 89f13eb4..68e58958 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -1,6 +1,6 @@ version: 2 conda: - environment: docs/environment.yml + environment: ci/environment-docs.yml build: os: 'ubuntu-20.04' tools: From d59d9afbcaceed81a9b2f76fa3e7b8fd52ae13fd Mon Sep 17 00:00:00 2001 From: Anderson Banihirwe Date: Thu, 15 Sep 2022 19:20:19 -0600 Subject: [PATCH 2/3] Re-organize docs --- ci/environment-docs.yml | 3 +- docs/source/conf.py | 1 + ...earch-query-criteria-via-require-all-on.md | 2 +- ...catalog-by-substring-and-regex-criteria.md | 15 ++- docs/source/how-to/index.md | 17 --- docs/source/how-to/manipulate-catalog.md | 27 ++--- docs/source/index.md | 114 ++++++++++++++++-- .../esm-catalog-spec.md | 0 docs/source/reference/index.md | 11 -- .../{index.md => loading-cmip6-data.md} | 31 ++--- 10 files changed, 143 insertions(+), 78 deletions(-) delete mode 100644 docs/source/how-to/index.md rename docs/source/{explanation => reference}/esm-catalog-spec.md (100%) delete mode 100644 docs/source/reference/index.md rename docs/source/tutorials/{index.md => loading-cmip6-data.md} (95%) diff --git a/ci/environment-docs.yml b/ci/environment-docs.yml index a69e9c52..33bd2b6c 100644 --- a/ci/environment-docs.yml +++ b/ci/environment-docs.yml @@ -5,8 +5,8 @@ channels: dependencies: - cftime - distributed + - ecgtools - fsspec>=2022.7.0 - - gcsfs - intake>=0.6.6 - jupyterlab @@ -20,7 +20,6 @@ dependencies: - sphinx - sphinx-copybutton - sphinx-design - - watermark - xarray-datatree>=0.0.9 - xarray>=2022.06 diff --git a/docs/source/conf.py b/docs/source/conf.py index 6b99f1a9..83e48394 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -34,6 +34,7 @@ nb_execution_mode = 'cache' nb_execution_timeout = 600 +nb_execution_raise_on_error = True extlinks = { 'issue': ('https://github.com/intake/intake-esm/issues/%s', 'GH#'), diff --git a/docs/source/how-to/enforce-search-query-criteria-via-require-all-on.md b/docs/source/how-to/enforce-search-query-criteria-via-require-all-on.md index e2d14736..b7be3cbb 100644 --- a/docs/source/how-to/enforce-search-query-criteria-via-require-all-on.md +++ b/docs/source/how-to/enforce-search-query-criteria-via-require-all-on.md @@ -12,7 +12,7 @@ kernelspec: ```{code-cell} ipython3 import intake -url = "https://gist.githubusercontent.com/andersy005/7f416e57acd8319b20fc2b88d129d2b8/raw/987b4b336d1a8a4f9abec95c23eed3bd7c63c80e/pangeo-gcp-subset.json" +url = "https://raw.githubusercontent.com/intake/intake-esm/main/tutorial-catalogs/GOOGLE-CMIP6.json" cat = intake.open_esm_datastore(url) cat ``` diff --git a/docs/source/how-to/filter-catalog-by-substring-and-regex-criteria.md b/docs/source/how-to/filter-catalog-by-substring-and-regex-criteria.md index dfcb69fc..63977ee4 100644 --- a/docs/source/how-to/filter-catalog-by-substring-and-regex-criteria.md +++ b/docs/source/how-to/filter-catalog-by-substring-and-regex-criteria.md @@ -16,6 +16,10 @@ import intake url = "https://ncar-cesm-lens.s3-us-west-2.amazonaws.com/catalogs/aws-cesm1-le.json" cat = intake.open_esm_datastore(url) +cat +``` + +```{code-cell} ipython3 cat.df.head() ``` @@ -24,7 +28,7 @@ By default, the and is case sensitive: ```{code-cell} ipython3 -cat.search(experiment="20C", long_name="wind").df +cat.search(experiment="20C", long_name="wind") ``` As you can see, the example above returns an empty catalog. @@ -40,7 +44,7 @@ a given column. Let's search for: - all entries whose variable long name **contains** `wind` ```{code-cell} ipython3 -cat.search(experiment="20C", long_name="wind*").df +cat.search(experiment="20C", long_name="wind*") ``` Now, let's search for: @@ -49,7 +53,12 @@ Now, let's search for: - all entries whose variable long name **starts** with `wind` ```{code-cell} ipython3 -cat.search(experiment="20C", long_name="^wind").df +cat_subset = cat.search(experiment="20C", long_name="^wind") +cat_subset +``` + +```{code-cell} ipython3 +cat_subset.df ``` ```{code-cell} ipython3 diff --git a/docs/source/how-to/index.md b/docs/source/how-to/index.md deleted file mode 100644 index 2c16c3c6..00000000 --- a/docs/source/how-to/index.md +++ /dev/null @@ -1,17 +0,0 @@ -# How to - -How to: - -```{toctree} ---- -maxdepth: 1 ---- - -install-intake-esm.md -build-a-catalog-from-timeseries-files.md -define-and-use-derived-variable-registry.md -use-catalogs-with-assets-containing-multiple-variables.md -filter-catalog-by-substring-and-regex-criteria.md -enforce-search-query-criteria-via-require-all-on.md -manipulate-catalog.md -``` diff --git a/docs/source/how-to/manipulate-catalog.md b/docs/source/how-to/manipulate-catalog.md index a3958c25..78bdb334 100644 --- a/docs/source/how-to/manipulate-catalog.md +++ b/docs/source/how-to/manipulate-catalog.md @@ -17,7 +17,7 @@ The in-memory representation of an Earth System Model (ESM) catalog is a pandas dataframe, and is accessible via the `.df` property: ```{code-cell} ipython3 -url = "https://gist.githubusercontent.com/andersy005/7f416e57acd8319b20fc2b88d129d2b8/raw/987b4b336d1a8a4f9abec95c23eed3bd7c63c80e/pangeo-gcp-subset.json" +url ="https://raw.githubusercontent.com/intake/intake-esm/main/tutorial-catalogs/GOOGLE-CMIP6.json" cat = intake.open_esm_datastore(url) cat.df.head() ``` @@ -31,8 +31,7 @@ Let's say we are interested in datasets with the following attributes: - `experiment_id=["historical"]` - `table_id="Amon"` -- `variable_id="tas"` -- `source_id=['TaiESM1', 'AWI-CM-1-1-MR', 'AWI-ESM-1-1-LR', 'BCC-CSM2-MR', 'BCC-ESM1', 'CAMS-CSM1-0', 'CAS-ESM2-0', 'UKESM1-0-LL']` +- `variable_id="ua"` In addition to these attributes, **we are interested in the first ensemble member (member_id) of each model (source_id) only**. @@ -47,17 +46,7 @@ We can run a query against the catalog: cat_subset = cat.search( experiment_id=["historical"], table_id="Amon", - variable_id="tas", - source_id=[ - "TaiESM1", - "AWI-CM-1-1-MR", - "AWI-ESM-1-1-LR", - "BCC-CSM2-MR", - "BCC-ESM1", - "CAMS-CSM1-0", - "CAS-ESM2-0", - "UKESM1-0-LL", - ], + variable_id="ua", ) cat_subset ``` @@ -83,6 +72,10 @@ df = grouped.first().reset_index() df.groupby("source_id")["member_id"].nunique() ``` +```{code-cell} ipython3 +df +``` + ### Step 3: Attach the new dataframe to our catalog object ```{code-cell} ipython3 @@ -93,18 +86,18 @@ cat_subset Let's load the subsetted catalog into a dictionary of datasets: ```{code-cell} ipython3 -dsets = cat_subset.to_dataset_dict(xarray_open_kwargs={"consolidated": True}) +dsets = cat_subset.to_dataset_dict() [key for key in dsets] ``` ```{code-cell} ipython3 -dsets["CMIP.CAS.CAS-ESM2-0.historical.Amon.gn"] +dsets["CMIP.IPSL.IPSL-CM6A-LR.historical.Amon.gr"] ``` ```{code-cell} ipython3 --- tags: [hide-input, hide-output] --- -import intake_esm # just to display version information +import intake_esm intake_esm.show_versions() ``` diff --git a/docs/source/index.md b/docs/source/index.md index 205d33dc..47041a4e 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -1,33 +1,133 @@ -# Welcome to Intake-esm's documentation! +--- +sd_hide_title: true +--- + +# Overview + +::::{grid} +:reverse: +:gutter: 3 4 4 4 +:margin: 1 2 1 2 + +:::{grid-item} +:columns: 12 4 4 4 + +```{image} _static/images/NSF_4-Color_bitmap_Logo.png +:width: 200px +:class: sd-m-auto +``` -`intake-esm` is a data cataloging utility built on top of [intake](https://github.com/intake/intake), [pandas](https://pandas.pydata.org/), and [xarray](https://xarray.pydata.org/en/stable/), and it's pretty awesome! +::: + +:::{grid-item} +:columns: 12 8 8 8 +:child-align: justify +:class: sd-fs-5 + +```{rubric} Intake-ESM + +``` + +A data cataloging utility built on top of [intake](https://github.com/intake/intake), [pandas](https://pandas.pydata.org/), and [xarray](https://xarray.pydata.org/en/stable/), and it's pretty awesome! + +```{button-ref} how-to/install-intake-esm +:ref-type: doc +:color: primary +:class: sd-rounded-pill + +Get Started +``` + +::: + +:::: + +--- + +## Motivation + +Computer simulations of the Earth’s climate and weather generate huge amounts of data. +These data are often persisted on HPC systems or in the cloud across multiple data +assets of a variety of formats ([netCDF](https://www.unidata.ucar.edu/software/netcdf/), [zarr](https://zarr.readthedocs.io/en/stable/), etc...). Finding, investigating, +loading these data assets into compute-ready data containers costs time and effort. +The data user needs to know what data sets are available, the attributes describing +each data set, before loading a specific data set and analyzing it. + +Finding, investigating, loading these assets into data array containers +such as xarray can be a daunting task due to the large number of files +a user may be interested in. Intake-esm aims to address these issues by +providing necessary functionality for searching, discovering, data access/loading. + +--- ## Get in touch - If you encounter any errors or problems with **intake-esm**, please open an issue at the GitHub [main repository](http://github.com/intake/intake-esm/issues). - If you have a question like “How do I find x?”, ask on [GitHub discussions](https://github.com/intake/intake-esm/discussions). Please include a self-contained reproducible example if possible. +--- + ```{toctree} --- maxdepth: 1 +caption: Tutorials hidden: --- +tutorials/loading-cmip6-data.md +``` + +```{toctree} +--- +maxdepth: 2 +caption: How to Guides and Examples +hidden: +--- + +how-to/install-intake-esm.md +how-to/build-a-catalog-from-timeseries-files.md +how-to/define-and-use-derived-variable-registry.md +how-to/use-catalogs-with-assets-containing-multiple-variables.md +how-to/filter-catalog-by-substring-and-regex-criteria.md +how-to/enforce-search-query-criteria-via-require-all-on.md +how-to/manipulate-catalog.md +``` + +```{toctree} +--- +maxdepth: 2 +caption: Reference +hidden: +--- + +reference/esm-catalog-spec.md +reference/api.md +reference/faq.md +reference/cmip_ap.md + -tutorials/index.md -how-to/index.md -explanation/index.md -reference/index.md ``` ```{toctree} --- maxdepth: 2 -caption: Contribute to intake-esm +caption: Development hidden: --- contributing.md +reference/changelog.md + +``` + +```{toctree} +--- +maxdepth: 2 +caption: Project Links +hidden: +--- + + GitHub Repo GitHub discussions diff --git a/docs/source/explanation/esm-catalog-spec.md b/docs/source/reference/esm-catalog-spec.md similarity index 100% rename from docs/source/explanation/esm-catalog-spec.md rename to docs/source/reference/esm-catalog-spec.md diff --git a/docs/source/reference/index.md b/docs/source/reference/index.md deleted file mode 100644 index 1446a1e2..00000000 --- a/docs/source/reference/index.md +++ /dev/null @@ -1,11 +0,0 @@ -# Reference - -```{toctree} ---- -maxdepth: 1 ---- -faq.md -cmip_ap.md -changelog.md -api.md -``` diff --git a/docs/source/tutorials/index.md b/docs/source/tutorials/loading-cmip6-data.md similarity index 95% rename from docs/source/tutorials/index.md rename to docs/source/tutorials/loading-cmip6-data.md index 7ab9b66d..478e1934 100644 --- a/docs/source/tutorials/index.md +++ b/docs/source/tutorials/loading-cmip6-data.md @@ -7,7 +7,7 @@ kernelspec: name: python3 --- -# Tutorial: loading CMIP6 data with Intake-esm +# Accessing CMIP6 data with intake-esm This notebook demonstrates how to access Google Cloud CMIP6 data using intake-esm. @@ -78,23 +78,14 @@ unique ``` ```{code-cell} ipython3 ---- -tags: [show-input, hide-output] ---- unique['source_id'] ``` ```{code-cell} ipython3 ---- -tags: [show-input, hide-output] ---- unique['experiment_id'] ``` ```{code-cell} ipython3 ---- -tags: [show-input, hide-output] ---- unique['table_id'] ``` @@ -171,7 +162,7 @@ Let’s create a quick plot for a slice of the data: ds.o2.isel(time=0, lev=0, member_id=range(1, 24, 4)).plot(col="member_id", col_wrap=3, robust=True) ``` -## Using custom preprocessing functions +## Use custom preprocessing functions When comparing many models it is often necessary to preprocess (e.g. rename certain variables) them before running some analysis step. The `preprocess` @@ -223,20 +214,20 @@ for k, ds in dset_dict_fixed.items(): This was just an example for one dimension. -## Load the data into a datatree +```{note} +Check out [xmip package](https://github.com/jbusecke/xMIP) +for a full renaming function for all available CMIP6 models and some other +utilities. +``` + +## Load datasets into an xarray-datatree using `to_datatree()` -We can also load our data into an xarray-datatree object using the following: +We can also load our data into an [xarray-datatree](https://xarray-datatree.readthedocs.io/en/latest/) object using the following: ```{code-cell} ipython3 tree = cat_pp.to_datatree(xarray_open_kwargs={"consolidated": True}, preprocess=helper_func) -tree -``` - -```{note} -Check out [xmip package](https://github.com/jbusecke/xMIP) -for a full renaming function for all available CMIP6 models and some other -utilities. +print(tree) ``` ```{code-cell} ipython3 From e46a9280c47f3dec56132012960a959a5534c7e6 Mon Sep 17 00:00:00 2001 From: Anderson Banihirwe Date: Thu, 15 Sep 2022 19:22:24 -0600 Subject: [PATCH 3/3] fix image path --- docs/source/explanation/index.md | 8 -------- docs/source/index.md | 2 +- 2 files changed, 1 insertion(+), 9 deletions(-) delete mode 100644 docs/source/explanation/index.md diff --git a/docs/source/explanation/index.md b/docs/source/explanation/index.md deleted file mode 100644 index 94ac8dd4..00000000 --- a/docs/source/explanation/index.md +++ /dev/null @@ -1,8 +0,0 @@ -# Explanation - -```{toctree} ---- -maxdepth: 1 ---- -esm-catalog-spec.md -``` diff --git a/docs/source/index.md b/docs/source/index.md index 47041a4e..7f72d695 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -12,7 +12,7 @@ sd_hide_title: true :::{grid-item} :columns: 12 4 4 4 -```{image} _static/images/NSF_4-Color_bitmap_Logo.png +```{image} ../_static/images/NSF_4-Color_bitmap_Logo.png :width: 200px :class: sd-m-auto ```