diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 3b5a87b..0807701 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -17,7 +17,7 @@ jobs: fail-fast: false matrix: os: ['ubuntu-latest', 'macos-latest', 'windows-latest'] - python-version: ['3.8', '3.9', '3.10'] + python-version: ['3.9', '3.10', '3.11'] env: MPLBACKEND: Agg # https://github.com/orgs/community/discussions/26434 steps: diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..f919fef --- /dev/null +++ b/docs/index.md @@ -0,0 +1,10 @@ +# Zampy + +A tool for downloading Land Surface Model input data. + +### Name origin + +Named after *Zam*; [the Avestan language term for the Zoroastrian concept of "earth"](https://en.wikipedia.org/wiki/Zam). + +## How to use Zampy +See the section ["using Zampy"](using_zampy.md). diff --git a/docs/using_zampy.md b/docs/using_zampy.md new file mode 100644 index 0000000..52bde20 --- /dev/null +++ b/docs/using_zampy.md @@ -0,0 +1,49 @@ +# Using Zampy + +## Installing Zampy +Zampy can be installed by doing: +```bash +pip install zampy git+https://github.com/EcoExtreML/zampy +``` + +## Configuration +Zampy needs to be configured with a simple configuration file. + +You need to create this file under your -*user's home*-/.config directory: `~/.config/zampy/zampy_config.yml`, and should contain the following: + +```yaml +working_directory: /path_to_a_working_directory/ #for example: /home/bart/Zampy +``` + +## Formulating a recipe +A "recipe" is a file with `yml` extension and has the following structure: + +```yaml +name: "test_recipe" + +download: + years: [2020, 2020] + bbox: [54, 6, 50, 3] # NESW + + datasets: + era5: + variables: + - 10m_v_component_of_wind + - surface_pressure + +convert: + convention: ALMA + frequency: 1H # outputs at 1 hour frequency. Pandas-like freq-keyword. + resolution: 0.5 # output resolution in degrees. +``` + +You can specify multiple datasets and multiple variables per dataset. + +## Running a recipe +Save this recipe to disk and run the following code in your shell: + +```bash +zampy --filename /home/username/path_to_file/simple_recipe.yml +``` + +This will execute the recipe (i.e. download, ingest, convert, resample and save the data). diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..17d0134 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,57 @@ +site_name: Zampy Documentation + +theme: + name: material + highlightjs: true + hljs_languages: + - yaml + - python + - bash + features: + - navigation.instant + - navigation.tabs + - navigation.tabs.sticky + - content.code.copy + + palette: + # Palette toggle for light mode + - scheme: default + toggle: + icon: material/weather-sunny + name: Switch to dark mode + primary: light green + accent: green + + # Palette toggle for dark mode + - scheme: slate + toggle: + icon: material/weather-night + name: Switch to light mode + primary: blue grey + accent: teal + +plugins: + - mkdocs-jupyter: + include_source: True + - search + - mkdocstrings: + handlers: + python: + options: + docstring_style: google + docstring_options: + ignore_init_summary: no + merge_init_into_class: yes + show_submodules: no + +markdown_extensions: + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.snippets + - pymdownx.superfences + +extra: + generator: false diff --git a/pyproject.toml b/pyproject.toml index ad83aae..11b5773 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ name = "zampy" description = "python package for getting Land Surface Model input data." readme = "README.md" license = "Apache-2.0" -requires-python = ">=3.8, <3.11" +requires-python = ">=3.9, <3.12" authors = [ {email = "b.schilperoort@esciencecenter.nl"}, {name = "Bart Schilperoort, Yang Liu, Fakhereh Alidoost"} @@ -43,17 +43,18 @@ classifiers = [ "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", ] dependencies = [ "requests", + "pyyaml", "netcdf4", "numpy", "pandas", "matplotlib", "xarray", + "scipy", # required for xarray.interpolate "rioxarray", # required for TIFF files "tqdm", "dask[diagnostics]", @@ -66,6 +67,9 @@ dependencies = [ ] dynamic = ["version"] +[project.scripts] +zampy="zampy.cli:run_recipe" + [project.optional-dependencies] dev = [ "bump2version", @@ -75,10 +79,19 @@ dev = [ "mypy", "types-requests", # type stubs for request lib "types-urllib3", # type stubs for url lib + "types-PyYAML", "pytest", "pytest-cov", + "pytest-mock", "pre-commit", ] +docs = [ + "mkdocs", + "mkdocs-material", + "mkdocs-jupyter", + "mkdocstrings[python]", + "mkdocs-gen-files", +] [tool.hatch.envs.default] features = ["dev"] @@ -99,6 +112,13 @@ coverage = [ "pytest --cov --cov-report term --cov-report xml --junitxml=xunit-result.xml tests/", ] +[tool.hatch.envs.docs] +features = ["docs"] + +[tool.hatch.envs.docs.scripts] +build = ["mkdocs build"] +serve = ["mkdocs serve"] + # [tool.hatch.envs.conda] # type = "conda" # python = "3.10" diff --git a/sonar-project.properties b/sonar-project.properties index c5a0059..8f280f2 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -10,4 +10,4 @@ sonar.links.ci=https://github.com/EcoExtreML/zampy/actions sonar.python.coverage.reportPaths=coverage.xml sonar.python.xunit.reportPath=xunit-result.xml sonar.python.pylint.reportPaths=pylint-report.txt -sonar.python.version=3.8, 3.9, 3.10 \ No newline at end of file +sonar.python.version=3.9, 3.10, 3.11 \ No newline at end of file diff --git a/src/zampy/cli.py b/src/zampy/cli.py new file mode 100644 index 0000000..e7b4564 --- /dev/null +++ b/src/zampy/cli.py @@ -0,0 +1,17 @@ +"""Implements CLI interface for Zampy.""" +from pathlib import Path +import click +from zampy.recipe import RecipeManager + + +@click.command() +@click.argument("recipe", type=click.Path(exists=True, path_type=Path)) +def run_recipe(recipe: Path) -> None: + """Run the recipe using the CLI.""" + click.echo(f"Executing recipe: {recipe}") + rm = RecipeManager(recipe) + rm.run() + + +if __name__ == "__main__": + run_recipe() diff --git a/src/zampy/datasets/__init__.py b/src/zampy/datasets/__init__.py index e1b1724..ccf538e 100644 --- a/src/zampy/datasets/__init__.py +++ b/src/zampy/datasets/__init__.py @@ -6,3 +6,11 @@ __all__ = ["dataset_protocol", "validation", "EthCanopyHeight", "ERA5"] + + +# This object tracks which datasets are available. +DATASETS: dict[str, type[dataset_protocol.Dataset]] = { + # All lowercase key. + "era5": ERA5, + "eth_canopy_height": EthCanopyHeight, +} diff --git a/src/zampy/recipe.py b/src/zampy/recipe.py new file mode 100644 index 0000000..cac5274 --- /dev/null +++ b/src/zampy/recipe.py @@ -0,0 +1,133 @@ +""""All functionality to read and execute Zampy recipes.""" +from pathlib import Path +from typing import Any +import numpy as np +import yaml +from zampy.datasets import DATASETS +from zampy.datasets import converter +from zampy.datasets.dataset_protocol import Dataset +from zampy.datasets.dataset_protocol import SpatialBounds +from zampy.datasets.dataset_protocol import TimeBounds + + +def recipe_loader(recipe_path: Path) -> dict: + """Load the yaml recipe into a dictionary, and do some validation.""" + with recipe_path.open() as f: + recipe: dict = yaml.safe_load(f) + + if not all(key in recipe.keys() for key in ["name", "download", "convert"]): + msg = ( + "One of the following items are missing from the recipe:\n" + "name, download, convert." + ) + raise ValueError(msg) + + if "datasets" not in recipe["download"].keys(): + msg = "No dataset entry found in the recipe." + raise ValueError(msg) + + if not all( + key in recipe["convert"].keys() + for key in ["convention", "frequency", "resolution"] + ): + msg = ( + "One of the following items are missing from the recipe:\n" + "name, download, convert." + ) + raise ValueError(msg) + + return recipe + + +def config_loader() -> dict: + """Load the zampty config and validate the contents.""" + config_path = Path.home() / ".config" / "zampy" / "zampy_config.yml" + + if not config_path.exists(): + msg = f"No config file was found at '{config_path}'" + raise FileNotFoundError(msg) + + with config_path.open() as f: + config: dict = yaml.safe_load(f) + + if not isinstance(config, dict) or "working_directory" not in config.keys(): + msg = "No `working_directory` key found in the config file." + raise ValueError(msg) + + return config + + +class RecipeManager: + """The recipe manager is used to get the required info, and then run the recipe.""" + + def __init__(self, recipe_path: Path) -> None: + """Instantiate the recipe manager, using a prepared recipe.""" + # Load & parse recipe + recipe = recipe_loader(recipe_path) + + self.start_year, self.end_year = recipe["download"]["years"] + self.timebounds = TimeBounds( + np.datetime64(f"{self.start_year}-01-01T00:00"), + np.datetime64(f"{self.end_year}-12-31T23:59"), + ) + self.spatialbounds = SpatialBounds(*recipe["download"]["bbox"]) + + self.datasets: dict[str, Any] = recipe["download"]["datasets"] + + self.convention = recipe["convert"]["convention"] + self.frequency = recipe["convert"]["frequency"] + self.resolution = recipe["convert"]["resolution"] + + # Load & parse config + config = config_loader() + self.download_dir = Path(config["working_directory"]) / "download" + self.ingest_dir = Path(config["working_directory"]) / "ingest" + self.data_dir = ( + Path(config["working_directory"]) / "output" / str(recipe["name"]) + ) + + # Create required directories if they do not exist yet: + for dir in [self.data_dir, self.download_dir, self.ingest_dir]: + dir.mkdir(parents=True, exist_ok=True) + + def run(self) -> None: + """Run the full recipe.""" + for dataset_name in self.datasets: + _dataset = DATASETS[dataset_name.lower()] + dataset: Dataset = _dataset() + variables: list[str] = self.datasets[dataset_name]["variables"] + + # Download datset + dataset.download( + download_dir=self.download_dir, + time_bounds=self.timebounds, + spatial_bounds=self.spatialbounds, + variable_names=variables, + ) + + dataset.ingest(self.download_dir, self.ingest_dir) + + ds = dataset.load( + ingest_dir=self.ingest_dir, + time_bounds=self.timebounds, + spatial_bounds=self.spatialbounds, + variable_names=variables, + resolution=self.resolution, + regrid_method="flox", + ) + + ds = converter.convert(ds, dataset, convention=self.convention) + + ds = ds.resample(time=self.frequency).mean() + + comp = dict(zlib=True, complevel=5) + encoding = {var: comp for var in ds.data_vars} + fname = ( # e.g. "era5_2010-2020.nc" + f"{dataset_name.lower()}_{self.start_year}-{self.end_year}.nc" + ) + ds.to_netcdf(path=self.data_dir / fname, encoding=encoding) + + print( + "Finished running the recipe. Output data can be found at:\n" + f" {self.data_dir}" + ) diff --git a/src/zampy/utils/regrid.py b/src/zampy/utils/regrid.py index 969b538..c52af19 100644 --- a/src/zampy/utils/regrid.py +++ b/src/zampy/utils/regrid.py @@ -105,7 +105,7 @@ def _groupby_regrid( ds_out = ds_out.swap_dims( {"latitude_bins": "latitude", "longitude_bins": "longitude"} ) - ds_out = ds_out.drop(["latitude_bins", "longitude_bins"]) + ds_out = ds_out.drop_vars(["latitude_bins", "longitude_bins"]) return ds_out.transpose("time", "latitude", "longitude", ...) diff --git a/tests/test_recipes/generate_test_data.py b/tests/test_recipes/generate_test_data.py new file mode 100644 index 0000000..46b9b96 --- /dev/null +++ b/tests/test_recipes/generate_test_data.py @@ -0,0 +1,72 @@ +"""Generates test data for running the recipe tests.""" +from pathlib import Path +import numpy as np +import pandas as pd +import xarray as xr +from zampy.datasets.dataset_protocol import SpatialBounds +from zampy.datasets.dataset_protocol import TimeBounds + + +def generate_era5_file( + varname: str, + time_bounds: TimeBounds, + spatial_bounds: SpatialBounds, + test_value: float, + resolution: float, + time_res="1H", +) -> xr.Dataset: + time_coords = pd.date_range( + start=time_bounds.start, end=time_bounds.end, freq=time_res, inclusive="left" + ) + lat_coords = np.arange( + start=np.round(spatial_bounds.south - 1), + stop=np.round(spatial_bounds.north + 1), + step=resolution, + ) + lon_coords = np.arange( + start=np.round(spatial_bounds.west - 1), + stop=np.round(spatial_bounds.east + 1), + step=resolution, + ) + data = np.zeros((len(lon_coords), len(lat_coords), len(time_coords))) + test_value + + ds = xr.Dataset( + data_vars={ERA5_LOOKUP[varname][1]: (("longitude", "latitude", "time"), data)}, + coords={ + "longitude": lon_coords, + "latitude": lat_coords, + "time": time_coords, + }, + ) + ds[ERA5_LOOKUP[varname][1]].attrs["units"] = ERA5_LOOKUP[varname][0] + ds["latitude"].attrs["units"] = "degrees_north" + ds["longitude"].attrs["units"] = "degrees_east" + + return ds + + +ERA5_LOOKUP = { # name: (unit, fname) + "10m_u_component_of_wind": ("m s**-1", "u10"), + "10m_v_component_of_wind": ("m s**-1", "v10"), + "surface_pressure": ("Pa", "sp"), +} + + +def generate_era5_files( + directory: Path, + variables: list[str], + spatial_bounds: SpatialBounds, + time_bounds: TimeBounds, +) -> None: + data_dir_era5 = directory / "era5" + data_dir_era5.mkdir() + + for var in variables: + ds = generate_era5_file( + varname=var, + time_bounds=time_bounds, + spatial_bounds=spatial_bounds, + test_value=1.0, + resolution=0.25, + ) + ds.to_netcdf(path=data_dir_era5 / f"era5_{var}.nc") diff --git a/tests/test_recipes/recipes/era5_recipe.yml b/tests/test_recipes/recipes/era5_recipe.yml new file mode 100644 index 0000000..576fb15 --- /dev/null +++ b/tests/test_recipes/recipes/era5_recipe.yml @@ -0,0 +1,16 @@ +name: "era5_recipe" + +download: + years: [2020, 2020] + bbox: [51, 4, 50, 3] # NESW + + datasets: + era5: + variables: + - 10m_v_component_of_wind + - surface_pressure + +convert: + convention: ALMA + frequency: 1H # outputs at 1 hour frequency. Pandas-like freq-keyword. + resolution: 0.5 # output resolution in degrees. diff --git a/tests/test_recipes/test_config_loader.py b/tests/test_recipes/test_config_loader.py new file mode 100644 index 0000000..a3d34ad --- /dev/null +++ b/tests/test_recipes/test_config_loader.py @@ -0,0 +1,41 @@ +from pathlib import Path +import pytest +from zampy.recipe import config_loader + + +def test_valid_config(tmp_path: Path, mocker): + mocker.patch( + "pathlib.Path.home", + return_value=tmp_path, + ) + config_dir = tmp_path / ".config" / "zampy" + config_dir.mkdir(parents=True) + valid_config = f"working_directory: {tmp_path}\n" + with (config_dir / "zampy_config.yml").open("w") as f: + f.write(valid_config) + + config = config_loader() + assert config == {"working_directory": str(tmp_path)} + + +def test_missing_config(tmp_path: Path, mocker): + mocker.patch( + "pathlib.Path.home", + return_value=tmp_path, + ) + with pytest.raises(FileNotFoundError): + config_loader() + + +def test_missing_key(tmp_path: Path, mocker): + mocker.patch( + "pathlib.Path.home", + return_value=tmp_path, + ) + config_dir = tmp_path / ".config" / "zampy" + config_dir.mkdir(parents=True) + with (config_dir / "zampy_config.yml").open("w") as f: + f.write("nonsense") + + with pytest.raises(ValueError, match="No `working_directory` key"): + config_loader() diff --git a/tests/test_recipes/test_recipe_loader.py b/tests/test_recipes/test_recipe_loader.py new file mode 100644 index 0000000..1f3d7cc --- /dev/null +++ b/tests/test_recipes/test_recipe_loader.py @@ -0,0 +1,80 @@ +"""Test the recipe loader.""" +import pytest +from zampy.recipe import recipe_loader + + +valid_recipe = """ +name: "Test recipe 2" +download: + years: [2020, 2020] + bbox: [54, 6, 50, 3] # NESW + datasets: + era5: + variables: + - 10m_v_component_of_wind + - surface_pressure +convert: + convention: ALMA + frequency: 1H + resolution: 0.5 +""" + +recipe_missing_datasets = """ +name: "Test recipe 2" +download: + years: [2020, 2020] + bbox: [54, 6, 50, 3] # NESW +convert: + convention: ALMA + frequency: 1H + resolution: 0.5 +""" + +recipe_missing_name = """ +download: + years: [2020, 2020] + bbox: [54, 6, 50, 3] # NESW + datasets: + era5: + variables: + - 10m_v_component_of_wind + - surface_pressure +convert: + convention: ALMA + frequency: 1H + resolution: 0.5 +""" + +recipe_missing_convention = """ +name: "Test recipe 2" +download: + years: [2020, 2020] + bbox: [54, 6, 50, 3] # NESW + datasets: + era5: + variables: + - 10m_v_component_of_wind + - surface_pressure +convert: + frequency: 1H + resolution: 0.5 +""" + + +def test_valid_recipe(tmp_path): + recipe_path = tmp_path / "valid_recipe.yml" + with recipe_path.open("w") as f: + f.write(valid_recipe) + recipe_loader(recipe_path) + + +@pytest.mark.parametrize( + "recipe", [recipe_missing_convention, recipe_missing_datasets, recipe_missing_name] +) +def test_invalid_recipes(tmp_path, recipe): + recipe_path = tmp_path / "invalid_recipe.yml" + with recipe_path.open("w") as f: + f.write(recipe) + + with pytest.raises(ValueError): + recipe_loader(recipe_path) diff --git a/tests/test_recipes/test_simple_recipe.py b/tests/test_recipes/test_simple_recipe.py new file mode 100644 index 0000000..064c7b9 --- /dev/null +++ b/tests/test_recipes/test_simple_recipe.py @@ -0,0 +1,44 @@ +"""Testing a simple recipe.""" +from pathlib import Path +from unittest.mock import patch +import generate_test_data +import numpy as np +import xarray as xr +from zampy.datasets import DATASETS +from zampy.datasets.dataset_protocol import SpatialBounds +from zampy.datasets.dataset_protocol import TimeBounds +from zampy.datasets.dataset_protocol import write_properties_file +from zampy.recipe import RecipeManager + + +RECIPE_FILE = Path(__file__).parent / "recipes" / "era5_recipe.yml" + + +def test_recipe(tmp_path: Path, mocker): + with (patch.object(DATASETS["era5"], "download"),): + mocker.patch( + "zampy.recipe.config_loader", + return_value={"working_directory": str(tmp_path.absolute())}, + ) + rm = RecipeManager(RECIPE_FILE.absolute()) + + spatial_bounds = SpatialBounds(51, 4, 50, 3) + time_bounds = TimeBounds( + np.datetime64("2020-01-01T00:00"), np.datetime64("2020-12-31T23:59") + ) + variables = ["10m_v_component_of_wind", "surface_pressure"] + + generate_test_data.generate_era5_files( + directory=tmp_path / "download", + variables=variables, + spatial_bounds=spatial_bounds, + time_bounds=time_bounds, + ) + write_properties_file( + tmp_path / "download" / "era5", spatial_bounds, time_bounds, variables + ) + + rm.run() + + ds = xr.open_mfdataset(str(tmp_path / "output" / "era5_recipe" / "*.nc")) + assert all(var in ds.data_vars for var in ["Psurf", "Wind_N"])