From ddef6200764fab11f49ab0841fece74d05b6c948 Mon Sep 17 00:00:00 2001 From: SarahAlidoost Date: Mon, 9 Dec 2024 16:15:37 +0100 Subject: [PATCH 1/3] use h5netcdf xarray engine --- pyproject.toml | 2 +- src/zampy/datasets/cds_utils.py | 4 ++-- src/zampy/datasets/ecmwf_dataset.py | 6 ++++-- src/zampy/datasets/eth_canopy_height.py | 11 +++++------ src/zampy/datasets/fapar_lai.py | 11 ++++++++--- src/zampy/datasets/land_cover.py | 10 ++++++---- src/zampy/datasets/prism_dem.py | 5 +++-- src/zampy/recipe.py | 4 +++- tests/test_datasets/test_fapar_lai.py | 4 +++- tests/test_recipes/test_simple_recipe.py | 20 +++++++++++++++----- 10 files changed, 50 insertions(+), 27 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ac61248..f678f3e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,7 +53,7 @@ dependencies = [ "numpy", "pandas", "matplotlib", - "xarray", + "xarray[io]", "scipy", # required for xarray.interpolate "rioxarray", # required for TIFF files "tqdm", diff --git a/src/zampy/datasets/cds_utils.py b/src/zampy/datasets/cds_utils.py index 4966344..c49ca9c 100644 --- a/src/zampy/datasets/cds_utils.py +++ b/src/zampy/datasets/cds_utils.py @@ -391,7 +391,7 @@ def convert_to_zampy( ds = parse_nc_file(file) # Rename the vswl data: ncfile = Path(str(ncfile).replace("volumetric_soil_water", "soil_moisture")) - ds.to_netcdf(path=ncfile) + ds.to_netcdf(path=ncfile, engine="h5netcdf") var_reference_ecmwf_to_zampy = { @@ -444,7 +444,7 @@ def parse_nc_file(file: Path) -> xr.Dataset: CF/Zampy formatted xarray Dataset """ # Open chunked: will be dask array -> file writing can be parallelized. - ds = xr.open_dataset(file, chunks={"x": 50, "y": 50}) + ds = xr.open_dataset(file, chunks={"x": 50, "y": 50}, engine="h5netcdf") for variable in ds.variables: if variable in var_reference_ecmwf_to_zampy: diff --git a/src/zampy/datasets/ecmwf_dataset.py b/src/zampy/datasets/ecmwf_dataset.py index 0ffd19c..4897677 100644 --- a/src/zampy/datasets/ecmwf_dataset.py +++ b/src/zampy/datasets/ecmwf_dataset.py @@ -119,7 +119,9 @@ def load( if var in variable_names: files += (ingest_dir / self.name).glob(f"{self.name}_{var}*.nc") - ds = xr.open_mfdataset(files, chunks={"latitude": 200, "longitude": 200}) + ds = xr.open_mfdataset( + files, chunks={"latitude": 200, "longitude": 200}, engine="h5netcdf" + ) # rename valid_time to time if "valid_time" in ds.dims: @@ -152,7 +154,7 @@ def convert( for file in data_files: # start conversion process print(f"Start processing file `{file.name}`.") - ds = xr.open_dataset(file, chunks={"x": 50, "y": 50}) + ds = xr.open_dataset(file, chunks={"x": 50, "y": 50}, engine="h5netcdf") ds = converter.convert(ds, dataset=self, convention=convention) # TODO: support derived variables # TODO: other calculations diff --git a/src/zampy/datasets/eth_canopy_height.py b/src/zampy/datasets/eth_canopy_height.py index ede1a7e..82d0674 100644 --- a/src/zampy/datasets/eth_canopy_height.py +++ b/src/zampy/datasets/eth_canopy_height.py @@ -135,7 +135,9 @@ def load( if self.variable_names[1] in variable_names: files += (ingest_dir / self.name).glob("*Map_SD.nc") - ds = xr.open_mfdataset(files, chunks={"latitude": 2000, "longitude": 2000}) + ds = xr.open_mfdataset( + files, chunks={"latitude": 2000, "longitude": 2000}, engine="h5netcdf" + ) ds = ds.sel(time=slice(time_bounds.start, time_bounds.end)) grid = xarray_regrid.create_regridding_dataset( @@ -159,7 +161,7 @@ def convert( for file in data_files: # start conversion process print(f"Start processing file `{file.name}`.") - ds = xr.open_dataset(file, chunks={"x": 2000, "y": 2000}) + ds = xr.open_dataset(file, chunks={"x": 2000, "y": 2000}, engine="h5netcdf") ds = converter.convert(ds, dataset=self, convention=convention) # TODO: support derived variables # TODO: other calculations @@ -249,10 +251,7 @@ def convert_tiff_to_netcdf( ds = ds.interpolate_na(dim="longitude", limit=1) ds = ds.interpolate_na(dim="latitude", limit=1) - ds.to_netcdf( - path=ncfile, - encoding=ds.encoding, - ) + ds.to_netcdf(path=ncfile, encoding=ds.encoding, engine="h5netcdf") def parse_tiff_file(file: Path, sd_file: bool = False) -> xr.Dataset: diff --git a/src/zampy/datasets/fapar_lai.py b/src/zampy/datasets/fapar_lai.py index f33df96..6d2bb15 100644 --- a/src/zampy/datasets/fapar_lai.py +++ b/src/zampy/datasets/fapar_lai.py @@ -150,7 +150,7 @@ def load( variable_names: list[str], ) -> xr.Dataset: files = list((ingest_dir / self.name).glob("*.nc")) - ds = xr.open_mfdataset(files) # see issue 65 + ds = xr.open_mfdataset(files, engine="h5netcdf") # see issue 65 ds = ds.sel(time=slice(time_bounds.start, time_bounds.end)) grid = xarray_regrid.create_regridding_dataset( @@ -175,7 +175,9 @@ def convert( # Will be removed, see issue #43. for file in data_files: # start conversion process print(f"Start processing file `{file.name}`.") - ds = xr.open_dataset(file, chunks={"latitude": 2000, "longitude": 2000}) + ds = xr.open_dataset( + file, chunks={"latitude": 2000, "longitude": 2000}, engine="h5netcdf" + ) ds = converter.convert(ds, dataset=self, convention=convention) return True @@ -248,7 +250,9 @@ def download_fapar_lai( def ingest_ncfile(ncfile: Path, ingest_folder: Path) -> None: """Ingest the 'raw' netCDF file to the Zampy standard format.""" print(f"Converting file {ncfile.name}...") - ds = xr.open_dataset(ncfile, decode_times=False, chunks={"lat": 5000, "lon": 5000}) + ds = xr.open_dataset( + ncfile, decode_times=False, chunks={"lat": 5000, "lon": 5000}, engine="h5netcdf" + ) ds = ds.rename( { "LAI": "leaf_area_index", @@ -260,6 +264,7 @@ def ingest_ncfile(ncfile: Path, ingest_folder: Path) -> None: ds[["leaf_area_index"]].to_netcdf( path=ingest_folder / ncfile.name, encoding={"leaf_area_index": {"zlib": True, "complevel": 3}}, + engine="h5netcdf", ) ds.close() # explicitly close to release file to system (for Windows) diff --git a/src/zampy/datasets/land_cover.py b/src/zampy/datasets/land_cover.py index ec30cc4..f91b733 100644 --- a/src/zampy/datasets/land_cover.py +++ b/src/zampy/datasets/land_cover.py @@ -136,7 +136,9 @@ def load( ) raise ValueError(msg) files = list((ingest_dir / self.name).glob(f"{self.name}_*.nc")) - ds = xr.open_mfdataset(files, chunks={"latitude": 200, "longitude": 200}) + ds = xr.open_mfdataset( + files, chunks={"latitude": 200, "longitude": 200}, engine="h5netcdf" + ) ds = ds.sel(time=slice(time_bounds.start, time_bounds.end)) grid = xarray_regrid.create_regridding_dataset( @@ -173,7 +175,7 @@ def convert( for file in data_files: print(f"Start processing file `{file.name}`.") - ds = xr.open_dataset(file) + ds = xr.open_dataset(file, engine="h5netcdf") ds = converter.convert(ds, dataset=self, convention=convention) return True @@ -197,7 +199,7 @@ def unzip_raw_to_netcdf( print(f"File '{ncfile.name}' already exists, skipping...") else: ds = extract_netcdf_to_zampy(file) - ds.to_netcdf(path=ncfile) + ds.to_netcdf(path=ncfile, engine="h5netcdf") def extract_netcdf_to_zampy(file: Path) -> xr.Dataset: @@ -220,7 +222,7 @@ def extract_netcdf_to_zampy(file: Path) -> xr.Dataset: zip_object.extract(zipped_file_name, path=unzip_folder) # only keep land cover class variable - with xr.open_dataset(unzip_folder / zipped_file_name) as ds: + with xr.open_dataset(unzip_folder / zipped_file_name, engine="h5netcdf") as ds: var_list = list(ds.data_vars) raw_variable = "lccs_class" var_list.remove(raw_variable) diff --git a/src/zampy/datasets/prism_dem.py b/src/zampy/datasets/prism_dem.py index 0952ad8..45ebbca 100644 --- a/src/zampy/datasets/prism_dem.py +++ b/src/zampy/datasets/prism_dem.py @@ -144,7 +144,7 @@ def preproc(ds: xr.Dataset) -> xr.Dataset: """Remove overlapping coordinates on the edges.""" return ds.isel(latitude=slice(None, -1), longitude=slice(None, -1)) - ds = xr.open_mfdataset(files, preprocess=preproc) + ds = xr.open_mfdataset(files, preprocess=preproc, engine="h5netcdf") grid = xarray_regrid.create_regridding_dataset( utils.make_grid(spatial_bounds, resolution) @@ -168,7 +168,7 @@ def convert( for file in data_files: # start conversion process print(f"Start processing file `{file.name}`.") - ds = xr.open_dataset(file) + ds = xr.open_dataset(file, engine="h5netcdf") ds = converter.convert(ds, dataset=self, convention=convention) return True @@ -216,6 +216,7 @@ def convert_raw_dem_to_netcdf( ds.to_netcdf( path=ncfile, encoding=ds.encoding, + engine="h5netcdf", ) diff --git a/src/zampy/recipe.py b/src/zampy/recipe.py index 782363e..8cd41d9 100644 --- a/src/zampy/recipe.py +++ b/src/zampy/recipe.py @@ -162,7 +162,9 @@ def run(self) -> None: time_end = str(self.timebounds.end.astype("datetime64[Y]")) # e.g. "era5_2010-2020.nc" fname = f"{dataset_name.lower()}_{time_start}-{time_end}.nc" - ds.to_netcdf(path=self.data_dir / fname, encoding=encoding) + ds.to_netcdf( + path=self.data_dir / fname, encoding=encoding, engine="h5netcdf" + ) del ds print( diff --git a/tests/test_datasets/test_fapar_lai.py b/tests/test_datasets/test_fapar_lai.py index 3d0ad16..cc0673a 100644 --- a/tests/test_datasets/test_fapar_lai.py +++ b/tests/test_datasets/test_fapar_lai.py @@ -89,7 +89,9 @@ def test_ingest(self, dummy_dir): lai_dataset = FaparLAI() lai_dataset.ingest(download_dir=data_folder, ingest_dir=dummy_dir) - ds = xr.open_mfdataset((dummy_dir / "fapar-lai").glob("*.nc")) + ds = xr.open_mfdataset( + (dummy_dir / "fapar-lai").glob("*.nc"), engine="h5netcdf" + ) assert isinstance(ds, xr.Dataset) def test_load(self, dummy_dir): diff --git a/tests/test_recipes/test_simple_recipe.py b/tests/test_recipes/test_simple_recipe.py index 413525b..bd432c4 100644 --- a/tests/test_recipes/test_simple_recipe.py +++ b/tests/test_recipes/test_simple_recipe.py @@ -48,7 +48,9 @@ def test_recipe(tmp_path: Path, mocker): rm.run() - ds = xr.open_mfdataset(str(tmp_path / "output" / "era5_recipe" / "*.nc")) + ds = xr.open_mfdataset( + str(tmp_path / "output" / "era5_recipe" / "*.nc"), engine="h5netcdf" + ) assert all(var in ds.data_vars for var in ["Psurf", "Wind_N"]) # Check if time frequency is correct assert ds.time.diff("time").min() == np.timedelta64(1, "h") @@ -85,7 +87,9 @@ def test_recipe_with_lower_frequency(tmp_path: Path, mocker): rm.run() - ds = xr.open_mfdataset(str(tmp_path / "output" / "era5_recipe" / "*.nc")) + ds = xr.open_mfdataset( + str(tmp_path / "output" / "era5_recipe" / "*.nc"), engine="h5netcdf" + ) # check the lenght of the time dimension, mean values are used assert len(ds.time) == 4 @@ -121,7 +125,9 @@ def test_recipe_with_higher_frequency(tmp_path: Path, mocker): rm.run() - ds = xr.open_mfdataset(str(tmp_path / "output" / "era5_recipe" / "*.nc")) + ds = xr.open_mfdataset( + str(tmp_path / "output" / "era5_recipe" / "*.nc"), engine="h5netcdf" + ) # check the lenght of the time dimension, data is interpolated assert len(ds.time) == 47 @@ -156,7 +162,9 @@ def test_recipe_with_two_time_values(tmp_path: Path, mocker): rm.run() - ds = xr.open_mfdataset(str(tmp_path / "output" / "era5_recipe" / "*.nc")) + ds = xr.open_mfdataset( + str(tmp_path / "output" / "era5_recipe" / "*.nc"), engine="h5netcdf" + ) # check the lenght of the time dimension assert len(ds.time) == 2 @@ -191,7 +199,9 @@ def test_recipe_with_one_time_values(tmp_path: Path, mocker): rm.run() - ds = xr.open_mfdataset(str(tmp_path / "output" / "era5_recipe" / "*.nc")) + ds = xr.open_mfdataset( + str(tmp_path / "output" / "era5_recipe" / "*.nc"), engine="h5netcdf" + ) # check the lenght of the time dimension, should not do interpolation or # extrapolation in time assert len(ds.time) == 1 From 0d05eb1ee85b5c40005d1d91b542501a3d6725db Mon Sep 17 00:00:00 2001 From: SarahAlidoost Date: Tue, 10 Dec 2024 11:08:29 +0100 Subject: [PATCH 2/3] use h5netcdf instead of xarray io --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f678f3e..5ac9606 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,13 +47,14 @@ classifiers = [ "Programming Language :: Python :: 3.11", ] dependencies = [ + "h5netcdf", # see issue 65 "requests", "pyyaml", "netcdf4", "numpy", "pandas", "matplotlib", - "xarray[io]", + "xarray", "scipy", # required for xarray.interpolate "rioxarray", # required for TIFF files "tqdm", From c7b91d3618bc0a29bdf54169ec6fe1e4adb3f6bf Mon Sep 17 00:00:00 2001 From: SarahAlidoost Date: Tue, 10 Dec 2024 11:08:49 +0100 Subject: [PATCH 3/3] add changes to changelog --- CHANGELOG.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b0e787..bcc6373 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,8 +6,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/). ## Unreleased +## 0.3.0 (...) + +Zampy works with [new CDS and ADS +API](https://confluence.ecmwf.int/display/CKB/Please+read%3A+CDS+and+ADS+migrating+to+new+infrastructure%3A+Common+Data+Store+%28CDS%29+Engine). + +### Added + +The zampy test dataset is generated using data from the new API, [PR + #62](https://github.com/EcoExtreML/zampy/pull/62). + +### Fixed + +- Add supports for the new CDS API [PR + #62](https://github.com/EcoExtreML/zampy/pull/62) +- Fixed segmentation fault error [PR + #66](https://github.com/EcoExtreML/zampy/pull/66) ## 0.2.0 (2024-09-02) + First release of `zampy`. `zampy` is designed to retrieve data for LSM model input. It can help you prepare the data within the following steps: