From dcfd5d3bc75707a65f3d36bc7ab3ec2062ee9f8c Mon Sep 17 00:00:00 2001 From: Yang Date: Tue, 17 Oct 2023 12:06:23 +0200 Subject: [PATCH] coarsen land cover and finish ingest --- pyproject.toml | 1 + src/zampy/datasets/land_cover.py | 62 ++++++++++++++++++++++++++++---- 2 files changed, 57 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e787454..2c180a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,6 +65,7 @@ dependencies = [ "pint-xarray", "flox", # For better groupby methods. "cdsapi", + "xarray-regrid", # for land cover data regridding ] dynamic = ["version"] diff --git a/src/zampy/datasets/land_cover.py b/src/zampy/datasets/land_cover.py index 469f9a5..cf7b571 100644 --- a/src/zampy/datasets/land_cover.py +++ b/src/zampy/datasets/land_cover.py @@ -1,8 +1,11 @@ """Land cover classification dataset.""" +import os from pathlib import Path -import numpy as np from zipfile import ZipFile +import numpy as np +import xarray as xr +import xarray_regrid from zampy.datasets import cds_utils from zampy.datasets import validation from zampy.datasets.dataset_protocol import SpatialBounds @@ -18,7 +21,7 @@ # ruff: noqa: D102 -class LandCover: # noqa: D101 +class LandCover: """Land cover classification gridded maps.""" name = "land-cover" @@ -129,10 +132,57 @@ def unzip_raw_to_netcdf( if ncfile.exists() and not overwrite: print(f"File '{ncfile.name}' already exists, skipping...") else: - extract_netcdf_to_zampy(file, ingest_folder) + ds = extract_netcdf_to_zampy(ingest_folder, file) + ds.to_netcdf(path=ncfile) + + +def extract_netcdf_to_zampy(ingest_folder: Path, file: Path) -> xr.Dataset: + """Extract zipped data and convert to zampy format. + Args: + ingest_folder: Folder where the files have to be written to. + file: Path to the land cover .zip archive. -def extract_netcdf_to_zampy(file, ingest_folder): - with ZipFile(file, 'r') as zip_object: + Returns: + Coarse land cover data satisfying zampy standard. + """ + with ZipFile(file, "r") as zip_object: zipped_file_name = zip_object.namelist()[0] - zip_object.extract(zipped_file_name, path = ingest_folder) + zip_object.extract(zipped_file_name, path=ingest_folder) + + # only keep land cover class variable + ds = xr.open_dataset(ingest_folder / zipped_file_name) + var_list = [var for var in ds.data_vars] + raw_variable = "lccs_class" + var_list.remove(raw_variable) + ds = ds.drop_vars(var_list) + + # coarsen to fit into memory + ds = ds.sortby(["lat", "lon"]) + ds = ds.rename({"lat": "latitude", "lon": "longitude"}) + new_grid = xarray_regrid.Grid( + north=90, + east=180, + south=-90, + west=-180, + resolution_lat=0.25, + resolution_lon=0.25, + ) + + target_dataset = xarray_regrid.create_regridding_dataset(new_grid) + + ds_regrid = ds.regrid.most_common(target_dataset, time_dim="time", max_mem=1e9) + + # rename variable to follow the zampy convention + variable_name = "land_cover" + ds_regrid = ds_regrid.rename({raw_variable: variable_name}) + ds_regrid[variable_name].attrs["units"] = str( + VARIABLE_REFERENCE_LOOKUP[variable_name].unit + ) + ds_regrid[variable_name].attrs["description"] = VARIABLE_REFERENCE_LOOKUP[ + variable_name + ].desc + + os.remove(ingest_folder / zipped_file_name) + + return ds_regrid