From 1f592aa3bd52f622e8eceef9f1626601df541c58 Mon Sep 17 00:00:00 2001 From: Yang Date: Mon, 16 Oct 2023 17:17:06 +0200 Subject: [PATCH] add ingest function and unzip --- src/zampy/datasets/land_cover.py | 51 ++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/src/zampy/datasets/land_cover.py b/src/zampy/datasets/land_cover.py index 725abb0..469f9a5 100644 --- a/src/zampy/datasets/land_cover.py +++ b/src/zampy/datasets/land_cover.py @@ -2,6 +2,7 @@ from pathlib import Path import numpy as np +from zipfile import ZipFile from zampy.datasets import cds_utils from zampy.datasets import validation from zampy.datasets.dataset_protocol import SpatialBounds @@ -85,3 +86,53 @@ def download( ) return True + + def ingest( + self, + download_dir: Path, + ingest_dir: Path, + overwrite: bool = False, + ) -> bool: + download_folder = download_dir / self.name + ingest_folder = ingest_dir / self.name + ingest_folder.mkdir(parents=True, exist_ok=True) + + archive_file_pattern = f"{self.name}_*.zip" + archive_files = list(download_folder.glob(archive_file_pattern)) + + for file in archive_files: + unzip_raw_to_netcdf( + ingest_folder, + file=file, + overwrite=overwrite, + ) + + copy_properties_file(download_folder, ingest_folder) + + return True + + +def unzip_raw_to_netcdf( + ingest_folder: Path, + file: Path, + overwrite: bool = False, +) -> None: + """Convert a downloaded zip netcdf file to a standard CF/Zampy netCDF file. + + Args: + ingest_folder: Folder where the files have to be written to. + file: Path to the land cover .zip archive. + overwrite: Overwrite all existing files. If False, file that already exist will + be skipped. + """ + ncfile = ingest_folder / file.with_suffix(".nc").name + if ncfile.exists() and not overwrite: + print(f"File '{ncfile.name}' already exists, skipping...") + else: + extract_netcdf_to_zampy(file, ingest_folder) + + +def extract_netcdf_to_zampy(file, ingest_folder): + with ZipFile(file, 'r') as zip_object: + zipped_file_name = zip_object.namelist()[0] + zip_object.extract(zipped_file_name, path = ingest_folder)