Skip to content

Commit

Permalink
Add support for land cover data (#73)
Browse files Browse the repository at this point in the history
* Add LAI to global data selection, tests

* Correct test data generation

* Add instructions, download script & parsing nb

* Move global data operations to separate folder

* Modify nearest_non_nan to check max_distance

* Fix typing issue

* Start split of global data, add verification

* Finish refactoring global data

* Fix bugs

* Add checks for tile existance for DEM and h_canopy

* Compress txt assets

* Improve err msgs. Add tests for raises

* Fix linter issues

* Add landcover table, download+parse scripts

* Add land cover data support & test data

* Add checks, cleanup landcover parse notebook

* Apply suggestions from Sarah's code review

Co-authored-by: SarahAlidoost <[email protected]>

---------

Co-authored-by: SarahAlidoost <[email protected]>
  • Loading branch information
BSchilperoort and SarahAlidoost authored Apr 4, 2023
1 parent 4d5ec56 commit a5be424
Show file tree
Hide file tree
Showing 11 changed files with 481 additions and 31,784 deletions.
2 changes: 2 additions & 0 deletions PyStemmusScope/global_data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Module for operations related to the 'global' datasets."""
from PyStemmusScope.global_data import cams_co2
from PyStemmusScope.global_data import cci_landcover
from PyStemmusScope.global_data import copernicus_lai
from PyStemmusScope.global_data import era5
from PyStemmusScope.global_data import eth_canopy_height
Expand All @@ -16,4 +17,5 @@
"prism_dem",
"cams_co2",
"copernicus_lai",
"cci_landcover",
]
24 changes: 24 additions & 0 deletions PyStemmusScope/global_data/assets/lccs_to_igbp_table.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"lccs_class","IGBP_STEMMUS_SCOPE"
0,"No vegetation"
10,"Croplands"
20,"Croplands"
30,"Cropland mosaics"
40,"Cropland mosaics"
50,"Evergreen Broadleaf Forest"
60,"Deciduous Broadleaf Forests"
70,"Evergreen Needleleaf Forests"
80,"Deciduous needleleaf forests"
90,"Mixed Forests"
100,"Woody savannas"
110,"Savannas"
120,"Closed shrublands"
130,"Grasslands"
140,"Not available in IGBP"
150,"Open shrublands"
160,"Permanent Wetlands"
170,"Permanent Wetlands"
180,"Permanent Wetlands"
190,"Urban and built-up lands"
200,"Barren"
210,"Water bodies"
220,"Snow and ice"
160 changes: 160 additions & 0 deletions PyStemmusScope/global_data/cci_landcover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
"""Module for loading and validating the ESA CCI land cover dataset."""
from pathlib import Path
from typing import Dict
from typing import List
from typing import Tuple
from typing import Union
import numpy as np
import pandas as pd
import xarray as xr
from PyStemmusScope.global_data import utils


RESOLUTION_CCI = 1 / 360 # Resolution of the dataset in degrees
FILEPATH_LANDCOVER_TABLE = Path(__file__).parent / "assets" / "lccs_to_igbp_table.csv"


def retrieve_landcover_data(
global_data_dir: Path,
latlon: Union[Tuple[int, int], Tuple[float, float]],
time_range: Tuple[np.datetime64, np.datetime64],
timestep: str,
) -> Dict[str, np.ndarray]:
"""Get the land cover data from the CCI netCDF files.
Args:
global_data_dir: Path to the directory containing the global datasets.
latlon: Latitude and longitude of the site.
time_range: Start and end time of the model run.
timestep: Desired timestep of the model, this is derived from the forcing data.
In a pandas-timedelta compatible format. For example: "1800S"
Returns:
Dictionary containing IGBP and LCCS land cover classes.
"""
files_cci = list((global_data_dir / "landcover").glob("*.nc"))

if len(files_cci) == 0:
raise FileNotFoundError(
f"No netCDF files found in the folder '{global_data_dir / 'landcover'}'"
)

return extract_landcover_data(
files_cci=files_cci,
latlon=latlon,
time_range=time_range,
timestep=timestep,
)


def extract_landcover_data(
files_cci: List[Path],
latlon: Union[Tuple[int, int], Tuple[float, float]],
time_range: Tuple[np.datetime64, np.datetime64],
timestep: str,
) -> Dict[str, np.ndarray]:
"""Extract the land cover data from the CCI netCDF files.
Args:
files_cci: List of CCI land cover files.
latlon: Latitude and longitude of the site.
time_range: Start and end time of the model run.
timestep: Desired timestep of the model, this is derived from the forcing data.
In a pandas-timedelta compatible format. For example: "1800S"
Returns:
Dictionary containing IGBP and LCCS land cover classes.
"""
cci_dataset = xr.open_mfdataset(files_cci)

check_cci_dataset(cci_dataset, latlon, time_range) # Assert spatial/temporal bounds

lat_bounds = cci_dataset["lat_bounds"].load() # Load so that they are not
lon_bounds = cci_dataset["lon_bounds"].load() # dask arrays
lat_idx = np.logical_and( # type: ignore
lat_bounds.isel(bounds=0) >= latlon[0], lat_bounds.isel(bounds=1) < latlon[0]
).argmax(dim="lat")
lon_idx = np.logical_and( # type: ignore
lon_bounds.isel(bounds=0) <= latlon[1], lon_bounds.isel(bounds=1) > latlon[1]
).argmax(dim="lon")

lccs_id = cci_dataset.isel(lat=lat_idx, lon=lon_idx)["lccs_class"]

# If time is size 1, interp fails. Adding an extra datapoint prevents this.
if lccs_id["time"].size == 1:
data_copy = lccs_id.copy()
data_copy["time"] = lccs_id["time"] + np.timedelta64(1, "D")
lccs_id = xr.concat((lccs_id, data_copy), dim="time")

lccs_id = lccs_id.interp(
time=pd.date_range(time_range[0], time_range[1], freq=timestep),
method="nearest",
kwargs={"fill_value": "extrapolate", "bounds_error": False},
)

landcover_lookup_table = get_landcover_table(cci_dataset)
igbp_lookup_table = get_lccs_to_igbp_table()

return {
"LCCS_landcover": np.array(
[landcover_lookup_table[_id] for _id in lccs_id.to_numpy()]
),
"IGBP_veg_long": np.array(
[igbp_lookup_table[_id] for _id in lccs_id.to_numpy()]
),
}


def get_lccs_to_igbp_table() -> Dict[int, str]:
"""Read the land cover translation table, and turn it into a lookup dictionary."""
df = pd.read_csv(FILEPATH_LANDCOVER_TABLE, index_col="lccs_class")
return df.to_dict()["IGBP_STEMMUS_SCOPE"]


def get_landcover_table(cci_dataset: xr.Dataset) -> Dict[int, str]:
"""Get the lookup table to convert the flag values to a land cover name.
The lookup table for the land cover classes is contained in the netCDF file, under
the lcc_class attributes. This function extracts it and turns it into a (dict)
lookup table.
Args:
cci_dataset: The CCI dataset netCDF file.
Returns:
The landcover class lookup table
"""
flag_meanings = cci_dataset["lccs_class"].attrs["flag_meanings"].split(" ")
flag_values = cci_dataset["lccs_class"].attrs["flag_values"]
return dict(zip(flag_values, flag_meanings))


def check_cci_dataset(
cci_dataset: xr.Dataset,
latlon: Union[Tuple[int, int], Tuple[float, float]],
time_range: Tuple[np.datetime64, np.datetime64],
) -> None:
"""Validate the cci dataset for spatial and temporal bounds."""
# Assert spatial bounds
if (
latlon[0] > cci_dataset["lat"].max() or latlon[0] < cci_dataset["lat"].min()
) or (latlon[1] > cci_dataset["lon"].max() or latlon[1] < cci_dataset["lon"].min()):
raise utils.MissingDataError(
f"\nThe specified location {latlon} was not within bounds of the CCI land"
f"\ncover dataset."
f"\nPlease check the netCDF files or select a different location"
)

# Assert temporal bounds
first_year_cci = pd.to_datetime(cci_dataset["time"].min().to_numpy()).year
last_year_cci = pd.to_datetime(cci_dataset["time"].max().to_numpy()).year
first_year_range = pd.to_datetime(time_range[0]).year
last_year_range = pd.to_datetime(time_range[-1]).year
# As the data is yearly, allow some leeway with the time bounds
if (first_year_range + 1 < first_year_cci) or (last_year_range - 1 > last_year_cci):
raise utils.MissingDataError(
f"\nThe specified time range {time_range} was not within the range of the"
f"\nCCI land cover dataset:"
f"\n({cci_dataset['time'].min(), cci_dataset['time'].max()})"
f"\nPlease check the netCDF files or select a different location"
)
12 changes: 10 additions & 2 deletions PyStemmusScope/global_data/global_data_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,15 @@ def collect_datasets(
data["latitude"] = latlon[0]
data["longitude"] = latlon[1]

# TODO: Add land cover data retrieval.
data["IGBP_veg_long"] = "Evergreen Needleleaf Forests"
landcover_data = gd.cci_landcover.retrieve_landcover_data(
global_data_dir,
latlon,
time_range,
timestep,
)
# TODO see issue github.com/EcoExtreML/STEMMUS_SCOPE/issues/137
# for now, we only use the first value of the time series
data["IGBP_veg_long"] = landcover_data["IGBP_veg_long"][0]
data["LCCS_landcover"] = landcover_data["LCCS_landcover"][0]

return data
123 changes: 123 additions & 0 deletions global_data/data_analysis_notebooks/parse_landcover.ipynb

Large diffs are not rendered by default.

31 changes: 31 additions & 0 deletions global_data/download_scripts/download_cds_landcover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Download land cover data using the cdsapi."""
from pathlib import Path
import cdsapi


with (Path.home() / ".cdsloginrc").open(encoding="utf8") as f:
uid = f.readline().strip()
api_key = f.readline().strip()


c = cdsapi.Client(
url="https://cds.climate.copernicus.eu/api/v2",
key=f"{uid}:{api_key}",
verify=True,
)


years = [2013]


for year in years:
c.retrieve(
"satellite-land-cover",
{
"variable": "all",
"format": "zip",
"year": f"{year}",
"version": "v2.0.7cds",
},
f"cds_landcover_{year}.zip",
)
Loading

0 comments on commit a5be424

Please sign in to comment.