fix dask PerformanceWarning: Slicing is producing a large chunk

EcoExtreML · Feb 2, 2024 · fa998c6 · fa998c6
1 parent 5b9585c
commit fa998c6
Show file tree

Hide file tree

Showing 7 changed files with 21 additions and 0 deletions.
diff --git a/PyStemmusScope/global_data/cams_co2.py b/PyStemmusScope/global_data/cams_co2.py
@@ -1,10 +1,13 @@
 """Module for loading and validating the CAMS CO2 dataset."""
 from pathlib import Path
 from typing import Union
+import dask
 import numpy as np
 import xarray as xr
 from PyStemmusScope.global_data import utils
 
+# see https://docs.dask.org/en/latest/array-slicing.html#efficiency
+dask.config.set(**{'array.slicing.split_large_chunks': True})
 
 RESOLUTION_CAMS = 0.75  # Resolution of the dataset in degrees
 

diff --git a/PyStemmusScope/global_data/cci_landcover.py b/PyStemmusScope/global_data/cci_landcover.py
@@ -1,11 +1,14 @@
 """Module for loading and validating the ESA CCI land cover dataset."""
 from pathlib import Path
 from typing import Union
+import dask
 import numpy as np
 import pandas as pd
 import xarray as xr
 from PyStemmusScope.global_data import utils
 
+# see https://docs.dask.org/en/latest/array-slicing.html#efficiency
+dask.config.set(**{'array.slicing.split_large_chunks': True})
 
 RESOLUTION_CCI = 1 / 360  # Resolution of the dataset in degrees
 FILEPATH_LANDCOVER_TABLE = Path(__file__).parent / "assets" / "lccs_to_igbp_table.csv"

diff --git a/PyStemmusScope/global_data/copernicus_lai.py b/PyStemmusScope/global_data/copernicus_lai.py
@@ -1,10 +1,13 @@
 """Module for loading and validating the Copernicus LAI dataset."""
 from pathlib import Path
 from typing import Union
+import dask
 import numpy as np
 import xarray as xr
 from PyStemmusScope.global_data import utils
 
+# see https://docs.dask.org/en/latest/array-slicing.html#efficiency
+dask.config.set(**{'array.slicing.split_large_chunks': True})
 
 RESOLUTION_LAI = 1 / 112  # Resolution of the LAI dataset in degrees
 

diff --git a/PyStemmusScope/global_data/era5.py b/PyStemmusScope/global_data/era5.py
@@ -2,11 +2,14 @@
 from pathlib import Path
 from typing import Literal
 from typing import Union
+import dask
 import numpy as np
 import PyStemmusScope.variable_conversion as vc
 import xarray as xr
 from PyStemmusScope.global_data import utils
 
+# see https://docs.dask.org/en/latest/array-slicing.html#efficiency
+dask.config.set(**{'array.slicing.split_large_chunks': True})
 
 ERA5_VARIABLES = ["u10", "v10", "mtpr", "sp", "ssrd", "strd"]
 ERA5LAND_VARIABLES = ["t2m", "d2m"]

diff --git a/PyStemmusScope/global_data/eth_canopy_height.py b/PyStemmusScope/global_data/eth_canopy_height.py
@@ -2,9 +2,12 @@
 import gzip
 from pathlib import Path
 from typing import Union
+import dask
 import xarray as xr
 from PyStemmusScope.global_data import utils
 
+# see https://docs.dask.org/en/latest/array-slicing.html#efficiency
+dask.config.set(**{'array.slicing.split_large_chunks': True})
 
 MAX_DISTANCE = 0.01  # Maximum lat/lon distance to be considered nearby.
 

diff --git a/PyStemmusScope/global_data/prism_dem.py b/PyStemmusScope/global_data/prism_dem.py
@@ -2,9 +2,12 @@
 import gzip
 from pathlib import Path
 from typing import Union
+import dask
 import xarray as xr
 from PyStemmusScope.global_data import utils
 
+# see https://docs.dask.org/en/latest/array-slicing.html#efficiency
+dask.config.set(**{'array.slicing.split_large_chunks': True})
 
 MAX_DISTANCE = 0.01  #  Maximum lat/lon distance to be considered nearby. Approx 1km.
 

diff --git a/PyStemmusScope/global_data/utils.py b/PyStemmusScope/global_data/utils.py
@@ -1,8 +1,11 @@
 """Utility funtions for the global data IO."""
 from typing import Union
+import dask
 import numpy as np
 import xarray as xr
 
+# see https://docs.dask.org/en/latest/array-slicing.html#efficiency
+dask.config.set(**{'array.slicing.split_large_chunks': True})
 
 class MissingDataError(Exception):
     """Error to be raised when requested data is missing."""