From 076f7717f737aad1b4258ceee1cf504efca2498a Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Mon, 4 Nov 2024 22:12:11 +0000 Subject: [PATCH 1/3] add check for corrupt satellite data --- pvnet_app/data/satellite.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/pvnet_app/data/satellite.py b/pvnet_app/data/satellite.py index ed56d0d..8f6ba48 100644 --- a/pvnet_app/data/satellite.py +++ b/pvnet_app/data/satellite.py @@ -15,6 +15,9 @@ sat_5_path = "sat_5_min.zarr" sat_15_path = "sat_15_min.zarr" +# The percentage of zeros in the satellite data that is acceptable +ERROR_ZERO_PERCENTAGE = 0.1 + def download_all_sat_data() -> bool: """Download the sat data and return whether it was successful @@ -339,9 +342,36 @@ def preprocess_sat_data(t0: pd.Timestamp, use_legacy: bool = False) -> pd.Dateti # non-nan timestamp extend_satellite_data_with_nans(t0) + # Check for zeros in the satellite data + check_for_zeros() + return sat_timestamps +def check_for_zeros(): + """Check the satellite data for zeros and raise an exception + + This sometimes happen when the satellite data is corrupt + + Note that in the UK, even at night, the values are not zero. + """ + # check satellite for zeros + logger.info("Checking satellite data for zeros") + ds_sat = xr.open_zarr(sat_path) + shape = ds_sat.data.shape + n_data_points_per_timestep = shape[1] * shape[2] * shape[3] + n_time_steps = shape[0] + for i in range(n_time_steps): + data = ds_sat.data[i].values + if (data == 0).sum() / n_data_points_per_timestep > ERROR_ZERO_PERCENTAGE: + time = ds_sat.time[i].values + message = ( + f"Satellite data contains zeros, (greater than {ERROR_ZERO_PERCENTAGE})" + f"This is for time step {time}" + ) + raise Exception(message) + + def scale_satellite_data() -> None: """Scale the satellite data to be between 0 and 1""" From 3f99ddad563d38f44b95e822f9d5e042fa39a871 Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Mon, 4 Nov 2024 22:18:10 +0000 Subject: [PATCH 2/3] change test data to ones --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 0194e15..e79bbcf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -158,7 +158,7 @@ def make_sat_data(test_t0, delay_mins, freq_mins): # Add data to dataset ds["data"] = xr.DataArray( - np.zeros([len(ds[c]) for c in ds.xindexes]), + np.ones([len(ds[c]) for c in ds.xindexes]), coords=[ds[c] for c in ds.xindexes], ) From 76bbd721a9a08dbc764cd34c63e66cf14cfaa3a8 Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Mon, 4 Nov 2024 22:26:16 +0000 Subject: [PATCH 3/3] add test --- pvnet_app/data/satellite.py | 2 +- tests/conftest.py | 10 +++++++++- tests/data/test_satellite.py | 25 +++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/pvnet_app/data/satellite.py b/pvnet_app/data/satellite.py index 8f6ba48..7f95d9b 100644 --- a/pvnet_app/data/satellite.py +++ b/pvnet_app/data/satellite.py @@ -366,7 +366,7 @@ def check_for_zeros(): if (data == 0).sum() / n_data_points_per_timestep > ERROR_ZERO_PERCENTAGE: time = ds_sat.time[i].values message = ( - f"Satellite data contains zeros, (greater than {ERROR_ZERO_PERCENTAGE})" + f"Satellite data contains zeros (greater than {ERROR_ZERO_PERCENTAGE}), " f"This is for time step {time}" ) raise Exception(message) diff --git a/tests/conftest.py b/tests/conftest.py index e79bbcf..00d6820 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -137,12 +137,16 @@ def config_filename(): return f"{os.path.dirname(os.path.abspath(__file__))}/test_data/test.yaml" -def make_sat_data(test_t0, delay_mins, freq_mins): +def make_sat_data(test_t0, delay_mins, freq_mins, small=False): # Load dataset which only contains coordinates, but no data ds = xr.open_zarr( f"{os.path.dirname(os.path.abspath(__file__))}/test_data/non_hrv_shell.zarr" ) + if small: + # only select 10 by 10 + ds = ds.isel(x_geostationary=slice(0, 10), y_geostationary=slice(0, 10)) + # remove tim dim and expand time dim to be len 36 = 3 hours of 5 minute data ds = ds.drop_vars("time") n_hours = 3 @@ -185,6 +189,10 @@ def sat_5_data_delayed(test_t0): def sat_15_data(test_t0): return make_sat_data(test_t0, delay_mins=0, freq_mins=15) +@pytest.fixture() +def sat_15_data_small(test_t0): + return make_sat_data(test_t0, delay_mins=0, freq_mins=15,small=True) + @pytest.fixture() def gsp_yields_and_systems(db_session, test_t0): diff --git a/tests/data/test_satellite.py b/tests/data/test_satellite.py index b43c753..8b1d550 100644 --- a/tests/data/test_satellite.py +++ b/tests/data/test_satellite.py @@ -14,6 +14,8 @@ import os import tempfile + +import pytest import zarr import numpy as np import pandas as pd @@ -237,3 +239,26 @@ def test_extend_satellite_data_with_nans_over_3_hours(sat_5_data, test_t0): ds = xr.open_zarr(filename) assert len(time) + 3*12 == len(ds.time) assert ds.time.values[-1] == t0 + + +def test_zeros_in_sat_data(sat_15_data_small, test_t0): + """Download and process only the 15 minute satellite data""" + + # make temporary directory + with tempfile.TemporaryDirectory() as tmpdirname: + + # Change to temporary working directory + os.chdir(tmpdirname) + + # make half the values zeros + sat_15_data_small.data[::2] = 0 + + # Make 15-minutely satellite data available + save_to_zarr_zip(sat_15_data_small, filename="latest.zarr.zip") + + os.environ["SATELLITE_ZARR_PATH"] = "latest.zarr.zip" + download_all_sat_data() + + # check an error is made + with pytest.raises(Exception): + preprocess_sat_data(test_t0)