From b6c9761c9357ac54ea70e01bd41cf01d29d17e84 Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Tue, 17 Dec 2024 13:26:28 +0000 Subject: [PATCH 1/5] remove any nans in the satellite data --- pvnet_app/consts.py | 6 ++++++ pvnet_app/data/satellite.py | 36 +++++++++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/pvnet_app/consts.py b/pvnet_app/consts.py index 0a32205..7ca8af6 100644 --- a/pvnet_app/consts.py +++ b/pvnet_app/consts.py @@ -1,3 +1,9 @@ sat_path = "sat.zarr" nwp_ukv_path = "nwp_ukv.zarr" nwp_ecmwf_path = "nwp_ecmwf.zarr" + + +uk_box = dict( + x_geostationary=[-996_133.85, -480_064.6], + y_geostationary=[4_512_606.3, 5_058_679.8], +) diff --git a/pvnet_app/data/satellite.py b/pvnet_app/data/satellite.py index 7f95d9b..032a427 100644 --- a/pvnet_app/data/satellite.py +++ b/pvnet_app/data/satellite.py @@ -8,7 +8,7 @@ import ocf_blosc2 from ocf_datapipes.config.load import load_yaml_configuration -from pvnet_app.consts import sat_path +from pvnet_app.consts import sat_path, uk_box logger = logging.getLogger(__name__) @@ -312,6 +312,37 @@ def check_model_satellite_inputs_available( return available +def remove_any_nans_in_satellite(): + """ Remove any NaNs in the satellite data""" + + ds_sat = xr.open_zarr(sat_path) + + # slice to uk box + ds_sat_uk = ds_sat.sel( + x=slice(uk_box["x_geostationary"][0], uk_box["x_geostationary"][1]), + y=slice(uk_box["y_geostationary"][0], uk_box["y_geostationary"][1]), + ) + + # remove any nans + ds_sat_uk = ds_sat_uk.dropna(dim="time", how="any") + + # see which timestamps have been dropped + dropped_timestamps = np.setdiff1d(ds_sat.time, ds_sat_uk.time) + if len(dropped_timestamps) > 0: + logger.info(f"Removing NaNs from satellite data." + f" The following timestamps have been dropped: {dropped_timestamps}") + + # remove dropped timstamps from original dataset + ds_sat = ds_sat.sel(time=~ds_sat.time.isin(dropped_timestamps)) + + # save + os.system(f"rm -rf {sat_path}") + ds_sat.to_zarr(sat_path) + + else: + logger.info("No NaNs found in satellite data.") + + def preprocess_sat_data(t0: pd.Timestamp, use_legacy: bool = False) -> pd.DatetimeIndex: """Combine and 5- and 15-minutely satellite data and extend to t0 if required @@ -327,6 +358,9 @@ def preprocess_sat_data(t0: pd.Timestamp, use_legacy: bool = False) -> pd.Dateti # Deal with switching between the 5 and 15 minutely satellite data combine_5_and_15_sat_data() + # check for any nans in the satellite data + remove_any_nans_in_satellite() + # Interpolate missing satellite timestamps interpolate_missing_satellite_timestamps(pd.Timedelta("15min")) From 93fa7578717c952858f42627901259cd02f82a2e Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Tue, 17 Dec 2024 13:29:20 +0000 Subject: [PATCH 2/5] raise hard error, if any nans --- pvnet_app/data/satellite.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/pvnet_app/data/satellite.py b/pvnet_app/data/satellite.py index 032a427..988f61d 100644 --- a/pvnet_app/data/satellite.py +++ b/pvnet_app/data/satellite.py @@ -313,7 +313,7 @@ def check_model_satellite_inputs_available( def remove_any_nans_in_satellite(): - """ Remove any NaNs in the satellite data""" + """Remove any NaNs in the satellite data""" ds_sat = xr.open_zarr(sat_path) @@ -329,15 +329,21 @@ def remove_any_nans_in_satellite(): # see which timestamps have been dropped dropped_timestamps = np.setdiff1d(ds_sat.time, ds_sat_uk.time) if len(dropped_timestamps) > 0: - logger.info(f"Removing NaNs from satellite data." - f" The following timestamps have been dropped: {dropped_timestamps}") + logger.info( + f"Removing NaNs from satellite data." + f" The following timestamps have been dropped: {dropped_timestamps}" + ) - # remove dropped timstamps from original dataset - ds_sat = ds_sat.sel(time=~ds_sat.time.isin(dropped_timestamps)) + raise Exception("There are some nans in the satellite data, so lets not run the forecast") - # save - os.system(f"rm -rf {sat_path}") - ds_sat.to_zarr(sat_path) + # other options + # remove dropped timestamps from original dataset + # we just remove them, as later on we might be able to interpolate them + # ds_sat = ds_sat.sel(time=~ds_sat.time.isin(dropped_timestamps)) + # + # # save + # os.system(f"rm -rf {sat_path}") + # ds_sat.to_zarr(sat_path) else: logger.info("No NaNs found in satellite data.") From 0d3b1ca955596c6ad4646b3ad0a5444ac4fa642d Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Tue, 17 Dec 2024 13:39:14 +0000 Subject: [PATCH 3/5] reuse same code for zero checking --- pvnet_app/data/satellite.py | 53 ++++++------------------------------ tests/data/test_satellite.py | 25 +++++++++++++++-- 2 files changed, 31 insertions(+), 47 deletions(-) diff --git a/pvnet_app/data/satellite.py b/pvnet_app/data/satellite.py index 988f61d..52b7e08 100644 --- a/pvnet_app/data/satellite.py +++ b/pvnet_app/data/satellite.py @@ -312,43 +312,6 @@ def check_model_satellite_inputs_available( return available -def remove_any_nans_in_satellite(): - """Remove any NaNs in the satellite data""" - - ds_sat = xr.open_zarr(sat_path) - - # slice to uk box - ds_sat_uk = ds_sat.sel( - x=slice(uk_box["x_geostationary"][0], uk_box["x_geostationary"][1]), - y=slice(uk_box["y_geostationary"][0], uk_box["y_geostationary"][1]), - ) - - # remove any nans - ds_sat_uk = ds_sat_uk.dropna(dim="time", how="any") - - # see which timestamps have been dropped - dropped_timestamps = np.setdiff1d(ds_sat.time, ds_sat_uk.time) - if len(dropped_timestamps) > 0: - logger.info( - f"Removing NaNs from satellite data." - f" The following timestamps have been dropped: {dropped_timestamps}" - ) - - raise Exception("There are some nans in the satellite data, so lets not run the forecast") - - # other options - # remove dropped timestamps from original dataset - # we just remove them, as later on we might be able to interpolate them - # ds_sat = ds_sat.sel(time=~ds_sat.time.isin(dropped_timestamps)) - # - # # save - # os.system(f"rm -rf {sat_path}") - # ds_sat.to_zarr(sat_path) - - else: - logger.info("No NaNs found in satellite data.") - - def preprocess_sat_data(t0: pd.Timestamp, use_legacy: bool = False) -> pd.DatetimeIndex: """Combine and 5- and 15-minutely satellite data and extend to t0 if required @@ -364,8 +327,8 @@ def preprocess_sat_data(t0: pd.Timestamp, use_legacy: bool = False) -> pd.Dateti # Deal with switching between the 5 and 15 minutely satellite data combine_5_and_15_sat_data() - # check for any nans in the satellite data - remove_any_nans_in_satellite() + # Check for nans in the satellite data + check_for_constant_values(value=np.nan, threshold=0) # Interpolate missing satellite timestamps interpolate_missing_satellite_timestamps(pd.Timedelta("15min")) @@ -383,30 +346,30 @@ def preprocess_sat_data(t0: pd.Timestamp, use_legacy: bool = False) -> pd.Dateti extend_satellite_data_with_nans(t0) # Check for zeros in the satellite data - check_for_zeros() + check_for_constant_values() return sat_timestamps -def check_for_zeros(): - """Check the satellite data for zeros and raise an exception +def check_for_constant_values(value: Optional[float] = 0, threshold: Optional[float] = ERROR_ZERO_PERCENTAGE) -> None: + """Check the satellite data for constant values and raise an exception This sometimes happen when the satellite data is corrupt Note that in the UK, even at night, the values are not zero. """ # check satellite for zeros - logger.info("Checking satellite data for zeros") + logger.info("Checking satellite data for constant value ({value})") ds_sat = xr.open_zarr(sat_path) shape = ds_sat.data.shape n_data_points_per_timestep = shape[1] * shape[2] * shape[3] n_time_steps = shape[0] for i in range(n_time_steps): data = ds_sat.data[i].values - if (data == 0).sum() / n_data_points_per_timestep > ERROR_ZERO_PERCENTAGE: + if (data == value).sum() / n_data_points_per_timestep > threshold: time = ds_sat.time[i].values message = ( - f"Satellite data contains zeros (greater than {ERROR_ZERO_PERCENTAGE}), " + f"Satellite data contains zeros (greater than {threshold}), " f"This is for time step {time}" ) raise Exception(message) diff --git a/tests/data/test_satellite.py b/tests/data/test_satellite.py index 8b1d550..e12a2e9 100644 --- a/tests/data/test_satellite.py +++ b/tests/data/test_satellite.py @@ -28,7 +28,7 @@ sat_path, sat_5_path, sat_15_path, - extend_satellite_data_with_nans + extend_satellite_data_with_nans, ) @@ -242,7 +242,7 @@ def test_extend_satellite_data_with_nans_over_3_hours(sat_5_data, test_t0): def test_zeros_in_sat_data(sat_15_data_small, test_t0): - """Download and process only the 15 minute satellite data""" + """Check error is made if data has zeros""" # make temporary directory with tempfile.TemporaryDirectory() as tmpdirname: @@ -262,3 +262,24 @@ def test_zeros_in_sat_data(sat_15_data_small, test_t0): # check an error is made with pytest.raises(Exception): preprocess_sat_data(test_t0) + + +def test_remove_satellite_data(sat_15_data_small, test_t0): + """Check error is made if data has zeros""" + # make temporary directory + with tempfile.TemporaryDirectory() as tmpdirname: + # Change to temporary working directory + os.chdir(tmpdirname) + + # make half the values zeros + sat_15_data_small.data[::2] = np.nan + + # Make 15-minutely satellite data available + save_to_zarr_zip(sat_15_data_small, filename="latest.zarr.zip") + + os.environ["SATELLITE_ZARR_PATH"] = "latest.zarr.zip" + download_all_sat_data() + + # check an error is made + with pytest.raises(Exception): + preprocess_sat_data(test_t0) From 073b7a9327e00b5b88bf414c007d9bd340139f51 Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Tue, 17 Dec 2024 13:56:30 +0000 Subject: [PATCH 4/5] self PR comments --- pvnet_app/data/satellite.py | 2 +- tests/data/test_satellite.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pvnet_app/data/satellite.py b/pvnet_app/data/satellite.py index 52b7e08..e3cd6c6 100644 --- a/pvnet_app/data/satellite.py +++ b/pvnet_app/data/satellite.py @@ -359,7 +359,7 @@ def check_for_constant_values(value: Optional[float] = 0, threshold: Optional[fl Note that in the UK, even at night, the values are not zero. """ # check satellite for zeros - logger.info("Checking satellite data for constant value ({value})") + logger.info(f"Checking satellite data for constant value ({value})") ds_sat = xr.open_zarr(sat_path) shape = ds_sat.data.shape n_data_points_per_timestep = shape[1] * shape[2] * shape[3] diff --git a/tests/data/test_satellite.py b/tests/data/test_satellite.py index e12a2e9..14d0ceb 100644 --- a/tests/data/test_satellite.py +++ b/tests/data/test_satellite.py @@ -265,7 +265,7 @@ def test_zeros_in_sat_data(sat_15_data_small, test_t0): def test_remove_satellite_data(sat_15_data_small, test_t0): - """Check error is made if data has zeros""" + """Check error is made if data has nans""" # make temporary directory with tempfile.TemporaryDirectory() as tmpdirname: # Change to temporary working directory From ecf112bd19cc1e2507b557f93bf3007da1a17148 Mon Sep 17 00:00:00 2001 From: peterdudfield Date: Tue, 17 Dec 2024 14:00:26 +0000 Subject: [PATCH 5/5] remove satellite box --- pvnet_app/data/satellite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pvnet_app/data/satellite.py b/pvnet_app/data/satellite.py index e3cd6c6..db278c7 100644 --- a/pvnet_app/data/satellite.py +++ b/pvnet_app/data/satellite.py @@ -8,7 +8,7 @@ import ocf_blosc2 from ocf_datapipes.config.load import load_yaml_configuration -from pvnet_app.consts import sat_path, uk_box +from pvnet_app.consts import sat_path logger = logging.getLogger(__name__)