Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add check for corrupt satellite data #150

Merged
merged 3 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions pvnet_app/data/satellite.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
sat_5_path = "sat_5_min.zarr"
sat_15_path = "sat_15_min.zarr"

# The percentage of zeros in the satellite data that is acceptable
ERROR_ZERO_PERCENTAGE = 0.1


def download_all_sat_data() -> bool:
"""Download the sat data and return whether it was successful
Expand Down Expand Up @@ -339,9 +342,36 @@ def preprocess_sat_data(t0: pd.Timestamp, use_legacy: bool = False) -> pd.Dateti
# non-nan timestamp
extend_satellite_data_with_nans(t0)

# Check for zeros in the satellite data
check_for_zeros()

return sat_timestamps


def check_for_zeros():
"""Check the satellite data for zeros and raise an exception

This sometimes happen when the satellite data is corrupt

Note that in the UK, even at night, the values are not zero.
"""
# check satellite for zeros
logger.info("Checking satellite data for zeros")
ds_sat = xr.open_zarr(sat_path)
shape = ds_sat.data.shape
n_data_points_per_timestep = shape[1] * shape[2] * shape[3]
n_time_steps = shape[0]
for i in range(n_time_steps):
data = ds_sat.data[i].values
if (data == 0).sum() / n_data_points_per_timestep > ERROR_ZERO_PERCENTAGE:
time = ds_sat.time[i].values
message = (
f"Satellite data contains zeros (greater than {ERROR_ZERO_PERCENTAGE}), "
f"This is for time step {time}"
)
raise Exception(message)


def scale_satellite_data() -> None:
"""Scale the satellite data to be between 0 and 1"""

Expand Down
12 changes: 10 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,16 @@ def config_filename():
return f"{os.path.dirname(os.path.abspath(__file__))}/test_data/test.yaml"


def make_sat_data(test_t0, delay_mins, freq_mins):
def make_sat_data(test_t0, delay_mins, freq_mins, small=False):
# Load dataset which only contains coordinates, but no data
ds = xr.open_zarr(
f"{os.path.dirname(os.path.abspath(__file__))}/test_data/non_hrv_shell.zarr"
)

if small:
# only select 10 by 10
ds = ds.isel(x_geostationary=slice(0, 10), y_geostationary=slice(0, 10))

# remove tim dim and expand time dim to be len 36 = 3 hours of 5 minute data
ds = ds.drop_vars("time")
n_hours = 3
Expand All @@ -158,7 +162,7 @@ def make_sat_data(test_t0, delay_mins, freq_mins):

# Add data to dataset
ds["data"] = xr.DataArray(
np.zeros([len(ds[c]) for c in ds.xindexes]),
np.ones([len(ds[c]) for c in ds.xindexes]),
coords=[ds[c] for c in ds.xindexes],
)

Expand All @@ -185,6 +189,10 @@ def sat_5_data_delayed(test_t0):
def sat_15_data(test_t0):
return make_sat_data(test_t0, delay_mins=0, freq_mins=15)

@pytest.fixture()
def sat_15_data_small(test_t0):
return make_sat_data(test_t0, delay_mins=0, freq_mins=15,small=True)


@pytest.fixture()
def gsp_yields_and_systems(db_session, test_t0):
Expand Down
25 changes: 25 additions & 0 deletions tests/data/test_satellite.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

import os
import tempfile

import pytest
import zarr
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -237,3 +239,26 @@ def test_extend_satellite_data_with_nans_over_3_hours(sat_5_data, test_t0):
ds = xr.open_zarr(filename)
assert len(time) + 3*12 == len(ds.time)
assert ds.time.values[-1] == t0


def test_zeros_in_sat_data(sat_15_data_small, test_t0):
"""Download and process only the 15 minute satellite data"""

# make temporary directory
with tempfile.TemporaryDirectory() as tmpdirname:

# Change to temporary working directory
os.chdir(tmpdirname)

# make half the values zeros
sat_15_data_small.data[::2] = 0

# Make 15-minutely satellite data available
save_to_zarr_zip(sat_15_data_small, filename="latest.zarr.zip")

os.environ["SATELLITE_ZARR_PATH"] = "latest.zarr.zip"
download_all_sat_data()

# check an error is made
with pytest.raises(Exception):
preprocess_sat_data(test_t0)
Loading