Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Satellite normalisation #97

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
1 change: 1 addition & 0 deletions .github/workflows/workflows.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ jobs:
#sudo_apt_install: "libgeos++-dev libproj-dev proj-data proj-bin"
# brew_install: "proj geos librttopo"
os_list: '["ubuntu-latest"]'
python_version: "['3.11']"
Binary file added ocf_data_sampler/.DS_Store
felix-e-h-p marked this conversation as resolved.
Show resolved Hide resolved
Binary file not shown.
67 changes: 67 additions & 0 deletions ocf_data_sampler/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
"ecmwf",
]

SAT_PROVIDERS = [
"rss",
]


def _to_data_array(d):
return xr.DataArray(
Expand All @@ -28,6 +32,21 @@ def __getitem__(self, key):
f"Values for {key} not yet available in ocf-data-sampler {list(self.keys())}"
)


class SatStatDict(dict):
"""Custom dictionary class to hold Satellite normalization stats"""

def __getitem__(self, key):
if key not in SAT_PROVIDERS:
raise KeyError(f"{key} is not a supported Satellite provider - {SAT_PROVIDERS}")
elif key in self.keys():
return super().__getitem__(key)
else:
raise KeyError(
f"Values for {key} not yet available in ocf-data-sampler {list(self.keys())}"
)


# ------ UKV
# Means and std computed WITH version_7 and higher, MetOffice values
UKV_STD = {
Expand All @@ -49,6 +68,7 @@ def __getitem__(self, key):
"prmsl": 1252.71790539,
"prate": 0.00021497,
}

UKV_MEAN = {
"cdcb": 1412.26599062,
"lcc": 50.08362643,
Expand Down Expand Up @@ -97,6 +117,7 @@ def __getitem__(self, key):
"diff_duvrs": 81605.25,
"diff_sr": 818950.6875,
}

ECMWF_MEAN = {
"dlwrf": 27187026.0,
"dswrf": 11458988.0,
Expand Down Expand Up @@ -133,3 +154,49 @@ def __getitem__(self, key):
ecmwf=ECMWF_MEAN,
)

# ------ Satellite
# RSS Mean and std values from randomised 20% of 2020 imagery

RSS_STD = {
"HRV": 0.11405209,
"IR_016": 0.21462157,
"IR_039": 0.04618041,
"IR_087": 0.06687243,
"IR_097": 0.0468558,
"IR_108": 0.17482725,
"IR_120": 0.06115861,
"IR_134": 0.04492306,
"VIS006": 0.12184761,
"VIS008": 0.13090034,
"WV_062": 0.16111417,
"WV_073": 0.12924142,
}

RSS_MEAN = {
"HRV": 0.09298719,
"IR_016": 0.17594202,
"IR_039": 0.86167645,
"IR_087": 0.7719318,
"IR_097": 0.8014212,
"IR_108": 0.71254843,
"IR_120": 0.89058584,
"IR_134": 0.944365,
"VIS006": 0.09633306,
"VIS008": 0.11426069,
"WV_062": 0.7359355,
"WV_073": 0.62479186,
}

# Specified to ensure calculation stability
EPSILON = 1e-8
felix-e-h-p marked this conversation as resolved.
Show resolved Hide resolved

RSS_STD = _to_data_array(RSS_STD)
RSS_MEAN = _to_data_array(RSS_MEAN)

SAT_STDS = SatStatDict(
rss=RSS_STD,
)

SAT_MEANS = SatStatDict(
rss=RSS_MEAN,
)
7 changes: 6 additions & 1 deletion ocf_data_sampler/numpy_batch/nwp.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""Convert NWP to NumpyBatch"""

import pandas as pd
import xarray as xr

Expand All @@ -19,6 +18,12 @@ class NWPBatchKey:
def convert_nwp_to_numpy_batch(da: xr.DataArray, t0_idx: int | None = None) -> dict:
"""Convert from Xarray to NWP NumpyBatch"""

# Missing coordinate checking stage
required_coords = ["y_osgb", "x_osgb"]
felix-e-h-p marked this conversation as resolved.
Show resolved Hide resolved
for coord in required_coords:
if coord not in da.coords:
raise ValueError(f"Input DataArray missing '{coord}'")

example = {
NWPBatchKey.nwp: da.values,
NWPBatchKey.channel_names: da.channel.values,
Expand Down
9 changes: 8 additions & 1 deletion ocf_data_sampler/numpy_batch/satellite.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@ class SatelliteBatchKey:

def convert_satellite_to_numpy_batch(da: xr.DataArray, t0_idx: int | None = None) -> dict:
"""Convert from Xarray to NumpyBatch"""

# Missing coordinate checking stage
required_coords = ["x_geostationary", "y_geostationary"]
felix-e-h-p marked this conversation as resolved.
Show resolved Hide resolved
for coord in required_coords:
if coord not in da.coords:
raise ValueError(f"Input DataArray missing '{coord}'")

example = {
SatelliteBatchKey.satellite_actual: da.values,
SatelliteBatchKey.time_utc: da.time_utc.values.astype(float),
Expand All @@ -27,4 +34,4 @@ def convert_satellite_to_numpy_batch(da: xr.DataArray, t0_idx: int | None = None
if t0_idx is not None:
example[SatelliteBatchKey.t0_idx] = t0_idx

return example
return example
28 changes: 22 additions & 6 deletions ocf_data_sampler/torch_datasets/process_and_combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import xarray as xr

from ocf_data_sampler.config import Configuration
from ocf_data_sampler.constants import NWP_MEANS, NWP_STDS
from ocf_data_sampler.constants import NWP_MEANS, NWP_STDS, SAT_MEANS, SAT_STDS, EPSILON
from ocf_data_sampler.numpy_batch import (
convert_nwp_to_numpy_batch,
convert_satellite_to_numpy_batch,
Expand All @@ -13,6 +13,8 @@
)
from ocf_data_sampler.numpy_batch.gsp import GSPBatchKey
from ocf_data_sampler.numpy_batch.nwp import NWPBatchKey
from ocf_data_sampler.numpy_batch.satellite import SatelliteBatchKey

from ocf_data_sampler.select.geospatial import osgb_to_lon_lat
from ocf_data_sampler.select.location import Location
from ocf_data_sampler.utils import minutes
Expand All @@ -25,8 +27,8 @@ def process_and_combine_datasets(
location: Location,
target_key: str = 'gsp'
) -> dict:
"""Normalize and convert data to numpy arrays"""

"""Normalise and convert data to numpy arrays"""
felix-e-h-p marked this conversation as resolved.
Show resolved Hide resolved
numpy_modalities = []

if "nwp" in dataset_dict:
Expand All @@ -37,18 +39,32 @@ def process_and_combine_datasets(
# Standardise
provider = config.input_data.nwp[nwp_key].provider
da_nwp = (da_nwp - NWP_MEANS[provider]) / NWP_STDS[provider]

# Convert to NumpyBatch
nwp_numpy_modalities[nwp_key] = convert_nwp_to_numpy_batch(da_nwp)

# Combine the NWPs into NumpyBatch
numpy_modalities.append({NWPBatchKey.nwp: nwp_numpy_modalities})


if "sat" in dataset_dict:
# Satellite is already in the range [0-1] so no need to standardise
da_sat = dataset_dict["sat"]

# Convert to NumpyBatch
numpy_modalities.append(convert_satellite_to_numpy_batch(da_sat))
sat_numpy_modalities = dict()

for sat_key, da_sat in dataset_dict["sat"].items():
felix-e-h-p marked this conversation as resolved.
Show resolved Hide resolved
# Standardise
provider = config.input_data.satellite[sat_key].provider

# Not entirely sure if epsilon is necessary considering mean and std values are consistently non-zero
# Purely a safety measure
da_sat = (da_sat - SAT_MEANS[provider]) / (SAT_STDS[provider] + EPSILON)
felix-e-h-p marked this conversation as resolved.
Show resolved Hide resolved
felix-e-h-p marked this conversation as resolved.
Show resolved Hide resolved

# Convert to NumpyBatch
sat_numpy_modalities[sat_key] = convert_satellite_to_numpy_batch(da_sat)

# Combine the Sattelites into NumpyBatch
numpy_modalities.append({SatelliteBatchKey.satellite_actual: sat_numpy_modalities})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line, could be changed to back to numpy_modalities.append(convert_satellite_to_numpy_batch(da_sat), and then I think itll be fine



gsp_config = config.input_data.gsp

Expand Down
136 changes: 136 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
aiobotocore
felix-e-h-p marked this conversation as resolved.
Show resolved Hide resolved
aiohttp
aioitertools
aiosignal
alabaster
altair
anaconda-client
anyio
appdirs==1.4.4
argon2-cffi
arrow
astroid
astropy
attrs
autopep8
Babel
bcrypt
beautifulsoup4
black
blosc2>=2.7.1,<3.0.0
bokeh
botocore
Bottleneck
cachetools
certifi
cffi
chardet
charset-normalizer
click
cloudpickle
colorama
cycler
cytoolz
dask
datashader
debugpy
decorator
defusedxml
dill
distributed
fsspec
gensim
greenlet
h5py
holoviews
hvplot
idna
imagecodecs
imageio
imbalanced-learn
importlib-metadata
ipykernel
ipython
ipywidgets
isort
jedi
joblib
jupyter
jupyter-client
jupyter-core
jupyterlab
jupyterlab-widgets
kiwisolver
lazy-object-proxy
llvmlite
lmdb
locket
lxml
lz4
Markdown
matplotlib==3.9.2
mccabe
mistune
more-itertools
mpmath
msgpack
multidict
networkx
nltk
notebook
numba
numcodecs==0.13.1
numexpr
numpy
numpydoc
openpyxl
packaging
pandas
param
patsy
pexpect
pickleshare
pillow
pkginfo
platformdirs
pluggy
prompt-toolkit
protobuf==4.25.3
psutil
ptyprocess
PyArrow
pydantic
PyYAML
pyzmq
pathy
qtconsole
queuelib
regex
requests
scikit-image
scikit-learn
scipy
seaborn
setuptools==75.1.0
shapely
six
sqlalchemy
statsmodels
sympy
tables
tabulate
threadpoolctl
tifffile
toolz
torch==2.5.1
tornado
tqdm
traitlets
typing-extensions
urllib3
watchdog
wcwidth
xarray
zarr==2.18.3
zict
zstandard
Binary file added tests/.DS_Store
felix-e-h-p marked this conversation as resolved.
Show resolved Hide resolved
Binary file not shown.
Binary file added tests/torch_datasets/.DS_Store
felix-e-h-p marked this conversation as resolved.
Show resolved Hide resolved
Binary file not shown.
Loading