Skip to content
This repository has been archived by the owner on Sep 11, 2023. It is now read-only.

multiple input data files for pv #562

Merged
merged 25 commits into from
Apr 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
787db42
first try and making multiple input data files for pv
peterdudfield Dec 21, 2021
c5752dc
Merge branch 'main' into issue/554-both-pv
peterdudfield Dec 21, 2021
db41f03
fix post init order
peterdudfield Dec 21, 2021
24f907d
add fake test data for 2 pv systems
peterdudfield Dec 21, 2021
f7729aa
update for pv tests
peterdudfield Dec 21, 2021
455be19
remove zarr add nc pv test files
peterdudfield Dec 21, 2021
da595a3
Merge branch 'main' into issue/554-both-pv
peterdudfield Dec 21, 2021
5faf7a6
add index to pv systems ids
peterdudfield Dec 22, 2021
d11498a
add code for checking over lap
peterdudfield Dec 22, 2021
e2549d1
Merge commit '9e29dc30e21fb63acf6814b70759ffe81fb0d6d9' into issue/55…
peterdudfield Dec 22, 2021
ce68eb8
Apply suggestions from code review
peterdudfield Jan 28, 2022
91217c2
Merge branch 'main' into issue/554-both-pv
peterdudfield Jan 28, 2022
eec5ade
try with coverage 6.2
peterdudfield Jan 28, 2022
9509fa3
Merge branch 'main' into issue/554-both-pv
peterdudfield Feb 2, 2022
71bd236
update config model
peterdudfield Feb 2, 2022
3262683
use pv label to change index
peterdudfield Feb 3, 2022
819c026
encode hand craft cleaning of data
peterdudfield Feb 3, 2022
4d235e5
use constant list of pv providers
peterdudfield Feb 3, 2022
2292401
add pvoutput data to on_premises.yaml
peterdudfield Feb 3, 2022
78864be
Merge branch 'main' into issue/554-both-pv
peterdudfield Apr 4, 2022
1babc01
make sure config is back compatiable
peterdudfield Apr 4, 2022
bf7f1d9
Merge branch 'main' into issue/554-both-pv
peterdudfield Apr 4, 2022
d23e64b
add issue #631
peterdudfield Apr 4, 2022
3057f69
fix
peterdudfield Apr 4, 2022
49521d8
fix in manager.py #631
peterdudfield Apr 4, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
336 changes: 336 additions & 0 deletions notebooks/2021-09/2021-09-07/gsp_regions_20181031.geojson

Large diffs are not rendered by default.

9 changes: 7 additions & 2 deletions nowcasting_dataset/config/gcp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,13 @@ input_data:
pv:
forecast_minutes: 60
history_minutes: 30
pv_filename: gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/passiv.netcdf
pv_metadata_filename: gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/system_metadata.csv
pv_files_groups:
- label: passiv
pv_filename: gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/passiv.netcdf
pv_metadata_filename: gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/system_metadata.csv
- label: pvoutput
pv_filename: gs://solar-pv-nowcasting-data/PV/PVOutput.org/UK_PV_timeseries_batch.nc
pv_metadata_filename: gs://solar-pv-nowcasting-data/PV/PVOutput.org/UK_PV_metadata.csv
get_center: false

#---------------------- Satellite -------------
Expand Down
52 changes: 48 additions & 4 deletions nowcasting_dataset/config/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
DEFAULT_N_GSP_PER_EXAMPLE,
DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE,
NWP_VARIABLE_NAMES,
PV_PROVIDERS,
SAT_VARIABLE_NAMES,
)
from nowcasting_dataset.dataset.split import split
Expand Down Expand Up @@ -175,17 +176,32 @@ def check_start_and_end_datetime(cls, values):
return values


class PV(DataSourceMixin, StartEndDatetimeMixin):
"""PV configuration model"""
class PVFiles(BaseModel):
"""Model to hold pv file and metadata file"""

pv_filename: str = Field(
"gs://solar-pv-nowcasting-data/PV/PVOutput.org/UK_PV_timeseries_batch.nc",
description=("The NetCDF file holding the solar PV power timeseries."),
description="The NetCDF files holding the solar PV power timeseries.",
)
pv_metadata_filename: str = Field(
"gs://solar-pv-nowcasting-data/PV/PVOutput.org/UK_PV_metadata.csv",
description="The CSV file describing each PV system.",
description="Tthe CSV files describing each PV system.",
)

label: str = Field("pvoutput", description="Label of where the pv data came from")

@validator("label")
def v_label0(cls, v):
"""Validate 'label'"""
assert v in PV_PROVIDERS
return v


class PV(DataSourceMixin, StartEndDatetimeMixin):
"""PV configuration model"""

pv_files_groups: List[PVFiles] = [PVFiles()]

n_pv_systems_per_example: int = Field(
DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE,
description="The number of PV systems samples per example. "
Expand All @@ -201,6 +217,34 @@ class PV(DataSourceMixin, StartEndDatetimeMixin):
"PVDataSource is used to define the geospatial positions of each example.",
)

pv_filename: str = Field(
None,
description="The NetCDF files holding the solar PV power timeseries.",
)
pv_metadata_filename: str = Field(
None,
description="Tthe CSV files describing each PV system.",
)

@classmethod
def model_validation(cls, v):
"""Move old way of storing filenames to new way"""

if (v.pv_filename is not None) and (v.pv_metadata_filename is not None):
logger.warning(
"Loading pv files the old way, and moving them the new way. "
"Please update configuration file"
)
label = "pvoutput" if "pvoutput" in v.pv_filename.lower() else "passiv"
pv_file = PVFiles(
pv_filename=v.pv_filename, pv_metadata_filename=v.pv_metadata_filename, label=label
)
v.pv_files_groups = [pv_file]
v.pv_filename = None
v.pv_metadata_filename = None

return v


class Satellite(DataSourceMixin, TimeResolutionMixin):
"""Satellite configuration model"""
Expand Down
9 changes: 7 additions & 2 deletions nowcasting_dataset/config/on_premises.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,13 @@ input_data:

#---------------------- PV -------------------
pv:
pv_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/Passiv/ocf_formatted/v0/passiv.netcdf
pv_metadata_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/Passiv/ocf_formatted/v0/system_metadata_OCF_ONLY.csv
pv_files_groups:
- label: passiv
pv_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/Passiv/ocf_formatted/v0/passiv.netcdf
pv_metadata_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/Passiv/ocf_formatted/v0/system_metadata_OCF_ONLY.csv
- label: pvoutput
pv_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/PVOutput.org/UK_PV_timeseries_batch.nc
pv_metadata_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/PVOutput.org/UK_PV_metadata.csv
get_center: false
history_minutes: 90
log_level: "INFO"
Expand Down
2 changes: 2 additions & 0 deletions nowcasting_dataset/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,5 @@
)

LOG_LEVELS = ("DEBUG", "INFO", "WARNING", "ERROR")

PV_PROVIDERS = ["passiv", "pvoutput"]
2 changes: 1 addition & 1 deletion nowcasting_dataset/data_sources/datasource_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def check_dataset_less_than_or_equal_to(
):
"""Check data is less than a certain value"""
if (data > max_value).any():
message = f"Some {self.__class__.__name__} data values are less than {max_value}"
message = f"Some {self.__class__.__name__} data values are more than {max_value}"
if variable_name is not None:
message += f" ({variable_name})"
logger.error(message)
Expand Down
89 changes: 75 additions & 14 deletions nowcasting_dataset/data_sources/pv/pv_data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import datetime
import functools
import io
import logging
from dataclasses import dataclass
from numbers import Number
Expand All @@ -16,7 +15,8 @@

import nowcasting_dataset.filesystem.utils as nd_fs_utils
from nowcasting_dataset import geospatial
from nowcasting_dataset.consts import DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE
from nowcasting_dataset.config.model import PVFiles
from nowcasting_dataset.consts import DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE, PV_PROVIDERS
from nowcasting_dataset.data_sources.data_source import ImageDataSource
from nowcasting_dataset.data_sources.metadata.metadata_model import SpaceTimeLocation
from nowcasting_dataset.data_sources.pv.pv_model import PV
Expand All @@ -33,8 +33,7 @@ class PVDataSource(ImageDataSource):
defined by image_size_pixels and meters_per_pixel.
"""

filename: Union[str, Path]
metadata_filename: Union[str, Path]
files_groups: List[Union[PVFiles, dict]]
# TODO: Issue #425: Use config to set start_dt and end_dt.
start_datetime: Optional[datetime.datetime] = None
end_datetime: Optional[datetime.datetime] = None
Expand All @@ -48,15 +47,20 @@ class PVDataSource(ImageDataSource):

def __post_init__(self, image_size_pixels: int, meters_per_pixel: int):
"""Post Init"""

if type(self.files_groups[0]) == dict:
self.files_groups = [PVFiles(**files) for files in self.files_groups]

super().__post_init__(image_size_pixels, meters_per_pixel)

self.rng = np.random.default_rng()
self.load()

def check_input_paths_exist(self) -> None:
"""Check input paths exist. If not, raise a FileNotFoundError."""
for filename in [self.filename, self.metadata_filename]:
nd_fs_utils.check_path_exists(filename)
for pv_files in self.files_groups:
for filename in [pv_files.pv_filename, pv_files.pv_metadata_filename]:
nd_fs_utils.check_path_exists(filename)

def load(self):
"""
Expand All @@ -73,9 +77,23 @@ def get_data_model_for_batch():

def _load_metadata(self):

logger.debug(f"Loading PV metadata from {self.metadata_filename}")
logger.debug(f"Loading PV metadata from {self.files_groups}")

# collect all metadata together
pv_metadata = []
for pv_files in self.files_groups:
metadata_filename = pv_files.pv_metadata_filename

# read metadata file
metadata = pd.read_csv(metadata_filename, index_col="system_id")

# encode index, to make sure the indexes are unique
metadata.index = encode_label(indexes=metadata.index, label=pv_files.label)

pv_metadata.append(metadata)
pv_metadata = pd.concat(pv_metadata)

pv_metadata = pd.read_csv(self.metadata_filename, index_col="system_id")
# drop any systems with no lon or lat
pv_metadata.dropna(subset=["longitude", "latitude"], how="any", inplace=True)

pv_metadata["location_x"], pv_metadata["location_y"] = geospatial.lat_lon_to_osgb(
Expand All @@ -99,15 +117,33 @@ def _load_metadata(self):

def _load_pv_power(self):

logger.debug(f"Loading PV Power data from {self.filename}")
logger.debug(f"Loading PV Power data from {self.files_groups}")

pv_power = load_solar_pv_data(
self.filename, start_dt=self.start_datetime, end_dt=self.end_datetime
)
# collect all PV power timeseries together
pv_power_all = []
for pv_files in self.files_groups:
filename = pv_files.pv_filename

# get pv power data
pv_power = load_solar_pv_data(
filename, start_dt=self.start_datetime, end_dt=self.end_datetime
)

# encode index, to make sure the columns are unique
new_columns = encode_label(indexes=pv_power.columns, label=pv_files.label)
pv_power.columns = new_columns

pv_power_all.append(pv_power)

pv_power = pd.concat(pv_power_all, axis="columns")
assert not pv_power.columns.duplicated().any()

# A bit of hand-crafted cleaning
if 30248 in pv_power.columns:
pv_power[30248]["2018-10-29":"2019-01-03"] = np.NaN
bad_pvputput_indexes = [30248]
bad_pvputput_indexes = encode_label(bad_pvputput_indexes, label="pvoutput")
for bad_index in bad_pvputput_indexes:
if bad_index in pv_power.columns:
pv_power[bad_index]["2018-10-29":"2019-01-03"] = np.NaN

# Drop columns and rows with all NaNs.
pv_power.dropna(axis="columns", how="all", inplace=True)
Expand Down Expand Up @@ -418,3 +454,28 @@ def drop_pv_systems_which_produce_overnight(pv_power: pd.DataFrame) -> pd.DataFr
bad_systems = pv_power.columns[pv_above_threshold_at_night]
print(len(bad_systems), "bad PV systems found and removed!")
return pv_power.drop(columns=bad_systems)


def encode_label(indexes: List[str], label: str):
"""
Encode the label to a list of indexes.

The new encoding must be integers and unique.
It would be useful if the indexes can read and deciphered by humans.
This is done by times the original index by 10
and adding 1 for passive or 2 for other lables

Args:
indexes: list of indexes
label: either 'passiv' or 'pvoutput'

Returns: list of indexes encoded by label
"""
assert label in PV_PROVIDERS
# this encoding does work if the number of pv providers is more than 10
assert len(PV_PROVIDERS) < 10

label_index = PV_PROVIDERS.index(label)
new_index = [str(int(col) * 10 + label_index) for col in indexes]

return new_index
11 changes: 10 additions & 1 deletion nowcasting_dataset/manager/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,20 @@ def initialize_data_sources(
config_for_data_source, pattern_to_remove=f"^{data_source_name}_"
)

# TODO: #631 remove
if data_source_name == "pv":
config_for_data_source.pop("filename")
config_for_data_source.pop("metadata_filename")

data_source_class = MAP_DATA_SOURCE_NAME_TO_CLASS[data_source_name]
try:
data_source = data_source_class(**config_for_data_source)
except Exception:
logger.exception(f"Exception whilst instantiating {data_source_name}!")
logger.exception(
f"Exception whilst instantiating {data_source_name}! "
f"Tried with configuration {config_for_data_source} "
f"in {data_source_class}"
)
raise
self.data_sources[data_source_name] = data_source

Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ gcsfs
dask
pvlib
pyproj
pytest
coverage<6.3
pytest-cov
jedi
mypy
pydantic
tqdm
Expand Down
Loading