openclimatefix · peterdudfield · Apr 4, 2022 · Dec 21, 2021 · Dec 21, 2021 · Dec 21, 2021
diff --git a/notebooks/2021-09/2021-09-07/gsp_regions_20181031.geojson b/notebooks/2021-09/2021-09-07/gsp_regions_20181031.geojson
diff --git a/nowcasting_dataset/config/gcp.yaml b/nowcasting_dataset/config/gcp.yaml
@@ -33,8 +33,13 @@ input_data:
   pv:
     forecast_minutes: 60
     history_minutes: 30
-    pv_filename: gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/passiv.netcdf
-    pv_metadata_filename: gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/system_metadata.csv
+    pv_files_groups:
+      - label: passiv
+        pv_filename: gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/passiv.netcdf
+        pv_metadata_filename: gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/system_metadata.csv
+      - label: pvoutput
+        pv_filename: gs://solar-pv-nowcasting-data/PV/PVOutput.org/UK_PV_timeseries_batch.nc
+        pv_metadata_filename: gs://solar-pv-nowcasting-data/PV/PVOutput.org/UK_PV_metadata.csv
     get_center: false
 
   #---------------------- Satellite -------------

diff --git a/nowcasting_dataset/config/model.py b/nowcasting_dataset/config/model.py
@@ -27,6 +27,7 @@
     DEFAULT_N_GSP_PER_EXAMPLE,
     DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE,
     NWP_VARIABLE_NAMES,
+    PV_PROVIDERS,
     SAT_VARIABLE_NAMES,
 )
 from nowcasting_dataset.dataset.split import split
@@ -175,17 +176,32 @@ def check_start_and_end_datetime(cls, values):
         return values
 
 
-class PV(DataSourceMixin, StartEndDatetimeMixin):
-    """PV configuration model"""
+class PVFiles(BaseModel):
+    """Model to hold pv file and metadata file"""
 
     pv_filename: str = Field(
         "gs://solar-pv-nowcasting-data/PV/PVOutput.org/UK_PV_timeseries_batch.nc",
-        description=("The NetCDF file holding the solar PV power timeseries."),
+        description="The NetCDF files holding the solar PV power timeseries.",
     )
     pv_metadata_filename: str = Field(
         "gs://solar-pv-nowcasting-data/PV/PVOutput.org/UK_PV_metadata.csv",
-        description="The CSV file describing each PV system.",
+        description="Tthe CSV files describing each PV system.",
     )
+
+    label: str = Field("pvoutput", description="Label of where the pv data came from")
+
+    @validator("label")
+    def v_label0(cls, v):
+        """Validate 'label'"""
+        assert v in PV_PROVIDERS
+        return v
+
+
+class PV(DataSourceMixin, StartEndDatetimeMixin):
+    """PV configuration model"""
+
+    pv_files_groups: List[PVFiles] = [PVFiles()]
+
     n_pv_systems_per_example: int = Field(
         DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE,
         description="The number of PV systems samples per example. "
@@ -201,6 +217,34 @@ class PV(DataSourceMixin, StartEndDatetimeMixin):
         "PVDataSource is used to define the geospatial positions of each example.",
     )
 
+    pv_filename: str = Field(
+        None,
+        description="The NetCDF files holding the solar PV power timeseries.",
+    )
+    pv_metadata_filename: str = Field(
+        None,
+        description="Tthe CSV files describing each PV system.",
+    )
+
+    @classmethod
+    def model_validation(cls, v):
+        """Move old way of storing filenames to new way"""
+
+        if (v.pv_filename is not None) and (v.pv_metadata_filename is not None):
+            logger.warning(
+                "Loading pv files the old way, and moving them the new way. "
+                "Please update configuration file"
+            )
+            label = "pvoutput" if "pvoutput" in v.pv_filename.lower() else "passiv"
+            pv_file = PVFiles(
+                pv_filename=v.pv_filename, pv_metadata_filename=v.pv_metadata_filename, label=label
+            )
+            v.pv_files_groups = [pv_file]
+            v.pv_filename = None
+            v.pv_metadata_filename = None
+
+        return v
+
 
 class Satellite(DataSourceMixin, TimeResolutionMixin):
     """Satellite configuration model"""

diff --git a/nowcasting_dataset/config/on_premises.yaml b/nowcasting_dataset/config/on_premises.yaml
@@ -30,8 +30,13 @@ input_data:
 
   #---------------------- PV -------------------
   pv:
-    pv_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/Passiv/ocf_formatted/v0/passiv.netcdf
-    pv_metadata_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/Passiv/ocf_formatted/v0/system_metadata_OCF_ONLY.csv
+    pv_files_groups:
+      - label: passiv
+        pv_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/Passiv/ocf_formatted/v0/passiv.netcdf
+        pv_metadata_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/Passiv/ocf_formatted/v0/system_metadata_OCF_ONLY.csv
+      - label: pvoutput
+        pv_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/PVOutput.org/UK_PV_timeseries_batch.nc
+        pv_metadata_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/PVOutput.org/UK_PV_metadata.csv
     get_center: false
     history_minutes: 90
     log_level: "INFO"

diff --git a/nowcasting_dataset/consts.py b/nowcasting_dataset/consts.py
@@ -135,3 +135,5 @@
 )
 
 LOG_LEVELS = ("DEBUG", "INFO", "WARNING", "ERROR")
+
+PV_PROVIDERS = ["passiv", "pvoutput"]
diff --git a/nowcasting_dataset/data_sources/datasource_output.py b/nowcasting_dataset/data_sources/datasource_output.py
@@ -99,7 +99,7 @@ def check_dataset_less_than_or_equal_to(
     ):
         """Check data is less than a certain value"""
         if (data > max_value).any():
-            message = f"Some {self.__class__.__name__} data values are less than {max_value}"
+            message = f"Some {self.__class__.__name__} data values are more than {max_value}"
             if variable_name is not None:
                 message += f" ({variable_name})"
             logger.error(message)

diff --git a/nowcasting_dataset/data_sources/pv/pv_data_source.py b/nowcasting_dataset/data_sources/pv/pv_data_source.py
@@ -2,7 +2,6 @@
 
 import datetime
 import functools
-import io
 import logging
 from dataclasses import dataclass
 from numbers import Number
@@ -16,7 +15,8 @@
 
 import nowcasting_dataset.filesystem.utils as nd_fs_utils
 from nowcasting_dataset import geospatial
-from nowcasting_dataset.consts import DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE
+from nowcasting_dataset.config.model import PVFiles
+from nowcasting_dataset.consts import DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE, PV_PROVIDERS
 from nowcasting_dataset.data_sources.data_source import ImageDataSource
 from nowcasting_dataset.data_sources.metadata.metadata_model import SpaceTimeLocation
 from nowcasting_dataset.data_sources.pv.pv_model import PV
@@ -33,8 +33,7 @@ class PVDataSource(ImageDataSource):
     defined by image_size_pixels and meters_per_pixel.
     """
 
-    filename: Union[str, Path]
-    metadata_filename: Union[str, Path]
+    files_groups: List[Union[PVFiles, dict]]
     # TODO: Issue #425: Use config to set start_dt and end_dt.
     start_datetime: Optional[datetime.datetime] = None
     end_datetime: Optional[datetime.datetime] = None
@@ -48,15 +47,20 @@ class PVDataSource(ImageDataSource):
 
     def __post_init__(self, image_size_pixels: int, meters_per_pixel: int):
         """Post Init"""
+
+        if type(self.files_groups[0]) == dict:
+            self.files_groups = [PVFiles(**files) for files in self.files_groups]
+
         super().__post_init__(image_size_pixels, meters_per_pixel)
 
         self.rng = np.random.default_rng()
         self.load()
 
     def check_input_paths_exist(self) -> None:
         """Check input paths exist.  If not, raise a FileNotFoundError."""
-        for filename in [self.filename, self.metadata_filename]:
-            nd_fs_utils.check_path_exists(filename)
+        for pv_files in self.files_groups:
+            for filename in [pv_files.pv_filename, pv_files.pv_metadata_filename]:
+                nd_fs_utils.check_path_exists(filename)
 
     def load(self):
         """
@@ -73,9 +77,23 @@ def get_data_model_for_batch():
 
     def _load_metadata(self):
 
-        logger.debug(f"Loading PV metadata from {self.metadata_filename}")
+        logger.debug(f"Loading PV metadata from {self.files_groups}")
+
+        # collect all metadata together
+        pv_metadata = []
+        for pv_files in self.files_groups:
+            metadata_filename = pv_files.pv_metadata_filename
+
+            # read metadata file
+            metadata = pd.read_csv(metadata_filename, index_col="system_id")
+
+            # encode index, to make sure the indexes are unique
+            metadata.index = encode_label(indexes=metadata.index, label=pv_files.label)
+
+            pv_metadata.append(metadata)
+        pv_metadata = pd.concat(pv_metadata)
 
-        pv_metadata = pd.read_csv(self.metadata_filename, index_col="system_id")
+        # drop any systems with no lon or lat
         pv_metadata.dropna(subset=["longitude", "latitude"], how="any", inplace=True)
 
         pv_metadata["location_x"], pv_metadata["location_y"] = geospatial.lat_lon_to_osgb(
@@ -99,15 +117,33 @@ def _load_metadata(self):
 
     def _load_pv_power(self):
 
-        logger.debug(f"Loading PV Power data from {self.filename}")
+        logger.debug(f"Loading PV Power data from {self.files_groups}")
 
-        pv_power = load_solar_pv_data(
-            self.filename, start_dt=self.start_datetime, end_dt=self.end_datetime
-        )
+        # collect all PV power timeseries together
+        pv_power_all = []
+        for pv_files in self.files_groups:
+            filename = pv_files.pv_filename
+
+            # get pv power data
+            pv_power = load_solar_pv_data(
+                filename, start_dt=self.start_datetime, end_dt=self.end_datetime
+            )
+
+            # encode index, to make sure the columns are unique
+            new_columns = encode_label(indexes=pv_power.columns, label=pv_files.label)
+            pv_power.columns = new_columns
+
+            pv_power_all.append(pv_power)
+
+        pv_power = pd.concat(pv_power_all, axis="columns")
+        assert not pv_power.columns.duplicated().any()
 
         # A bit of hand-crafted cleaning
-        if 30248 in pv_power.columns:
-            pv_power[30248]["2018-10-29":"2019-01-03"] = np.NaN
+        bad_pvputput_indexes = [30248]
+        bad_pvputput_indexes = encode_label(bad_pvputput_indexes, label="pvoutput")
+        for bad_index in bad_pvputput_indexes:
+            if bad_index in pv_power.columns:
+                pv_power[bad_index]["2018-10-29":"2019-01-03"] = np.NaN
 
         # Drop columns and rows with all NaNs.
         pv_power.dropna(axis="columns", how="all", inplace=True)
@@ -418,3 +454,28 @@ def drop_pv_systems_which_produce_overnight(pv_power: pd.DataFrame) -> pd.DataFr
     bad_systems = pv_power.columns[pv_above_threshold_at_night]
     print(len(bad_systems), "bad PV systems found and removed!")
     return pv_power.drop(columns=bad_systems)
+
+
+def encode_label(indexes: List[str], label: str):
+    """
+    Encode the label to a list of indexes.
+
+    The new encoding must be integers and unique.
+    It would be useful if the indexes can read and deciphered by humans.
+    This is done by times the original index by 10
+    and adding 1 for passive or 2 for other lables
+
+    Args:
+        indexes: list of indexes
+        label: either 'passiv' or 'pvoutput'
+
+    Returns: list of indexes encoded by label
+    """
+    assert label in PV_PROVIDERS
+    # this encoding does work if the number of pv providers is more than 10
+    assert len(PV_PROVIDERS) < 10
+
+    label_index = PV_PROVIDERS.index(label)
+    new_index = [str(int(col) * 10 + label_index) for col in indexes]
+
+    return new_index
diff --git a/nowcasting_dataset/manager/base.py b/nowcasting_dataset/manager/base.py
@@ -69,11 +69,20 @@ def initialize_data_sources(
                 config_for_data_source, pattern_to_remove=f"^{data_source_name}_"
             )
 
+            # TODO: #631 remove
+            if data_source_name == "pv":
+                config_for_data_source.pop("filename")
+                config_for_data_source.pop("metadata_filename")
+
             data_source_class = MAP_DATA_SOURCE_NAME_TO_CLASS[data_source_name]
             try:
                 data_source = data_source_class(**config_for_data_source)
             except Exception:
-                logger.exception(f"Exception whilst instantiating {data_source_name}!")
+                logger.exception(
+                    f"Exception whilst instantiating {data_source_name}! "
+                    f"Tried with configuration {config_for_data_source} "
+                    f"in {data_source_class}"
+                )
                 raise
             self.data_sources[data_source_name] = data_source
 

diff --git a/requirements.txt b/requirements.txt
@@ -9,6 +9,10 @@ gcsfs
 dask
 pvlib
 pyproj
+pytest
+coverage<6.3
+pytest-cov
+jedi
 mypy
 pydantic
 tqdm
Original file line number	Diff line number	Diff line change
Expand Up		@@ -135,3 +135,5 @@
		)

		LOG_LEVELS = ("DEBUG", "INFO", "WARNING", "ERROR")

		PV_PROVIDERS = ["passiv", "pvoutput"]
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,6 +9,10 @@ gcsfs @@
     dask
     pvlib
     pyproj
+    pytest
+    coverage<6.3
+    pytest-cov
+    jedi
     mypy
     pydantic
     tqdm
@@ Expand Down @@