Merge branch 'data_sampler' of https://github.com/openclimatefix/PVNet …

…into data_sampler
openclimatefix · Nov 13, 2024 · 465e518 · 465e518
2 parents 87d5718 + db81147
commit 465e518
Show file tree

Hide file tree

Showing 14 changed files with 294 additions and 254 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,7 +1,7 @@
 [bumpversion]
 commit = True
 tag = True
-current_version = 3.0.53
+current_version = 3.0.63
 message = Bump version: {current_version} → {new_version} [skip ci]
 
 [bumpversion:file:pvnet/__init__.py]

diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
 # PVNet 2.1
 
-[![Python Bump Version & release](https://github.com/openclimatefix/PVNet/actions/workflows/release.yml/badge.svg)](https://github.com/openclimatefix/PVNet/actions/workflows/release.yml)
+ [![Python Bump Version & release](https://github.com/openclimatefix/PVNet/actions/workflows/release.yml/badge.svg)](https://github.com/openclimatefix/PVNet/actions/workflows/release.yml) [![ease of contribution: hard](https://img.shields.io/badge/ease%20of%20contribution:%20hard-bb2629)](https://github.com/openclimatefix/ocf-meta-repo?tab=readme-ov-file#overview-of-ocfs-nowcasting-repositories)
+
 
 This project is used for training PVNet and running PVNet on live data.
 
@@ -85,6 +86,8 @@ OCF maintains a Zarr formatted version of the German Weather Service's (DWD)
 ICON-EU NWP model here:
 https://huggingface.co/datasets/openclimatefix/dwd-icon-eu which includes the UK
 
+Please note that the current version of [ICON loader]([url](https://github.com/openclimatefix/ocf_datapipes/blob/9ec252eeee44937c12ab52699579bdcace76e72f/ocf_datapipes/load/nwp/providers/icon.py#L9-L30)) supports a different format. If you want to use our ICON-EU dataset or your own NWP source, you can create a loader for it using [the instructions here]([url](https://github.com/openclimatefix/ocf_datapipes/tree/main/ocf_datapipes/load#nwp)).
+
 **PV**\
 OCF maintains a dataset of PV generation from 1311 private PV installations
 here: https://huggingface.co/datasets/openclimatefix/uk_pv

diff --git a/experiments/analysis.py → experiments/mae_analysis.py b/experiments/analysis.py → experiments/mae_analysis.py
@@ -1,5 +1,8 @@
 """
-Script to generate a table comparing two run for MAE values for 48 hour 15 minute forecast
+Script to generate analysis of MAE values for multiple model forecasts
+
+Does this for 48 hour horizon forecasts with 15 minute granularity
+
 """
 
 import argparse
@@ -10,16 +13,23 @@
 import wandb
 
 
-def main(runs: list[str], run_names: list[str]) -> None:
+def main(project: str, runs: list[str], run_names: list[str]) -> None:
     """
-    Compare two runs for MAE values for 48 hour 15 minute forecast
+    Compare MAE values for multiple model forecasts for 48 hour horizon with 15 minute granularity
+
+    Args:
+            project: name of W&B project
+            runs: W&B ids of runs
+            run_names: user specified names for runs
+
     """
     api = wandb.Api()
     dfs = []
+    epoch_num = []
     for run in runs:
-        run = api.run(f"openclimatefix/india/{run}")
+        run = api.run(f"openclimatefix/{project}/{run}")
 
-        df = run.history()
+        df = run.history(samples=run.lastHistoryStep + 1)
         # Get the columns that are in the format 'MAE_horizon/step_<number>/val`
         mae_cols = [col for col in df.columns if "MAE_horizon/step_" in col and "val" in col]
         # Sort them
@@ -40,6 +50,7 @@ def main(runs: list[str], run_names: list[str]) -> None:
         # Get the step from the column name
         column_timesteps = [int(col.split("_")[-1].split("/")[0]) * 15 for col in mae_cols]
         dfs.append(df)
+        epoch_num.append(min_row_idx)
     # Get the timedelta for each group
     groupings = [
         [0, 0],
@@ -86,36 +97,41 @@ def main(runs: list[str], run_names: list[str]) -> None:
     for idx, df in enumerate(dfs):
         print(f"{run_names[idx]}: {df.mean()*100:0.3f}")
 
-    # Plot the error on per timestep, and all timesteps
+    # Plot the error per timestep
     plt.figure()
     for idx, df in enumerate(dfs):
-        plt.plot(column_timesteps, df, label=run_names[idx])
+        plt.plot(
+            column_timesteps, df, label=f"{run_names[idx]}, epoch: {epoch_num[idx]}", linestyle="-"
+        )
     plt.legend()
     plt.xlabel("Timestep (minutes)")
     plt.ylabel("MAE %")
     plt.title("MAE % for each timestep")
     plt.savefig("mae_per_timestep.png")
     plt.show()
 
-    # Plot the error on per timestep, and grouped timesteps
+    # Plot the error per grouped timestep
     plt.figure()
-    for run_name in run_names:
-        plt.plot(groups_df[run_name], label=run_name)
+    for idx, run_name in enumerate(run_names):
+        plt.plot(
+            groups_df[run_name],
+            label=f"{run_name}, epoch: {epoch_num[idx]}",
+            marker="o",
+            linestyle="-",
+        )
     plt.legend()
     plt.xlabel("Timestep (minutes)")
     plt.ylabel("MAE %")
-    plt.title("MAE % for each timestep")
-    plt.savefig("mae_per_timestep.png")
+    plt.title("MAE % for each grouped timestep")
+    plt.savefig("mae_per_grouped_timestep.png")
     plt.show()
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    "5llq8iw6"
-    parser.add_argument("--first_run", type=str, default="xdlew7ib")
-    parser.add_argument("--second_run", type=str, default="v3mja33d")
+    parser.add_argument("--project", type=str, default="")
     # Add arguments that is a list of strings
     parser.add_argument("--list_of_runs", nargs="+")
     parser.add_argument("--run_names", nargs="+")
     args = parser.parse_args()
-    main(args.list_of_runs, args.run_names)
+    main(args.project, args.list_of_runs, args.run_names)
diff --git a/pvnet/__init__.py b/pvnet/__init__.py
@@ -1,2 +1,2 @@
 """PVNet"""
-__version__ = "3.0.53"
+__version__ = "3.0.63"
diff --git a/pvnet/data/datamodule.py b/pvnet/data/datamodule.py
@@ -1,40 +1,35 @@
 """ Data module for pytorch lightning """
-from datetime import datetime
 from glob import glob
 
-from lightning.pytorch import LightningDataModule
-from torch.utils.data import Dataset, DataLoader
 import torch
-
-from ocf_datapipes.batch import batch_to_tensor, stack_np_examples_into_batch, NumpyBatch
-from ocf_data_sampler.torch_datasets.pvnet_uk_regional import (
-    PVNetUKRegionalDataset
-)
+from lightning.pytorch import LightningDataModule
+from ocf_data_sampler.torch_datasets.pvnet_uk_regional import PVNetUKRegionalDataset
+from ocf_datapipes.batch import NumpyBatch, batch_to_tensor, stack_np_examples_into_batch
+from torch.utils.data import DataLoader, Dataset
 
 
 class NumpybatchPremadeSamplesDataset(Dataset):
     """Dataset to load NumpyBatch samples"""
-    
+
     def __init__(self, sample_dir):
         """Dataset to load NumpyBatch samples
-        
+
         Args:
             sample_dir: Path to the directory of pre-saved samples.
         """
         self.sample_paths = glob(f"{sample_dir}/*.pt")
-
-
+
     def __len__(self):
         return len(self.sample_paths)
-    
+
     def __getitem__(self, idx):
         return torch.load(self.sample_paths[idx])
 
 
 def collate_fn(samples: list[NumpyBatch]):
     """Convert a list of NumpyBatch samples to a tensor batch"""
     return batch_to_tensor(stack_np_examples_into_batch(samples))
-        
+
 
 class DataModule(LightningDataModule):
     """Datamodule for training pvnet and using pvnet pipeline in `ocf_datapipes`."""
@@ -46,9 +41,8 @@ def __init__(
         batch_size: int = 16,
         num_workers: int = 0,
         prefetch_factor: int | None = None,
-        train_period: list[str|None] = [None, None],
-        val_period: list[str|None] = [None, None],
-
+        train_period: list[str | None] = [None, None],
+        val_period: list[str | None] = [None, None],
     ):
         """Datamodule for training pvnet architecture.
 
@@ -67,7 +61,6 @@ def __init__(
         """
         super().__init__()
 
-
         if not ((sample_dir is not None) ^ (configuration is not None)):
             raise ValueError("Exactly one of `sample_dir` or `configuration` must be set.")
 
@@ -100,21 +93,19 @@ def _get_streamed_samples_dataset(self, start_time, end_time) -> Dataset:
     def _get_premade_samples_dataset(self, subdir) -> Dataset:
         split_dir = f"{self.sample_dir}/{subdir}"
         return NumpybatchPremadeSamplesDataset(split_dir)
-        
+
     def train_dataloader(self) -> DataLoader:
         """Construct train dataloader"""
         if self.sample_dir is not None:
             dataset = self._get_premade_samples_dataset("train")
         else:
             dataset = self._get_streamed_samples_dataset(*self.train_period)
         return DataLoader(dataset, shuffle=True, **self._common_dataloader_kwargs)
-    
+
     def val_dataloader(self) -> DataLoader:
         """Construct val dataloader"""
         if self.sample_dir is not None:
             dataset = self._get_premade_samples_dataset("val")
         else:
             dataset = self._get_streamed_samples_dataset(*self.val_period)
         return DataLoader(dataset, shuffle=False, **self._common_dataloader_kwargs)
-
-