Merge pull request #120 from openclimatefix/readme_clone_depth

update readme and templates to reflect changes in datapipes
openclimatefix · Jan 18, 2024 · 8924ddf · 8924ddf
2 parents 2c05bc2 + f5400fe
commit 8924ddf
Show file tree

Hide file tree

Showing 5 changed files with 128 additions and 122 deletions.
diff --git a/README.md b/README.md
@@ -28,12 +28,26 @@ cd PVNet
 pip install -r requirements.txt
 ```
 
+The commit history is extensive. To save download time, use a depth of 1:
+```bash
+git clone --depth 1 https://github.com/openclimatefix/PVNet.git
+```
+This means only the latest commit and its associated files will be downloaded.
+
+Next, in the PVNet repo, install PVNet as an editable package:
+
+```bash
+pip install -e .
+```
+
 ### Additional development dependencies
 
 ```bash
 pip install -r requirements-dev.txt
 ```
 
+
+
 ## Getting started with running PVNet
 
 Before running any code in within PVNet, copy the example configuration to a
@@ -69,116 +83,103 @@ https://huggingface.co/datasets/openclimatefix/dwd-icon-eu which includes the UK
 OCF maintains a dataset of PV generation from 1311 private PV installations
 here: https://huggingface.co/datasets/openclimatefix/uk_pv
 
-### Generating pre-made batches of data for training/validation of PVNet
 
-PVNet contains a script for generating batches of data suitable for training the
-PVNet models.
+### Connecting with ocf_datapipes for batch creation
 
-To run the script you will need to make some modifications to the datamodule
-configuration.
+Outside the PVNet repo, clone the ocf-datapipes repo and exit the conda env created for PVNet: https://github.com/openclimatefix/ocf_datapipes
+```bash
+git clone --depth 1 https://github.com/openclimatefix/ocf_datapipes.git
+conda create -n ocf_datapipes python=3.10
+```
 
-1. First, create your new configuration file in
-   `./configs/datamodule/configiration/local_configuration.yaml` and paste the
-   sample config (shown below)
-2. Duplicate the `./configs/datamodule/ocf_datapipes.yaml` to
-   `./configs/datamodule/_local_ocf_datapipes.yaml` and ensure the
-   `configuration` key points to your newly created configuration file in
-   step 1.
-3. Also in this file, update the train, val & test periods to cover the data you
-   have access to.
-4. To get you started with your own configuration file, see the sample config
-   below. Update the data paths to the location of your local GSP, NWP and PV
-   datasets:
+Then go inside the ocf_datapipes repo to add packages
 
-```yaml
-general:
-  description: Demo config
-  name: demo_datamodule_config
-
-input_data:
-  default_history_minutes: 60
-  default_forecast_minutes: 120
-
-  gsp:
-    gsp_zarr_path: /path/to/gsp-data.zarr
-    history_minutes: 60
-    forecast_minutes: 120
-    time_resolution_minutes: 30
-    start_datetime: "2019-01-01T00:00:00"
-    end_datetime: "2019-01-08T00:00:00"
-    metadata_only: false
-
-  nwp:
-    ukv:
-      nwp_zarr_path: /path/to/nwp-data.zarr
-      history_minutes: 60
-      forecast_minutes: 120
-      time_resolution_minutes: 60
-      nwp_channels: # comment out channels as appropriate
-        - t         # 2-metre temperature
-        - dswrf     # downwards short-wave radiation flux
-        - dlwrf     # downwards long-wave radiation flux
-        - hcc       # high cloud cover
-        - mcc       # medium cloud cover
-        - lcc       # low cloud cover
-        - vis       # visability
-        - r         # relative humidity
-        - prate     # precipitation rate
-        - si10      # 10-metre wind speed | live = unknown
-      nwp_image_size_pixels_height: 24
-      nwp_image_size_pixels_width: 24
-      nwp_provider: ukv
-
-  pv:
-    pv_files_groups:
-      - label: pvoutput.org
-        pv_filename: /path/to/pv-data/pv.netcdf
-        pv_metadata_filename: /path/to/pv-data/metadata.csv
-    history_minutes: 60
-    forecast_minutes: 0 # PVNet assumes no future PV generation
-    time_resolution_minutes: 5
-    start_datetime: "2019-01-01T00:00:00"
-    end_datetime: "2019-01-08T00:00:00"
-    pv_image_size_meters_height: 24
-    pv_image_size_meters_width: 24
-    pv_ml_ids: [154,155,156,158,159,160,162,164,165,166,167,168,169,171,173,177,178,179,181,182,185,186,187,188,189,190,191,192,193,197,198,199,200,202,204,205,206,208,209,211,214,215,216,217,218,219,220,221,225,229,230,232,233,234,236,242,243,245,252,254,255,256,257,258,260,261,262,265,267,268,272,273,275,276,277,280,281,282,283,287,289,291,292,293,294,295,296,297,298,301,302,303,304,306,307,309,310,311,317,318,319,320,321,322,323,325,326,329,332,333,335,336,338,340,342,344,345,346,348,349,352,354,355,356,357,360,362,363,368,369,370,371,372,374,375,376,378,380,382,384,385,388,390,391,393,396,397,398,399,400,401,403,404,405,406,407,409,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,429,431,435,437,438,440,441,444,447,450,451,453,456,457,458,459,464,465,466,467,468,470,471,473,474,476,477,479,480,481,482,485,486,488,490,491,492,493,496,498,501,503,506,507,508,509,510,511,512,513,515,516,517,519,520,521,522,524,526,527,528,531,532,536,537,538,540,541,542,543,544,545,549,550,551,552,553,554,556,557,560,561,563,566,568,571,572,575,576,577,579,580,581,582,584,585,588,590,594,595,597,600,602,603,604,606,611,613,614,616,618,620,622,623,624,625,626,628,629,630,631,636,637,638,640,641,642,644,645,646,650,651,652,653,654,655,657,660,661,662,663,666,667,668,670,675,676,679,681,683,684,685,687,696,698,701,702,703,704,706,710,722,723,724,725,727,728,729,730,732,733,734,735,736,737,]
-    n_pv_systems_per_example: 128
-    get_center: false
-    is_live: false
-
-  satellite:
-    satellite_zarr_path: "" # Left empty to avoid using satellite data
-    history_minutes: 60
-    forecast_minutes: 0
-    live_delay_minutes: 30
-    time_resolution_minutes: 5
-    satellite_channels:
-      - IR_016
-      - IR_039
-      - IR_087
-      - IR_097
-      - IR_108
-      - IR_120
-      - IR_134
-      - VIS006
-      - VIS008
-      - WV_062
-      - WV_073
-    satellite_image_size_pixels_height: 24
-    satellite_image_size_pixels_width: 24
+```bash
+pip install -r requirements.txt requirements-dev.txt
 ```
 
+Then exit this environment, and enter back into the pvnet conda environment and install ocf_datapies in editable mode (-e). This means the package is directly linked to the source code in the ocf_datapies repo.
+
+```bash
+pip install -e <PATH-TO-ocf_datapipes-REPO>
+```
+
+## Generating pre-made batches of data for training/validation of PVNet
+
+PVNet contains a script for generating batches of data suitable for training the PVNet models. To run the script you will need to make some modifications to the datamodule configuration.
+
+Make sure you have copied the example configs (as already stated above):
+```
+cp -r configs.example configs
+```
+
+### Set up and config example for batch creation
+
+We will use the example of creating batches using data from gcp:
+`/PVNet/configs/datamodule/configuration/gcp_configuration.yaml`
+Ensure that the file paths are set to the correct locations in
+`gcp_configuration.yaml`.
+
+`PLACEHOLDER` is used to indcate where to input the location of the files.
+
+For OCF use cases, file locations can be found in `template_configuration.yaml` located alongside `gcp_configuration.yaml`.
+
+In these configurations you can update the train, val & test periods to cover the data you have access to.
+
+
 With your configuration in place, you can proceed to create batches. PVNet uses
 [hydra](https://hydra.cc/) which enables us to pass variables via the command
 line that will override the configuration defined in the `./configs` directory.
 
-Run the save_batches.py script to create batches with the following arguments as
-a minimum:
+When creating batches, an additional config is used which is passed into the batch creation script. This is the datamodule config located `PVNet/configs/datamodule`.
+
+For this example we will be using the `streamed_batches.yaml` config. Like before, a placeholder variable is used when specifing which configuration to use:
+
+`configuration: "PLACEHOLDER.yaml"`
+
+This should be given the whole path to the config on your local machine, such as for our example it should be changed to:
+
+`configuration: "/FULL-PATH-TO-REPO/PVNet/configs/datamodule/configuration/gcp_configuration.yaml"
+`
+
+Where `FULL-PATH-TO-REPO` represent the whole path to the PVNet repo on your local machine.
+
+### Running the batch creation script
+
+Run the save_batches.py script to create batches with the following example arguments as:
 
 ```
-python scripts/save_batches.py datamodule=local_ocf_datapipes +batch_output_dir="./output" +num_train_batches=10 +num_val_batches=5
+python scripts/save_batches.py datamodule=streamed_batches +batch_output_dir="./output" +num_train_batches=10 +num_val_batches=5
 ```
 
+In this function the datamodule argument looks for a config under `PVNet/configs/datamodule`. The examples here are either to use "premade_batches" or "streamed_batches".
+
+Its important that the dates set for the training, validation and testing in the datamodule (`streamed_batches.yaml`) config are within the ranges of the dates set for the input features in the configuration (`gcp_configuration.yaml`).
+
+If downloading private data from a gcp bucket make sure to authenticate gcloud (the public satellite data does not need authentication):
+
+```
+gcloud auth login
+```
+
+For files stored in multiple locations they can be added as list. For example from the gcp_configuration.yaml file we can change from satellite data stored on a bucket:
+
+```
+satellite:
+    satellite_zarr_path: gs://solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/v4/2020_nonhrv.zarr
+```
+
+To satellite data hosted by Google:
+
+```
+satellite:
+    satellite_zarr_paths:
+      - "gs://public-datasets-eumetsat-solar-forecasting/satellite/EUMETSAT/SEVIRI_RSS/v4/2020_nonhrv.zarr"
+      - "gs://public-datasets-eumetsat-solar-forecasting/satellite/EUMETSAT/SEVIRI_RSS/v4/2021_nonhrv.zarr"
+```
+Datapipes is currently set up to use 11 channels from the satellite data, the 12th of which is HRV and is not included in these.
+
+
 ### Training PVNet
 
 How PVNet is run is determined by the extensive configuration in the config

diff --git a/configs.example/datamodule/configuration/template_configuration.yaml b/configs.example/datamodule/configuration/template_configuration.yaml
@@ -15,29 +15,35 @@ input_data:
     end_datetime: "2021-09-01T00:00:00"
 
   nwp:
-    nwp_zarr_path: gs://solar-pv-nowcasting-data/NWP/UK_Met_Office/UKV_intermediate_version_7.zarr
-    history_minutes: 60
-    forecast_minutes: 120
-    time_resolution_minutes: 60
-    nwp_channels:
-      - t # live = t2m
-      - dswrf
-      - dlwrf
-      - hcc
-      - mcc
-      - lcc
-      - vis
-      - r # live = r2
-      - prate # live ~= rprate
-      - si10 # 10-metre wind speed | live = unknown
-    nwp_image_size_pixels_height: 24
-    nwp_image_size_pixels_width: 24
+    ukv:
+      nwp_zarr_path: gs://solar-pv-nowcasting-data/NWP/UK_Met_Office/UKV_intermediate_version_7.zarr
+      history_minutes: 60
+      forecast_minutes: 120
+      time_resolution_minutes: 60
+      nwp_channels: # comment out channels as appropriate
+        - t # 2-metre temperature
+        - dswrf # downwards short-wave radiation flux
+        - dlwrf # downwards long-wave radiation flux
+        - hcc # high cloud cover
+        - mcc # medium cloud cover
+        - lcc # low cloud cover
+        - vis # visability
+        - r # relative humidity
+        - prate # precipitation rate
+        - si10 # 10-metre wind speed | live = unknown
+      nwp_image_size_pixels_height: 24
+      nwp_image_size_pixels_width: 24
+      nwp_provider: ukv
+      start_datetime: "2020-01-01T00:00:00"
+      end_datetime: "2021-09-01T00:00:00"
 
   pv:
     pv_files_groups:
       - label: solar_sheffield_passiv
         pv_filename: gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/passiv.netcdf
         pv_metadata_filename: gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/system_metadata_OCF_ONLY.csv
+        # This is the list of pv_ml_ids to be sliced from the PV site level data
+    pv_ml_ids: [156, 158, 159, 160, 162]
     history_minutes: 60
     forecast_minutes: 120
     time_resolution_minutes: 5
@@ -50,7 +56,9 @@ input_data:
     is_live: false
 
   satellite:
-    satellite_zarr_path: gs://solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/v4/2020_nonhrv.zarr
+    satellite_zarr_path:
+      - "gs://public-datasets-eumetsat-solar-forecasting/satellite/EUMETSAT/SEVIRI_RSS/v4/2020_nonhrv.zarr"
+      - "gs://public-datasets-eumetsat-solar-forecasting/satellite/EUMETSAT/SEVIRI_RSS/v4/2021_nonhrv.zarr"
     history_minutes: 60
     forecast_minutes: 0
     live_delay_minutes: 30
@@ -69,6 +77,3 @@ input_data:
       - WV_073
     satellite_image_size_pixels_height: 24
     satellite_image_size_pixels_width: 24
-
-output_data:
-  filepath: "not-needed"
diff --git a/configs.example/datamodule/streamed_batches.yaml b/configs.example/datamodule/streamed_batches.yaml
@@ -1,6 +1,7 @@
 _target_: pvnet.data.datamodule.DataModule
 # Path to the data configuration yaml file. You can find examples in the configuration subdirectory
 # in configs.example/datamodule/configuration
+# Use the full local path such as: /FULL/PATH/PVNet/configs/datamodule/configuration/gcp_configuration.yaml"
 configuration: "PLACEHOLDER.yaml"
 num_workers: 20
 prefetch_factor: 2

diff --git a/configs.example/readme.md b/configs.example/readme.md
@@ -1,8 +1,5 @@
-This directory contains example configuration files for the PVNet project. Many paths will need to
-be each user. YOu can find these paths by searching for PLACEHOLDER within these logs. Not all of
-the values with a placeholder need to be set. For example in the logger subdirectory there are
-many different loggers with PLACEHOLDERS. If only one logger is used, then only that placeholder
-need be set.
+This directory contains example configuration files for the PVNet project. Many paths will need to unique to each user. You can find these paths by searching for PLACEHOLDER within these logs. Not all of
+the values with a placeholder need to be set. For example in the logger subdirectory there are many different loggers with PLACEHOLDERS. If only one logger is used, then only that placeholder needs to be set.
 
 run experiments by:
 `python run.py experiment=example_simple `
diff --git a/requirements.txt b/requirements.txt
@@ -23,3 +23,5 @@ hydra-core
 python-dotenv
 hydra-optuna-sweeper
 rich
+# gcsfs is only needed when getting data from Google Cloud Storage
+gcsfs