From dc6f6cd8deba471dcf1bd33927b41bebce7c57d4 Mon Sep 17 00:00:00 2001 From: Felix Peretz Date: Wed, 18 Dec 2024 14:57:35 +0000 Subject: [PATCH] Config updates finalised --- .../configuration/example_configuration.yaml | 194 +++++++----------- .../datamodule/premade_batches.yaml | 8 +- .../datamodule/streamed_batches.yaml | 10 +- pyproject.toml | 2 +- 4 files changed, 84 insertions(+), 130 deletions(-) diff --git a/configs.example/datamodule/configuration/example_configuration.yaml b/configs.example/datamodule/configuration/example_configuration.yaml index 827b0e9d..3d4166ca 100644 --- a/configs.example/datamodule/configuration/example_configuration.yaml +++ b/configs.example/datamodule/configuration/example_configuration.yaml @@ -1,124 +1,48 @@ general: - description: Example data config for creating PVNet batches - name: example_pvnet + description: Example config for producing PVNet samples for a reneweble generation site + name: site_example_config input_data: - default_history_minutes: 120 - default_forecast_minutes: 480 + + # Either use Site OR GSP configuration + site: + # Path to Site data in NetCDF format + file_path: PLACEHOLDER.nc + # Path to metadata in CSV format + metadata_file_path: PLACEHOLDER.csv + time_resolution_minutes: 15 + interval_start_minutes: -60 + # Specified for intraday currently + interval_end_minutes: 480 + dropout_timedeltas_minutes: null + dropout_fraction: 0 # Fraction of samples with dropout gsp: - # Path to the GSP data. This should be a zarr file + # Path to GSP data in zarr format # e.g. gs://solar-pv-nowcasting-data/PV/GSP/v7/pv_gsp.zarr - gsp_zarr_path: PLACEHOLDER.zarr - history_minutes: 120 - forecast_minutes: 480 + zarr_path: PLACEHOLDER.zarr + interval_start_minutes: -60 + # Specified for intraday currently + interval_end_minutes: 480 time_resolution_minutes: 30 - # A random value from the list below will be chosen as the delay when dropout is used + # Random value from the list below will be chosen as the delay when dropout is used # If set to null no dropout is applied. Only values before t0 are dropped out for GSP. # Values after t0 are assumed as targets and cannot be dropped. dropout_timedeltas_minutes: null dropout_fraction: 0 # Fraction of samples with dropout - pv: - pv_files_groups: - - label: solar_sheffield_passiv - # Path to the site-level PV data. This should be a netcdf - # e.g gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/passiv.netcdf - pv_filename: PLACEHOLDER.netcdf - # Path to the site-level PV metadata. This choudl be a csv - # e.g gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/system_metadata.csv - pv_metadata_filename: PLACEHOLDER.csv - # This is the list of pv_ml_ids to be sliced from the PV site level data - # The IDs below are 349 of the PV systems which have very little NaN data in the historic data - # and which are still reporting live (as of Oct 2023) - pv_ml_ids: - [ - 154, 155, 156, 158, 159, 160, 162, 164, 165, 166, 167, 168, 169, 171, 173, 177, 178, 179, - 181, 182, 185, 186, 187, 188, 189, 190, 191, 192, 193, 197, 198, 199, 200, 202, 204, 205, - 206, 208, 209, 211, 214, 215, 216, 217, 218, 219, 220, 221, 225, 229, 230, 232, 233, 234, - 236, 242, 243, 245, 252, 254, 255, 256, 257, 258, 260, 261, 262, 265, 267, 268, 272, 273, - 275, 276, 277, 280, 281, 282, 283, 287, 289, 291, 292, 293, 294, 295, 296, 297, 298, 301, - 302, 303, 304, 306, 307, 309, 310, 311, 317, 318, 319, 320, 321, 322, 323, 325, 326, 329, - 332, 333, 335, 336, 338, 340, 342, 344, 345, 346, 348, 349, 352, 354, 355, 356, 357, 360, - 362, 363, 368, 369, 370, 371, 372, 374, 375, 376, 378, 380, 382, 384, 385, 388, 390, 391, - 393, 396, 397, 398, 399, 400, 401, 403, 404, 405, 406, 407, 409, 411, 412, 413, 414, 415, - 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 429, 431, 435, 437, 438, 440, - 441, 444, 447, 450, 451, 453, 456, 457, 458, 459, 464, 465, 466, 467, 468, 470, 471, 473, - 474, 476, 477, 479, 480, 481, 482, 485, 486, 488, 490, 491, 492, 493, 496, 498, 501, 503, - 506, 507, 508, 509, 510, 511, 512, 513, 515, 516, 517, 519, 520, 521, 522, 524, 526, 527, - 528, 531, 532, 536, 537, 538, 540, 541, 542, 543, 544, 545, 549, 550, 551, 552, 553, 554, - 556, 557, 560, 561, 563, 566, 568, 571, 572, 575, 576, 577, 579, 580, 581, 582, 584, 585, - 588, 590, 594, 595, 597, 600, 602, 603, 604, 606, 611, 613, 614, 616, 618, 620, 622, 623, - 624, 625, 626, 628, 629, 630, 631, 636, 637, 638, 640, 641, 642, 644, 645, 646, 650, 651, - 652, 653, 654, 655, 657, 660, 661, 662, 663, 666, 667, 668, 670, 675, 676, 679, 681, 683, - 684, 685, 687, 696, 698, 701, 702, 703, 704, 706, 710, 722, 723, 724, 725, 727, 728, 729, - 730, 732, 733, 734, 735, 736, 737 - ] - history_minutes: 180 - forecast_minutes: 0 - time_resolution_minutes: 5 - # A random value from the list below will be chosen as the delay when dropout is used. - # If set to null no dropout is applied. All PV systems are dropped together with this setting. - dropout_timedeltas_minutes: null - dropout_fraction: 0 # Fraction of samples with dropout - # A random value from the list below will be chosen as the delay when system dropout is used. - # If set to null no dropout is applied. All PV systems are indpendently with this setting. - system_dropout_timedeltas_minutes: null - # For ech sample a differnt dropout probability is used which is uniformly sampled from the min - # and max below - system_dropout_fraction_min: 0 - system_dropout_fraction_max: 0 - nwp: - ukv: - nwp_provider: ukv - nwp_zarr_path: - # Path(s) to UKV NWP data in zarr format - # e.g. gs://solar-pv-nowcasting-data/NWP/UK_Met_Office/UKV_intermediate_version_7.zarr - - PLACEHOLDER.zarr - history_minutes: 120 - forecast_minutes: 480 - time_resolution_minutes: 60 - nwp_channels: - # These variables exist in the CEDA training set and in the live MetOffice live service - - t # 2-metre temperature - - dswrf # downwards short-wave radiation flux - - dlwrf # downwards long-wave radiation flux - - hcc # high cloud cover - - mcc # medium cloud cover - - lcc # low cloud cover - - sde # snow depth water equivalent - - r # relative humidty - - vis # visibility - - si10 # 10-metre wind speed - - wdir10 # 10-metre wind direction - - prate # precipitation rate - # These variables exist in CEDA training data but not in the live MetOffice live service - - hcct # height of convective cloud top, meters above surface. NaN if no clouds - - cdcb # height of lowest cloud base > 3 oktas - - dpt # dew point temperature - - prmsl # mean sea level pressure - - h # geometrical? (maybe geopotential?) height - nwp_image_size_pixels_height: 24 - nwp_image_size_pixels_width: 24 - # A random value from the list below will be chosen as the delay when dropout is used - # If set to null no dropout is applied. Values must be negative. - dropout_timedeltas_minutes: [-180] - # Dropout applied with this probability - dropout_fraction: 1.0 - # How long after the NWP init-time are we still willing to use this forecast - # If null we use each init-time for all steps it covers - max_staleness_minutes: null ecmwf: - nwp_provider: ecmwf + provider: ecmwf # Path to ECMWF NWP data in zarr format # n.b. It is not necessary to use multiple or any NWP data. These entries can be removed - nwp_zarr_path: PLACEHOLDER.zarr - history_minutes: 120 - forecast_minutes: 480 + zarr_path: PLACEHOLDER.zarr + interval_start_minutes: -60 + # Specified for intraday currently + interval_end_minutes: 480 time_resolution_minutes: 60 - nwp_channels: + channels: - t2m # 2-metre temperature - dswrf # downwards short-wave radiation flux - dlwrf # downwards long-wave radiation flux @@ -136,23 +60,55 @@ input_data: - v10 # 10-metre V component of wind speed - v100 # 100-metre V component of wind speed - v200 # 200-metre V component of wind speed - nwp_image_size_pixels_height: 12 # roughly equivalent to UKV 24 pixels - nwp_image_size_pixels_width: 12 - dropout_timedeltas_minutes: [-180] - dropout_fraction: 1.0 + image_size_pixels_height: 24 + image_size_pixels_width: 24 + dropout_timedeltas_minutes: [-360] + dropout_fraction: 1.0 # Fraction of samples with dropout + max_staleness_minutes: null + + ukv: + provider: ukv + # Path to UKV NWP data in zarr format + # e.g. gs://solar-pv-nowcasting-data/NWP/UK_Met_Office/UKV_intermediate_version_7.zarr + # n.b. It is not necessary to use multiple or any NWP data. These entries can be removed + zarr_path: PLACEHOLDER.zarr + interval_start_minutes: -60 + # Specified for intraday currently + interval_end_minutes: 480 + time_resolution_minutes: 60 + channels: + - t # 2-metre temperature + - dswrf # downwards short-wave radiation flux + - dlwrf # downwards long-wave radiation flux + - hcc # high cloud cover + - mcc # medium cloud cover + - lcc # low cloud cover + - sde # snow depth water equivalent + - r # relative humidty + - vis # visibility + - si10 # 10-metre wind speed + - wdir10 # 10-metre wind direction + - prate # precipitation rate + # These variables exist in CEDA training data but not in the live MetOffice live service + - hcct # height of convective cloud top, meters above surface. NaN if no clouds + - cdcb # height of lowest cloud base > 3 oktas + - dpt # dew point temperature + - prmsl # mean sea level pressure + - h # geometrical? (maybe geopotential?) height + image_size_pixels_height: 24 + image_size_pixels_width: 24 + dropout_timedeltas_minutes: [-360] + dropout_fraction: 1.0 # Fraction of samples with dropout max_staleness_minutes: null satellite: - satellite_zarr_path: - # Path(s) to non-HRV satellite data in zarr format - # e.g. gs://solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/v4/2020_nonhrv.zarr - - PLACEHOLDER.zarr - history_minutes: 90 - forecast_minutes: 0 # Deprecated for most use cases - live_delay_minutes: 60 # Only data up to time t0-60minutes is inluced in slice + # Path to Satellite data (non-HRV) in zarr format + # e.g. gs://solar-pv-nowcasting-data/satellite/EUMETSAT/SEVIRI_RSS/v4/2020_nonhrv.zarr + zarr_path: PLACEHOLDER.zarr + interval_start_minutes: -30 + interval_end_minutes: 0 time_resolution_minutes: 5 - satellite_channels: - # Uses for each channel taken from https://resources.eumetrain.org/data/3/311/bsc_s4.pdf + channels: - IR_016 # Surface, cloud phase - IR_039 # Surface, clouds, wind fields - IR_087 # Surface, clouds, atmospheric instability @@ -164,9 +120,7 @@ input_data: - VIS008 # Surface, clouds, wind fields - WV_062 # Water vapor, high level clouds, upper air analysis - WV_073 # Water vapor, atmospheric instability, upper-level dynamics - satellite_image_size_pixels_height: 24 - satellite_image_size_pixels_width: 24 - # A random value from the list below will be chosen as the delay when dropout is used - # If set to null no dropout is applied. Values must be negative. + image_size_pixels_height: 24 + image_size_pixels_width: 24 dropout_timedeltas_minutes: null dropout_fraction: 0 # Fraction of samples with dropout diff --git a/configs.example/datamodule/premade_batches.yaml b/configs.example/datamodule/premade_batches.yaml index f08f5af2..2326edc0 100644 --- a/configs.example/datamodule/premade_batches.yaml +++ b/configs.example/datamodule/premade_batches.yaml @@ -1,8 +1,10 @@ _target_: pvnet.data.datamodule.DataModule configuration: null -# The batch_dir is the location batches were saved to using the save_batches.py script -# The batch_dir should contain train and val subdirectories with batches -batch_dir: "PLACEHOLDER" + +# The sample_dir is the location batches were saved to using the save_batches.py script +# The sample_dir should contain train and val subdirectories with batches + +sample_dir: "PLACEHOLDER" num_workers: 10 prefetch_factor: 2 batch_size: 8 diff --git a/configs.example/datamodule/streamed_batches.yaml b/configs.example/datamodule/streamed_batches.yaml index 14f42bc5..aec3c1bf 100644 --- a/configs.example/datamodule/streamed_batches.yaml +++ b/configs.example/datamodule/streamed_batches.yaml @@ -2,14 +2,15 @@ _target_: pvnet.data.datamodule.DataModule # Path to the data configuration yaml file. You can find examples in the configuration subdirectory # in configs.example/datamodule/configuration # Use the full local path such as: /FULL/PATH/PVNet/configs/datamodule/configuration/gcp_configuration.yaml" + configuration: "PLACEHOLDER.yaml" num_workers: 20 prefetch_factor: 2 batch_size: 8 -batch_output_dir: "PLACEHOLDER" -num_train_batches: 2 -num_val_batches: 1 +sample_output_dir: "PLACEHOLDER" +num_train_samples: 2 +num_val_samples: 1 train_period: - null @@ -17,6 +18,3 @@ train_period: val_period: - "2022-05-08" - "2023-05-08" -test_period: - - "2022-05-08" - - "2023-05-08" diff --git a/pyproject.toml b/pyproject.toml index cc3ea1cf..b931d605 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ dynamic = ["version", "readme"] license={file="LICENCE"} dependencies = [ - "ocf_data_sampler==0.0.26", + "ocf_data_sampler==0.0.32", "ocf_datapipes>=3.3.34", "ocf_ml_metrics>=0.0.11", "numpy",