From e1311652a00da9573c6ab09ada1c6b509638bd7b Mon Sep 17 00:00:00 2001
From: Zwart <jzwart@usgs.gov>
Date: Fri, 22 Dec 2023 11:55:30 -0800
Subject: [PATCH 01/14] be more specific in functions

---
 drivers/download_stage1_psuedo.R | 35 +++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/drivers/download_stage1_psuedo.R b/drivers/download_stage1_psuedo.R
index 564a9cec9d..4a80a19f65 100644
--- a/drivers/download_stage1_psuedo.R
+++ b/drivers/download_stage1_psuedo.R
@@ -5,8 +5,10 @@ library(gefs4cast)
 gdalcubes::gdalcubes_options(parallel=2*parallel::detectCores())
 #gdalcubes::gdalcubes_options(parallel=TRUE)
 
+config <- yaml::read_yaml("challenge_configuration.yaml")
+
 sites <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/",
-                                "raw/main/USGS_site_metadata.csv"),
+                                "raw/prod/USGS_site_metadata.csv"),
                          col_select = c("site_id", "latitude", "longitude"))
 
 Sys.setenv("GEFS_VERSION"="v12")
@@ -15,27 +17,40 @@ dates_pseudo <- seq(as.Date("2020-09-24"), Sys.Date(), by=1)
 
 message("GEFS v12 stage1-stats")
 bench::bench_time({ # thelio
-  s3 <- gefs_s3_dir("stage1-stats")
+  s3 <- gefs4cast::gefs_s3_dir(product = "stage1-stats",
+                               path = "", # should this path be more specific? the noaa bucket in the config is "drivers/noaa/gefs-v12-reprocess/"
+                               endpoint = config$endpoint,
+                               bucket = config$noaa_forecast_bucket)
   have_dates <- gsub("reference_datetime=", "", s3$ls())
   missing_dates <- dates[!(as.character(dates) %in% have_dates)]
-  gefs_to_parquet(missing_dates,
-                  ensemble=c("geavg", "gespr"),
-                  path = s3,
-                  sites = sites)
+  gefs4cast::gefs_to_parquet(dates = missing_dates,
+                             ensemble = c("geavg", "gespr"),
+                             path = s3,
+                             sites = sites) # should partitioning also include the project_id ??
 })
 
 message("GEFS v12 pseudo")
 bench::bench_time({ #32xlarge
-  s3 <- gefs_s3_dir("pseudo")
+  s3 <- gefs4cast::gefs_s3_dir(product = "pseudo",
+                               path = "", # same questions as above ^
+                               endpoint = config$endpoint,
+                               bucket = config$noaa_forecast_bucket)
   have_dates <- gsub("reference_datetime=", "", s3$ls())
   missing_dates <- dates_pseudo[!(as.character(dates_pseudo) %in% have_dates)]
-  gefs4cast:::gefs_pseudo_measures(missing_dates,  path = s3, sites = sites)
+  gefs4cast:::gefs_pseudo_measures(dates = missing_dates,
+                                   path = s3,
+                                   sites = sites)
 })
 
 message("GEFS v12 stage1")
 bench::bench_time({ # cirrus ~ 6days for full set
-  s3 <- gefs_s3_dir("stage1")
+  s3 <- gefs4cast::gefs_s3_dir(product = "stage1",
+                               path = "",
+                               endpoint = config$endpoint,
+                               bucket = config$noaa_forecast_bucket)
   have_dates <- gsub("reference_datetime=", "", s3$ls())
   missing_dates <- dates[!(as.character(dates) %in% have_dates)]
-  gefs_to_parquet(missing_dates, path = s3, sites = sites)
+  gefs4cast::gefs_to_parquet(dates = missing_dates,
+                             path = s3,
+                             sites = sites)
 })

From efd638550ad4686b83a3bc95206dcd1cd47673d8 Mon Sep 17 00:00:00 2001
From: Zwart <jzwart@usgs.gov>
Date: Fri, 22 Dec 2023 14:10:01 -0800
Subject: [PATCH 02/14] make scripts more explicit

---
 drivers/download_stage1_psuedo.R |  6 ++---
 drivers/generate_stage2.R        | 41 ++++++++++++++++++++------------
 2 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/drivers/download_stage1_psuedo.R b/drivers/download_stage1_psuedo.R
index 4a80a19f65..23c26ab883 100644
--- a/drivers/download_stage1_psuedo.R
+++ b/drivers/download_stage1_psuedo.R
@@ -20,7 +20,7 @@ bench::bench_time({ # thelio
   s3 <- gefs4cast::gefs_s3_dir(product = "stage1-stats",
                                path = "", # should this path be more specific? the noaa bucket in the config is "drivers/noaa/gefs-v12-reprocess/"
                                endpoint = config$endpoint,
-                               bucket = config$noaa_forecast_bucket)
+                               bucket = config$driver_bucket)
   have_dates <- gsub("reference_datetime=", "", s3$ls())
   missing_dates <- dates[!(as.character(dates) %in% have_dates)]
   gefs4cast::gefs_to_parquet(dates = missing_dates,
@@ -34,7 +34,7 @@ bench::bench_time({ #32xlarge
   s3 <- gefs4cast::gefs_s3_dir(product = "pseudo",
                                path = "", # same questions as above ^
                                endpoint = config$endpoint,
-                               bucket = config$noaa_forecast_bucket)
+                               bucket = config$driver_bucket)
   have_dates <- gsub("reference_datetime=", "", s3$ls())
   missing_dates <- dates_pseudo[!(as.character(dates_pseudo) %in% have_dates)]
   gefs4cast:::gefs_pseudo_measures(dates = missing_dates,
@@ -47,7 +47,7 @@ bench::bench_time({ # cirrus ~ 6days for full set
   s3 <- gefs4cast::gefs_s3_dir(product = "stage1",
                                path = "",
                                endpoint = config$endpoint,
-                               bucket = config$noaa_forecast_bucket)
+                               bucket = config$driver_bucket)
   have_dates <- gsub("reference_datetime=", "", s3$ls())
   missing_dates <- dates[!(as.character(dates) %in% have_dates)]
   gefs4cast::gefs_to_parquet(dates = missing_dates,
diff --git a/drivers/generate_stage2.R b/drivers/generate_stage2.R
index 2655bf60e5..6f2a72ca91 100644
--- a/drivers/generate_stage2.R
+++ b/drivers/generate_stage2.R
@@ -1,20 +1,27 @@
-source("https://raw.githubusercontent.com/eco4cast/neon4cast/ci_upgrade/R/to_hourly.R") # is this branch stable?
+source("https://raw.githubusercontent.com/eco4cast/neon4cast/ci_upgrade/R/to_hourly.R") # is this branch stable? why use ci_upgrade ?
+
+Sys.setenv("GEFS_VERSION"="v12")
 
 site_list <- readr::read_csv("USGS_site_metadata.csv",
                              show_col_types = FALSE)
 
+config <- yaml::read_yaml("challenge_configuration.yaml")
+
 # should this be updated to a usgsrc4cast-drivers path? or are we keeping all drivers in
 #  neon4cast-drivers?
-s3_stage2 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage2",
-                              endpoint_override = "sdsc.osn.xsede.org",
-                              access_key= Sys.getenv("OSN_KEY"),
-                              secret_key= Sys.getenv("OSN_SECRET"))
+# s3_stage2 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage2",
+#                               endpoint_override = "sdsc.osn.xsede.org",
+#                               access_key= Sys.getenv("OSN_KEY"),
+#                               secret_key= Sys.getenv("OSN_SECRET"))
+s3_stage2 <- gefs4cast::gefs_s3_dir(product = "stage2",
+                                    path = "",
+                                    endpoint = config$endpoint,
+                                    bucket = config$driver_bucket)
 
 df <- arrow::open_dataset(s3_stage2) |>
   dplyr::distinct(reference_datetime) |>
   dplyr::collect()
 
-
 #stage1_s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1",
 #                       endpoint_override = "sdsc.osn.xsede.org",
 #                       anonymous = TRUE)
@@ -28,9 +35,12 @@ df <- arrow::open_dataset(s3_stage2) |>
 #  dplyr::collect()
 
 curr_date <- Sys.Date()
-last_week <- dplyr::tibble(reference_datetime = as.character(seq(curr_date - lubridate::days(7), curr_date - lubridate::days(1), by = "1 day")))
+last_week <- dplyr::tibble(reference_datetime = as.character(seq(curr_date - lubridate::days(7),
+                                                                 curr_date - lubridate::days(1),
+                                                                 by = "1 day")))
 
-missing_dates <- dplyr::anti_join(last_week, df, by = "reference_datetime") |>
+missing_dates <- dplyr::anti_join(last_week, df,
+                                  by = "reference_datetime") |>
   dplyr::pull(reference_datetime)
 
 if(length(missing_dates) > 0){
@@ -38,14 +48,15 @@ if(length(missing_dates) > 0){
 
     print(missing_dates[i])
 
-    bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",missing_dates[i])
+    # bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",
+    #                  missing_dates[i])
+    bucket <- glue::glue("{config$driver_bucket}/gefs-v12/stage1/reference_datetime={missing_dates[i]}")
 
-    endpoint_override <- "https://sdsc.osn.xsede.org"
-    s3 <- arrow::s3_bucket(paste0(bucket),
-                           endpoint_override = endpoint_override,
-                           anonymous = TRUE)
+    s3_stage1 <- arrow::s3_bucket(bucket = bucket,
+                                  endpoint_override = config$endpoint,
+                                  anonymous = TRUE)
 
-    site_df <- arrow::open_dataset(s3) |>
+    site_df <- arrow::open_dataset(s3_stage1) |>
       dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) |>
       dplyr::filter(site_id %in% site_list$site_id) |>
       dplyr::collect() |>
@@ -58,7 +69,7 @@ if(length(missing_dates) > 0){
                     reference_datetime = lubridate::as_date(reference_datetime)) |>
       dplyr::rename(parameter = ensemble)
 
-    arrow::write_dataset(hourly_df,
+    arrow::write_dataset(dataset = hourly_df,
                          path = s3_stage2,
                          partitioning = c("reference_datetime", "site_id"))
   }

From a4c787ebe8f9b786f58ce5db13aeb0aa819d88da Mon Sep 17 00:00:00 2001
From: Zwart <jzwart@usgs.gov>
Date: Fri, 22 Dec 2023 14:10:14 -0800
Subject: [PATCH 03/14] organizing config

---
 challenge_configuration.yaml | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/challenge_configuration.yaml b/challenge_configuration.yaml
index d5425f6776..051a693817 100644
--- a/challenge_configuration.yaml
+++ b/challenge_configuration.yaml
@@ -1,20 +1,33 @@
+# Challenge details
 challenge_long_name: EFI-USGS River Chlorophyll Forecasting Challenge
+challenge_url: https://projects.ecoforecast.org/usgsrc4cast-ci
+github_repo: eco4cast/usgsrc4cast-ci
 project_id: usgsrc4cast
+
+# Endpoints
 endpoint: sdsc.osn.xsede.org
 noaa_endpoint: s3.flare-forecast.org
 submissions_endpoint: submit.ecoforecast.org
-scores_bucket: bio230014-bucket01/challenges/scores
+
+# Buckets
+archive_bucket: bio230014-bucket01/challenges/archive
+driver_bucket: bio230014-bucket01/challenges/drivers
 forecasts_bucket: bio230014-bucket01/challenges/forecasts
-summaries_bucket: bio230014-bucket01/challenges/forecasts/summaries
-submissions_bucket: submissions
 inventory_bucket: bio230014-bucket01/challenges/inventory
+model_metadata_bucket: bio230014-bucket01/challenges/metadata/model_id
 noaa_forecast_bucket: drivers/noaa/gefs-v12-reprocess/
 prov_bucket: bio230014-bucket01/challenges/prov
+scores_bucket: bio230014-bucket01/challenges/scores
+submissions_bucket: submissions
+summaries_bucket: bio230014-bucket01/challenges/forecasts/summaries
 targets_bucket: bio230014-bucket01/challenges/targets
-archive_bucket: bio230014-bucket01/challenges/archive
-model_metadata_bucket: bio230014-bucket01/challenges/metadata/model_id
-model_metadata_gsheet: https://docs.google.com/spreadsheets/d/1f177dpaxLzc4UuQ4_SJV9JWIbQPlilVnEztyvZE6aSU/edit?usp=sharing
+
+# Misc. files
 example_model_id: example
+model_metadata_gsheet: https://docs.google.com/spreadsheets/d/1f177dpaxLzc4UuQ4_SJV9JWIbQPlilVnEztyvZE6aSU/edit?usp=sharing
+targets_file_name: 'river-chl-targets.csv.gz'
+
+# Forecast sites
 site_path: 'catalog/sites'
 site_table: USGS_site_metadata.csv
 site_thumbnail: 'https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/screencapture-waterdata-usgs-gov-nwis-rt-2018-08-02-13_00_05-01.jpg'
@@ -25,21 +38,21 @@ target_metadata_gsheet: https://docs.google.com/spreadsheets/d/10YTX9ae_C1rFdLgE
 targets_thumbnail: 'https://raw.githubusercontent.com/eco4cast/neon4cast-ci/main/catalog/thumbnail_plots/neon_stream.jpg'
 targets_thumbnail_title: 'Test Image'
 targets_path: 'catalog/targets/'
-# to here
-targets_file_name: 'river-chl-targets.csv.gz'
-challenge_url: https://projects.ecoforecast.org/usgsrc4cast-ci
-github_repo: eco4cast/usgsrc4cast-ci
+
 target_groups:
   Aquatics:
     targets_file: "https://data.ecoforecast.org/neon4cast-targets/aquatics/aquatics-targets.csv.gz"
+
 noaa_forecast_groups: ['Pseudo','Stage1-stats','Stage1','Stage2','Stage3']
 noaa_forecast_group_paths: ['pseudo','stage1-stats','stage1','stage2','stage3']
+
 variable_groups:
   Aquatics:
     variable: ['chla']
     duration: ['P1D']
     thumbnail_link: 'https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/Back-b.jpg'
     thumbnail_title: 'USGS Streamgage'
+
 catalog_config:
   about_string: 'https://projects.ecoforecast.org/usgsrc4cast-docs/'
   about_title: 'EFI-USGS River Chlorophyll Forecasting Challenge Documentation'

From 1319000bf18e4231de270d9e9482482934a97fb6 Mon Sep 17 00:00:00 2001
From: Zwart <jzwart@usgs.gov>
Date: Fri, 22 Dec 2023 14:11:12 -0800
Subject: [PATCH 04/14] checkout prod - comment for now

---
 .github/workflows/drivers_stage1.yaml | 4 +++-
 .github/workflows/drivers_stage3.yaml | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/drivers_stage1.yaml b/.github/workflows/drivers_stage1.yaml
index d83eeedadf..3a67b8e99f 100644
--- a/.github/workflows/drivers_stage1.yaml
+++ b/.github/workflows/drivers_stage1.yaml
@@ -10,7 +10,7 @@ name: gefs_osn
 
 jobs:
   docker:
-    runs-on: [self-hosted]
+    runs-on: [self-hosted] # do we have access to this in usgsrc4cast repo?
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
       OSN_KEY: ${{ secrets.OSN_KEY }}
@@ -18,6 +18,8 @@ jobs:
 #    container: rocker/geospatial
     steps:
       - uses: actions/checkout@v3
+        # with:
+        #   ref: prod
 
       - name: Install
         shell: Rscript {0}
diff --git a/.github/workflows/drivers_stage3.yaml b/.github/workflows/drivers_stage3.yaml
index 6eb0582383..41015650cf 100644
--- a/.github/workflows/drivers_stage3.yaml
+++ b/.github/workflows/drivers_stage3.yaml
@@ -17,6 +17,8 @@ jobs:
 #    container: rocker/geospatial
     steps:
       - uses: actions/checkout@v3
+        # with:
+        #   ref: prod
 
       - name: Install
         shell: Rscript {0}

From 47852e72cd208aa89b3ee679fd73eed1c6575418 Mon Sep 17 00:00:00 2001
From: Zwart <jzwart@usgs.gov>
Date: Fri, 22 Dec 2023 14:23:43 -0800
Subject: [PATCH 05/14] making stage 3 scripts more explicit

---
 drivers/update_stage3.R | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/drivers/update_stage3.R b/drivers/update_stage3.R
index 36352a1cff..1849972333 100644
--- a/drivers/update_stage3.R
+++ b/drivers/update_stage3.R
@@ -1,22 +1,26 @@
-source("https://raw.githubusercontent.com/eco4cast/neon4cast/ci_upgrade/R/to_hourly.R")
+source("https://raw.githubusercontent.com/eco4cast/neon4cast/ci_upgrade/R/to_hourly.R") # should we rely on this branch?
 
 site_list <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/",
-                                    "raw/main/USGS_site_metadata.csv"),
+                                    "raw/prod/USGS_site_metadata.csv"),
                              show_col_types = FALSE) |>
   dplyr::pull(site_id)
 
+Sys.setenv("GEFS_VERSION"="v12")
+
+config <- yaml::read_yaml("challenge_configuration.yaml")
+
 future::plan("future::multisession", workers = 8)
 
 furrr::future_walk(site_list, function(curr_site_id){
 
   print(curr_site_id)
 
-  s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage3",
-                         endpoint_override = "sdsc.osn.xsede.org",
-                         access_key= Sys.getenv("OSN_KEY"),
-                         secret_key= Sys.getenv("OSN_SECRET"))
+  s3_stage3 <- arrow::s3_bucket(bucket = glue::glue("{config$driver_bucket}/gefs-v12/stage3"),
+                                endpoint_override = config$endpoint,
+                                access_key = Sys.getenv("OSN_KEY"),
+                                secret_key = Sys.getenv("OSN_SECRET"))
 
-  stage3_df <- arrow::open_dataset(s3) |>
+  stage3_df <- arrow::open_dataset(s3_stage3) |>
     dplyr::filter(site_id == curr_site_id) |>
     dplyr::collect()
 
@@ -24,8 +28,8 @@ furrr::future_walk(site_list, function(curr_site_id){
     dplyr::summarise(max = as.character(lubridate::as_date(max(datetime)))) |>
     dplyr::pull(max)
 
-  s3_pseudo <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/pseudo",
-                                endpoint_override = "sdsc.osn.xsede.org",
+  s3_pseudo <- arrow::s3_bucket(bucket = glue::glue("{config$driver_bucket}/gefs-v12/pseudo"),
+                                endpoint_override = config$endpoint,
                                 access_key= Sys.getenv("OSN_KEY"),
                                 secret_key= Sys.getenv("OSN_SECRET"))
 
@@ -33,18 +37,18 @@ furrr::future_walk(site_list, function(curr_site_id){
 
   cut_off <- as.character(lubridate::as_date(max_date) - lubridate::days(3))
 
-  df <- arrow::open_dataset(s3_pseudo) |>
+  pseudo_df <- arrow::open_dataset(s3_pseudo) |>
     dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) |>
     dplyr::filter(site_id == curr_site_id,
                   reference_datetime >= cut_off) |>
     dplyr::collect()
 
-  if(nrow(df) > 0){
+  if(nrow(psuedo_df) > 0){
 
-    df2 <- df |>
+    df2 <- psuedo_df |>
       to_hourly(use_solar_geom = TRUE, psuedo = TRUE) |>
-    dplyr::mutate(ensemble = as.numeric(stringr::str_sub(ensemble, start = 4, end = 5))) |>
-    dplyr::rename(parameter = ensemble)
+      dplyr::mutate(ensemble = as.numeric(stringr::str_sub(ensemble, start = 4, end = 5))) |>
+      dplyr::rename(parameter = ensemble)
 
     stage3_df_update <- stage3_df |>
       dplyr::filter(datetime < min(df2$datetime))
@@ -52,6 +56,6 @@ furrr::future_walk(site_list, function(curr_site_id){
     df2 |>
       dplyr::bind_rows(stage3_df_update) |>
       dplyr::arrange(variable, datetime, parameter) |>
-      arrow::write_dataset(path = s3, partitioning = "site_id")
+      arrow::write_dataset(path = s3_stage3, partitioning = "site_id")
   }
 })

From 34eef4123e697df21d2cf92236a1b6dd717d6b22 Mon Sep 17 00:00:00 2001
From: Zwart <jzwart@usgs.gov>
Date: Mon, 8 Jan 2024 11:49:57 -0800
Subject: [PATCH 06/14] public domain disclaimer

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 9dc42e1dd5..ed5bc2c7ff 100644
--- a/README.md
+++ b/README.md
@@ -19,3 +19,6 @@ Supported by the U.S. National Science Foundation grants (DEB-1926388 and OAC-22
 ## Disclaimer
 Although this software program has been used by the U.S. Geological Survey (USGS), no warranty, expressed or implied, is made by the USGS or the U.S. Government as to the accuracy and functioning of the program and related program material nor shall the fact of distribution constitute any such warranty, and no responsibility is assumed by the USGS in connection therewith.
 This software is provided “AS IS.”
+
+## License Disclaimer 
+As a government employee, the contributions from Jacob Zwart to this repository are in the public domain. 

From e6bbd82384363692a6afa6b7eb85ec538547aec4 Mon Sep 17 00:00:00 2001
From: Zwart <jzwart@usgs.gov>
Date: Mon, 8 Jan 2024 12:02:21 -0800
Subject: [PATCH 07/14] organizing

---
 challenge_configuration.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/challenge_configuration.yaml b/challenge_configuration.yaml
index 051a693817..755a513d4a 100644
--- a/challenge_configuration.yaml
+++ b/challenge_configuration.yaml
@@ -53,6 +53,7 @@ variable_groups:
     thumbnail_link: 'https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/Back-b.jpg'
     thumbnail_title: 'USGS Streamgage'
 
+# Forecast catalog configuration
 catalog_config:
   about_string: 'https://projects.ecoforecast.org/usgsrc4cast-docs/'
   about_title: 'EFI-USGS River Chlorophyll Forecasting Challenge Documentation'

From e7d4ccda78bfa6ad944925691e34f7d5d3bed73f Mon Sep 17 00:00:00 2001
From: Zwart <jzwart@usgs.gov>
Date: Mon, 8 Jan 2024 12:54:06 -0800
Subject: [PATCH 08/14] update config

---
 challenge_configuration.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/challenge_configuration.yaml b/challenge_configuration.yaml
index 755a513d4a..ba23250e3e 100644
--- a/challenge_configuration.yaml
+++ b/challenge_configuration.yaml
@@ -11,7 +11,7 @@ submissions_endpoint: submit.ecoforecast.org
 
 # Buckets
 archive_bucket: bio230014-bucket01/challenges/archive
-driver_bucket: bio230014-bucket01/challenges/drivers
+driver_bucket: bio230014-bucket01/challenges/drivers/usgsrc4cast/noaa
 forecasts_bucket: bio230014-bucket01/challenges/forecasts
 inventory_bucket: bio230014-bucket01/challenges/inventory
 model_metadata_bucket: bio230014-bucket01/challenges/metadata/model_id

From d36e6ead63bd9234b4805e2d70c0364ed724ea7d Mon Sep 17 00:00:00 2001
From: Zwart <jzwart@usgs.gov>
Date: Mon, 8 Jan 2024 12:54:23 -0800
Subject: [PATCH 09/14] respond to reviews and consistent function use

---
 drivers/download_stage1_psuedo.R | 16 +++++++++-------
 drivers/generate_stage2.R        | 24 ++++++++----------------
 drivers/generate_stage3.R        |  5 ++++-
 drivers/update_stage3.R          | 22 +++++++++++++---------
 4 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/drivers/download_stage1_psuedo.R b/drivers/download_stage1_psuedo.R
index 23c26ab883..e940df9961 100644
--- a/drivers/download_stage1_psuedo.R
+++ b/drivers/download_stage1_psuedo.R
@@ -6,6 +6,8 @@ gdalcubes::gdalcubes_options(parallel=2*parallel::detectCores())
 #gdalcubes::gdalcubes_options(parallel=TRUE)
 
 config <- yaml::read_yaml("challenge_configuration.yaml")
+driver_bucket <- stringr::word(config$driver_bucket, 1, sep = "/")
+driver_path <- stringr::word(config$driver_bucket, 2, -1, sep = "/")
 
 sites <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/",
                                 "raw/prod/USGS_site_metadata.csv"),
@@ -18,23 +20,23 @@ dates_pseudo <- seq(as.Date("2020-09-24"), Sys.Date(), by=1)
 message("GEFS v12 stage1-stats")
 bench::bench_time({ # thelio
   s3 <- gefs4cast::gefs_s3_dir(product = "stage1-stats",
-                               path = "", # should this path be more specific? the noaa bucket in the config is "drivers/noaa/gefs-v12-reprocess/"
+                               path = driver_path,
                                endpoint = config$endpoint,
-                               bucket = config$driver_bucket)
+                               bucket = driver_bucket)
   have_dates <- gsub("reference_datetime=", "", s3$ls())
   missing_dates <- dates[!(as.character(dates) %in% have_dates)]
   gefs4cast::gefs_to_parquet(dates = missing_dates,
                              ensemble = c("geavg", "gespr"),
                              path = s3,
-                             sites = sites) # should partitioning also include the project_id ??
+                             sites = sites)
 })
 
 message("GEFS v12 pseudo")
 bench::bench_time({ #32xlarge
   s3 <- gefs4cast::gefs_s3_dir(product = "pseudo",
-                               path = "", # same questions as above ^
+                               path = driver_path,
                                endpoint = config$endpoint,
-                               bucket = config$driver_bucket)
+                               bucket = driver_bucket)
   have_dates <- gsub("reference_datetime=", "", s3$ls())
   missing_dates <- dates_pseudo[!(as.character(dates_pseudo) %in% have_dates)]
   gefs4cast:::gefs_pseudo_measures(dates = missing_dates,
@@ -45,9 +47,9 @@ bench::bench_time({ #32xlarge
 message("GEFS v12 stage1")
 bench::bench_time({ # cirrus ~ 6days for full set
   s3 <- gefs4cast::gefs_s3_dir(product = "stage1",
-                               path = "",
+                               path = driver_path,
                                endpoint = config$endpoint,
-                               bucket = config$driver_bucket)
+                               bucket = driver_bucket)
   have_dates <- gsub("reference_datetime=", "", s3$ls())
   missing_dates <- dates[!(as.character(dates) %in% have_dates)]
   gefs4cast::gefs_to_parquet(dates = missing_dates,
diff --git a/drivers/generate_stage2.R b/drivers/generate_stage2.R
index 6f2a72ca91..31d77718d0 100644
--- a/drivers/generate_stage2.R
+++ b/drivers/generate_stage2.R
@@ -1,4 +1,7 @@
-source("https://raw.githubusercontent.com/eco4cast/neon4cast/ci_upgrade/R/to_hourly.R") # is this branch stable? why use ci_upgrade ?
+## setup
+library(gdalcubes)
+library(gefs4cast)
+source("https://raw.githubusercontent.com/eco4cast/neon4cast/main/R/to_hourly.R")
 
 Sys.setenv("GEFS_VERSION"="v12")
 
@@ -6,33 +9,22 @@ site_list <- readr::read_csv("USGS_site_metadata.csv",
                              show_col_types = FALSE)
 
 config <- yaml::read_yaml("challenge_configuration.yaml")
+driver_bucket <- stringr::word(config$driver_bucket, 1, sep = "/")
+driver_path <- stringr::word(config$driver_bucket, 2, -1, sep = "/")
 
-# should this be updated to a usgsrc4cast-drivers path? or are we keeping all drivers in
-#  neon4cast-drivers?
 # s3_stage2 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage2",
 #                               endpoint_override = "sdsc.osn.xsede.org",
 #                               access_key= Sys.getenv("OSN_KEY"),
 #                               secret_key= Sys.getenv("OSN_SECRET"))
 s3_stage2 <- gefs4cast::gefs_s3_dir(product = "stage2",
-                                    path = "",
+                                    path = driver_path,
                                     endpoint = config$endpoint,
-                                    bucket = config$driver_bucket)
+                                    bucket = driver_bucket)
 
 df <- arrow::open_dataset(s3_stage2) |>
   dplyr::distinct(reference_datetime) |>
   dplyr::collect()
 
-#stage1_s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1",
-#                       endpoint_override = "sdsc.osn.xsede.org",
-#                       anonymous = TRUE)
-
-
-#efi <- duckdbfs::open_dataset("s3://bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1",
-#                    s3_access_key_id="",
-#                    s3_endpoint="sdsc.osn.xsede.org")
-#df_stage1 <- arrow::open_dataset(stage1_s3) |>
-#  dplyr::summarize(max(reference_datetime)) |>
-#  dplyr::collect()
 
 curr_date <- Sys.Date()
 last_week <- dplyr::tibble(reference_datetime = as.character(seq(curr_date - lubridate::days(7),
diff --git a/drivers/generate_stage3.R b/drivers/generate_stage3.R
index 7314144572..6b8b4e899f 100644
--- a/drivers/generate_stage3.R
+++ b/drivers/generate_stage3.R
@@ -1,5 +1,8 @@
+## setup
 library(minioclient)
-source("https://raw.githubusercontent.com/eco4cast/neon4cast/ci_upgrade/R/to_hourly.R")
+library(gdalcubes)
+library(gefs4cast)
+source("https://raw.githubusercontent.com/eco4cast/neon4cast/main/R/to_hourly.R")
 
 #install_mc()
 mc_alias_set("osn", "sdsc.osn.xsede.org", "", "")
diff --git a/drivers/update_stage3.R b/drivers/update_stage3.R
index 1849972333..05d86b0fea 100644
--- a/drivers/update_stage3.R
+++ b/drivers/update_stage3.R
@@ -1,4 +1,6 @@
-source("https://raw.githubusercontent.com/eco4cast/neon4cast/ci_upgrade/R/to_hourly.R") # should we rely on this branch?
+library(gdalcubes)
+library(gefs4cast)
+source("https://raw.githubusercontent.com/eco4cast/neon4cast/main/R/to_hourly.R")
 
 site_list <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/",
                                     "raw/prod/USGS_site_metadata.csv"),
@@ -8,6 +10,8 @@ site_list <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/
 Sys.setenv("GEFS_VERSION"="v12")
 
 config <- yaml::read_yaml("challenge_configuration.yaml")
+driver_bucket <- stringr::word(config$driver_bucket, 1, sep = "/")
+driver_path <- stringr::word(config$driver_bucket, 2, -1, sep = "/")
 
 future::plan("future::multisession", workers = 8)
 
@@ -15,10 +19,10 @@ furrr::future_walk(site_list, function(curr_site_id){
 
   print(curr_site_id)
 
-  s3_stage3 <- arrow::s3_bucket(bucket = glue::glue("{config$driver_bucket}/gefs-v12/stage3"),
-                                endpoint_override = config$endpoint,
-                                access_key = Sys.getenv("OSN_KEY"),
-                                secret_key = Sys.getenv("OSN_SECRET"))
+  s3_stage3 <- gefs4cast::gefs_s3_dir(product = "stage3",
+                                      path = driver_path,
+                                      endpoint = config$endpoint,
+                                      bucket = driver_bucket)
 
   stage3_df <- arrow::open_dataset(s3_stage3) |>
     dplyr::filter(site_id == curr_site_id) |>
@@ -28,10 +32,10 @@ furrr::future_walk(site_list, function(curr_site_id){
     dplyr::summarise(max = as.character(lubridate::as_date(max(datetime)))) |>
     dplyr::pull(max)
 
-  s3_pseudo <- arrow::s3_bucket(bucket = glue::glue("{config$driver_bucket}/gefs-v12/pseudo"),
-                                endpoint_override = config$endpoint,
-                                access_key= Sys.getenv("OSN_KEY"),
-                                secret_key= Sys.getenv("OSN_SECRET"))
+  s3_pseudo <- gefs4cast::gefs_s3_dir(product = "pseudo",
+                                      path = driver_path,
+                                      endpoint = config$endpoint,
+                                      bucket = driver_bucket)
 
   vars <- names(stage3_df)
 

From 8729bc5c169506310ff24882adf0f6bc923ce4c6 Mon Sep 17 00:00:00 2001
From: Zwart <jzwart@usgs.gov>
Date: Mon, 8 Jan 2024 13:01:55 -0800
Subject: [PATCH 10/14] generating stage 3

---
 drivers/generate_stage3.R | 38 +++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/drivers/generate_stage3.R b/drivers/generate_stage3.R
index 6b8b4e899f..d70a426f70 100644
--- a/drivers/generate_stage3.R
+++ b/drivers/generate_stage3.R
@@ -4,33 +4,31 @@ library(gdalcubes)
 library(gefs4cast)
 source("https://raw.githubusercontent.com/eco4cast/neon4cast/main/R/to_hourly.R")
 
+config <- yaml::read_yaml("challenge_configuration.yaml")
+driver_bucket <- stringr::word(config$driver_bucket, 1, sep = "/")
+driver_path <- stringr::word(config$driver_bucket, 2, -1, sep = "/")
+
+Sys.setenv("GEFS_VERSION"="v12")
+
 #install_mc()
 mc_alias_set("osn", "sdsc.osn.xsede.org", "", "")
-# TODO: update path to usgsrc4cast-drivers?
-mc_mirror("osn/bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/pseudo", "pseudo")
+
+mc_mirror(glue::glue("osn/{driver_bucket}/{driver_path}/gefs-v12/pseudo"), "pseudo")
 
 df <- arrow::open_dataset("pseudo") |>
   dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF"))
 
 
 site_list <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/",
-                                    "raw/main/USGS_site_metadata.csv"),
+                                    "raw/prod/USGS_site_metadata.csv"),
                              show_col_types = FALSE) |>
   dplyr::pull(site_id)
 
-s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12",
-                       endpoint_override = "sdsc.osn.xsede.org",
-                       access_key= Sys.getenv("OSN_KEY"),
-                       secret_key= Sys.getenv("OSN_SECRET"))
-
-s3$CreateDir("stage3")
-
-s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage3",
-                       endpoint_override = "sdsc.osn.xsede.org",
-                       access_key= Sys.getenv("OSN_KEY"),
-                       secret_key= Sys.getenv("OSN_SECRET"))
 
-#site_list <- site_list[1:3]
+s3_stage3 <- gefs4cast::gefs_s3_dir(product = "stage3",
+                                    path = driver_path,
+                                    endpoint = config$endpoint,
+                                    bucket = driver_bucket)
 
 future::plan("future::multisession", workers = 8)
 
@@ -41,15 +39,13 @@ furrr::future_walk(site_list, function(curr_site_id){
     dplyr::filter(site_id == curr_site_id) |>
     dplyr::collect()
 
-  s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage3",
-                         endpoint_override = "sdsc.osn.xsede.org",
-                         access_key= Sys.getenv("OSN_KEY"),
-                         secret_key= Sys.getenv("OSN_SECRET"))
+  s3_stage3 <- gefs4cast::gefs_s3_dir(product = "stage3",
+                                      path = driver_path,
+                                      endpoint = config$endpoint,
+                                      bucket = driver_bucket)
 
   print(curr_site_id)
   df |>
-    #dplyr::filter(site_id == curr_site_id) |>
-    #dplyr::collect() |>
     to_hourly(use_solar_geom = TRUE, psuedo = TRUE) |>
     dplyr::mutate(ensemble = as.numeric(stringr::str_sub(ensemble, start = 4, end = 5))) |>
     dplyr::rename(parameter = ensemble) |>

From 31c4de91df1292ef5bc874dfafd1b18b886996ac Mon Sep 17 00:00:00 2001
From: Zwart <jzwart@usgs.gov>
Date: Mon, 8 Jan 2024 13:04:19 -0800
Subject: [PATCH 11/14] deleting submit met forecasts as I dont think its used

---
 drivers/submit_met_forecast.R | 50 -----------------------------------
 1 file changed, 50 deletions(-)
 delete mode 100644 drivers/submit_met_forecast.R

diff --git a/drivers/submit_met_forecast.R b/drivers/submit_met_forecast.R
deleted file mode 100644
index c3d5a54315..0000000000
--- a/drivers/submit_met_forecast.R
+++ /dev/null
@@ -1,50 +0,0 @@
-submit_met_forecast <- function(model_id){
-
-  s3 <- arrow::s3_bucket(paste0("bio230121-bucket01/flare/drivers/met/ensemble_forecast/model_id=", model_id),
-                         endpoint_override = "renc.osn.xsede.org",
-                         access_key = Sys.getenv("OSN_KEY"),
-                         secret_key = Sys.getenv("OSN_SECRET"))
-
-  df <- arrow::open_dataset(s3) |> dplyr::filter(site_id == "fcre") |> dplyr::collect()
-
-  max_reference_date <- max(df$reference_date)
-
-  filename <- paste0("drivers/", model_id, "-",max_reference_date,".csv.gz")
-  df <- df |> dplyr::filter(reference_date == max_reference_date) |>
-    dplyr::mutate(date = lubridate::as_date(datetime)) |>
-    dplyr::select(-unit) |>
-    tidyr::pivot_wider(names_from = variable, values_from = prediction)
-
-  if(model_id == "ecmwf_ifs04"){
-  df <- df |>  dplyr::summarize(RH_percent_mean = mean(relativehumidity_2m, na.rm = TRUE),
-              Rain_mm_sum = sum(precipitation, na.rm = TRUE),
-              WindSpeed_ms_mean = mean(windspeed_10m, na.rm = TRUE),
-              AirTemp_C_mean = mean(temperature_2m, na.rm = TRUE),
-              BP_kPa_mean = mean(surface_pressure * 0.1, na.rm = TRUE),
-              .by = c("date","ensemble"))
-  }else{
-    df <- df |>  dplyr::summarize(RH_percent_mean = mean(relativehumidity_2m, na.rm = TRUE),
-                                  Rain_mm_sum = sum(precipitation, na.rm = TRUE),
-                                  WindSpeed_ms_mean = mean(windspeed_10m, na.rm = TRUE),
-                                  AirTemp_C_mean = mean(temperature_2m, na.rm = TRUE),
-                                  ShortwaveRadiationUp_Wm2_mean = mean(shortwave_radiation, na.rm = TRUE),
-                                  BP_kPa_mean = mean(surface_pressure * 0.1, na.rm = TRUE),
-                                  .by = c("date","ensemble"))
-  }
-    df |> tidyr::pivot_longer(-c(date, ensemble), names_to = "variable", values_to = "prediction") |>
-    dplyr::mutate(datetime = lubridate::as_datetime(date),
-           reference_datetime = lubridate::as_datetime(max_reference_date),
-           site_id = "fcre",
-           model_id = model_id,
-           duration = "P1D",
-           project_id = "vera4cast",
-           depth_m = NA,
-           family = "ensemble",
-           ensemble = as.numeric(ensemble)) |>
-    dplyr::rename(parameter = ensemble) |>
-    dplyr::select(c("project_id", "site_id","model_id", "reference_datetime", "datetime","duration", "depth_m","variable", "family", "parameter", "prediction")) |>
-    readr::write_csv(filename)
-
-  vera4castHelpers::submit(filename, first_submission = FALSE)
-
-}

From 976db6c5b48652419885979e9b3fead31b6fbcff Mon Sep 17 00:00:00 2001
From: Zwart <jzwart@usgs.gov>
Date: Mon, 8 Jan 2024 13:26:55 -0800
Subject: [PATCH 12/14] testing

---
 .github/workflows/drivers_stage1.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/drivers_stage1.yaml b/.github/workflows/drivers_stage1.yaml
index 3a67b8e99f..1bd420745e 100644
--- a/.github/workflows/drivers_stage1.yaml
+++ b/.github/workflows/drivers_stage1.yaml
@@ -10,7 +10,7 @@ name: gefs_osn
 
 jobs:
   docker:
-    runs-on: [self-hosted] # do we have access to this in usgsrc4cast repo?
+    runs-on: [self-hosted]
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
       OSN_KEY: ${{ secrets.OSN_KEY }}
@@ -18,8 +18,8 @@ jobs:
 #    container: rocker/geospatial
     steps:
       - uses: actions/checkout@v3
-        # with:
-        #   ref: prod
+        with:
+          ref: generate-drivers
 
       - name: Install
         shell: Rscript {0}

From 82275329d6b6c1a808efe6de84e8b46e1184cd28 Mon Sep 17 00:00:00 2001
From: Zwart <jzwart@usgs.gov>
Date: Mon, 8 Jan 2024 13:32:31 -0800
Subject: [PATCH 13/14] comment out schedule

---
 .github/workflows/drivers_stage1.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/drivers_stage1.yaml b/.github/workflows/drivers_stage1.yaml
index 1bd420745e..a99b1ab0ff 100644
--- a/.github/workflows/drivers_stage1.yaml
+++ b/.github/workflows/drivers_stage1.yaml
@@ -1,8 +1,8 @@
 # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
-  schedule:
-    - cron: '0 23 * * *'
+  # schedule:
+  #   - cron: '0 23 * * *'
   workflow_dispatch:
 
 

From fe4fafc45105c843c03da20ed62e4094ee069f2b Mon Sep 17 00:00:00 2001
From: Zwart <jzwart@usgs.gov>
Date: Wed, 24 Jan 2024 10:19:28 -0800
Subject: [PATCH 14/14] ref to prod

---
 .github/workflows/drivers_stage1.yaml | 7 +++----
 .github/workflows/drivers_stage3.yaml | 8 ++++----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/drivers_stage1.yaml b/.github/workflows/drivers_stage1.yaml
index a99b1ab0ff..1a4698d97f 100644
--- a/.github/workflows/drivers_stage1.yaml
+++ b/.github/workflows/drivers_stage1.yaml
@@ -1,11 +1,10 @@
 # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
-  # schedule:
-  #   - cron: '0 23 * * *'
+  schedule:
+    - cron: '0 23 * * *'
   workflow_dispatch:
 
-
 name: gefs_osn
 
 jobs:
@@ -19,7 +18,7 @@ jobs:
     steps:
       - uses: actions/checkout@v3
         with:
-          ref: generate-drivers
+          ref: prod
 
       - name: Install
         shell: Rscript {0}
diff --git a/.github/workflows/drivers_stage3.yaml b/.github/workflows/drivers_stage3.yaml
index 41015650cf..f1f6f78399 100644
--- a/.github/workflows/drivers_stage3.yaml
+++ b/.github/workflows/drivers_stage3.yaml
@@ -1,8 +1,8 @@
 # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
-  #schedule:
-  #  - cron: '0 13 * * *'
+  schedule:
+    - cron: '0 13 * * *'
   workflow_dispatch:
 
 name: gefs_osn_stage3
@@ -17,8 +17,8 @@ jobs:
 #    container: rocker/geospatial
     steps:
       - uses: actions/checkout@v3
-        # with:
-        #   ref: prod
+        with:
+          ref: prod
 
       - name: Install
         shell: Rscript {0}