From e1311652a00da9573c6ab09ada1c6b509638bd7b Mon Sep 17 00:00:00 2001 From: Zwart Date: Fri, 22 Dec 2023 11:55:30 -0800 Subject: [PATCH 01/14] be more specific in functions --- drivers/download_stage1_psuedo.R | 35 +++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/drivers/download_stage1_psuedo.R b/drivers/download_stage1_psuedo.R index 564a9cec9d..4a80a19f65 100644 --- a/drivers/download_stage1_psuedo.R +++ b/drivers/download_stage1_psuedo.R @@ -5,8 +5,10 @@ library(gefs4cast) gdalcubes::gdalcubes_options(parallel=2*parallel::detectCores()) #gdalcubes::gdalcubes_options(parallel=TRUE) +config <- yaml::read_yaml("challenge_configuration.yaml") + sites <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/", - "raw/main/USGS_site_metadata.csv"), + "raw/prod/USGS_site_metadata.csv"), col_select = c("site_id", "latitude", "longitude")) Sys.setenv("GEFS_VERSION"="v12") @@ -15,27 +17,40 @@ dates_pseudo <- seq(as.Date("2020-09-24"), Sys.Date(), by=1) message("GEFS v12 stage1-stats") bench::bench_time({ # thelio - s3 <- gefs_s3_dir("stage1-stats") + s3 <- gefs4cast::gefs_s3_dir(product = "stage1-stats", + path = "", # should this path be more specific? the noaa bucket in the config is "drivers/noaa/gefs-v12-reprocess/" + endpoint = config$endpoint, + bucket = config$noaa_forecast_bucket) have_dates <- gsub("reference_datetime=", "", s3$ls()) missing_dates <- dates[!(as.character(dates) %in% have_dates)] - gefs_to_parquet(missing_dates, - ensemble=c("geavg", "gespr"), - path = s3, - sites = sites) + gefs4cast::gefs_to_parquet(dates = missing_dates, + ensemble = c("geavg", "gespr"), + path = s3, + sites = sites) # should partitioning also include the project_id ?? }) message("GEFS v12 pseudo") bench::bench_time({ #32xlarge - s3 <- gefs_s3_dir("pseudo") + s3 <- gefs4cast::gefs_s3_dir(product = "pseudo", + path = "", # same questions as above ^ + endpoint = config$endpoint, + bucket = config$noaa_forecast_bucket) have_dates <- gsub("reference_datetime=", "", s3$ls()) missing_dates <- dates_pseudo[!(as.character(dates_pseudo) %in% have_dates)] - gefs4cast:::gefs_pseudo_measures(missing_dates, path = s3, sites = sites) + gefs4cast:::gefs_pseudo_measures(dates = missing_dates, + path = s3, + sites = sites) }) message("GEFS v12 stage1") bench::bench_time({ # cirrus ~ 6days for full set - s3 <- gefs_s3_dir("stage1") + s3 <- gefs4cast::gefs_s3_dir(product = "stage1", + path = "", + endpoint = config$endpoint, + bucket = config$noaa_forecast_bucket) have_dates <- gsub("reference_datetime=", "", s3$ls()) missing_dates <- dates[!(as.character(dates) %in% have_dates)] - gefs_to_parquet(missing_dates, path = s3, sites = sites) + gefs4cast::gefs_to_parquet(dates = missing_dates, + path = s3, + sites = sites) }) From efd638550ad4686b83a3bc95206dcd1cd47673d8 Mon Sep 17 00:00:00 2001 From: Zwart Date: Fri, 22 Dec 2023 14:10:01 -0800 Subject: [PATCH 02/14] make scripts more explicit --- drivers/download_stage1_psuedo.R | 6 ++--- drivers/generate_stage2.R | 41 ++++++++++++++++++++------------ 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/drivers/download_stage1_psuedo.R b/drivers/download_stage1_psuedo.R index 4a80a19f65..23c26ab883 100644 --- a/drivers/download_stage1_psuedo.R +++ b/drivers/download_stage1_psuedo.R @@ -20,7 +20,7 @@ bench::bench_time({ # thelio s3 <- gefs4cast::gefs_s3_dir(product = "stage1-stats", path = "", # should this path be more specific? the noaa bucket in the config is "drivers/noaa/gefs-v12-reprocess/" endpoint = config$endpoint, - bucket = config$noaa_forecast_bucket) + bucket = config$driver_bucket) have_dates <- gsub("reference_datetime=", "", s3$ls()) missing_dates <- dates[!(as.character(dates) %in% have_dates)] gefs4cast::gefs_to_parquet(dates = missing_dates, @@ -34,7 +34,7 @@ bench::bench_time({ #32xlarge s3 <- gefs4cast::gefs_s3_dir(product = "pseudo", path = "", # same questions as above ^ endpoint = config$endpoint, - bucket = config$noaa_forecast_bucket) + bucket = config$driver_bucket) have_dates <- gsub("reference_datetime=", "", s3$ls()) missing_dates <- dates_pseudo[!(as.character(dates_pseudo) %in% have_dates)] gefs4cast:::gefs_pseudo_measures(dates = missing_dates, @@ -47,7 +47,7 @@ bench::bench_time({ # cirrus ~ 6days for full set s3 <- gefs4cast::gefs_s3_dir(product = "stage1", path = "", endpoint = config$endpoint, - bucket = config$noaa_forecast_bucket) + bucket = config$driver_bucket) have_dates <- gsub("reference_datetime=", "", s3$ls()) missing_dates <- dates[!(as.character(dates) %in% have_dates)] gefs4cast::gefs_to_parquet(dates = missing_dates, diff --git a/drivers/generate_stage2.R b/drivers/generate_stage2.R index 2655bf60e5..6f2a72ca91 100644 --- a/drivers/generate_stage2.R +++ b/drivers/generate_stage2.R @@ -1,20 +1,27 @@ -source("https://raw.githubusercontent.com/eco4cast/neon4cast/ci_upgrade/R/to_hourly.R") # is this branch stable? +source("https://raw.githubusercontent.com/eco4cast/neon4cast/ci_upgrade/R/to_hourly.R") # is this branch stable? why use ci_upgrade ? + +Sys.setenv("GEFS_VERSION"="v12") site_list <- readr::read_csv("USGS_site_metadata.csv", show_col_types = FALSE) +config <- yaml::read_yaml("challenge_configuration.yaml") + # should this be updated to a usgsrc4cast-drivers path? or are we keeping all drivers in # neon4cast-drivers? -s3_stage2 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage2", - endpoint_override = "sdsc.osn.xsede.org", - access_key= Sys.getenv("OSN_KEY"), - secret_key= Sys.getenv("OSN_SECRET")) +# s3_stage2 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage2", +# endpoint_override = "sdsc.osn.xsede.org", +# access_key= Sys.getenv("OSN_KEY"), +# secret_key= Sys.getenv("OSN_SECRET")) +s3_stage2 <- gefs4cast::gefs_s3_dir(product = "stage2", + path = "", + endpoint = config$endpoint, + bucket = config$driver_bucket) df <- arrow::open_dataset(s3_stage2) |> dplyr::distinct(reference_datetime) |> dplyr::collect() - #stage1_s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1", # endpoint_override = "sdsc.osn.xsede.org", # anonymous = TRUE) @@ -28,9 +35,12 @@ df <- arrow::open_dataset(s3_stage2) |> # dplyr::collect() curr_date <- Sys.Date() -last_week <- dplyr::tibble(reference_datetime = as.character(seq(curr_date - lubridate::days(7), curr_date - lubridate::days(1), by = "1 day"))) +last_week <- dplyr::tibble(reference_datetime = as.character(seq(curr_date - lubridate::days(7), + curr_date - lubridate::days(1), + by = "1 day"))) -missing_dates <- dplyr::anti_join(last_week, df, by = "reference_datetime") |> +missing_dates <- dplyr::anti_join(last_week, df, + by = "reference_datetime") |> dplyr::pull(reference_datetime) if(length(missing_dates) > 0){ @@ -38,14 +48,15 @@ if(length(missing_dates) > 0){ print(missing_dates[i]) - bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",missing_dates[i]) + # bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=", + # missing_dates[i]) + bucket <- glue::glue("{config$driver_bucket}/gefs-v12/stage1/reference_datetime={missing_dates[i]}") - endpoint_override <- "https://sdsc.osn.xsede.org" - s3 <- arrow::s3_bucket(paste0(bucket), - endpoint_override = endpoint_override, - anonymous = TRUE) + s3_stage1 <- arrow::s3_bucket(bucket = bucket, + endpoint_override = config$endpoint, + anonymous = TRUE) - site_df <- arrow::open_dataset(s3) |> + site_df <- arrow::open_dataset(s3_stage1) |> dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) |> dplyr::filter(site_id %in% site_list$site_id) |> dplyr::collect() |> @@ -58,7 +69,7 @@ if(length(missing_dates) > 0){ reference_datetime = lubridate::as_date(reference_datetime)) |> dplyr::rename(parameter = ensemble) - arrow::write_dataset(hourly_df, + arrow::write_dataset(dataset = hourly_df, path = s3_stage2, partitioning = c("reference_datetime", "site_id")) } From a4c787ebe8f9b786f58ce5db13aeb0aa819d88da Mon Sep 17 00:00:00 2001 From: Zwart Date: Fri, 22 Dec 2023 14:10:14 -0800 Subject: [PATCH 03/14] organizing config --- challenge_configuration.yaml | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/challenge_configuration.yaml b/challenge_configuration.yaml index d5425f6776..051a693817 100644 --- a/challenge_configuration.yaml +++ b/challenge_configuration.yaml @@ -1,20 +1,33 @@ +# Challenge details challenge_long_name: EFI-USGS River Chlorophyll Forecasting Challenge +challenge_url: https://projects.ecoforecast.org/usgsrc4cast-ci +github_repo: eco4cast/usgsrc4cast-ci project_id: usgsrc4cast + +# Endpoints endpoint: sdsc.osn.xsede.org noaa_endpoint: s3.flare-forecast.org submissions_endpoint: submit.ecoforecast.org -scores_bucket: bio230014-bucket01/challenges/scores + +# Buckets +archive_bucket: bio230014-bucket01/challenges/archive +driver_bucket: bio230014-bucket01/challenges/drivers forecasts_bucket: bio230014-bucket01/challenges/forecasts -summaries_bucket: bio230014-bucket01/challenges/forecasts/summaries -submissions_bucket: submissions inventory_bucket: bio230014-bucket01/challenges/inventory +model_metadata_bucket: bio230014-bucket01/challenges/metadata/model_id noaa_forecast_bucket: drivers/noaa/gefs-v12-reprocess/ prov_bucket: bio230014-bucket01/challenges/prov +scores_bucket: bio230014-bucket01/challenges/scores +submissions_bucket: submissions +summaries_bucket: bio230014-bucket01/challenges/forecasts/summaries targets_bucket: bio230014-bucket01/challenges/targets -archive_bucket: bio230014-bucket01/challenges/archive -model_metadata_bucket: bio230014-bucket01/challenges/metadata/model_id -model_metadata_gsheet: https://docs.google.com/spreadsheets/d/1f177dpaxLzc4UuQ4_SJV9JWIbQPlilVnEztyvZE6aSU/edit?usp=sharing + +# Misc. files example_model_id: example +model_metadata_gsheet: https://docs.google.com/spreadsheets/d/1f177dpaxLzc4UuQ4_SJV9JWIbQPlilVnEztyvZE6aSU/edit?usp=sharing +targets_file_name: 'river-chl-targets.csv.gz' + +# Forecast sites site_path: 'catalog/sites' site_table: USGS_site_metadata.csv site_thumbnail: 'https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/screencapture-waterdata-usgs-gov-nwis-rt-2018-08-02-13_00_05-01.jpg' @@ -25,21 +38,21 @@ target_metadata_gsheet: https://docs.google.com/spreadsheets/d/10YTX9ae_C1rFdLgE targets_thumbnail: 'https://raw.githubusercontent.com/eco4cast/neon4cast-ci/main/catalog/thumbnail_plots/neon_stream.jpg' targets_thumbnail_title: 'Test Image' targets_path: 'catalog/targets/' -# to here -targets_file_name: 'river-chl-targets.csv.gz' -challenge_url: https://projects.ecoforecast.org/usgsrc4cast-ci -github_repo: eco4cast/usgsrc4cast-ci + target_groups: Aquatics: targets_file: "https://data.ecoforecast.org/neon4cast-targets/aquatics/aquatics-targets.csv.gz" + noaa_forecast_groups: ['Pseudo','Stage1-stats','Stage1','Stage2','Stage3'] noaa_forecast_group_paths: ['pseudo','stage1-stats','stage1','stage2','stage3'] + variable_groups: Aquatics: variable: ['chla'] duration: ['P1D'] thumbnail_link: 'https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/Back-b.jpg' thumbnail_title: 'USGS Streamgage' + catalog_config: about_string: 'https://projects.ecoforecast.org/usgsrc4cast-docs/' about_title: 'EFI-USGS River Chlorophyll Forecasting Challenge Documentation' From 1319000bf18e4231de270d9e9482482934a97fb6 Mon Sep 17 00:00:00 2001 From: Zwart Date: Fri, 22 Dec 2023 14:11:12 -0800 Subject: [PATCH 04/14] checkout prod - comment for now --- .github/workflows/drivers_stage1.yaml | 4 +++- .github/workflows/drivers_stage3.yaml | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/drivers_stage1.yaml b/.github/workflows/drivers_stage1.yaml index d83eeedadf..3a67b8e99f 100644 --- a/.github/workflows/drivers_stage1.yaml +++ b/.github/workflows/drivers_stage1.yaml @@ -10,7 +10,7 @@ name: gefs_osn jobs: docker: - runs-on: [self-hosted] + runs-on: [self-hosted] # do we have access to this in usgsrc4cast repo? env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} OSN_KEY: ${{ secrets.OSN_KEY }} @@ -18,6 +18,8 @@ jobs: # container: rocker/geospatial steps: - uses: actions/checkout@v3 + # with: + # ref: prod - name: Install shell: Rscript {0} diff --git a/.github/workflows/drivers_stage3.yaml b/.github/workflows/drivers_stage3.yaml index 6eb0582383..41015650cf 100644 --- a/.github/workflows/drivers_stage3.yaml +++ b/.github/workflows/drivers_stage3.yaml @@ -17,6 +17,8 @@ jobs: # container: rocker/geospatial steps: - uses: actions/checkout@v3 + # with: + # ref: prod - name: Install shell: Rscript {0} From 47852e72cd208aa89b3ee679fd73eed1c6575418 Mon Sep 17 00:00:00 2001 From: Zwart Date: Fri, 22 Dec 2023 14:23:43 -0800 Subject: [PATCH 05/14] making stage 3 scripts more explicit --- drivers/update_stage3.R | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/update_stage3.R b/drivers/update_stage3.R index 36352a1cff..1849972333 100644 --- a/drivers/update_stage3.R +++ b/drivers/update_stage3.R @@ -1,22 +1,26 @@ -source("https://raw.githubusercontent.com/eco4cast/neon4cast/ci_upgrade/R/to_hourly.R") +source("https://raw.githubusercontent.com/eco4cast/neon4cast/ci_upgrade/R/to_hourly.R") # should we rely on this branch? site_list <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/", - "raw/main/USGS_site_metadata.csv"), + "raw/prod/USGS_site_metadata.csv"), show_col_types = FALSE) |> dplyr::pull(site_id) +Sys.setenv("GEFS_VERSION"="v12") + +config <- yaml::read_yaml("challenge_configuration.yaml") + future::plan("future::multisession", workers = 8) furrr::future_walk(site_list, function(curr_site_id){ print(curr_site_id) - s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage3", - endpoint_override = "sdsc.osn.xsede.org", - access_key= Sys.getenv("OSN_KEY"), - secret_key= Sys.getenv("OSN_SECRET")) + s3_stage3 <- arrow::s3_bucket(bucket = glue::glue("{config$driver_bucket}/gefs-v12/stage3"), + endpoint_override = config$endpoint, + access_key = Sys.getenv("OSN_KEY"), + secret_key = Sys.getenv("OSN_SECRET")) - stage3_df <- arrow::open_dataset(s3) |> + stage3_df <- arrow::open_dataset(s3_stage3) |> dplyr::filter(site_id == curr_site_id) |> dplyr::collect() @@ -24,8 +28,8 @@ furrr::future_walk(site_list, function(curr_site_id){ dplyr::summarise(max = as.character(lubridate::as_date(max(datetime)))) |> dplyr::pull(max) - s3_pseudo <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/pseudo", - endpoint_override = "sdsc.osn.xsede.org", + s3_pseudo <- arrow::s3_bucket(bucket = glue::glue("{config$driver_bucket}/gefs-v12/pseudo"), + endpoint_override = config$endpoint, access_key= Sys.getenv("OSN_KEY"), secret_key= Sys.getenv("OSN_SECRET")) @@ -33,18 +37,18 @@ furrr::future_walk(site_list, function(curr_site_id){ cut_off <- as.character(lubridate::as_date(max_date) - lubridate::days(3)) - df <- arrow::open_dataset(s3_pseudo) |> + pseudo_df <- arrow::open_dataset(s3_pseudo) |> dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) |> dplyr::filter(site_id == curr_site_id, reference_datetime >= cut_off) |> dplyr::collect() - if(nrow(df) > 0){ + if(nrow(psuedo_df) > 0){ - df2 <- df |> + df2 <- psuedo_df |> to_hourly(use_solar_geom = TRUE, psuedo = TRUE) |> - dplyr::mutate(ensemble = as.numeric(stringr::str_sub(ensemble, start = 4, end = 5))) |> - dplyr::rename(parameter = ensemble) + dplyr::mutate(ensemble = as.numeric(stringr::str_sub(ensemble, start = 4, end = 5))) |> + dplyr::rename(parameter = ensemble) stage3_df_update <- stage3_df |> dplyr::filter(datetime < min(df2$datetime)) @@ -52,6 +56,6 @@ furrr::future_walk(site_list, function(curr_site_id){ df2 |> dplyr::bind_rows(stage3_df_update) |> dplyr::arrange(variable, datetime, parameter) |> - arrow::write_dataset(path = s3, partitioning = "site_id") + arrow::write_dataset(path = s3_stage3, partitioning = "site_id") } }) From 34eef4123e697df21d2cf92236a1b6dd717d6b22 Mon Sep 17 00:00:00 2001 From: Zwart Date: Mon, 8 Jan 2024 11:49:57 -0800 Subject: [PATCH 06/14] public domain disclaimer --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 9dc42e1dd5..ed5bc2c7ff 100644 --- a/README.md +++ b/README.md @@ -19,3 +19,6 @@ Supported by the U.S. National Science Foundation grants (DEB-1926388 and OAC-22 ## Disclaimer Although this software program has been used by the U.S. Geological Survey (USGS), no warranty, expressed or implied, is made by the USGS or the U.S. Government as to the accuracy and functioning of the program and related program material nor shall the fact of distribution constitute any such warranty, and no responsibility is assumed by the USGS in connection therewith. This software is provided “AS IS.” + +## License Disclaimer +As a government employee, the contributions from Jacob Zwart to this repository are in the public domain. From e6bbd82384363692a6afa6b7eb85ec538547aec4 Mon Sep 17 00:00:00 2001 From: Zwart Date: Mon, 8 Jan 2024 12:02:21 -0800 Subject: [PATCH 07/14] organizing --- challenge_configuration.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/challenge_configuration.yaml b/challenge_configuration.yaml index 051a693817..755a513d4a 100644 --- a/challenge_configuration.yaml +++ b/challenge_configuration.yaml @@ -53,6 +53,7 @@ variable_groups: thumbnail_link: 'https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/Back-b.jpg' thumbnail_title: 'USGS Streamgage' +# Forecast catalog configuration catalog_config: about_string: 'https://projects.ecoforecast.org/usgsrc4cast-docs/' about_title: 'EFI-USGS River Chlorophyll Forecasting Challenge Documentation' From e7d4ccda78bfa6ad944925691e34f7d5d3bed73f Mon Sep 17 00:00:00 2001 From: Zwart Date: Mon, 8 Jan 2024 12:54:06 -0800 Subject: [PATCH 08/14] update config --- challenge_configuration.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/challenge_configuration.yaml b/challenge_configuration.yaml index 755a513d4a..ba23250e3e 100644 --- a/challenge_configuration.yaml +++ b/challenge_configuration.yaml @@ -11,7 +11,7 @@ submissions_endpoint: submit.ecoforecast.org # Buckets archive_bucket: bio230014-bucket01/challenges/archive -driver_bucket: bio230014-bucket01/challenges/drivers +driver_bucket: bio230014-bucket01/challenges/drivers/usgsrc4cast/noaa forecasts_bucket: bio230014-bucket01/challenges/forecasts inventory_bucket: bio230014-bucket01/challenges/inventory model_metadata_bucket: bio230014-bucket01/challenges/metadata/model_id From d36e6ead63bd9234b4805e2d70c0364ed724ea7d Mon Sep 17 00:00:00 2001 From: Zwart Date: Mon, 8 Jan 2024 12:54:23 -0800 Subject: [PATCH 09/14] respond to reviews and consistent function use --- drivers/download_stage1_psuedo.R | 16 +++++++++------- drivers/generate_stage2.R | 24 ++++++++---------------- drivers/generate_stage3.R | 5 ++++- drivers/update_stage3.R | 22 +++++++++++++--------- 4 files changed, 34 insertions(+), 33 deletions(-) diff --git a/drivers/download_stage1_psuedo.R b/drivers/download_stage1_psuedo.R index 23c26ab883..e940df9961 100644 --- a/drivers/download_stage1_psuedo.R +++ b/drivers/download_stage1_psuedo.R @@ -6,6 +6,8 @@ gdalcubes::gdalcubes_options(parallel=2*parallel::detectCores()) #gdalcubes::gdalcubes_options(parallel=TRUE) config <- yaml::read_yaml("challenge_configuration.yaml") +driver_bucket <- stringr::word(config$driver_bucket, 1, sep = "/") +driver_path <- stringr::word(config$driver_bucket, 2, -1, sep = "/") sites <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/", "raw/prod/USGS_site_metadata.csv"), @@ -18,23 +20,23 @@ dates_pseudo <- seq(as.Date("2020-09-24"), Sys.Date(), by=1) message("GEFS v12 stage1-stats") bench::bench_time({ # thelio s3 <- gefs4cast::gefs_s3_dir(product = "stage1-stats", - path = "", # should this path be more specific? the noaa bucket in the config is "drivers/noaa/gefs-v12-reprocess/" + path = driver_path, endpoint = config$endpoint, - bucket = config$driver_bucket) + bucket = driver_bucket) have_dates <- gsub("reference_datetime=", "", s3$ls()) missing_dates <- dates[!(as.character(dates) %in% have_dates)] gefs4cast::gefs_to_parquet(dates = missing_dates, ensemble = c("geavg", "gespr"), path = s3, - sites = sites) # should partitioning also include the project_id ?? + sites = sites) }) message("GEFS v12 pseudo") bench::bench_time({ #32xlarge s3 <- gefs4cast::gefs_s3_dir(product = "pseudo", - path = "", # same questions as above ^ + path = driver_path, endpoint = config$endpoint, - bucket = config$driver_bucket) + bucket = driver_bucket) have_dates <- gsub("reference_datetime=", "", s3$ls()) missing_dates <- dates_pseudo[!(as.character(dates_pseudo) %in% have_dates)] gefs4cast:::gefs_pseudo_measures(dates = missing_dates, @@ -45,9 +47,9 @@ bench::bench_time({ #32xlarge message("GEFS v12 stage1") bench::bench_time({ # cirrus ~ 6days for full set s3 <- gefs4cast::gefs_s3_dir(product = "stage1", - path = "", + path = driver_path, endpoint = config$endpoint, - bucket = config$driver_bucket) + bucket = driver_bucket) have_dates <- gsub("reference_datetime=", "", s3$ls()) missing_dates <- dates[!(as.character(dates) %in% have_dates)] gefs4cast::gefs_to_parquet(dates = missing_dates, diff --git a/drivers/generate_stage2.R b/drivers/generate_stage2.R index 6f2a72ca91..31d77718d0 100644 --- a/drivers/generate_stage2.R +++ b/drivers/generate_stage2.R @@ -1,4 +1,7 @@ -source("https://raw.githubusercontent.com/eco4cast/neon4cast/ci_upgrade/R/to_hourly.R") # is this branch stable? why use ci_upgrade ? +## setup +library(gdalcubes) +library(gefs4cast) +source("https://raw.githubusercontent.com/eco4cast/neon4cast/main/R/to_hourly.R") Sys.setenv("GEFS_VERSION"="v12") @@ -6,33 +9,22 @@ site_list <- readr::read_csv("USGS_site_metadata.csv", show_col_types = FALSE) config <- yaml::read_yaml("challenge_configuration.yaml") +driver_bucket <- stringr::word(config$driver_bucket, 1, sep = "/") +driver_path <- stringr::word(config$driver_bucket, 2, -1, sep = "/") -# should this be updated to a usgsrc4cast-drivers path? or are we keeping all drivers in -# neon4cast-drivers? # s3_stage2 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage2", # endpoint_override = "sdsc.osn.xsede.org", # access_key= Sys.getenv("OSN_KEY"), # secret_key= Sys.getenv("OSN_SECRET")) s3_stage2 <- gefs4cast::gefs_s3_dir(product = "stage2", - path = "", + path = driver_path, endpoint = config$endpoint, - bucket = config$driver_bucket) + bucket = driver_bucket) df <- arrow::open_dataset(s3_stage2) |> dplyr::distinct(reference_datetime) |> dplyr::collect() -#stage1_s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1", -# endpoint_override = "sdsc.osn.xsede.org", -# anonymous = TRUE) - - -#efi <- duckdbfs::open_dataset("s3://bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1", -# s3_access_key_id="", -# s3_endpoint="sdsc.osn.xsede.org") -#df_stage1 <- arrow::open_dataset(stage1_s3) |> -# dplyr::summarize(max(reference_datetime)) |> -# dplyr::collect() curr_date <- Sys.Date() last_week <- dplyr::tibble(reference_datetime = as.character(seq(curr_date - lubridate::days(7), diff --git a/drivers/generate_stage3.R b/drivers/generate_stage3.R index 7314144572..6b8b4e899f 100644 --- a/drivers/generate_stage3.R +++ b/drivers/generate_stage3.R @@ -1,5 +1,8 @@ +## setup library(minioclient) -source("https://raw.githubusercontent.com/eco4cast/neon4cast/ci_upgrade/R/to_hourly.R") +library(gdalcubes) +library(gefs4cast) +source("https://raw.githubusercontent.com/eco4cast/neon4cast/main/R/to_hourly.R") #install_mc() mc_alias_set("osn", "sdsc.osn.xsede.org", "", "") diff --git a/drivers/update_stage3.R b/drivers/update_stage3.R index 1849972333..05d86b0fea 100644 --- a/drivers/update_stage3.R +++ b/drivers/update_stage3.R @@ -1,4 +1,6 @@ -source("https://raw.githubusercontent.com/eco4cast/neon4cast/ci_upgrade/R/to_hourly.R") # should we rely on this branch? +library(gdalcubes) +library(gefs4cast) +source("https://raw.githubusercontent.com/eco4cast/neon4cast/main/R/to_hourly.R") site_list <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/", "raw/prod/USGS_site_metadata.csv"), @@ -8,6 +10,8 @@ site_list <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/ Sys.setenv("GEFS_VERSION"="v12") config <- yaml::read_yaml("challenge_configuration.yaml") +driver_bucket <- stringr::word(config$driver_bucket, 1, sep = "/") +driver_path <- stringr::word(config$driver_bucket, 2, -1, sep = "/") future::plan("future::multisession", workers = 8) @@ -15,10 +19,10 @@ furrr::future_walk(site_list, function(curr_site_id){ print(curr_site_id) - s3_stage3 <- arrow::s3_bucket(bucket = glue::glue("{config$driver_bucket}/gefs-v12/stage3"), - endpoint_override = config$endpoint, - access_key = Sys.getenv("OSN_KEY"), - secret_key = Sys.getenv("OSN_SECRET")) + s3_stage3 <- gefs4cast::gefs_s3_dir(product = "stage3", + path = driver_path, + endpoint = config$endpoint, + bucket = driver_bucket) stage3_df <- arrow::open_dataset(s3_stage3) |> dplyr::filter(site_id == curr_site_id) |> @@ -28,10 +32,10 @@ furrr::future_walk(site_list, function(curr_site_id){ dplyr::summarise(max = as.character(lubridate::as_date(max(datetime)))) |> dplyr::pull(max) - s3_pseudo <- arrow::s3_bucket(bucket = glue::glue("{config$driver_bucket}/gefs-v12/pseudo"), - endpoint_override = config$endpoint, - access_key= Sys.getenv("OSN_KEY"), - secret_key= Sys.getenv("OSN_SECRET")) + s3_pseudo <- gefs4cast::gefs_s3_dir(product = "pseudo", + path = driver_path, + endpoint = config$endpoint, + bucket = driver_bucket) vars <- names(stage3_df) From 8729bc5c169506310ff24882adf0f6bc923ce4c6 Mon Sep 17 00:00:00 2001 From: Zwart Date: Mon, 8 Jan 2024 13:01:55 -0800 Subject: [PATCH 10/14] generating stage 3 --- drivers/generate_stage3.R | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/drivers/generate_stage3.R b/drivers/generate_stage3.R index 6b8b4e899f..d70a426f70 100644 --- a/drivers/generate_stage3.R +++ b/drivers/generate_stage3.R @@ -4,33 +4,31 @@ library(gdalcubes) library(gefs4cast) source("https://raw.githubusercontent.com/eco4cast/neon4cast/main/R/to_hourly.R") +config <- yaml::read_yaml("challenge_configuration.yaml") +driver_bucket <- stringr::word(config$driver_bucket, 1, sep = "/") +driver_path <- stringr::word(config$driver_bucket, 2, -1, sep = "/") + +Sys.setenv("GEFS_VERSION"="v12") + #install_mc() mc_alias_set("osn", "sdsc.osn.xsede.org", "", "") -# TODO: update path to usgsrc4cast-drivers? -mc_mirror("osn/bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/pseudo", "pseudo") + +mc_mirror(glue::glue("osn/{driver_bucket}/{driver_path}/gefs-v12/pseudo"), "pseudo") df <- arrow::open_dataset("pseudo") |> dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) site_list <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/", - "raw/main/USGS_site_metadata.csv"), + "raw/prod/USGS_site_metadata.csv"), show_col_types = FALSE) |> dplyr::pull(site_id) -s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12", - endpoint_override = "sdsc.osn.xsede.org", - access_key= Sys.getenv("OSN_KEY"), - secret_key= Sys.getenv("OSN_SECRET")) - -s3$CreateDir("stage3") - -s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage3", - endpoint_override = "sdsc.osn.xsede.org", - access_key= Sys.getenv("OSN_KEY"), - secret_key= Sys.getenv("OSN_SECRET")) -#site_list <- site_list[1:3] +s3_stage3 <- gefs4cast::gefs_s3_dir(product = "stage3", + path = driver_path, + endpoint = config$endpoint, + bucket = driver_bucket) future::plan("future::multisession", workers = 8) @@ -41,15 +39,13 @@ furrr::future_walk(site_list, function(curr_site_id){ dplyr::filter(site_id == curr_site_id) |> dplyr::collect() - s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage3", - endpoint_override = "sdsc.osn.xsede.org", - access_key= Sys.getenv("OSN_KEY"), - secret_key= Sys.getenv("OSN_SECRET")) + s3_stage3 <- gefs4cast::gefs_s3_dir(product = "stage3", + path = driver_path, + endpoint = config$endpoint, + bucket = driver_bucket) print(curr_site_id) df |> - #dplyr::filter(site_id == curr_site_id) |> - #dplyr::collect() |> to_hourly(use_solar_geom = TRUE, psuedo = TRUE) |> dplyr::mutate(ensemble = as.numeric(stringr::str_sub(ensemble, start = 4, end = 5))) |> dplyr::rename(parameter = ensemble) |> From 31c4de91df1292ef5bc874dfafd1b18b886996ac Mon Sep 17 00:00:00 2001 From: Zwart Date: Mon, 8 Jan 2024 13:04:19 -0800 Subject: [PATCH 11/14] deleting submit met forecasts as I dont think its used --- drivers/submit_met_forecast.R | 50 ----------------------------------- 1 file changed, 50 deletions(-) delete mode 100644 drivers/submit_met_forecast.R diff --git a/drivers/submit_met_forecast.R b/drivers/submit_met_forecast.R deleted file mode 100644 index c3d5a54315..0000000000 --- a/drivers/submit_met_forecast.R +++ /dev/null @@ -1,50 +0,0 @@ -submit_met_forecast <- function(model_id){ - - s3 <- arrow::s3_bucket(paste0("bio230121-bucket01/flare/drivers/met/ensemble_forecast/model_id=", model_id), - endpoint_override = "renc.osn.xsede.org", - access_key = Sys.getenv("OSN_KEY"), - secret_key = Sys.getenv("OSN_SECRET")) - - df <- arrow::open_dataset(s3) |> dplyr::filter(site_id == "fcre") |> dplyr::collect() - - max_reference_date <- max(df$reference_date) - - filename <- paste0("drivers/", model_id, "-",max_reference_date,".csv.gz") - df <- df |> dplyr::filter(reference_date == max_reference_date) |> - dplyr::mutate(date = lubridate::as_date(datetime)) |> - dplyr::select(-unit) |> - tidyr::pivot_wider(names_from = variable, values_from = prediction) - - if(model_id == "ecmwf_ifs04"){ - df <- df |> dplyr::summarize(RH_percent_mean = mean(relativehumidity_2m, na.rm = TRUE), - Rain_mm_sum = sum(precipitation, na.rm = TRUE), - WindSpeed_ms_mean = mean(windspeed_10m, na.rm = TRUE), - AirTemp_C_mean = mean(temperature_2m, na.rm = TRUE), - BP_kPa_mean = mean(surface_pressure * 0.1, na.rm = TRUE), - .by = c("date","ensemble")) - }else{ - df <- df |> dplyr::summarize(RH_percent_mean = mean(relativehumidity_2m, na.rm = TRUE), - Rain_mm_sum = sum(precipitation, na.rm = TRUE), - WindSpeed_ms_mean = mean(windspeed_10m, na.rm = TRUE), - AirTemp_C_mean = mean(temperature_2m, na.rm = TRUE), - ShortwaveRadiationUp_Wm2_mean = mean(shortwave_radiation, na.rm = TRUE), - BP_kPa_mean = mean(surface_pressure * 0.1, na.rm = TRUE), - .by = c("date","ensemble")) - } - df |> tidyr::pivot_longer(-c(date, ensemble), names_to = "variable", values_to = "prediction") |> - dplyr::mutate(datetime = lubridate::as_datetime(date), - reference_datetime = lubridate::as_datetime(max_reference_date), - site_id = "fcre", - model_id = model_id, - duration = "P1D", - project_id = "vera4cast", - depth_m = NA, - family = "ensemble", - ensemble = as.numeric(ensemble)) |> - dplyr::rename(parameter = ensemble) |> - dplyr::select(c("project_id", "site_id","model_id", "reference_datetime", "datetime","duration", "depth_m","variable", "family", "parameter", "prediction")) |> - readr::write_csv(filename) - - vera4castHelpers::submit(filename, first_submission = FALSE) - -} From 976db6c5b48652419885979e9b3fead31b6fbcff Mon Sep 17 00:00:00 2001 From: Zwart Date: Mon, 8 Jan 2024 13:26:55 -0800 Subject: [PATCH 12/14] testing --- .github/workflows/drivers_stage1.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/drivers_stage1.yaml b/.github/workflows/drivers_stage1.yaml index 3a67b8e99f..1bd420745e 100644 --- a/.github/workflows/drivers_stage1.yaml +++ b/.github/workflows/drivers_stage1.yaml @@ -10,7 +10,7 @@ name: gefs_osn jobs: docker: - runs-on: [self-hosted] # do we have access to this in usgsrc4cast repo? + runs-on: [self-hosted] env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} OSN_KEY: ${{ secrets.OSN_KEY }} @@ -18,8 +18,8 @@ jobs: # container: rocker/geospatial steps: - uses: actions/checkout@v3 - # with: - # ref: prod + with: + ref: generate-drivers - name: Install shell: Rscript {0} From 82275329d6b6c1a808efe6de84e8b46e1184cd28 Mon Sep 17 00:00:00 2001 From: Zwart Date: Mon, 8 Jan 2024 13:32:31 -0800 Subject: [PATCH 13/14] comment out schedule --- .github/workflows/drivers_stage1.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/drivers_stage1.yaml b/.github/workflows/drivers_stage1.yaml index 1bd420745e..a99b1ab0ff 100644 --- a/.github/workflows/drivers_stage1.yaml +++ b/.github/workflows/drivers_stage1.yaml @@ -1,8 +1,8 @@ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: - schedule: - - cron: '0 23 * * *' + # schedule: + # - cron: '0 23 * * *' workflow_dispatch: From fe4fafc45105c843c03da20ed62e4094ee069f2b Mon Sep 17 00:00:00 2001 From: Zwart Date: Wed, 24 Jan 2024 10:19:28 -0800 Subject: [PATCH 14/14] ref to prod --- .github/workflows/drivers_stage1.yaml | 7 +++---- .github/workflows/drivers_stage3.yaml | 8 ++++---- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/.github/workflows/drivers_stage1.yaml b/.github/workflows/drivers_stage1.yaml index a99b1ab0ff..1a4698d97f 100644 --- a/.github/workflows/drivers_stage1.yaml +++ b/.github/workflows/drivers_stage1.yaml @@ -1,11 +1,10 @@ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: - # schedule: - # - cron: '0 23 * * *' + schedule: + - cron: '0 23 * * *' workflow_dispatch: - name: gefs_osn jobs: @@ -19,7 +18,7 @@ jobs: steps: - uses: actions/checkout@v3 with: - ref: generate-drivers + ref: prod - name: Install shell: Rscript {0} diff --git a/.github/workflows/drivers_stage3.yaml b/.github/workflows/drivers_stage3.yaml index 41015650cf..f1f6f78399 100644 --- a/.github/workflows/drivers_stage3.yaml +++ b/.github/workflows/drivers_stage3.yaml @@ -1,8 +1,8 @@ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: - #schedule: - # - cron: '0 13 * * *' + schedule: + - cron: '0 13 * * *' workflow_dispatch: name: gefs_osn_stage3 @@ -17,8 +17,8 @@ jobs: # container: rocker/geospatial steps: - uses: actions/checkout@v3 - # with: - # ref: prod + with: + ref: prod - name: Install shell: Rscript {0}