diff --git a/.github/workflows/drivers_stage1.yaml b/.github/workflows/drivers_stage1.yaml index d83eeedadf..1a4698d97f 100644 --- a/.github/workflows/drivers_stage1.yaml +++ b/.github/workflows/drivers_stage1.yaml @@ -5,7 +5,6 @@ on: - cron: '0 23 * * *' workflow_dispatch: - name: gefs_osn jobs: @@ -18,6 +17,8 @@ jobs: # container: rocker/geospatial steps: - uses: actions/checkout@v3 + with: + ref: prod - name: Install shell: Rscript {0} diff --git a/.github/workflows/drivers_stage3.yaml b/.github/workflows/drivers_stage3.yaml index 6eb0582383..f1f6f78399 100644 --- a/.github/workflows/drivers_stage3.yaml +++ b/.github/workflows/drivers_stage3.yaml @@ -1,8 +1,8 @@ # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: - #schedule: - # - cron: '0 13 * * *' + schedule: + - cron: '0 13 * * *' workflow_dispatch: name: gefs_osn_stage3 @@ -17,6 +17,8 @@ jobs: # container: rocker/geospatial steps: - uses: actions/checkout@v3 + with: + ref: prod - name: Install shell: Rscript {0} diff --git a/README.md b/README.md index 9dc42e1dd5..ed5bc2c7ff 100644 --- a/README.md +++ b/README.md @@ -19,3 +19,6 @@ Supported by the U.S. National Science Foundation grants (DEB-1926388 and OAC-22 ## Disclaimer Although this software program has been used by the U.S. Geological Survey (USGS), no warranty, expressed or implied, is made by the USGS or the U.S. Government as to the accuracy and functioning of the program and related program material nor shall the fact of distribution constitute any such warranty, and no responsibility is assumed by the USGS in connection therewith. This software is provided “AS IS.” + +## License Disclaimer +As a government employee, the contributions from Jacob Zwart to this repository are in the public domain. diff --git a/challenge_configuration.yaml b/challenge_configuration.yaml index d5425f6776..ba23250e3e 100644 --- a/challenge_configuration.yaml +++ b/challenge_configuration.yaml @@ -1,20 +1,33 @@ +# Challenge details challenge_long_name: EFI-USGS River Chlorophyll Forecasting Challenge +challenge_url: https://projects.ecoforecast.org/usgsrc4cast-ci +github_repo: eco4cast/usgsrc4cast-ci project_id: usgsrc4cast + +# Endpoints endpoint: sdsc.osn.xsede.org noaa_endpoint: s3.flare-forecast.org submissions_endpoint: submit.ecoforecast.org -scores_bucket: bio230014-bucket01/challenges/scores + +# Buckets +archive_bucket: bio230014-bucket01/challenges/archive +driver_bucket: bio230014-bucket01/challenges/drivers/usgsrc4cast/noaa forecasts_bucket: bio230014-bucket01/challenges/forecasts -summaries_bucket: bio230014-bucket01/challenges/forecasts/summaries -submissions_bucket: submissions inventory_bucket: bio230014-bucket01/challenges/inventory +model_metadata_bucket: bio230014-bucket01/challenges/metadata/model_id noaa_forecast_bucket: drivers/noaa/gefs-v12-reprocess/ prov_bucket: bio230014-bucket01/challenges/prov +scores_bucket: bio230014-bucket01/challenges/scores +submissions_bucket: submissions +summaries_bucket: bio230014-bucket01/challenges/forecasts/summaries targets_bucket: bio230014-bucket01/challenges/targets -archive_bucket: bio230014-bucket01/challenges/archive -model_metadata_bucket: bio230014-bucket01/challenges/metadata/model_id -model_metadata_gsheet: https://docs.google.com/spreadsheets/d/1f177dpaxLzc4UuQ4_SJV9JWIbQPlilVnEztyvZE6aSU/edit?usp=sharing + +# Misc. files example_model_id: example +model_metadata_gsheet: https://docs.google.com/spreadsheets/d/1f177dpaxLzc4UuQ4_SJV9JWIbQPlilVnEztyvZE6aSU/edit?usp=sharing +targets_file_name: 'river-chl-targets.csv.gz' + +# Forecast sites site_path: 'catalog/sites' site_table: USGS_site_metadata.csv site_thumbnail: 'https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/screencapture-waterdata-usgs-gov-nwis-rt-2018-08-02-13_00_05-01.jpg' @@ -25,21 +38,22 @@ target_metadata_gsheet: https://docs.google.com/spreadsheets/d/10YTX9ae_C1rFdLgE targets_thumbnail: 'https://raw.githubusercontent.com/eco4cast/neon4cast-ci/main/catalog/thumbnail_plots/neon_stream.jpg' targets_thumbnail_title: 'Test Image' targets_path: 'catalog/targets/' -# to here -targets_file_name: 'river-chl-targets.csv.gz' -challenge_url: https://projects.ecoforecast.org/usgsrc4cast-ci -github_repo: eco4cast/usgsrc4cast-ci + target_groups: Aquatics: targets_file: "https://data.ecoforecast.org/neon4cast-targets/aquatics/aquatics-targets.csv.gz" + noaa_forecast_groups: ['Pseudo','Stage1-stats','Stage1','Stage2','Stage3'] noaa_forecast_group_paths: ['pseudo','stage1-stats','stage1','stage2','stage3'] + variable_groups: Aquatics: variable: ['chla'] duration: ['P1D'] thumbnail_link: 'https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/Back-b.jpg' thumbnail_title: 'USGS Streamgage' + +# Forecast catalog configuration catalog_config: about_string: 'https://projects.ecoforecast.org/usgsrc4cast-docs/' about_title: 'EFI-USGS River Chlorophyll Forecasting Challenge Documentation' diff --git a/drivers/download_stage1_psuedo.R b/drivers/download_stage1_psuedo.R index 564a9cec9d..e940df9961 100644 --- a/drivers/download_stage1_psuedo.R +++ b/drivers/download_stage1_psuedo.R @@ -5,8 +5,12 @@ library(gefs4cast) gdalcubes::gdalcubes_options(parallel=2*parallel::detectCores()) #gdalcubes::gdalcubes_options(parallel=TRUE) +config <- yaml::read_yaml("challenge_configuration.yaml") +driver_bucket <- stringr::word(config$driver_bucket, 1, sep = "/") +driver_path <- stringr::word(config$driver_bucket, 2, -1, sep = "/") + sites <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/", - "raw/main/USGS_site_metadata.csv"), + "raw/prod/USGS_site_metadata.csv"), col_select = c("site_id", "latitude", "longitude")) Sys.setenv("GEFS_VERSION"="v12") @@ -15,27 +19,40 @@ dates_pseudo <- seq(as.Date("2020-09-24"), Sys.Date(), by=1) message("GEFS v12 stage1-stats") bench::bench_time({ # thelio - s3 <- gefs_s3_dir("stage1-stats") + s3 <- gefs4cast::gefs_s3_dir(product = "stage1-stats", + path = driver_path, + endpoint = config$endpoint, + bucket = driver_bucket) have_dates <- gsub("reference_datetime=", "", s3$ls()) missing_dates <- dates[!(as.character(dates) %in% have_dates)] - gefs_to_parquet(missing_dates, - ensemble=c("geavg", "gespr"), - path = s3, - sites = sites) + gefs4cast::gefs_to_parquet(dates = missing_dates, + ensemble = c("geavg", "gespr"), + path = s3, + sites = sites) }) message("GEFS v12 pseudo") bench::bench_time({ #32xlarge - s3 <- gefs_s3_dir("pseudo") + s3 <- gefs4cast::gefs_s3_dir(product = "pseudo", + path = driver_path, + endpoint = config$endpoint, + bucket = driver_bucket) have_dates <- gsub("reference_datetime=", "", s3$ls()) missing_dates <- dates_pseudo[!(as.character(dates_pseudo) %in% have_dates)] - gefs4cast:::gefs_pseudo_measures(missing_dates, path = s3, sites = sites) + gefs4cast:::gefs_pseudo_measures(dates = missing_dates, + path = s3, + sites = sites) }) message("GEFS v12 stage1") bench::bench_time({ # cirrus ~ 6days for full set - s3 <- gefs_s3_dir("stage1") + s3 <- gefs4cast::gefs_s3_dir(product = "stage1", + path = driver_path, + endpoint = config$endpoint, + bucket = driver_bucket) have_dates <- gsub("reference_datetime=", "", s3$ls()) missing_dates <- dates[!(as.character(dates) %in% have_dates)] - gefs_to_parquet(missing_dates, path = s3, sites = sites) + gefs4cast::gefs_to_parquet(dates = missing_dates, + path = s3, + sites = sites) }) diff --git a/drivers/generate_stage2.R b/drivers/generate_stage2.R index 2655bf60e5..31d77718d0 100644 --- a/drivers/generate_stage2.R +++ b/drivers/generate_stage2.R @@ -1,36 +1,38 @@ -source("https://raw.githubusercontent.com/eco4cast/neon4cast/ci_upgrade/R/to_hourly.R") # is this branch stable? +## setup +library(gdalcubes) +library(gefs4cast) +source("https://raw.githubusercontent.com/eco4cast/neon4cast/main/R/to_hourly.R") + +Sys.setenv("GEFS_VERSION"="v12") site_list <- readr::read_csv("USGS_site_metadata.csv", show_col_types = FALSE) -# should this be updated to a usgsrc4cast-drivers path? or are we keeping all drivers in -# neon4cast-drivers? -s3_stage2 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage2", - endpoint_override = "sdsc.osn.xsede.org", - access_key= Sys.getenv("OSN_KEY"), - secret_key= Sys.getenv("OSN_SECRET")) +config <- yaml::read_yaml("challenge_configuration.yaml") +driver_bucket <- stringr::word(config$driver_bucket, 1, sep = "/") +driver_path <- stringr::word(config$driver_bucket, 2, -1, sep = "/") + +# s3_stage2 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage2", +# endpoint_override = "sdsc.osn.xsede.org", +# access_key= Sys.getenv("OSN_KEY"), +# secret_key= Sys.getenv("OSN_SECRET")) +s3_stage2 <- gefs4cast::gefs_s3_dir(product = "stage2", + path = driver_path, + endpoint = config$endpoint, + bucket = driver_bucket) df <- arrow::open_dataset(s3_stage2) |> dplyr::distinct(reference_datetime) |> dplyr::collect() -#stage1_s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1", -# endpoint_override = "sdsc.osn.xsede.org", -# anonymous = TRUE) - - -#efi <- duckdbfs::open_dataset("s3://bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1", -# s3_access_key_id="", -# s3_endpoint="sdsc.osn.xsede.org") -#df_stage1 <- arrow::open_dataset(stage1_s3) |> -# dplyr::summarize(max(reference_datetime)) |> -# dplyr::collect() - curr_date <- Sys.Date() -last_week <- dplyr::tibble(reference_datetime = as.character(seq(curr_date - lubridate::days(7), curr_date - lubridate::days(1), by = "1 day"))) +last_week <- dplyr::tibble(reference_datetime = as.character(seq(curr_date - lubridate::days(7), + curr_date - lubridate::days(1), + by = "1 day"))) -missing_dates <- dplyr::anti_join(last_week, df, by = "reference_datetime") |> +missing_dates <- dplyr::anti_join(last_week, df, + by = "reference_datetime") |> dplyr::pull(reference_datetime) if(length(missing_dates) > 0){ @@ -38,14 +40,15 @@ if(length(missing_dates) > 0){ print(missing_dates[i]) - bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",missing_dates[i]) + # bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=", + # missing_dates[i]) + bucket <- glue::glue("{config$driver_bucket}/gefs-v12/stage1/reference_datetime={missing_dates[i]}") - endpoint_override <- "https://sdsc.osn.xsede.org" - s3 <- arrow::s3_bucket(paste0(bucket), - endpoint_override = endpoint_override, - anonymous = TRUE) + s3_stage1 <- arrow::s3_bucket(bucket = bucket, + endpoint_override = config$endpoint, + anonymous = TRUE) - site_df <- arrow::open_dataset(s3) |> + site_df <- arrow::open_dataset(s3_stage1) |> dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) |> dplyr::filter(site_id %in% site_list$site_id) |> dplyr::collect() |> @@ -58,7 +61,7 @@ if(length(missing_dates) > 0){ reference_datetime = lubridate::as_date(reference_datetime)) |> dplyr::rename(parameter = ensemble) - arrow::write_dataset(hourly_df, + arrow::write_dataset(dataset = hourly_df, path = s3_stage2, partitioning = c("reference_datetime", "site_id")) } diff --git a/drivers/generate_stage3.R b/drivers/generate_stage3.R index 7314144572..d70a426f70 100644 --- a/drivers/generate_stage3.R +++ b/drivers/generate_stage3.R @@ -1,33 +1,34 @@ +## setup library(minioclient) -source("https://raw.githubusercontent.com/eco4cast/neon4cast/ci_upgrade/R/to_hourly.R") +library(gdalcubes) +library(gefs4cast) +source("https://raw.githubusercontent.com/eco4cast/neon4cast/main/R/to_hourly.R") + +config <- yaml::read_yaml("challenge_configuration.yaml") +driver_bucket <- stringr::word(config$driver_bucket, 1, sep = "/") +driver_path <- stringr::word(config$driver_bucket, 2, -1, sep = "/") + +Sys.setenv("GEFS_VERSION"="v12") #install_mc() mc_alias_set("osn", "sdsc.osn.xsede.org", "", "") -# TODO: update path to usgsrc4cast-drivers? -mc_mirror("osn/bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/pseudo", "pseudo") + +mc_mirror(glue::glue("osn/{driver_bucket}/{driver_path}/gefs-v12/pseudo"), "pseudo") df <- arrow::open_dataset("pseudo") |> dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) site_list <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/", - "raw/main/USGS_site_metadata.csv"), + "raw/prod/USGS_site_metadata.csv"), show_col_types = FALSE) |> dplyr::pull(site_id) -s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12", - endpoint_override = "sdsc.osn.xsede.org", - access_key= Sys.getenv("OSN_KEY"), - secret_key= Sys.getenv("OSN_SECRET")) - -s3$CreateDir("stage3") - -s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage3", - endpoint_override = "sdsc.osn.xsede.org", - access_key= Sys.getenv("OSN_KEY"), - secret_key= Sys.getenv("OSN_SECRET")) -#site_list <- site_list[1:3] +s3_stage3 <- gefs4cast::gefs_s3_dir(product = "stage3", + path = driver_path, + endpoint = config$endpoint, + bucket = driver_bucket) future::plan("future::multisession", workers = 8) @@ -38,15 +39,13 @@ furrr::future_walk(site_list, function(curr_site_id){ dplyr::filter(site_id == curr_site_id) |> dplyr::collect() - s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage3", - endpoint_override = "sdsc.osn.xsede.org", - access_key= Sys.getenv("OSN_KEY"), - secret_key= Sys.getenv("OSN_SECRET")) + s3_stage3 <- gefs4cast::gefs_s3_dir(product = "stage3", + path = driver_path, + endpoint = config$endpoint, + bucket = driver_bucket) print(curr_site_id) df |> - #dplyr::filter(site_id == curr_site_id) |> - #dplyr::collect() |> to_hourly(use_solar_geom = TRUE, psuedo = TRUE) |> dplyr::mutate(ensemble = as.numeric(stringr::str_sub(ensemble, start = 4, end = 5))) |> dplyr::rename(parameter = ensemble) |> diff --git a/drivers/submit_met_forecast.R b/drivers/submit_met_forecast.R deleted file mode 100644 index c3d5a54315..0000000000 --- a/drivers/submit_met_forecast.R +++ /dev/null @@ -1,50 +0,0 @@ -submit_met_forecast <- function(model_id){ - - s3 <- arrow::s3_bucket(paste0("bio230121-bucket01/flare/drivers/met/ensemble_forecast/model_id=", model_id), - endpoint_override = "renc.osn.xsede.org", - access_key = Sys.getenv("OSN_KEY"), - secret_key = Sys.getenv("OSN_SECRET")) - - df <- arrow::open_dataset(s3) |> dplyr::filter(site_id == "fcre") |> dplyr::collect() - - max_reference_date <- max(df$reference_date) - - filename <- paste0("drivers/", model_id, "-",max_reference_date,".csv.gz") - df <- df |> dplyr::filter(reference_date == max_reference_date) |> - dplyr::mutate(date = lubridate::as_date(datetime)) |> - dplyr::select(-unit) |> - tidyr::pivot_wider(names_from = variable, values_from = prediction) - - if(model_id == "ecmwf_ifs04"){ - df <- df |> dplyr::summarize(RH_percent_mean = mean(relativehumidity_2m, na.rm = TRUE), - Rain_mm_sum = sum(precipitation, na.rm = TRUE), - WindSpeed_ms_mean = mean(windspeed_10m, na.rm = TRUE), - AirTemp_C_mean = mean(temperature_2m, na.rm = TRUE), - BP_kPa_mean = mean(surface_pressure * 0.1, na.rm = TRUE), - .by = c("date","ensemble")) - }else{ - df <- df |> dplyr::summarize(RH_percent_mean = mean(relativehumidity_2m, na.rm = TRUE), - Rain_mm_sum = sum(precipitation, na.rm = TRUE), - WindSpeed_ms_mean = mean(windspeed_10m, na.rm = TRUE), - AirTemp_C_mean = mean(temperature_2m, na.rm = TRUE), - ShortwaveRadiationUp_Wm2_mean = mean(shortwave_radiation, na.rm = TRUE), - BP_kPa_mean = mean(surface_pressure * 0.1, na.rm = TRUE), - .by = c("date","ensemble")) - } - df |> tidyr::pivot_longer(-c(date, ensemble), names_to = "variable", values_to = "prediction") |> - dplyr::mutate(datetime = lubridate::as_datetime(date), - reference_datetime = lubridate::as_datetime(max_reference_date), - site_id = "fcre", - model_id = model_id, - duration = "P1D", - project_id = "vera4cast", - depth_m = NA, - family = "ensemble", - ensemble = as.numeric(ensemble)) |> - dplyr::rename(parameter = ensemble) |> - dplyr::select(c("project_id", "site_id","model_id", "reference_datetime", "datetime","duration", "depth_m","variable", "family", "parameter", "prediction")) |> - readr::write_csv(filename) - - vera4castHelpers::submit(filename, first_submission = FALSE) - -} diff --git a/drivers/update_stage3.R b/drivers/update_stage3.R index 36352a1cff..05d86b0fea 100644 --- a/drivers/update_stage3.R +++ b/drivers/update_stage3.R @@ -1,22 +1,30 @@ -source("https://raw.githubusercontent.com/eco4cast/neon4cast/ci_upgrade/R/to_hourly.R") +library(gdalcubes) +library(gefs4cast) +source("https://raw.githubusercontent.com/eco4cast/neon4cast/main/R/to_hourly.R") site_list <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/", - "raw/main/USGS_site_metadata.csv"), + "raw/prod/USGS_site_metadata.csv"), show_col_types = FALSE) |> dplyr::pull(site_id) +Sys.setenv("GEFS_VERSION"="v12") + +config <- yaml::read_yaml("challenge_configuration.yaml") +driver_bucket <- stringr::word(config$driver_bucket, 1, sep = "/") +driver_path <- stringr::word(config$driver_bucket, 2, -1, sep = "/") + future::plan("future::multisession", workers = 8) furrr::future_walk(site_list, function(curr_site_id){ print(curr_site_id) - s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage3", - endpoint_override = "sdsc.osn.xsede.org", - access_key= Sys.getenv("OSN_KEY"), - secret_key= Sys.getenv("OSN_SECRET")) + s3_stage3 <- gefs4cast::gefs_s3_dir(product = "stage3", + path = driver_path, + endpoint = config$endpoint, + bucket = driver_bucket) - stage3_df <- arrow::open_dataset(s3) |> + stage3_df <- arrow::open_dataset(s3_stage3) |> dplyr::filter(site_id == curr_site_id) |> dplyr::collect() @@ -24,27 +32,27 @@ furrr::future_walk(site_list, function(curr_site_id){ dplyr::summarise(max = as.character(lubridate::as_date(max(datetime)))) |> dplyr::pull(max) - s3_pseudo <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/pseudo", - endpoint_override = "sdsc.osn.xsede.org", - access_key= Sys.getenv("OSN_KEY"), - secret_key= Sys.getenv("OSN_SECRET")) + s3_pseudo <- gefs4cast::gefs_s3_dir(product = "pseudo", + path = driver_path, + endpoint = config$endpoint, + bucket = driver_bucket) vars <- names(stage3_df) cut_off <- as.character(lubridate::as_date(max_date) - lubridate::days(3)) - df <- arrow::open_dataset(s3_pseudo) |> + pseudo_df <- arrow::open_dataset(s3_pseudo) |> dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) |> dplyr::filter(site_id == curr_site_id, reference_datetime >= cut_off) |> dplyr::collect() - if(nrow(df) > 0){ + if(nrow(psuedo_df) > 0){ - df2 <- df |> + df2 <- psuedo_df |> to_hourly(use_solar_geom = TRUE, psuedo = TRUE) |> - dplyr::mutate(ensemble = as.numeric(stringr::str_sub(ensemble, start = 4, end = 5))) |> - dplyr::rename(parameter = ensemble) + dplyr::mutate(ensemble = as.numeric(stringr::str_sub(ensemble, start = 4, end = 5))) |> + dplyr::rename(parameter = ensemble) stage3_df_update <- stage3_df |> dplyr::filter(datetime < min(df2$datetime)) @@ -52,6 +60,6 @@ furrr::future_walk(site_list, function(curr_site_id){ df2 |> dplyr::bind_rows(stage3_df_update) |> dplyr::arrange(variable, datetime, parameter) |> - arrow::write_dataset(path = s3, partitioning = "site_id") + arrow::write_dataset(path = s3_stage3, partitioning = "site_id") } })