From b02308826f6191f8fd7304e754ca95b0e1836840 Mon Sep 17 00:00:00 2001 From: Quinn Thomas Date: Tue, 31 Oct 2023 12:34:09 -0400 Subject: [PATCH 01/20] Create to_hourly.R --- R/to_hourly.R | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 R/to_hourly.R diff --git a/R/to_hourly.R b/R/to_hourly.R new file mode 100644 index 0000000..4dd2227 --- /dev/null +++ b/R/to_hourly.R @@ -0,0 +1,132 @@ +to_hourly <- function(df, + use_solar_geom = TRUE, + psuedo = FALSE){ + + if(!psuedo){ + reference_datetime <- lubridate::as_datetime(df$reference_datetime) + }else{ + reference_datetime <- NA + } + + var_order <- names(df) + + ensemble_maxtime <- df |> + dplyr::group_by(site_id, family, ensemble) |> + dplyr::summarise(max_time = max(datetime), .groups = "drop") + + ensembles <- unique(df$ensemble) + datetime <- seq(min(df$datetime), max(df$datetime), by = "1 hour") + variables <- unique(df$variable) + sites <- unique(df$site_id) + + full_time <- expand.grid(sites, ensembles, datetime, variables) |> + dplyr::rename(site_id = Var1, + ensemble = Var2, + datetime = Var3, + variable = Var4) |> + dplyr::mutate(datetime = lubridate::as_datetime(datetime)) |> + dplyr::arrange(site_id, ensemble, variable, datetime) |> + dplyr::left_join(ensemble_maxtime, by = c("site_id","ensemble")) |> + dplyr::filter(datetime <= max_time) |> + dplyr::select(-c("max_time")) |> + dplyr::distinct() + + states <- df |> + dplyr::select(site_id, family, horizon, ensemble, datetime, variable, prediction) |> + dplyr::filter(horizon != "006") |> + dplyr::select(-horizon) |> + dplyr::group_by(site_id, family, ensemble, variable) |> + dplyr::right_join(full_time, by = c("site_id", "ensemble", "datetime", "family", "variable")) |> + dplyr::filter(variable %in% c("PRES", "RH", "TMP", "UGRD", "VGRD")) |> + dplyr::arrange(site_id, family, ensemble, datetime) |> + dplyr::mutate(prediction = imputeTS::na_interpolation(prediction, option = "linear")) |> + dplyr::mutate(prediction = ifelse(variable == "TMP", prediction + 273, prediction)) |> + dplyr::mutate(prediction = ifelse(variable == "RH", prediction/100, prediction)) + + fluxes <- df |> + dplyr::select(site_id, family, horizon, ensemble, datetime, variable, prediction) |> + dplyr::filter(horizon != "003") |> + dplyr::select(-horizon) |> + dplyr::group_by(site_id, family, ensemble, variable) |> + dplyr::right_join(full_time, by = c("site_id", "ensemble", "datetime", "family", "variable")) |> + dplyr::filter(variable %in% c("APCP","DSWRF","DLWRF")) |> + dplyr::arrange(site_id, family, ensemble, datetime) |> + tidyr::fill(prediction, .direction = "up") |> + dplyr::mutate(prediction = ifelse(variable == "APCP", prediction / (6 * 60 * 60), prediction), + variable = ifelse(variable == "APCP", "PRATE", variable)) + + if(use_solar_geom){ + + site_list <- readr::read_csv(paste0("https://github.com/eco4cast/", + "neon4cast-noaa-download/", + "raw/master/noaa_download_site_list.csv"), + show_col_types = FALSE) |> + dplyr::select(-site_name) + + fluxes <- fluxes |> + dplyr::left_join(site_list, by = "site_id") |> + dplyr::mutate(hour = lubridate::hour(datetime), + date = lubridate::as_date(datetime), + doy = lubridate::yday(datetime) + hour/24, + longitude = ifelse(longitude < 0, 360 + longitude, longitude), + rpot = downscale_solar_geom(doy, longitude, latitude)) |> # hourly sw flux calculated using solar geometry + dplyr::group_by(site_id, family, ensemble, date, variable) |> + dplyr::mutate(avg.rpot = mean(rpot, na.rm = TRUE), + avg.SW = mean(prediction, na.rm = TRUE))|> # daily sw mean from solar geometry + dplyr::ungroup() |> + dplyr::mutate(prediction = ifelse(variable == "DSWRF" & avg.rpot > 0.0, rpot * (avg.SW/avg.rpot),prediction)) |> + dplyr::select(any_of(var_order)) + } + + hourly_df <- dplyr::bind_rows(states, fluxes) |> + dplyr::arrange(site_id, family, ensemble, datetime) |> + dplyr::mutate(variable = ifelse(variable == "TMP", "air_temperature", variable), + variable = ifelse(variable == "PRES", "air_pressure", variable), + variable = ifelse(variable == "RH", "relative_humidity", variable), + variable = ifelse(variable == "DLWRF", "surface_downwelling_longwave_flux_in_air", variable), + variable = ifelse(variable == "DSWRF", "surface_downwelling_shortwave_flux_in_air", variable), + variable = ifelse(variable == "PRATE", "precipitation_flux", variable), + variable = ifelse(variable == "VGRD", "eastward_wind", variable), + variable = ifelse(variable == "UGRD", "northward_wind", variable), + variable = ifelse(variable == "APCP", "precipitation_amount", variable), + reference_datetime = reference_datetime) |> + dplyr::select(any_of(var_order)) + + return(hourly_df) + +} + +cos_solar_zenith_angle <- function(doy, lat, lon, dt, hr) { + et <- equation_of_time(doy) + merid <- floor(lon / 15) * 15 + merid[merid < 0] <- merid[merid < 0] + 15 + lc <- (lon - merid) * -4/60 ## longitude correction + tz <- merid / 360 * 24 ## time zone + midbin <- 0.5 * dt / 86400 * 24 ## shift calc to middle of bin + t0 <- 12 + lc - et - tz - midbin ## solar time + h <- pi/12 * (hr - t0) ## solar hour + dec <- -23.45 * pi / 180 * cos(2 * pi * (doy + 10) / 365) ## declination + cosz <- sin(lat * pi / 180) * sin(dec) + cos(lat * pi / 180) * cos(dec) * cos(h) + cosz[cosz < 0] <- 0 + return(cosz) +} + +equation_of_time <- function(doy) { + stopifnot(doy <= 367) + f <- pi / 180 * (279.5 + 0.9856 * doy) + et <- (-104.7 * sin(f) + 596.2 * sin(2 * f) + 4.3 * + sin(4 * f) - 429.3 * cos(f) - 2 * + cos(2 * f) + 19.3 * cos(3 * f)) / 3600 # equation of time -> eccentricity and obliquity + return(et) +} + +downscale_solar_geom <- function(doy, lon, lat) { + + dt <- median(diff(doy)) * 86400 # average number of seconds in time interval + hr <- (doy - floor(doy)) * 24 # hour of day for each element of doy + + ## calculate potential radiation + cosz <- cos_solar_zenith_angle(doy, lat, lon, dt, hr) + rpot <- 1366 * cosz + return(rpot) +} From 289616b4ff97834a3d9fa925b753ba16e0eaafc2 Mon Sep 17 00:00:00 2001 From: Quinn Thomas Date: Tue, 31 Oct 2023 12:43:01 -0400 Subject: [PATCH 02/20] updating stage1, stage2, and stage3 to use osn --- R/noaa_gefs.R | 134 +++++++++++++++++++++++++++++++------------------- 1 file changed, 83 insertions(+), 51 deletions(-) diff --git a/R/noaa_gefs.R b/R/noaa_gefs.R index ad1b4c5..0438189 100644 --- a/R/noaa_gefs.R +++ b/R/noaa_gefs.R @@ -60,13 +60,34 @@ noaa_stage1 <- function(cycle = 0, version = "v12", endpoint = "data.ecoforecast.org", verbose = TRUE, - start_date = "") { - noaa_gefs_stage(file.path("stage1",cycle, start_date), - partitioning = "start_date", - version = version, - endpoint = endpoint, - verbose = verbose, - start_date = start_date) + start_date = "", + site_id = NA) { + + vars <- arrow_env_vars() + + if(is.na(site_id)){ + bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",start_date) + }else{ + bucket <-paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",start_date,"/site_id=",site_id) + } + + endpoint_override <- "https://sdsc.osn.xsede.org" + s3 <- arrow::s3_bucket(paste0(bucket), + endpoint_override = endpoint_override, + anonymous = TRUE) + + site_df <- arrow::open_dataset(s3) |> + dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) |> + dplyr::collect() |> + mutate(reference_datetime = start_date) + + if(!is.na(site_id)){ + site_df <- site_df |> dplyr::mutate(site_id = site_id) + } + + unset_arrow_vars(vars) + + return(site_df) } #' NOAA GEFS forecasts with EFI stage 2 processing @@ -79,18 +100,41 @@ noaa_stage1 <- function(cycle = 0, #' @export noaa_stage2 <- function(cycle = 0, version = "v12", - endpoint = "data.ecoforecast.org", + endpoint = NA, verbose = TRUE, - start_date = "") { - noaa_gefs_stage(file.path("stage2/parquet",cycle, start_date), - partitioning = "start_date", - version = version, - endpoint = endpoint, - verbose = verbose, - start_date = start_date) + start_date = "", + site_id = NA) { + + vars <- arrow_env_vars() + + if(is.na(site_id)){ + bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",start_date) + }else{ + bucket <-paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",start_date,"/site_id=",site_id) + + } -} + endpoint_override <- "https://sdsc.osn.xsede.org" + s3 <- arrow::s3_bucket(paste0(bucket), + endpoint_override = endpoint_override, + anonymous = TRUE) + site_df <- arrow::open_dataset(s3) |> + dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) |> + dplyr::collect() |> + mutate(reference_datetime = start_date) + + if(!is.na(site_id)){ + site_df <- site_df |> dplyr::mutate(site_id = site_id) + } + + hourly_df <- to_hourly(site_df, use_solar_geom = TRUE, psuedo = TRUE) + + unset_arrow_vars(vars) + + return(hourly_df) + +} #' NOAA GEFS forecasts with EFI stage 3 processing #' @@ -107,46 +151,34 @@ noaa_stage2 <- function(cycle = 0, #' @export noaa_stage3 <- function(version = "v12", endpoint = "data.ecoforecast.org", - verbose = TRUE) { - noaa_gefs_stage("stage3/parquet", - partitioning = "site_id", - version = version, - endpoint = endpoint, - verbose = verbose, - start_date = NA) -} + verbose = TRUE, + site_id = NA) { + +vars <- arrow_env_vars() + + if(is.na(site_id)){ + bucket <- "bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage3" + }else{ + bucket <-paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage3/site_id=",site_id) -noaa_gefs_stage <- function(stage = "stage1", - partitioning = c("cycle","start_date"), - cycle = 0, - version = "v12", - endpoint = "data.ecoforecast.org", - verbose = getOption("verbose", TRUE), - start_date = start_date) { - if(verbose) - message(paste("establishing connection to", stage, "at", endpoint, "...")) - s3 <- noaa_gefs(version, endpoint) - if (!is.na(as.Date(start_date))) { - ds <- arrow::open_dataset(s3$path(stage)) |> dplyr::filter(parameter <= 31) - } else { - ds <- arrow::open_dataset(s3$path(stage), partitioning = partitioning) |> dplyr::filter(parameter <= 31) } - if(verbose) - message(paste0("connected! Use dplyr functions to filter and summarise.\n", - "Then, use collect() to read result into R\n")) - ds -} -noaa_gefs <- function(version = "v12", - endpoint = "data.ecoforecast.org") { + endpoint_override <- "https://sdsc.osn.xsede.org" + s3 <- arrow::s3_bucket(bucket, + endpoint_override = endpoint_override, + anonymous = TRUE) - vars <- arrow_env_vars() - gefs <- arrow::s3_bucket(paste0("neon4cast-drivers/noaa/gefs-", version), - endpoint_override = endpoint, - anonymous = TRUE) - on.exit(unset_arrow_vars(vars)) - gefs + site_df <- arrow::open_dataset(s3) |> + dplyr::collect() + if(!is.na(site_id)){ + site_df <- site_df |> dplyr::mutate(site_id = site_id) + } + + unset_arrow_vars(vars) + + return(site_df) + } arrow_env_vars <- function(){ From 903ffc17e5e56b75a401c5627eb4ea570731dabd Mon Sep 17 00:00:00 2001 From: rqthomas Date: Wed, 1 Nov 2023 10:52:05 -0400 Subject: [PATCH 03/20] adding registration check to submit function --- R/submit.R | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/R/submit.R b/R/submit.R index f1a9eaa..83b0e68 100644 --- a/R/submit.R +++ b/R/submit.R @@ -11,7 +11,7 @@ submit <- function(forecast_file, metadata = NULL, ask = interactive(), - s3_region = "data", + s3_region = "submit", s3_endpoint = "ecoforecast.org" ){ if(file.exists("~/.aws")){ @@ -20,6 +20,7 @@ submit <- function(forecast_file, } message("validating that file matches required standard") go <- forecast_output_validator(forecast_file) + if(!go){ warning(paste0("forecasts was not in a valid format and was not submitted\n", @@ -28,6 +29,52 @@ submit <- function(forecast_file, return(NULL) } + googlesheets4::gs4_deauth() + message("Checking if model_id is registered") + registered_model_id <- suppressMessages(googlesheets4::read_sheet("https://docs.google.com/spreadsheets/d/1f177dpaxLzc4UuQ4_SJV9JWIbQPlilVnEztyvZE6aSU/edit?usp=sharing", range = "Sheet1!A:A")) + + df <- read4cast::read_forecast(forecast_file) + model_id <- df$model_id[1] + + if(grepl("(example)", model_id)){ + message(paste0("You are submitting a forecast with 'example' in the model_id. As an example forecast, it will be processed but not used in future analyses.\n", + "No registration is required to submit an example forecast.\n", + "If you want your forecast to be retained, please select a different model_id that does not contain `example` and register you model id at https://forms.gle/kg2Vkpho9BoMXSy57\n")) + } + if(!(model_id %in% registered_model_id$model_id) & !grepl("(example)",model_id)){ + + message("Checking if model_id is already used in submissions") + + submitted_model_ids <- readr::read_csv("https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/inventory/model_id/model_id-project_id-inventory.csv", show_col_types = FALSE) + + if(model_id %in% submitted_model_ids$model_id){ + + stop(paste0("Your model_id (",model_id,") has not been registered yet but is already used in other submissions. Please use and register another model_id\n", + " Register at https://forms.gle/kg2Vkpho9BoMXSy57\n", + "If you want to submit without registering, include the word 'example' in your model_id. As an example forecast, it will be processed but not used in future analyses.")) + + }else{ + + stop(paste0("Your model_id (",model_id,") has not been registered\n", + " Register at https://forms.gle/kg2Vkpho9BoMXSy57\n", + "If you want to submit without registering, include the word 'example' in your model_id. As an example forecast, it will be processed but not used in future analyses.")) + + } + } + + if(!grepl("(example)",model_id)){ + if(first_submission & model_id %in% registered_model_id$model_id){ + submitted_model_ids <- readr::read_csv("https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/inventory/model_id/model_id-project_id-inventory.csv", show_col_types = FALSE) + if(model_id %in% submitted_model_ids$model_id){ + stop(paste0("Your model_id (",model_id,") is already used in other submitted forecasts. There are two causes for this error: \n + - If you have previously submitted a forecast, set the argument `first_submission = FALSE` to remove this error\n + - If you have not previously submitted a forecast, this error message means that the model_id has already been registered and used for submissions. Please register and use another model_id at [https://forms.gle/kg2Vkpho9BoMXSy57](https://forms.gle/kg2Vkpho9BoMXSy57)")) + } + } + }else{ + message("Since `example` is in your model_id, you are submitting an example forecast that will be processed but only retained for 30-days") + } + if(go & ask){ go <- utils::askYesNo("Forecast file is valid, ready to submit?") } @@ -35,14 +82,14 @@ submit <- function(forecast_file, #GENERALIZATION: Here are specific AWS INFO exists <- aws.s3::put_object(file = forecast_file, object = basename(forecast_file), - bucket = "neon4cast-submissions", + bucket = "submissions", region= s3_region, base_url = s3_endpoint) if(exists){ - message("Successfully submitted forecast to server") + message("Thank you for submitting!") }else{ - warning("Forecasts was not sucessfully submitted to server") + warning("Forecasts was not sucessfully submitted to server. Try again, then contact the Challenge organizers.") } From 9914adda276a1e6abf691fa643ec66def72f4890 Mon Sep 17 00:00:00 2001 From: rqthomas Date: Fri, 3 Nov 2023 16:28:49 -0400 Subject: [PATCH 04/20] updating --- DESCRIPTION | 5 +- NAMESPACE | 17 ----- R/combine_scores.R | 70 -------------------- R/download_forecast.R | 52 --------------- R/download_scores.R | 43 ------------- R/download_target.R | 31 --------- R/re_exports.R | 28 -------- R/s3_helpers.R | 54 +++++----------- R/submit.R | 134 ++++++++++++--------------------------- man/combined_scores.Rd | 19 ------ man/download_forecast.Rd | 33 ---------- man/download_scores.Rd | 23 ------- man/get_target.Rd | 18 ------ man/noaa_stage1.Rd | 3 +- man/noaa_stage2.Rd | 5 +- man/noaa_stage3.Rd | 7 +- man/reexports.Rd | 23 ------- man/submit.Rd | 2 +- man/theme_statistics.Rd | 17 ----- 19 files changed, 68 insertions(+), 516 deletions(-) delete mode 100644 R/combine_scores.R delete mode 100644 R/download_forecast.R delete mode 100644 R/download_scores.R delete mode 100644 R/download_target.R delete mode 100644 R/re_exports.R delete mode 100644 man/combined_scores.Rd delete mode 100644 man/download_forecast.Rd delete mode 100644 man/download_scores.Rd delete mode 100644 man/get_target.Rd delete mode 100644 man/reexports.Rd delete mode 100644 man/theme_statistics.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 238ea81..7c3dfe8 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,14 +18,12 @@ Imports: aws.s3, usethis, ncdf4, - distributional, dplyr, tidyr, arrow, rlang, tibble, - read4cast (>= 0.0.0.9000), - score4cast (>= 0.0.0.9000) + read4cast (>= 0.0.0.9000) RoxygenNote: 7.2.3 Language: en-US Suggests: @@ -51,5 +49,4 @@ Additional_repositories: https://cboettig.github.io/drat Config/testthat/edition: 3 Remotes: github::eco4cast/read4cast, - github::eco4cast/score4cast, github::cboettig/aws.s3 diff --git a/NAMESPACE b/NAMESPACE index 5c47bfe..68b77f3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,30 +1,13 @@ # Generated by roxygen2: do not edit by hand export(check_submission) -export(combined_scores) -export(crps_logs_score) -export(download_forecast) -export(download_scores) export(efi_format) export(efi_format_ensemble) export(forecast_output_validator) -export(get_target) -export(include_horizon) export(noaa_stage1) export(noaa_stage2) export(noaa_stage3) -export(pivot_forecast) -export(pivot_target) -export(read_forecast) -export(score) export(submit) -export(theme_statistics) importFrom(dplyr,`%>%`) -importFrom(read4cast,read_forecast) importFrom(rlang,.data) importFrom(rlang,`:=`) -importFrom(score4cast,crps_logs_score) -importFrom(score4cast,include_horizon) -importFrom(score4cast,pivot_forecast) -importFrom(score4cast,pivot_target) -importFrom(score4cast,score) diff --git a/R/combine_scores.R b/R/combine_scores.R deleted file mode 100644 index 1e2effb..0000000 --- a/R/combine_scores.R +++ /dev/null @@ -1,70 +0,0 @@ -#' Combining scores from a theme together -#' -#' @param theme theme name -#' @param collect TRUE/FALSE to download results -#' @return a data.frame of scores -#' @export -combined_scores <- function(theme, collect = TRUE){ - - vars <- arrow_env_vars() - - #GENERALIZATION: THIS IS A SPECIFIC ENDPOINT - s3 <- arrow::s3_bucket(bucket = paste0("neon4cast-scores/parquet/", theme), - endpoint_override = "data.ecoforecast.org", - anonymous = TRUE) - ds <- arrow::open_dataset(s3, schema=score4cast::score_schema()) - if (collect) { - ds <- dplyr::collect(ds) - } - on.exit(unset_arrow_vars(vars)) - ds -} - - -#' Calculating forecast challenge submission statistics -#' -#' @param themes theme names -#' @return a data.frame of challenge statistics -#' @export - -theme_statistics <- function(themes){ - -theme_stats <- purrr::map_dfr(themes, function(theme){ - - message(theme) - - theme_scores <- combined_scores(theme = theme, collect = FALSE) - - teams <- theme_scores |> - dplyr::summarise(n = dplyr::n_distinct(model_id)) |> - dplyr::collect() |> - dplyr::pull(n) - - forecasts <- theme_scores |> - dplyr::select(model_id,reference_datetime, variable) |> - dplyr::distinct() |> - dplyr::summarise(n = dplyr::n(), .groups = "drop") |> - dplyr::summarise(total = sum(n)) |> - dplyr::collect() |> - dplyr::pull(total) - - forecast_obs <- theme_scores |> - dplyr::filter(!is.na(crps)) |> - dplyr::summarise(n = n(), .groups = "drop") |> - dplyr::summarise(total = sum(n)) |> - dplyr::collect() |> - dplyr::pull(total) - - output <- tibble::tibble(theme = theme, - n_teams = teams, - n_submissions = forecasts, - n_obs_forecasts_pairs = forecast_obs) - - return(output) - }) - -return(theme_stats) - -} -globalVariables(c("theme", "n", "crps", "total"), "neon4cast") - diff --git a/R/download_forecast.R b/R/download_forecast.R deleted file mode 100644 index 50ea283..0000000 --- a/R/download_forecast.R +++ /dev/null @@ -1,52 +0,0 @@ -# DEPRECATE, mark for removal - - - -## grab data -#project = "phenology" -#dir = "~/efi_neon_challenge/forecasts/" - -#' Download forecasts for NEON sites from the EFI server -#' -#' @param theme string of the theme -#' @param date start date for the forecast -#' @param dir storage location. Use tempdir unless you want to keep this -#' data around on your computer, in which case, `neonstore::neon_dir()` might -#' be a convenient choice. -#' @param s3_region data -#' @param s3_endpoint ecoforecast.org -#' @export -#' @examples -#' download_forecast("phenology") -download_forecast <- function(theme, - date = Sys.Date()-2, - dir = tempdir(), - s3_region = "data", - s3_endpoint = "ecoforecast.org"){ - lapply(theme, download_forecast_, date, dir) - invisible(dir) -} -download_forecast_ <- function(theme, - date = Sys.Date()-2, - dir = tempdir(), - s3_region = "data", - s3_endpoint = "ecoforecast.org"){ - - dir.create(dir, FALSE, TRUE) - parent_theme <- unlist(stringr::str_split(theme, "_"))[1] - prefix <- paste(parent_theme, paste0(theme,"-", date), sep="/") - #GENERALIZATION: Specific AWS info - object <- aws.s3::get_bucket("neon4cast-forecasts", - prefix = prefix, - region = s3_region, - base_url = s3_endpoint) - - #GENERALIZATION: Specific AWS info - for(i in seq_along(object)){ - aws.s3::save_object(object[[i]], - bucket = "neon4cast-forecasts", - file = file.path(dir, object[[i]]$Key), - region = s3_region, - base_url = s3_endpoint) - } -} diff --git a/R/download_scores.R b/R/download_scores.R deleted file mode 100644 index 18610a2..0000000 --- a/R/download_scores.R +++ /dev/null @@ -1,43 +0,0 @@ -# DEPRECATE, mark for removal - - -## grab data -#project = "phenology" -#dir = "~/efi_neon_challenge/forecasts/" - -#' Download score for NEON sites from the EFI server -#' -#' @param theme string of the theme -#' @param date start date for the forecast -#' @param dir storage location. Use tempdir unless you want to keep this -#' data around on your computer, in which case, `neonstore::neon_dir()` might -#' be a convenient choice. -#' @export -#' @examples -#' download_scores("phenology") -download_scores <- function(theme, - date = Sys.Date()-2, - dir = tempdir()){ - lapply(theme, download_forecast_, date, dir) - invisible(dir) -} -download_scores_ <- function(theme, - date = Sys.Date()-2, - dir = tempdir()){ - - dir.create(dir, FALSE, TRUE) - parent_theme <- unlist(stringr::str_split(theme, "_"))[1] - prefix <- paste(parent_theme, paste0("scores-",theme,"-", date), sep="/") - object <- aws.s3::get_bucket("neon4cast-scores", - prefix = prefix, - region = "data", - base_url = "ecoforecast.org") - - for(i in seq_along(object)){ - aws.s3::save_object(object[[i]], - bucket = "neon4cast-scores", - file = file.path(dir, object[[i]]$Key), - region = "data", - base_url = "ecoforecast.org") - } -} diff --git a/R/download_target.R b/R/download_target.R deleted file mode 100644 index 047e7a2..0000000 --- a/R/download_target.R +++ /dev/null @@ -1,31 +0,0 @@ -# DEPRECATE, mark for removal - -#GENERALIZATION: SPECIFC THEMES, TARGET FILES, AND TARGET FILE URL - - - -download_target <- function(theme = c("aquatics", "beetles", - "phenology", "terrestrial_30min", - "terrestrial_daily","ticks"), - download_url = NA){ - - if(is.na(download_url)){ - theme <- match.arg(theme) - - target_file <- switch(theme, - aquatics = "aquatics-targets.csv.gz", - beetles = "beetles-targets.csv.gz", - phenology = "phenology-targets.csv.gz", - terrestrial_daily = "terrestrial_daily-targets.csv.gz", - terrestrial_30min = "terrestrial_30min-targets.csv.gz", - ticks = "ticks-targets.csv.gz" - ) - download_url <- paste0("https://data.ecoforecast.org/neon4cast-targets/", - theme, "/", target_file) - } - - readr::read_csv(download_url, show_col_types = FALSE, - lazy = FALSE, progress = FALSE) - -} - diff --git a/R/re_exports.R b/R/re_exports.R deleted file mode 100644 index 02861ba..0000000 --- a/R/re_exports.R +++ /dev/null @@ -1,28 +0,0 @@ -#' @importFrom read4cast read_forecast -#' @export -read4cast::read_forecast - -#' @importFrom score4cast score -#' @export -score4cast::score - -#' @importFrom score4cast crps_logs_score -#' @export -score4cast::crps_logs_score - -#' @importFrom score4cast include_horizon -#' @export -score4cast::include_horizon - -#' @importFrom score4cast pivot_forecast -#' @export -score4cast::pivot_forecast - -#' @importFrom score4cast pivot_target -#' @export -score4cast::pivot_target - -# @importFrom dplyr `%>%` - -#globalVariables("forecast_start_time", "horizon", "observed", -# "predicted","statistic", "target_url", "theme") \ No newline at end of file diff --git a/R/s3_helpers.R b/R/s3_helpers.R index 297e3f1..6f0c56f 100644 --- a/R/s3_helpers.R +++ b/R/s3_helpers.R @@ -1,42 +1,20 @@ -# DEPRECATE and mark for removal +get_target <- function(variable, duration, project_id = "neon4cast", lazy = FALSE){ -#' Download target data from s3 -#' -#' @param dir full path to working directory -#' @param theme forecast theme -#' @param s3_region s3 region -#' @export -#' -get_target <- function(dir, theme, s3_region = Sys.getenv("AWS_DEFAULT_REGION")){ - download_s3_objects(dir, - bucket = "neon4cast-targets", - prefix = theme) -} + s3_targets <- arrow::s3_bucket("bio230014-bucket01/challenges/targets", endpoint_override = "sdsc.osn.xsede.org") + + target <- arrow::open_csv_dataset(s3_targets, + schema = arrow::schema( + project_id = arrow::string(), + site_id = arrow::string(), + datetime = arrow::timestamp(unit = "ns", timezone = "UTC"), + duration = arrow::string(), + variable = arrow::string(), + observation = arrow::float()), + skip = 1) |> + dplyr::filter(variable %in% variable, + duration == duration, + project_id == project_id) -#### + if(!lazy) target <- target |> dplyr::collect() -download_s3_objects <- function(dir, bucket, prefix, s3_region = Sys.getenv("AWS_DEFAULT_REGION")){ - - files <- aws.s3::get_bucket(bucket = bucket, - prefix = prefix, - region = s3_region, - use_https = as.logical(Sys.getenv("USE_HTTPS")), - max = Inf) - keys <- vapply(files, `[[`, "", "Key", USE.NAMES = FALSE) - empty <- grepl("/$", keys) - keys <- keys[!empty] - files_present <- TRUE - if(length(keys) > 0){ - for(i in 1:length(keys)){ - aws.s3::save_object(object = keys[i], - bucket = bucket, - file = file.path(dir, bucket, keys[i]), - region = s3_region, - use_https = as.logical(Sys.getenv("USE_HTTPS"))) - } - }else{ - message("Requested files are not available on the s3 bucket") - files_present <- FALSE - } - invisible(files_present) } diff --git a/R/submit.R b/R/submit.R index 83b0e68..dfd114e 100644 --- a/R/submit.R +++ b/R/submit.R @@ -1,18 +1,18 @@ ## Technically this could become arrow-based #' submit forecast to EFI -#' +#' #' @inheritParams forecast_output_validator #' @param metadata path to metadata file #' @param ask should we prompt for a go before submission? #' @param s3_region subdomain (leave as is for EFI challenge) #' @param s3_endpoint root domain (leave as is for EFI challenge) #' @export -submit <- function(forecast_file, - metadata = NULL, - ask = interactive(), +submit <- function(forecast_file, + metadata = NULL, + ask = interactive(), s3_region = "submit", - s3_endpoint = "ecoforecast.org" + s3_endpoint = "ecoforecast.org" ){ if(file.exists("~/.aws")){ warning(paste("Detected existing AWS credentials file in ~/.aws,", @@ -20,142 +20,86 @@ submit <- function(forecast_file, } message("validating that file matches required standard") go <- forecast_output_validator(forecast_file) - + if(!go){ - + warning(paste0("forecasts was not in a valid format and was not submitted\n", "First, try read reinstalling neon4cast (remotes::install_github('eco4cast\\neon4cast'), restarting R, and trying again\n", "Second, see https://projects.ecoforecast.org/neon4cast-docs/Submission-Instructions.html for more information on the file format")) return(NULL) } - + googlesheets4::gs4_deauth() message("Checking if model_id is registered") - registered_model_id <- suppressMessages(googlesheets4::read_sheet("https://docs.google.com/spreadsheets/d/1f177dpaxLzc4UuQ4_SJV9JWIbQPlilVnEztyvZE6aSU/edit?usp=sharing", range = "Sheet1!A:A")) - + registered_model_id <- suppressMessages(googlesheets4::read_sheet("https://docs.google.com/spreadsheets/d/1f177dpaxLzc4UuQ4_SJV9JWIbQPlilVnEztyvZE6aSU/edit?usp=sharing", range = "Sheet1!A:V")) + + registered_project_id <- registered_model_id$`What forecasting challenge are you registering for?` + registered_model_id <- registered_model_id$model_id + + registered_model_project_id <- paste(registered_project_id, registered_model_id, sep = "-") + df <- read4cast::read_forecast(forecast_file) model_id <- df$model_id[1] - + model_project_id <- paste("neon4cast", registered_model_id, sep = "-") + if(grepl("(example)", model_id)){ message(paste0("You are submitting a forecast with 'example' in the model_id. As an example forecast, it will be processed but not used in future analyses.\n", "No registration is required to submit an example forecast.\n", "If you want your forecast to be retained, please select a different model_id that does not contain `example` and register you model id at https://forms.gle/kg2Vkpho9BoMXSy57\n")) } - if(!(model_id %in% registered_model_id$model_id) & !grepl("(example)",model_id)){ - - message("Checking if model_id is already used in submissions") - + + if(!(model_project_id %in% registered_model_project_id) & !grepl("(example)",model_id)){ + + message("Checking if model_id for neon4cast is already used in submissions") + submitted_model_ids <- readr::read_csv("https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/inventory/model_id/model_id-project_id-inventory.csv", show_col_types = FALSE) - - if(model_id %in% submitted_model_ids$model_id){ - + submitted_project_model_id <- paste(submitted_model_ids$project_id, submitted_model_ids$model_id, sep = "-") + + + if(model_project_id %in% submitted_project_model_id){ + stop(paste0("Your model_id (",model_id,") has not been registered yet but is already used in other submissions. Please use and register another model_id\n", " Register at https://forms.gle/kg2Vkpho9BoMXSy57\n", "If you want to submit without registering, include the word 'example' in your model_id. As an example forecast, it will be processed but not used in future analyses.")) - + }else{ - + stop(paste0("Your model_id (",model_id,") has not been registered\n", " Register at https://forms.gle/kg2Vkpho9BoMXSy57\n", "If you want to submit without registering, include the word 'example' in your model_id. As an example forecast, it will be processed but not used in future analyses.")) - + } } - + if(!grepl("(example)",model_id)){ - if(first_submission & model_id %in% registered_model_id$model_id){ + if(first_submission & model_project_id %in% registered_model_project_id){ submitted_model_ids <- readr::read_csv("https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/inventory/model_id/model_id-project_id-inventory.csv", show_col_types = FALSE) - if(model_id %in% submitted_model_ids$model_id){ + submitted_project_model_id <- paste(submitted_model_ids$project_id, submitted_model_ids$model_id, sep = "-") + + if(model_project_id %in% submitted_project_model_id){ stop(paste0("Your model_id (",model_id,") is already used in other submitted forecasts. There are two causes for this error: \n - If you have previously submitted a forecast, set the argument `first_submission = FALSE` to remove this error\n - If you have not previously submitted a forecast, this error message means that the model_id has already been registered and used for submissions. Please register and use another model_id at [https://forms.gle/kg2Vkpho9BoMXSy57](https://forms.gle/kg2Vkpho9BoMXSy57)")) } } }else{ - message("Since `example` is in your model_id, you are submitting an example forecast that will be processed but only retained for 30-days") + message("Since `example` is in your model_id, you are submitting an example forecast that will be processed but not used in future analyses.") } - + if(go & ask){ go <- utils::askYesNo("Forecast file is valid, ready to submit?") } #GENERALIZATION: Here are specific AWS INFO - exists <- aws.s3::put_object(file = forecast_file, + exists <- aws.s3::put_object(file = forecast_file, object = basename(forecast_file), bucket = "submissions", region= s3_region, base_url = s3_endpoint) - + if(exists){ message("Thank you for submitting!") }else{ warning("Forecasts was not sucessfully submitted to server. Try again, then contact the Challenge organizers.") } - - - - - if(!is.null(metadata)){ - if(tools::file_ext(metadata) == "xml"){ - EFIstandards::forecast_validator(metadata) - aws.s3::put_object(file = metadata, - object = basename(metadata), - bucket = "neon4cast-submissions", - region= s3_region, - base_url = s3_endpoint) - }else{ - warning(paste("Metadata file is not an .xml file", - "Did you incorrectly submit the model description yml file instead of an xml file")) - } - } -} - -#' Check that submission was successfully processed -#' -#' @param forecast_file Your forecast csv or nc file -#' @param s3_region subdomain (leave as is for EFI challenge) -#' @param s3_endpoint root domain (leave as is for EFI challenge) -#' @export -check_submission <- function(forecast_file, - s3_region = "data", - s3_endpoint = "ecoforecast.org"){ - - theme <- stringr::str_split_fixed(basename(forecast_file), "-", n = 2) - - #All forecats are converted into a common file format when they are processed. This generates that name. - #if (grepl("[.]nc$", forecast_file)) { - # base_name <- paste0(tools::file_path_sans_ext(basename(forecast_file)), - # ".csv.gz") - #}else if (grepl("[.]csv$", forecast_file)) { - # base_name <- paste0(tools::file_path_sans_ext(basename(forecast_file)), - # ".csv.gz") - #}else if (grepl("[.]csv\\.gz$", forecast_file)) { - # base_name <- basename(forecast_file) - #}else { - # message("File is not a .nc, .cvs, or .csv.gz file") - # base_name <- forecast_file - #} - - base_name <- forecast_file - - exists <- suppressMessages(aws.s3::object_exists(object = file.path("raw", theme[,1], base_name), - bucket = "neon4cast-forecasts", - region= s3_region, - base_url = s3_endpoint)) - - - if(exists){ - message("Submission was successfully processed") - }else{ - not_in_standard <- suppressMessages(aws.s3::object_exists(object = file.path("not_in_standard", basename(forecast_file)), - bucket = "neon4cast-forecasts", - region= s3_region, - base_url = s3_endpoint)) - if(not_in_standard){ - message("Submission is not in required format. Try running neon4cast::forecast_output_validator on your file to see what the issue may be") - }else{ - message("Your forecast is still in queue to be processed by the server. Please check again in a few hours") - } - } - invisible(exists) } diff --git a/man/combined_scores.Rd b/man/combined_scores.Rd deleted file mode 100644 index ee6a3d1..0000000 --- a/man/combined_scores.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/combine_scores.R -\name{combined_scores} -\alias{combined_scores} -\title{Combining scores from a theme together} -\usage{ -combined_scores(theme, collect = TRUE) -} -\arguments{ -\item{theme}{theme name} - -\item{collect}{TRUE/FALSE to download results} -} -\value{ -a data.frame of scores -} -\description{ -Combining scores from a theme together -} diff --git a/man/download_forecast.Rd b/man/download_forecast.Rd deleted file mode 100644 index 1c3eb21..0000000 --- a/man/download_forecast.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/download_forecast.R -\name{download_forecast} -\alias{download_forecast} -\title{Download forecasts for NEON sites from the EFI server} -\usage{ -download_forecast( - theme, - date = Sys.Date() - 2, - dir = tempdir(), - s3_region = "data", - s3_endpoint = "ecoforecast.org" -) -} -\arguments{ -\item{theme}{string of the theme} - -\item{date}{start date for the forecast} - -\item{dir}{storage location. Use tempdir unless you want to keep this -data around on your computer, in which case, \code{neonstore::neon_dir()} might -be a convenient choice.} - -\item{s3_region}{data} - -\item{s3_endpoint}{ecoforecast.org} -} -\description{ -Download forecasts for NEON sites from the EFI server -} -\examples{ -download_forecast("phenology") -} diff --git a/man/download_scores.Rd b/man/download_scores.Rd deleted file mode 100644 index b250899..0000000 --- a/man/download_scores.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/download_scores.R -\name{download_scores} -\alias{download_scores} -\title{Download score for NEON sites from the EFI server} -\usage{ -download_scores(theme, date = Sys.Date() - 2, dir = tempdir()) -} -\arguments{ -\item{theme}{string of the theme} - -\item{date}{start date for the forecast} - -\item{dir}{storage location. Use tempdir unless you want to keep this -data around on your computer, in which case, \code{neonstore::neon_dir()} might -be a convenient choice.} -} -\description{ -Download score for NEON sites from the EFI server -} -\examples{ -download_scores("phenology") -} diff --git a/man/get_target.Rd b/man/get_target.Rd deleted file mode 100644 index f7d5fa2..0000000 --- a/man/get_target.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/s3_helpers.R -\name{get_target} -\alias{get_target} -\title{Download target data from s3} -\usage{ -get_target(dir, theme, s3_region = Sys.getenv("AWS_DEFAULT_REGION")) -} -\arguments{ -\item{dir}{full path to working directory} - -\item{theme}{forecast theme} - -\item{s3_region}{s3 region} -} -\description{ -Download target data from s3 -} diff --git a/man/noaa_stage1.Rd b/man/noaa_stage1.Rd index 42f3ddb..bc1c7e5 100644 --- a/man/noaa_stage1.Rd +++ b/man/noaa_stage1.Rd @@ -9,7 +9,8 @@ noaa_stage1( version = "v12", endpoint = "data.ecoforecast.org", verbose = TRUE, - start_date = "" + start_date = "", + site_id = NA ) } \arguments{ diff --git a/man/noaa_stage2.Rd b/man/noaa_stage2.Rd index 1938d5e..91ddba3 100644 --- a/man/noaa_stage2.Rd +++ b/man/noaa_stage2.Rd @@ -13,9 +13,10 @@ Stage2 processing involves the following transforms of the data: noaa_stage2( cycle = 0, version = "v12", - endpoint = "data.ecoforecast.org", + endpoint = NA, verbose = TRUE, - start_date = "" + start_date = "", + site_id = NA ) } \arguments{ diff --git a/man/noaa_stage3.Rd b/man/noaa_stage3.Rd index 41ad9bf..8836024 100644 --- a/man/noaa_stage3.Rd +++ b/man/noaa_stage3.Rd @@ -4,7 +4,12 @@ \alias{noaa_stage3} \title{NOAA GEFS forecasts with EFI stage 3 processing} \usage{ -noaa_stage3(version = "v12", endpoint = "data.ecoforecast.org", verbose = TRUE) +noaa_stage3( + version = "v12", + endpoint = "data.ecoforecast.org", + verbose = TRUE, + site_id = NA +) } \arguments{ \item{version}{GEFS forecast version. Prior versions correspond to forecasts diff --git a/man/reexports.Rd b/man/reexports.Rd deleted file mode 100644 index 1107f89..0000000 --- a/man/reexports.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/re_exports.R -\docType{import} -\name{reexports} -\alias{reexports} -\alias{read_forecast} -\alias{score} -\alias{crps_logs_score} -\alias{include_horizon} -\alias{pivot_forecast} -\alias{pivot_target} -\title{Objects exported from other packages} -\keyword{internal} -\description{ -These objects are imported from other packages. Follow the links -below to see their documentation. - -\describe{ - \item{read4cast}{\code{\link[read4cast]{read_forecast}}} - - \item{score4cast}{\code{\link[score4cast]{crps_logs_score}}, \code{\link[score4cast]{include_horizon}}, \code{\link[score4cast]{pivot_forecast}}, \code{\link[score4cast]{pivot_target}}, \code{\link[score4cast]{score}}} -}} - diff --git a/man/submit.Rd b/man/submit.Rd index ce56063..297938f 100644 --- a/man/submit.Rd +++ b/man/submit.Rd @@ -8,7 +8,7 @@ submit( forecast_file, metadata = NULL, ask = interactive(), - s3_region = "data", + s3_region = "submit", s3_endpoint = "ecoforecast.org" ) } diff --git a/man/theme_statistics.Rd b/man/theme_statistics.Rd deleted file mode 100644 index 920edbb..0000000 --- a/man/theme_statistics.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/combine_scores.R -\name{theme_statistics} -\alias{theme_statistics} -\title{Calculating forecast challenge submission statistics} -\usage{ -theme_statistics(themes) -} -\arguments{ -\item{themes}{theme names} -} -\value{ -a data.frame of challenge statistics -} -\description{ -Calculating forecast challenge submission statistics -} From 7930b91707089c78ea2fb01cb6ca891056753baf Mon Sep 17 00:00:00 2001 From: Quinn Thomas Date: Sun, 5 Nov 2023 19:38:35 -0500 Subject: [PATCH 05/20] Update submit.R --- R/submit.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/submit.R b/R/submit.R index dfd114e..bfaed49 100644 --- a/R/submit.R +++ b/R/submit.R @@ -40,7 +40,7 @@ submit <- function(forecast_file, df <- read4cast::read_forecast(forecast_file) model_id <- df$model_id[1] - model_project_id <- paste("neon4cast", registered_model_id, sep = "-") + model_project_id <- paste("neon4cast", model_id, sep = "-") if(grepl("(example)", model_id)){ message(paste0("You are submitting a forecast with 'example' in the model_id. As an example forecast, it will be processed but not used in future analyses.\n", From d6aafdaca59ef7da1b508a7f789a0b3a8e8ab08a Mon Sep 17 00:00:00 2001 From: Quinn Thomas Date: Sun, 5 Nov 2023 19:45:19 -0500 Subject: [PATCH 06/20] Update NAMESPACE --- NAMESPACE | 1 - 1 file changed, 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 68b77f3..9ee553e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,5 @@ # Generated by roxygen2: do not edit by hand -export(check_submission) export(efi_format) export(efi_format_ensemble) export(forecast_output_validator) From 346e7a1a266084996a2b64a3d2ba0cef2d396790 Mon Sep 17 00:00:00 2001 From: Quinn Thomas Date: Tue, 7 Nov 2023 09:29:45 -0500 Subject: [PATCH 07/20] Update noaa_gefs.R --- R/noaa_gefs.R | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/R/noaa_gefs.R b/R/noaa_gefs.R index 0438189..25c8a1c 100644 --- a/R/noaa_gefs.R +++ b/R/noaa_gefs.R @@ -79,7 +79,7 @@ noaa_stage1 <- function(cycle = 0, site_df <- arrow::open_dataset(s3) |> dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) |> dplyr::collect() |> - mutate(reference_datetime = start_date) + dplyr::mutate(reference_datetime = start_date) if(!is.na(site_id)){ site_df <- site_df |> dplyr::mutate(site_id = site_id) @@ -107,28 +107,27 @@ noaa_stage2 <- function(cycle = 0, vars <- arrow_env_vars() - if(is.na(site_id)){ - bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",start_date) - }else{ - bucket <-paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",start_date,"/site_id=",site_id) + bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",start_date) - } - endpoint_override <- "https://sdsc.osn.xsede.org" s3 <- arrow::s3_bucket(paste0(bucket), endpoint_override = endpoint_override, anonymous = TRUE) site_df <- arrow::open_dataset(s3) |> - dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) |> - dplyr::collect() |> - mutate(reference_datetime = start_date) + dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) if(!is.na(site_id)){ - site_df <- site_df |> dplyr::mutate(site_id = site_id) + site_id_list <- site_id + site_df <- site_df |> + filter(site_id %in% site_id_list) } - hourly_df <- to_hourly(site_df, use_solar_geom = TRUE, psuedo = TRUE) + site_df <- site_df |> + dplyr::collect() |> + dplyr::mutate(reference_datetime = start_date) + + hourly_df <- to_hourly(site_df, use_solar_geom = TRUE, psuedo = FALSE) unset_arrow_vars(vars) From 21297a3b7341d9768c95fc0bafbb48b44821cc78 Mon Sep 17 00:00:00 2001 From: Quinn Thomas Date: Tue, 7 Nov 2023 09:31:52 -0500 Subject: [PATCH 08/20] Update noaa_gefs.R --- R/noaa_gefs.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/R/noaa_gefs.R b/R/noaa_gefs.R index 25c8a1c..98177a4 100644 --- a/R/noaa_gefs.R +++ b/R/noaa_gefs.R @@ -65,11 +65,8 @@ noaa_stage1 <- function(cycle = 0, vars <- arrow_env_vars() - if(is.na(site_id)){ - bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",start_date) - }else{ - bucket <-paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",start_date,"/site_id=",site_id) - } + bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",start_date) + endpoint_override <- "https://sdsc.osn.xsede.org" s3 <- arrow::s3_bucket(paste0(bucket), @@ -77,14 +74,17 @@ noaa_stage1 <- function(cycle = 0, anonymous = TRUE) site_df <- arrow::open_dataset(s3) |> - dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) |> - dplyr::collect() |> - dplyr::mutate(reference_datetime = start_date) + dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) if(!is.na(site_id)){ - site_df <- site_df |> dplyr::mutate(site_id = site_id) + site_id_list <- site_id + site_df <- site_df |> + filter(site_id %in% site_id_list) } + site_df <- site_df |> + dplyr::collect() |> + dplyr::mutate(reference_datetime = start_date) unset_arrow_vars(vars) return(site_df) From 02a82311241a73c074923d27070a2a0b52a1de06 Mon Sep 17 00:00:00 2001 From: Quinn Thomas Date: Tue, 7 Nov 2023 09:33:21 -0500 Subject: [PATCH 09/20] Update to_hourly.R --- R/to_hourly.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/to_hourly.R b/R/to_hourly.R index 4dd2227..0b53887 100644 --- a/R/to_hourly.R +++ b/R/to_hourly.R @@ -41,7 +41,8 @@ to_hourly <- function(df, dplyr::arrange(site_id, family, ensemble, datetime) |> dplyr::mutate(prediction = imputeTS::na_interpolation(prediction, option = "linear")) |> dplyr::mutate(prediction = ifelse(variable == "TMP", prediction + 273, prediction)) |> - dplyr::mutate(prediction = ifelse(variable == "RH", prediction/100, prediction)) + dplyr::mutate(prediction = ifelse(variable == "RH", prediction/100, prediction)) |> + dplyr::ungroup() fluxes <- df |> dplyr::select(site_id, family, horizon, ensemble, datetime, variable, prediction) |> @@ -53,7 +54,8 @@ to_hourly <- function(df, dplyr::arrange(site_id, family, ensemble, datetime) |> tidyr::fill(prediction, .direction = "up") |> dplyr::mutate(prediction = ifelse(variable == "APCP", prediction / (6 * 60 * 60), prediction), - variable = ifelse(variable == "APCP", "PRATE", variable)) + variable = ifelse(variable == "APCP", "PRATE", variable)) |> + dplyr::ungroup() if(use_solar_geom){ From b67fb3dee478b39ab225274b48e03cd6a260b693 Mon Sep 17 00:00:00 2001 From: rqthomas Date: Tue, 7 Nov 2023 10:59:57 -0500 Subject: [PATCH 10/20] fixing noaa stage2 issues --- R/noaa_gefs.R | 12 +++++++++--- R/to_hourly.R | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/R/noaa_gefs.R b/R/noaa_gefs.R index 98177a4..209201e 100644 --- a/R/noaa_gefs.R +++ b/R/noaa_gefs.R @@ -79,7 +79,7 @@ noaa_stage1 <- function(cycle = 0, if(!is.na(site_id)){ site_id_list <- site_id site_df <- site_df |> - filter(site_id %in% site_id_list) + dplyr::filter(site_id %in% site_id_list) } site_df <- site_df |> @@ -120,7 +120,7 @@ noaa_stage2 <- function(cycle = 0, if(!is.na(site_id)){ site_id_list <- site_id site_df <- site_df |> - filter(site_id %in% site_id_list) + dplyr::filter(site_id %in% site_id_list) } site_df <- site_df |> @@ -128,7 +128,13 @@ noaa_stage2 <- function(cycle = 0, dplyr::mutate(reference_datetime = start_date) hourly_df <- to_hourly(site_df, use_solar_geom = TRUE, psuedo = FALSE) - + + hourly_df <- hourly_df |> + dplyr::mutate(ensemble = as.numeric(stringr::str_sub(ensemble, start = 4, end = 5))) |> + dplyr::rename(parameter = ensemble) + + + unset_arrow_vars(vars) return(hourly_df) diff --git a/R/to_hourly.R b/R/to_hourly.R index 0b53887..7b9ca37 100644 --- a/R/to_hourly.R +++ b/R/to_hourly.R @@ -3,7 +3,7 @@ to_hourly <- function(df, psuedo = FALSE){ if(!psuedo){ - reference_datetime <- lubridate::as_datetime(df$reference_datetime) + reference_datetime <- lubridate::as_datetime(df$reference_datetime)[1] }else{ reference_datetime <- NA } From cf834ffc36f9f944693004e533cc9068c3c018be Mon Sep 17 00:00:00 2001 From: rqthomas Date: Tue, 7 Nov 2023 13:41:09 -0500 Subject: [PATCH 11/20] getting stage1,stage2,and stage3 to match existing output --- R/noaa_gefs.R | 60 +++++++++------------------------------------------ 1 file changed, 10 insertions(+), 50 deletions(-) diff --git a/R/noaa_gefs.R b/R/noaa_gefs.R index 209201e..842347b 100644 --- a/R/noaa_gefs.R +++ b/R/noaa_gefs.R @@ -60,31 +60,19 @@ noaa_stage1 <- function(cycle = 0, version = "v12", endpoint = "data.ecoforecast.org", verbose = TRUE, - start_date = "", - site_id = NA) { + start_date = "") { vars <- arrow_env_vars() bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",start_date) - endpoint_override <- "https://sdsc.osn.xsede.org" s3 <- arrow::s3_bucket(paste0(bucket), endpoint_override = endpoint_override, anonymous = TRUE) - site_df <- arrow::open_dataset(s3) |> - dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) - - if(!is.na(site_id)){ - site_id_list <- site_id - site_df <- site_df |> - dplyr::filter(site_id %in% site_id_list) - } - - site_df <- site_df |> - dplyr::collect() |> - dplyr::mutate(reference_datetime = start_date) + site_df <- arrow::open_dataset(s3) + unset_arrow_vars(vars) return(site_df) @@ -102,8 +90,7 @@ noaa_stage2 <- function(cycle = 0, version = "v12", endpoint = NA, verbose = TRUE, - start_date = "", - site_id = NA) { + start_date = "") { vars <- arrow_env_vars() @@ -114,27 +101,8 @@ noaa_stage2 <- function(cycle = 0, endpoint_override = endpoint_override, anonymous = TRUE) - site_df <- arrow::open_dataset(s3) |> - dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) - - if(!is.na(site_id)){ - site_id_list <- site_id - site_df <- site_df |> - dplyr::filter(site_id %in% site_id_list) - } - - site_df <- site_df |> - dplyr::collect() |> - dplyr::mutate(reference_datetime = start_date) - - hourly_df <- to_hourly(site_df, use_solar_geom = TRUE, psuedo = FALSE) - - hourly_df <- hourly_df |> - dplyr::mutate(ensemble = as.numeric(stringr::str_sub(ensemble, start = 4, end = 5))) |> - dplyr::rename(parameter = ensemble) + site_df <- arrow::open_dataset(s3) - - unset_arrow_vars(vars) return(hourly_df) @@ -156,17 +124,12 @@ noaa_stage2 <- function(cycle = 0, #' @export noaa_stage3 <- function(version = "v12", endpoint = "data.ecoforecast.org", - verbose = TRUE, - site_id = NA) { + verbose = TRUE) { vars <- arrow_env_vars() - if(is.na(site_id)){ - bucket <- "bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage3" - }else{ - bucket <-paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage3/site_id=",site_id) + bucket <- "bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage3" - } endpoint_override <- "https://sdsc.osn.xsede.org" s3 <- arrow::s3_bucket(bucket, @@ -174,12 +137,9 @@ vars <- arrow_env_vars() anonymous = TRUE) site_df <- arrow::open_dataset(s3) |> - dplyr::collect() - - if(!is.na(site_id)){ - site_df <- site_df |> dplyr::mutate(site_id = site_id) - } - + dplyr::mutate(ensemble = as.numeric(stringr::str_sub(ensemble, start = 4, end = 5))) |> + dplyr::rename(parameter = ensemble) + unset_arrow_vars(vars) return(site_df) From 35d699447c4428754ba87182cccdffb6fd0431b5 Mon Sep 17 00:00:00 2001 From: rqthomas Date: Tue, 7 Nov 2023 13:47:13 -0500 Subject: [PATCH 12/20] toggle to enforce registration --- R/submit.R | 117 +++++++++++++++++++++++++++-------------------------- 1 file changed, 60 insertions(+), 57 deletions(-) diff --git a/R/submit.R b/R/submit.R index bfaed49..faed250 100644 --- a/R/submit.R +++ b/R/submit.R @@ -20,83 +20,86 @@ submit <- function(forecast_file, } message("validating that file matches required standard") go <- forecast_output_validator(forecast_file) - + if(!go){ - + warning(paste0("forecasts was not in a valid format and was not submitted\n", "First, try read reinstalling neon4cast (remotes::install_github('eco4cast\\neon4cast'), restarting R, and trying again\n", "Second, see https://projects.ecoforecast.org/neon4cast-docs/Submission-Instructions.html for more information on the file format")) return(NULL) } - - googlesheets4::gs4_deauth() - message("Checking if model_id is registered") - registered_model_id <- suppressMessages(googlesheets4::read_sheet("https://docs.google.com/spreadsheets/d/1f177dpaxLzc4UuQ4_SJV9JWIbQPlilVnEztyvZE6aSU/edit?usp=sharing", range = "Sheet1!A:V")) - - registered_project_id <- registered_model_id$`What forecasting challenge are you registering for?` - registered_model_id <- registered_model_id$model_id - - registered_model_project_id <- paste(registered_project_id, registered_model_id, sep = "-") - - df <- read4cast::read_forecast(forecast_file) - model_id <- df$model_id[1] - model_project_id <- paste("neon4cast", model_id, sep = "-") - - if(grepl("(example)", model_id)){ - message(paste0("You are submitting a forecast with 'example' in the model_id. As an example forecast, it will be processed but not used in future analyses.\n", - "No registration is required to submit an example forecast.\n", - "If you want your forecast to be retained, please select a different model_id that does not contain `example` and register you model id at https://forms.gle/kg2Vkpho9BoMXSy57\n")) - } - - if(!(model_project_id %in% registered_model_project_id) & !grepl("(example)",model_id)){ - - message("Checking if model_id for neon4cast is already used in submissions") - - submitted_model_ids <- readr::read_csv("https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/inventory/model_id/model_id-project_id-inventory.csv", show_col_types = FALSE) - submitted_project_model_id <- paste(submitted_model_ids$project_id, submitted_model_ids$model_id, sep = "-") - - - if(model_project_id %in% submitted_project_model_id){ - - stop(paste0("Your model_id (",model_id,") has not been registered yet but is already used in other submissions. Please use and register another model_id\n", - " Register at https://forms.gle/kg2Vkpho9BoMXSy57\n", - "If you want to submit without registering, include the word 'example' in your model_id. As an example forecast, it will be processed but not used in future analyses.")) - - }else{ - - stop(paste0("Your model_id (",model_id,") has not been registered\n", - " Register at https://forms.gle/kg2Vkpho9BoMXSy57\n", - "If you want to submit without registering, include the word 'example' in your model_id. As an example forecast, it will be processed but not used in future analyses.")) - + + check_model_id <- FALSE + if(check_model_id){ + googlesheets4::gs4_deauth() + message("Checking if model_id is registered") + registered_model_id <- suppressMessages(googlesheets4::read_sheet("https://docs.google.com/spreadsheets/d/1f177dpaxLzc4UuQ4_SJV9JWIbQPlilVnEztyvZE6aSU/edit?usp=sharing", range = "Sheet1!A:V")) + + registered_project_id <- registered_model_id$`What forecasting challenge are you registering for?` + registered_model_id <- registered_model_id$model_id + + registered_model_project_id <- paste(registered_project_id, registered_model_id, sep = "-") + + df <- read4cast::read_forecast(forecast_file) + model_id <- df$model_id[1] + model_project_id <- paste("neon4cast", model_id, sep = "-") + + if(grepl("(example)", model_id)){ + message(paste0("You are submitting a forecast with 'example' in the model_id. As an example forecast, it will be processed but not used in future analyses.\n", + "No registration is required to submit an example forecast.\n", + "If you want your forecast to be retained, please select a different model_id that does not contain `example` and register you model id at https://forms.gle/kg2Vkpho9BoMXSy57\n")) } - } - - if(!grepl("(example)",model_id)){ - if(first_submission & model_project_id %in% registered_model_project_id){ + + if(!(model_project_id %in% registered_model_project_id) & !grepl("(example)",model_id)){ + + message("Checking if model_id for neon4cast is already used in submissions") + submitted_model_ids <- readr::read_csv("https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/inventory/model_id/model_id-project_id-inventory.csv", show_col_types = FALSE) submitted_project_model_id <- paste(submitted_model_ids$project_id, submitted_model_ids$model_id, sep = "-") - + + if(model_project_id %in% submitted_project_model_id){ - stop(paste0("Your model_id (",model_id,") is already used in other submitted forecasts. There are two causes for this error: \n + + stop(paste0("Your model_id (",model_id,") has not been registered yet but is already used in other submissions. Please use and register another model_id\n", + " Register at https://forms.gle/kg2Vkpho9BoMXSy57\n", + "If you want to submit without registering, include the word 'example' in your model_id. As an example forecast, it will be processed but not used in future analyses.")) + + }else{ + + stop(paste0("Your model_id (",model_id,") has not been registered\n", + " Register at https://forms.gle/kg2Vkpho9BoMXSy57\n", + "If you want to submit without registering, include the word 'example' in your model_id. As an example forecast, it will be processed but not used in future analyses.")) + + } + } + + if(!grepl("(example)",model_id)){ + if(first_submission & model_project_id %in% registered_model_project_id){ + submitted_model_ids <- readr::read_csv("https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/inventory/model_id/model_id-project_id-inventory.csv", show_col_types = FALSE) + submitted_project_model_id <- paste(submitted_model_ids$project_id, submitted_model_ids$model_id, sep = "-") + + if(model_project_id %in% submitted_project_model_id){ + stop(paste0("Your model_id (",model_id,") is already used in other submitted forecasts. There are two causes for this error: \n - If you have previously submitted a forecast, set the argument `first_submission = FALSE` to remove this error\n - If you have not previously submitted a forecast, this error message means that the model_id has already been registered and used for submissions. Please register and use another model_id at [https://forms.gle/kg2Vkpho9BoMXSy57](https://forms.gle/kg2Vkpho9BoMXSy57)")) + } } + }else{ + message("Since `example` is in your model_id, you are submitting an example forecast that will be processed but not used in future analyses.") } - }else{ - message("Since `example` is in your model_id, you are submitting an example forecast that will be processed but not used in future analyses.") } - + if(go & ask){ go <- utils::askYesNo("Forecast file is valid, ready to submit?") } - + #GENERALIZATION: Here are specific AWS INFO exists <- aws.s3::put_object(file = forecast_file, - object = basename(forecast_file), - bucket = "submissions", - region= s3_region, - base_url = s3_endpoint) - + object = basename(forecast_file), + bucket = "submissions", + region= s3_region, + base_url = s3_endpoint) + if(exists){ message("Thank you for submitting!") }else{ From 430d38f0c234c8f1bbaf762cdfdaee481a24f814 Mon Sep 17 00:00:00 2001 From: rqthomas Date: Tue, 7 Nov 2023 13:50:24 -0500 Subject: [PATCH 13/20] site_df --- R/noaa_gefs.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/noaa_gefs.R b/R/noaa_gefs.R index 842347b..329e932 100644 --- a/R/noaa_gefs.R +++ b/R/noaa_gefs.R @@ -105,7 +105,7 @@ noaa_stage2 <- function(cycle = 0, unset_arrow_vars(vars) - return(hourly_df) + return(site_df) } From a9247c3fb69b45bc2e6c27942996ea58502508f2 Mon Sep 17 00:00:00 2001 From: rqthomas Date: Tue, 7 Nov 2023 14:23:27 -0500 Subject: [PATCH 14/20] updates --- R/noaa_gefs.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/noaa_gefs.R b/R/noaa_gefs.R index 329e932..7d442b5 100644 --- a/R/noaa_gefs.R +++ b/R/noaa_gefs.R @@ -94,7 +94,7 @@ noaa_stage2 <- function(cycle = 0, vars <- arrow_env_vars() - bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",start_date) + bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage2/reference_datetime=",start_date) endpoint_override <- "https://sdsc.osn.xsede.org" s3 <- arrow::s3_bucket(paste0(bucket), From 2214c0b88b7ea5edf2484bcfdcb9afc023c586a0 Mon Sep 17 00:00:00 2001 From: Quinn Thomas Date: Tue, 7 Nov 2023 15:58:20 -0500 Subject: [PATCH 15/20] Update to_hourly.R --- R/to_hourly.R | 168 +++++++++++++++++++++++++------------------------- 1 file changed, 84 insertions(+), 84 deletions(-) diff --git a/R/to_hourly.R b/R/to_hourly.R index 7b9ca37..c7a8161 100644 --- a/R/to_hourly.R +++ b/R/to_hourly.R @@ -8,91 +8,91 @@ to_hourly <- function(df, reference_datetime <- NA } - var_order <- names(df) - - ensemble_maxtime <- df |> - dplyr::group_by(site_id, family, ensemble) |> - dplyr::summarise(max_time = max(datetime), .groups = "drop") - - ensembles <- unique(df$ensemble) - datetime <- seq(min(df$datetime), max(df$datetime), by = "1 hour") - variables <- unique(df$variable) - sites <- unique(df$site_id) - - full_time <- expand.grid(sites, ensembles, datetime, variables) |> - dplyr::rename(site_id = Var1, - ensemble = Var2, - datetime = Var3, - variable = Var4) |> - dplyr::mutate(datetime = lubridate::as_datetime(datetime)) |> - dplyr::arrange(site_id, ensemble, variable, datetime) |> - dplyr::left_join(ensemble_maxtime, by = c("site_id","ensemble")) |> - dplyr::filter(datetime <= max_time) |> - dplyr::select(-c("max_time")) |> - dplyr::distinct() - - states <- df |> - dplyr::select(site_id, family, horizon, ensemble, datetime, variable, prediction) |> - dplyr::filter(horizon != "006") |> - dplyr::select(-horizon) |> - dplyr::group_by(site_id, family, ensemble, variable) |> - dplyr::right_join(full_time, by = c("site_id", "ensemble", "datetime", "family", "variable")) |> - dplyr::filter(variable %in% c("PRES", "RH", "TMP", "UGRD", "VGRD")) |> - dplyr::arrange(site_id, family, ensemble, datetime) |> - dplyr::mutate(prediction = imputeTS::na_interpolation(prediction, option = "linear")) |> - dplyr::mutate(prediction = ifelse(variable == "TMP", prediction + 273, prediction)) |> - dplyr::mutate(prediction = ifelse(variable == "RH", prediction/100, prediction)) |> - dplyr::ungroup() - - fluxes <- df |> - dplyr::select(site_id, family, horizon, ensemble, datetime, variable, prediction) |> - dplyr::filter(horizon != "003") |> - dplyr::select(-horizon) |> - dplyr::group_by(site_id, family, ensemble, variable) |> - dplyr::right_join(full_time, by = c("site_id", "ensemble", "datetime", "family", "variable")) |> - dplyr::filter(variable %in% c("APCP","DSWRF","DLWRF")) |> - dplyr::arrange(site_id, family, ensemble, datetime) |> - tidyr::fill(prediction, .direction = "up") |> - dplyr::mutate(prediction = ifelse(variable == "APCP", prediction / (6 * 60 * 60), prediction), - variable = ifelse(variable == "APCP", "PRATE", variable)) |> - dplyr::ungroup() - - if(use_solar_geom){ - - site_list <- readr::read_csv(paste0("https://github.com/eco4cast/", - "neon4cast-noaa-download/", - "raw/master/noaa_download_site_list.csv"), - show_col_types = FALSE) |> - dplyr::select(-site_name) - - fluxes <- fluxes |> - dplyr::left_join(site_list, by = "site_id") |> - dplyr::mutate(hour = lubridate::hour(datetime), - date = lubridate::as_date(datetime), - doy = lubridate::yday(datetime) + hour/24, - longitude = ifelse(longitude < 0, 360 + longitude, longitude), - rpot = downscale_solar_geom(doy, longitude, latitude)) |> # hourly sw flux calculated using solar geometry - dplyr::group_by(site_id, family, ensemble, date, variable) |> - dplyr::mutate(avg.rpot = mean(rpot, na.rm = TRUE), - avg.SW = mean(prediction, na.rm = TRUE))|> # daily sw mean from solar geometry - dplyr::ungroup() |> - dplyr::mutate(prediction = ifelse(variable == "DSWRF" & avg.rpot > 0.0, rpot * (avg.SW/avg.rpot),prediction)) |> - dplyr::select(any_of(var_order)) - } - - hourly_df <- dplyr::bind_rows(states, fluxes) |> - dplyr::arrange(site_id, family, ensemble, datetime) |> - dplyr::mutate(variable = ifelse(variable == "TMP", "air_temperature", variable), - variable = ifelse(variable == "PRES", "air_pressure", variable), - variable = ifelse(variable == "RH", "relative_humidity", variable), - variable = ifelse(variable == "DLWRF", "surface_downwelling_longwave_flux_in_air", variable), - variable = ifelse(variable == "DSWRF", "surface_downwelling_shortwave_flux_in_air", variable), - variable = ifelse(variable == "PRATE", "precipitation_flux", variable), - variable = ifelse(variable == "VGRD", "eastward_wind", variable), - variable = ifelse(variable == "UGRD", "northward_wind", variable), - variable = ifelse(variable == "APCP", "precipitation_amount", variable), - reference_datetime = reference_datetime) |> +var_order <- names(df) + +ensemble_maxtime <- df |> + dplyr::group_by(site_id, family, ensemble) |> + dplyr::summarise(max_time = max(datetime), .groups = "drop") + +ensembles <- unique(df$ensemble) +datetime <- seq(min(df$datetime), max(df$datetime), by = "1 hour") +variables <- unique(df$variable) +sites <- unique(df$site_id) + +full_time <- expand.grid(sites, ensembles, datetime, variables) |> + dplyr::rename(site_id = Var1, + ensemble = Var2, + datetime = Var3, + variable = Var4) |> + dplyr::mutate(datetime = lubridate::as_datetime(datetime)) |> + dplyr::arrange(site_id, ensemble, variable, datetime) |> + dplyr::left_join(ensemble_maxtime, by = c("site_id","ensemble")) |> + dplyr::filter(datetime <= max_time) |> + dplyr::select(-c("max_time")) |> + dplyr::distinct() + +states <- df |> + dplyr::select(site_id, family, horizon, ensemble, datetime, variable, prediction) |> + dplyr::filter((horizon != "006" & datetime < max(df$datetime)) | (horizon == "006" & datetime == max(df$datetime))) |> + dplyr::select(-horizon) |> + dplyr::group_by(site_id, family, ensemble, variable) |> + dplyr::right_join(full_time, by = c("site_id", "ensemble", "datetime", "family", "variable")) |> + dplyr::filter(variable %in% c("PRES", "RH", "TMP", "UGRD", "VGRD")) |> + dplyr::arrange(site_id, family, ensemble, datetime) |> + dplyr::mutate(prediction = imputeTS::na_interpolation(prediction, option = "linear")) |> + dplyr::mutate(prediction = ifelse(variable == "TMP", prediction + 273, prediction)) |> + dplyr::mutate(prediction = ifelse(variable == "RH", prediction/100, prediction)) |> + dplyr::ungroup() + +fluxes <- df |> + dplyr::select(site_id, family, horizon, ensemble, datetime, variable, prediction) |> + dplyr::filter(horizon != "003") |> + dplyr::select(-horizon) |> + dplyr::group_by(site_id, family, ensemble, variable) |> + dplyr::right_join(full_time, by = c("site_id", "ensemble", "datetime", "family", "variable")) |> + dplyr::filter(variable %in% c("APCP","DSWRF","DLWRF")) |> + dplyr::arrange(site_id, family, ensemble, datetime) |> + tidyr::fill(prediction, .direction = "up") |> + dplyr::mutate(prediction = ifelse(variable == "APCP", prediction / (6 * 60 * 60), prediction), + variable = ifelse(variable == "APCP", "PRATE", variable)) |> + dplyr::ungroup() + +if(use_solar_geom){ + + site_list <- readr::read_csv(paste0("https://github.com/eco4cast/", + "neon4cast-noaa-download/", + "raw/master/noaa_download_site_list.csv"), + show_col_types = FALSE) |> + dplyr::select(-site_name) + + fluxes <- fluxes |> + dplyr::left_join(site_list, by = "site_id") |> + dplyr::mutate(hour = lubridate::hour(datetime), + date = lubridate::as_date(datetime), + doy = lubridate::yday(datetime) + hour/24, + longitude = ifelse(longitude < 0, 360 + longitude, longitude), + rpot = downscale_solar_geom(doy, longitude, latitude)) |> # hourly sw flux calculated using solar geometry + dplyr::group_by(site_id, family, ensemble, date, variable) |> + dplyr::mutate(avg.rpot = mean(rpot, na.rm = TRUE), + avg.SW = mean(prediction, na.rm = TRUE))|> # daily sw mean from solar geometry + dplyr::ungroup() |> + dplyr::mutate(prediction = ifelse(variable == "DSWRF" & avg.rpot > 0.0, rpot * (avg.SW/avg.rpot),prediction)) |> dplyr::select(any_of(var_order)) +} + +hourly_df <- dplyr::bind_rows(states, fluxes) |> + dplyr::arrange(site_id, family, ensemble, datetime) |> + dplyr::mutate(variable = ifelse(variable == "TMP", "air_temperature", variable), + variable = ifelse(variable == "PRES", "air_pressure", variable), + variable = ifelse(variable == "RH", "relative_humidity", variable), + variable = ifelse(variable == "DLWRF", "surface_downwelling_longwave_flux_in_air", variable), + variable = ifelse(variable == "DSWRF", "surface_downwelling_shortwave_flux_in_air", variable), + variable = ifelse(variable == "PRATE", "precipitation_flux", variable), + variable = ifelse(variable == "VGRD", "eastward_wind", variable), + variable = ifelse(variable == "UGRD", "northward_wind", variable), + variable = ifelse(variable == "APCP", "precipitation_amount", variable), + reference_datetime = reference_datetime) |> + dplyr::select(any_of(var_order)) return(hourly_df) From d913750e3b011fa7e9eeb08bec7c715fd749d46d Mon Sep 17 00:00:00 2001 From: rqthomas Date: Wed, 8 Nov 2023 15:09:27 -0500 Subject: [PATCH 16/20] adding in reference_datetime to stage2 --- R/noaa_gefs.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/noaa_gefs.R b/R/noaa_gefs.R index 7d442b5..b5c2b42 100644 --- a/R/noaa_gefs.R +++ b/R/noaa_gefs.R @@ -101,7 +101,8 @@ noaa_stage2 <- function(cycle = 0, endpoint_override = endpoint_override, anonymous = TRUE) - site_df <- arrow::open_dataset(s3) + site_df <- arrow::open_dataset(s3) |> + dplyr::mutate(reference_datetime = lubridate::as_datetime(start_date)) unset_arrow_vars(vars) From 3ca6400408bbbd27c682d4c282a4dad142602a36 Mon Sep 17 00:00:00 2001 From: rqthomas Date: Wed, 8 Nov 2023 19:28:00 -0500 Subject: [PATCH 17/20] fixing to_hourly --- R/to_hourly.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/to_hourly.R b/R/to_hourly.R index c7a8161..f5f0b5c 100644 --- a/R/to_hourly.R +++ b/R/to_hourly.R @@ -33,7 +33,7 @@ full_time <- expand.grid(sites, ensembles, datetime, variables) |> states <- df |> dplyr::select(site_id, family, horizon, ensemble, datetime, variable, prediction) |> - dplyr::filter((horizon != "006" & datetime < max(df$datetime)) | (horizon == "006" & datetime == max(df$datetime))) |> + dplyr::filter(!psuedo | (psuedo & horizon != "006") | (psuedo & datetime == max(df$datetime))) |> dplyr::select(-horizon) |> dplyr::group_by(site_id, family, ensemble, variable) |> dplyr::right_join(full_time, by = c("site_id", "ensemble", "datetime", "family", "variable")) |> From 2582f355eceac27d6e020c1ed7535ad17d7b07f2 Mon Sep 17 00:00:00 2001 From: rqthomas Date: Fri, 1 Dec 2023 13:43:36 -0500 Subject: [PATCH 18/20] updating documentation and validator script --- R/forecast_output_validator.R | 252 +++++-------------------------- man/check_submission.Rd | 22 --- man/forecast_output_validator.Rd | 26 +--- man/noaa_stage1.Rd | 3 +- man/noaa_stage2.Rd | 3 +- man/noaa_stage3.Rd | 7 +- man/submit.Rd | 2 +- 7 files changed, 49 insertions(+), 266 deletions(-) delete mode 100644 man/check_submission.Rd diff --git a/R/forecast_output_validator.R b/R/forecast_output_validator.R index 81ff118..76fa67b 100644 --- a/R/forecast_output_validator.R +++ b/R/forecast_output_validator.R @@ -1,74 +1,31 @@ -#' forecast_output_validator +#' Validate forecast file #' -#' @param forecast_file Your forecast csv or nc file -#' @param target_variables Possible target variables -#' @param theme_names valid EFI theme names +#' @param forecast_file forecast csv or csv.gz file #' @export -#' -#' @examples -#' -#' forecast_file <- system.file("extdata/aquatics-2021-02-01-EFInull.csv.gz", -#' package = "neon4cast") -#' forecast_output_validator(forecast_file) -#' -forecast_output_validator <- function(forecast_file, - target_variables = c("oxygen", - "temperature", - "richness", - "abundance", - "nee", - "le", - "vswc", - "gcc_90", - "rcc_90", - "ixodes_scapularis", - "amblyomma_americanum", - "prediction", - "observed"), - #GENERALIZATION: Specific themes - theme_names = c("aquatics", "beetles", - "phenology", "terrestrial_30min", - "terrestrial_daily","ticks")){ + +forecast_output_validator <- function(forecast_file){ + + file_in <- forecast_file valid <- TRUE message(file_in) - #usethis::ui_todo("Checking validity of file name...") - file_basename <- basename(file_in) - parsed_basename <- unlist(stringr::str_split(file_basename, "-")) - file_name_parsable <- TRUE - - if(!(parsed_basename[1] %in% theme_names)){ - usethis::ui_warn(paste0("first position of file name (before first -) is not one of the following : ", - paste(theme_names, collapse = " "))) - valid <- FALSE - file_name_parsable <- FALSE - } - - date_string <- lubridate::as_date(paste(parsed_basename[2:4], collapse = "-")) - - if(is.na(date_string)){ - usethis::ui_warn("file name does not contain parsable date") - file_name_parsable <- FALSE - valid <- FALSE - } - - if(file_name_parsable){ - usethis::ui_done("file name is correct") - } - - if(any(vapply(c("[.]csv", "[.]csv\\.gz"), grepl, logical(1), file_in))){ + if(any(vapply(c("[.]csv", "[.]csv\\.gz"), grepl, logical(1), file_in))){ # if file is csv zip file out <- readr::read_csv(file_in, guess_max = 1e6, show_col_types = FALSE) + if(lexists(out, c("model_id"))){ + usethis::ui_done("file has model_id column") + }else{ + usethis::ui_warn("file missing model_id column ") + } + + if("variable" %in% names(out) & "prediction" %in% names(out)){ usethis::ui_done("forecasted variables found correct variable + prediction column") - }else if("variable" %in% names(out) & "predicted" %in% names(out)){ - usethis::ui_warn("file as predicted column. change column name to prediction") - valid <- FALSE }else{ usethis::ui_warn("missing the variable and prediction columns") valid <- FALSE @@ -81,199 +38,73 @@ forecast_output_validator <- function(forecast_file, valid <- FALSE }else if(lexists(out, "family")){ - if("normal" %in% unique(out$family)){ - usethis::ui_done("file has normal distribution in family column") - }else if("ensemble" %in% unique(out$family)){ - usethis::ui_done("file has ensemble distribution in family column") - }else{ - usethis::ui_warn("only normal or ensemble distributions in family columns are currently supported") - valid <- FALSE - } - if(lexists(out, "parameter")){ - if("mu" %in% unique(out$parameter) & "sigma" %in% unique(out$parameter)){ - usethis::ui_done("file has parameter and family column with normal distribution") - }else if("ensemble" %in% unique(out$family) | "sample" %in% unique(out$family) ){ - usethis::ui_done("file has parameter and family column with ensemble generated distribution") - }else{ - usethis::ui_warn("file does not have parameter column is not a normal or ensemble distribution") - valid <- FALSE - } + usethis::ui_done("file has correct family and parameter columns") }else{ - usethis::ui_warn("file does not have parameter and family column ") + usethis::ui_warn("file does not have parameter column ") valid <- FALSE } }else{ - usethis::ui_warn("file does not have ensemble or family + parameter column") + usethis::ui_warn("file does not have ensemble or family and/or parameter column") valid <- FALSE } #usethis::ui_todo("Checking that file contains siteID column...") if(lexists(out, c("site_id"))){ usethis::ui_done("file has site_id column") - }else if(lexists(out, c("siteID"))){ - usethis::ui_warn("file siteID column should be named site_id") }else{ usethis::ui_warn("file missing site_id column") } - #usethis::ui_todo("Checking that file contains parsable time column...") if(lexists(out, c("datetime"))){ - usethis::ui_done("file has time column") - if(!stringr::str_detect(out$datetime[1], "-")){ - usethis::ui_done("time column format is not in the correct YYYY-MM-DD format") + usethis::ui_done("file has datetime column") + if(!grepl("-", out$datetime[1])){ + usethis::ui_done("datetime column format is not in the correct YYYY-MM-DD format") valid <- FALSE }else{ if(sum(class(out$datetime) %in% c("Date","POSIXct")) > 0){ - usethis::ui_done("file has correct time column") + usethis::ui_done("file has correct datetime column") }else{ - usethis::ui_done("time column format is not in the correct YYYY-MM-DD format") + usethis::ui_done("datetime column format is not in the correct YYYY-MM-DD format") valid <- FALSE } } - }else if(lexists(out, c("time"))){ - usethis::ui_warn("time dimension should be named datetime. We are converting it during processing but please update your submission format") - valid <- TRUE }else{ - usethis::ui_warn("file missing time column") + usethis::ui_warn("file missing datetime column") valid <- FALSE } + + #if(lexists(out, c("duration"))){ + # usethis::ui_done("file has duration column") + #}else{ + # usethis::ui_warn("file missing duration column (values for the column: daily = P1D, hourly = PT1H)") + # valid <- FALSE + #} + + #if(lexists(out, c("project_id"))){ + # usethis::ui_done("file has project_id column") + #}else{ + # usethis::ui_warn("file missing project_id column (use `vera4cast` as the project_id") + # valid <- FALSE + #} + if(lexists(out, c("reference_datetime"))){ usethis::ui_done("file has reference_datetime column") }else if(lexists(out, c("start_time"))){ usethis::ui_warn("file start_time column should be named reference_datetime. We are converting it during processing but please update your submission format") }else{ usethis::ui_warn("file missing reference_datetime column") - } - - } else if(grepl("[.]nc", file_in)){ #if file is nc - - nc <- ncdf4::nc_open(file_in) - - #usethis::ui_todo("Checking that file contains correct variables...") - - if(lexists(nc$var, target_variables) > 0){ - usethis::ui_done("target variables found") - var_dim <- dim(ncdf4::ncvar_get(nc, varid = names(nc$var[which(names(nc$var) %in% target_variables)][1]))) - }else{ - usethis::ui_warn(paste0("no target variables in found in possible list: ", paste(target_variables, collapse = " "))) valid <- FALSE } - #usethis::ui_todo("Checking that time variable exist and is parseable...") - - if(lexists(nc$dim, c("time", "datetime"))){ - usethis::ui_done("file has time dimension") - if("time" %in% names(nc$dim)){ - usethis::ui_warn("time dimension should be named datetime we are converting it during processing but please update your submission format") - time <- ncdf4::ncvar_get(nc, "time") - time_dim <- length(time) - valid <- TRUE - }else{ - time <- ncdf4::ncvar_get(nc, "datetime") - tustr<-strsplit(ncdf4::ncatt_get(nc, varid = "datetime", "units")$value, " ") - t_string <- strsplit(ncdf4::ncatt_get(nc, varid = "datetime", "units")$value, " ")[[1]][1] - time_dim <- length(time) - time <-lubridate::as_date(time,origin=unlist(tustr)[3]) - if(t_string %in% c("days","seconds")){ - usethis::ui_done("file has correct time dimension") - }else{ - usethis::ui_warn("time dimension is in correct format") - valid <- FALSE - } - } - }else{ - usethis::ui_warn("file missing time dimension") - valid <- FALSE - } - - #usethis::ui_todo("Checking that siteID variable exists...") - #GENERALIZATION: using siteID here - should be site_id - if(lexists(nc$var, c("siteID", "site_id"))){ - usethis::ui_done("file has siteID variable") - }else{ - usethis::ui_warn("file missing siteID variable") - valid <- FALSE - } - - #usethis::ui_todo("Checking that netcdf contains site dimension...") - - if(lexists(nc$dim, c("site")) > 0){ - usethis::ui_done("file has site dimension") - site_dim <- length(ncdf4::ncvar_get(nc, "site")) - - }else{ - usethis::ui_warn("file missing site dimension") - valid <- FALSE - } - - #usethis::ui_todo("Checking that netcdf contains ensemble dimension...") - - if(lexists(nc$dim, "ensemble")){ - usethis::ui_warn("ensemble dimension should be named parameter") - ensemble_dim <- length(ncdf4::ncvar_get(nc, "ensemble")) - valid <- FALSE - }else if(lexists(nc$dim, "parameter")){ - usethis::ui_done("file has parameter dimension") - ensemble_dim <- length(ncdf4::ncvar_get(nc, "parameter")) - }else{ - usethis::ui_warn("file missing parameter dimension") - valid <- FALSE - } - - #usethis::ui_todo("Checking that netcdf dimensions are correct order...") - dim_order <- TRUE - - if(var_dim[1] != time_dim){ - usethis::ui_warn("time is not the first dimension") - valid <- FALSE - dim_order <- FALSE - } - - if(var_dim[2] != site_dim){ - usethis::ui_warn("site is not the second dimension") - valid <- FALSE - dim_order <- FALSE - } - - if(var_dim[3] != ensemble_dim){ - usethis::ui_warn("ensemble is not the third dimension") - valid <- FALSE - dim_order <- FALSE - } - - if(dim_order){ - usethis::ui_done("dimensions are correct order") - } - - ncdf4::nc_close(nc) - - }else if(grepl("[.]xml", file_in)){ #if file is eml - - #usethis::ui_todo("Checking validity of metdata...") - - #out <- EML::read_eml(file_in) - - #valid_metadata <- tryCatch(EFIstandards::forecast_validator(out),error = function(e){ - # message(e) - # return(FALSE) - #}, - #finally = NULL) - - #if(!valid_metadata){ - # usethis::ui_warn("metadata is not correct") - # valid <- FALSE - #}else{ - # usethis::ui_done("metadata is correct") - #} - valid <- TRUE }else{ + usethis::ui_warn("incorrect file extension (csv or csv.gz are accepted)") valid <- FALSE } if(!valid){ - message("Forecast file is not valid. The following link provides information about the format:\nhttps://projects.ecoforecast.org/neon4cast-docs/Submission-Instructions.html") + message("Forecast file is not valid. The following link provides information about the format:\nhttps://projects.ecoforecast.org/neon4cast-ci/instructions.html#forecast-file-format") }else{ message("Forecast format is valid") } @@ -283,5 +114,4 @@ forecast_output_validator <- function(forecast_file, lexists <- function(list,name){ any(!is.na(match(name, names(list)))) -} - +} \ No newline at end of file diff --git a/man/check_submission.Rd b/man/check_submission.Rd deleted file mode 100644 index 614be98..0000000 --- a/man/check_submission.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/submit.R -\name{check_submission} -\alias{check_submission} -\title{Check that submission was successfully processed} -\usage{ -check_submission( - forecast_file, - s3_region = "data", - s3_endpoint = "ecoforecast.org" -) -} -\arguments{ -\item{forecast_file}{Your forecast csv or nc file} - -\item{s3_region}{subdomain (leave as is for EFI challenge)} - -\item{s3_endpoint}{root domain (leave as is for EFI challenge)} -} -\description{ -Check that submission was successfully processed -} diff --git a/man/forecast_output_validator.Rd b/man/forecast_output_validator.Rd index 1e2c55f..e46fbae 100644 --- a/man/forecast_output_validator.Rd +++ b/man/forecast_output_validator.Rd @@ -2,31 +2,13 @@ % Please edit documentation in R/forecast_output_validator.R \name{forecast_output_validator} \alias{forecast_output_validator} -\title{forecast_output_validator} +\title{Validate forecast file} \usage{ -forecast_output_validator( - forecast_file, - target_variables = c("oxygen", "temperature", "richness", "abundance", "nee", "le", - "vswc", "gcc_90", "rcc_90", "ixodes_scapularis", "amblyomma_americanum", - "prediction", "observed"), - theme_names = c("aquatics", "beetles", "phenology", "terrestrial_30min", - "terrestrial_daily", "ticks") -) +forecast_output_validator(forecast_file) } \arguments{ -\item{forecast_file}{Your forecast csv or nc file} - -\item{target_variables}{Possible target variables} - -\item{theme_names}{valid EFI theme names} +\item{forecast_file}{forecast csv or csv.gz file} } \description{ -forecast_output_validator -} -\examples{ - -forecast_file <- system.file("extdata/aquatics-2021-02-01-EFInull.csv.gz", - package = "neon4cast") -forecast_output_validator(forecast_file) - +Validate forecast file } diff --git a/man/noaa_stage1.Rd b/man/noaa_stage1.Rd index bc1c7e5..42f3ddb 100644 --- a/man/noaa_stage1.Rd +++ b/man/noaa_stage1.Rd @@ -9,8 +9,7 @@ noaa_stage1( version = "v12", endpoint = "data.ecoforecast.org", verbose = TRUE, - start_date = "", - site_id = NA + start_date = "" ) } \arguments{ diff --git a/man/noaa_stage2.Rd b/man/noaa_stage2.Rd index 91ddba3..d503783 100644 --- a/man/noaa_stage2.Rd +++ b/man/noaa_stage2.Rd @@ -15,8 +15,7 @@ noaa_stage2( version = "v12", endpoint = NA, verbose = TRUE, - start_date = "", - site_id = NA + start_date = "" ) } \arguments{ diff --git a/man/noaa_stage3.Rd b/man/noaa_stage3.Rd index 8836024..41ad9bf 100644 --- a/man/noaa_stage3.Rd +++ b/man/noaa_stage3.Rd @@ -4,12 +4,7 @@ \alias{noaa_stage3} \title{NOAA GEFS forecasts with EFI stage 3 processing} \usage{ -noaa_stage3( - version = "v12", - endpoint = "data.ecoforecast.org", - verbose = TRUE, - site_id = NA -) +noaa_stage3(version = "v12", endpoint = "data.ecoforecast.org", verbose = TRUE) } \arguments{ \item{version}{GEFS forecast version. Prior versions correspond to forecasts diff --git a/man/submit.Rd b/man/submit.Rd index 297938f..608dfe0 100644 --- a/man/submit.Rd +++ b/man/submit.Rd @@ -13,7 +13,7 @@ submit( ) } \arguments{ -\item{forecast_file}{Your forecast csv or nc file} +\item{forecast_file}{forecast csv or csv.gz file} \item{metadata}{path to metadata file} From f5144f79c2479b6cb419dfd410d3a3666881053d Mon Sep 17 00:00:00 2001 From: rqthomas Date: Fri, 1 Dec 2023 13:44:40 -0500 Subject: [PATCH 19/20] checking duration and project_id --- R/forecast_output_validator.R | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/R/forecast_output_validator.R b/R/forecast_output_validator.R index 76fa67b..1a02c09 100644 --- a/R/forecast_output_validator.R +++ b/R/forecast_output_validator.R @@ -76,19 +76,17 @@ forecast_output_validator <- function(forecast_file){ } - #if(lexists(out, c("duration"))){ - # usethis::ui_done("file has duration column") - #}else{ - # usethis::ui_warn("file missing duration column (values for the column: daily = P1D, hourly = PT1H)") - # valid <- FALSE - #} + if(lexists(out, c("duration"))){ + usethis::ui_done("file has duration column") + }else{ + usethis::ui_warn("file missing duration column (values for the column: daily = P1D, hourly = PT1H)") + } - #if(lexists(out, c("project_id"))){ - # usethis::ui_done("file has project_id column") - #}else{ - # usethis::ui_warn("file missing project_id column (use `vera4cast` as the project_id") - # valid <- FALSE - #} + if(lexists(out, c("project_id"))){ + usethis::ui_done("file has project_id column") + }else{ + usethis::ui_warn("file missing project_id column (use `vera4cast` as the project_id") + } if(lexists(out, c("reference_datetime"))){ usethis::ui_done("file has reference_datetime column") From 136d692976fc70ac05216b985d386b8b8b79fcac Mon Sep 17 00:00:00 2001 From: rqthomas Date: Fri, 1 Dec 2023 13:45:12 -0500 Subject: [PATCH 20/20] minor --- R/forecast_output_validator.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/R/forecast_output_validator.R b/R/forecast_output_validator.R index 1a02c09..b90165c 100644 --- a/R/forecast_output_validator.R +++ b/R/forecast_output_validator.R @@ -31,8 +31,6 @@ forecast_output_validator <- function(forecast_file){ valid <- FALSE } - #usethis::ui_todo("Checking that file contains either ensemble or statistic column...") - if(lexists(out, "ensemble")){ usethis::ui_warn("ensemble dimension should be named parameter") valid <- FALSE