Skip to content

Commit

Permalink
Merge pull request #17 from eco4cast/generate-drivers
Browse files Browse the repository at this point in the history
Generate drivers for USGS sites
  • Loading branch information
jzwart authored Jan 24, 2024
2 parents e037a04 + fe4fafc commit 0f8c43e
Show file tree
Hide file tree
Showing 9 changed files with 137 additions and 140 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/drivers_stage1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ on:
- cron: '0 23 * * *'
workflow_dispatch:


name: gefs_osn

jobs:
Expand All @@ -18,6 +17,8 @@ jobs:
# container: rocker/geospatial
steps:
- uses: actions/checkout@v3
with:
ref: prod

- name: Install
shell: Rscript {0}
Expand Down
6 changes: 4 additions & 2 deletions .github/workflows/drivers_stage3.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
#schedule:
# - cron: '0 13 * * *'
schedule:
- cron: '0 13 * * *'
workflow_dispatch:

name: gefs_osn_stage3
Expand All @@ -17,6 +17,8 @@ jobs:
# container: rocker/geospatial
steps:
- uses: actions/checkout@v3
with:
ref: prod

- name: Install
shell: Rscript {0}
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,6 @@ Supported by the U.S. National Science Foundation grants (DEB-1926388 and OAC-22
## Disclaimer
Although this software program has been used by the U.S. Geological Survey (USGS), no warranty, expressed or implied, is made by the USGS or the U.S. Government as to the accuracy and functioning of the program and related program material nor shall the fact of distribution constitute any such warranty, and no responsibility is assumed by the USGS in connection therewith.
This software is provided “AS IS.”

## License Disclaimer
As a government employee, the contributions from Jacob Zwart to this repository are in the public domain.
34 changes: 24 additions & 10 deletions challenge_configuration.yaml
Original file line number Diff line number Diff line change
@@ -1,20 +1,33 @@
# Challenge details
challenge_long_name: EFI-USGS River Chlorophyll Forecasting Challenge
challenge_url: https://projects.ecoforecast.org/usgsrc4cast-ci
github_repo: eco4cast/usgsrc4cast-ci
project_id: usgsrc4cast

# Endpoints
endpoint: sdsc.osn.xsede.org
noaa_endpoint: s3.flare-forecast.org
submissions_endpoint: submit.ecoforecast.org
scores_bucket: bio230014-bucket01/challenges/scores

# Buckets
archive_bucket: bio230014-bucket01/challenges/archive
driver_bucket: bio230014-bucket01/challenges/drivers/usgsrc4cast/noaa
forecasts_bucket: bio230014-bucket01/challenges/forecasts
summaries_bucket: bio230014-bucket01/challenges/forecasts/summaries
submissions_bucket: submissions
inventory_bucket: bio230014-bucket01/challenges/inventory
model_metadata_bucket: bio230014-bucket01/challenges/metadata/model_id
noaa_forecast_bucket: drivers/noaa/gefs-v12-reprocess/
prov_bucket: bio230014-bucket01/challenges/prov
scores_bucket: bio230014-bucket01/challenges/scores
submissions_bucket: submissions
summaries_bucket: bio230014-bucket01/challenges/forecasts/summaries
targets_bucket: bio230014-bucket01/challenges/targets
archive_bucket: bio230014-bucket01/challenges/archive
model_metadata_bucket: bio230014-bucket01/challenges/metadata/model_id
model_metadata_gsheet: https://docs.google.com/spreadsheets/d/1f177dpaxLzc4UuQ4_SJV9JWIbQPlilVnEztyvZE6aSU/edit?usp=sharing

# Misc. files
example_model_id: example
model_metadata_gsheet: https://docs.google.com/spreadsheets/d/1f177dpaxLzc4UuQ4_SJV9JWIbQPlilVnEztyvZE6aSU/edit?usp=sharing
targets_file_name: 'river-chl-targets.csv.gz'

# Forecast sites
site_path: 'catalog/sites'
site_table: USGS_site_metadata.csv
site_thumbnail: 'https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/screencapture-waterdata-usgs-gov-nwis-rt-2018-08-02-13_00_05-01.jpg'
Expand All @@ -25,21 +38,22 @@ target_metadata_gsheet: https://docs.google.com/spreadsheets/d/10YTX9ae_C1rFdLgE
targets_thumbnail: 'https://raw.githubusercontent.com/eco4cast/neon4cast-ci/main/catalog/thumbnail_plots/neon_stream.jpg'
targets_thumbnail_title: 'Test Image'
targets_path: 'catalog/targets/'
# to here
targets_file_name: 'river-chl-targets.csv.gz'
challenge_url: https://projects.ecoforecast.org/usgsrc4cast-ci
github_repo: eco4cast/usgsrc4cast-ci

target_groups:
Aquatics:
targets_file: "https://data.ecoforecast.org/neon4cast-targets/aquatics/aquatics-targets.csv.gz"

noaa_forecast_groups: ['Pseudo','Stage1-stats','Stage1','Stage2','Stage3']
noaa_forecast_group_paths: ['pseudo','stage1-stats','stage1','stage2','stage3']

variable_groups:
Aquatics:
variable: ['chla']
duration: ['P1D']
thumbnail_link: 'https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/Back-b.jpg'
thumbnail_title: 'USGS Streamgage'

# Forecast catalog configuration
catalog_config:
about_string: 'https://projects.ecoforecast.org/usgsrc4cast-docs/'
about_title: 'EFI-USGS River Chlorophyll Forecasting Challenge Documentation'
Expand Down
37 changes: 27 additions & 10 deletions drivers/download_stage1_psuedo.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@ library(gefs4cast)
gdalcubes::gdalcubes_options(parallel=2*parallel::detectCores())
#gdalcubes::gdalcubes_options(parallel=TRUE)

config <- yaml::read_yaml("challenge_configuration.yaml")
driver_bucket <- stringr::word(config$driver_bucket, 1, sep = "/")
driver_path <- stringr::word(config$driver_bucket, 2, -1, sep = "/")

sites <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/",
"raw/main/USGS_site_metadata.csv"),
"raw/prod/USGS_site_metadata.csv"),
col_select = c("site_id", "latitude", "longitude"))

Sys.setenv("GEFS_VERSION"="v12")
Expand All @@ -15,27 +19,40 @@ dates_pseudo <- seq(as.Date("2020-09-24"), Sys.Date(), by=1)

message("GEFS v12 stage1-stats")
bench::bench_time({ # thelio
s3 <- gefs_s3_dir("stage1-stats")
s3 <- gefs4cast::gefs_s3_dir(product = "stage1-stats",
path = driver_path,
endpoint = config$endpoint,
bucket = driver_bucket)
have_dates <- gsub("reference_datetime=", "", s3$ls())
missing_dates <- dates[!(as.character(dates) %in% have_dates)]
gefs_to_parquet(missing_dates,
ensemble=c("geavg", "gespr"),
path = s3,
sites = sites)
gefs4cast::gefs_to_parquet(dates = missing_dates,
ensemble = c("geavg", "gespr"),
path = s3,
sites = sites)
})

message("GEFS v12 pseudo")
bench::bench_time({ #32xlarge
s3 <- gefs_s3_dir("pseudo")
s3 <- gefs4cast::gefs_s3_dir(product = "pseudo",
path = driver_path,
endpoint = config$endpoint,
bucket = driver_bucket)
have_dates <- gsub("reference_datetime=", "", s3$ls())
missing_dates <- dates_pseudo[!(as.character(dates_pseudo) %in% have_dates)]
gefs4cast:::gefs_pseudo_measures(missing_dates, path = s3, sites = sites)
gefs4cast:::gefs_pseudo_measures(dates = missing_dates,
path = s3,
sites = sites)
})

message("GEFS v12 stage1")
bench::bench_time({ # cirrus ~ 6days for full set
s3 <- gefs_s3_dir("stage1")
s3 <- gefs4cast::gefs_s3_dir(product = "stage1",
path = driver_path,
endpoint = config$endpoint,
bucket = driver_bucket)
have_dates <- gsub("reference_datetime=", "", s3$ls())
missing_dates <- dates[!(as.character(dates) %in% have_dates)]
gefs_to_parquet(missing_dates, path = s3, sites = sites)
gefs4cast::gefs_to_parquet(dates = missing_dates,
path = s3,
sites = sites)
})
59 changes: 31 additions & 28 deletions drivers/generate_stage2.R
Original file line number Diff line number Diff line change
@@ -1,51 +1,54 @@
source("https://raw.githubusercontent.com/eco4cast/neon4cast/ci_upgrade/R/to_hourly.R") # is this branch stable?
## setup
library(gdalcubes)
library(gefs4cast)
source("https://raw.githubusercontent.com/eco4cast/neon4cast/main/R/to_hourly.R")

Sys.setenv("GEFS_VERSION"="v12")

site_list <- readr::read_csv("USGS_site_metadata.csv",
show_col_types = FALSE)

# should this be updated to a usgsrc4cast-drivers path? or are we keeping all drivers in
# neon4cast-drivers?
s3_stage2 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage2",
endpoint_override = "sdsc.osn.xsede.org",
access_key= Sys.getenv("OSN_KEY"),
secret_key= Sys.getenv("OSN_SECRET"))
config <- yaml::read_yaml("challenge_configuration.yaml")
driver_bucket <- stringr::word(config$driver_bucket, 1, sep = "/")
driver_path <- stringr::word(config$driver_bucket, 2, -1, sep = "/")

# s3_stage2 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage2",
# endpoint_override = "sdsc.osn.xsede.org",
# access_key= Sys.getenv("OSN_KEY"),
# secret_key= Sys.getenv("OSN_SECRET"))
s3_stage2 <- gefs4cast::gefs_s3_dir(product = "stage2",
path = driver_path,
endpoint = config$endpoint,
bucket = driver_bucket)

df <- arrow::open_dataset(s3_stage2) |>
dplyr::distinct(reference_datetime) |>
dplyr::collect()


#stage1_s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1",
# endpoint_override = "sdsc.osn.xsede.org",
# anonymous = TRUE)


#efi <- duckdbfs::open_dataset("s3://bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1",
# s3_access_key_id="",
# s3_endpoint="sdsc.osn.xsede.org")
#df_stage1 <- arrow::open_dataset(stage1_s3) |>
# dplyr::summarize(max(reference_datetime)) |>
# dplyr::collect()

curr_date <- Sys.Date()
last_week <- dplyr::tibble(reference_datetime = as.character(seq(curr_date - lubridate::days(7), curr_date - lubridate::days(1), by = "1 day")))
last_week <- dplyr::tibble(reference_datetime = as.character(seq(curr_date - lubridate::days(7),
curr_date - lubridate::days(1),
by = "1 day")))

missing_dates <- dplyr::anti_join(last_week, df, by = "reference_datetime") |>
missing_dates <- dplyr::anti_join(last_week, df,
by = "reference_datetime") |>
dplyr::pull(reference_datetime)

if(length(missing_dates) > 0){
for(i in 1:length(missing_dates)){

print(missing_dates[i])

bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",missing_dates[i])
# bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",
# missing_dates[i])
bucket <- glue::glue("{config$driver_bucket}/gefs-v12/stage1/reference_datetime={missing_dates[i]}")

endpoint_override <- "https://sdsc.osn.xsede.org"
s3 <- arrow::s3_bucket(paste0(bucket),
endpoint_override = endpoint_override,
anonymous = TRUE)
s3_stage1 <- arrow::s3_bucket(bucket = bucket,
endpoint_override = config$endpoint,
anonymous = TRUE)

site_df <- arrow::open_dataset(s3) |>
site_df <- arrow::open_dataset(s3_stage1) |>
dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) |>
dplyr::filter(site_id %in% site_list$site_id) |>
dplyr::collect() |>
Expand All @@ -58,7 +61,7 @@ if(length(missing_dates) > 0){
reference_datetime = lubridate::as_date(reference_datetime)) |>
dplyr::rename(parameter = ensemble)

arrow::write_dataset(hourly_df,
arrow::write_dataset(dataset = hourly_df,
path = s3_stage2,
partitioning = c("reference_datetime", "site_id"))
}
Expand Down
43 changes: 21 additions & 22 deletions drivers/generate_stage3.R
Original file line number Diff line number Diff line change
@@ -1,33 +1,34 @@
## setup
library(minioclient)
source("https://raw.githubusercontent.com/eco4cast/neon4cast/ci_upgrade/R/to_hourly.R")
library(gdalcubes)
library(gefs4cast)
source("https://raw.githubusercontent.com/eco4cast/neon4cast/main/R/to_hourly.R")

config <- yaml::read_yaml("challenge_configuration.yaml")
driver_bucket <- stringr::word(config$driver_bucket, 1, sep = "/")
driver_path <- stringr::word(config$driver_bucket, 2, -1, sep = "/")

Sys.setenv("GEFS_VERSION"="v12")

#install_mc()
mc_alias_set("osn", "sdsc.osn.xsede.org", "", "")
# TODO: update path to usgsrc4cast-drivers?
mc_mirror("osn/bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/pseudo", "pseudo")

mc_mirror(glue::glue("osn/{driver_bucket}/{driver_path}/gefs-v12/pseudo"), "pseudo")

df <- arrow::open_dataset("pseudo") |>
dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF"))


site_list <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/",
"raw/main/USGS_site_metadata.csv"),
"raw/prod/USGS_site_metadata.csv"),
show_col_types = FALSE) |>
dplyr::pull(site_id)

s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12",
endpoint_override = "sdsc.osn.xsede.org",
access_key= Sys.getenv("OSN_KEY"),
secret_key= Sys.getenv("OSN_SECRET"))

s3$CreateDir("stage3")

s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage3",
endpoint_override = "sdsc.osn.xsede.org",
access_key= Sys.getenv("OSN_KEY"),
secret_key= Sys.getenv("OSN_SECRET"))

#site_list <- site_list[1:3]
s3_stage3 <- gefs4cast::gefs_s3_dir(product = "stage3",
path = driver_path,
endpoint = config$endpoint,
bucket = driver_bucket)

future::plan("future::multisession", workers = 8)

Expand All @@ -38,15 +39,13 @@ furrr::future_walk(site_list, function(curr_site_id){
dplyr::filter(site_id == curr_site_id) |>
dplyr::collect()

s3 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage3",
endpoint_override = "sdsc.osn.xsede.org",
access_key= Sys.getenv("OSN_KEY"),
secret_key= Sys.getenv("OSN_SECRET"))
s3_stage3 <- gefs4cast::gefs_s3_dir(product = "stage3",
path = driver_path,
endpoint = config$endpoint,
bucket = driver_bucket)

print(curr_site_id)
df |>
#dplyr::filter(site_id == curr_site_id) |>
#dplyr::collect() |>
to_hourly(use_solar_geom = TRUE, psuedo = TRUE) |>
dplyr::mutate(ensemble = as.numeric(stringr::str_sub(ensemble, start = 4, end = 5))) |>
dplyr::rename(parameter = ensemble) |>
Expand Down
50 changes: 0 additions & 50 deletions drivers/submit_met_forecast.R

This file was deleted.

Loading

0 comments on commit 0f8c43e

Please sign in to comment.