diff --git a/.Rbuildignore b/.Rbuildignore
deleted file mode 100644
index 4ff7437070..0000000000
--- a/.Rbuildignore
+++ /dev/null
@@ -1,5 +0,0 @@
-^renv$
-^renv\.lock$
-^\.github$
-^.*\.Rproj$
-^\.Rproj\.user$
diff --git a/.github/.gitignore b/.github/.gitignore
deleted file mode 100644
index 2d19fc766d..0000000000
--- a/.github/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*.html
diff --git a/.github/workflows/baselines_daily.yaml b/.github/workflows/baselines_daily.yaml
deleted file mode 100644
index a497311206..0000000000
--- a/.github/workflows/baselines_daily.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
-# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
-on:
- schedule:
- - cron: '0 20 * * *'
- workflow_dispatch:
-
-
-name: baseline-daily-forecasts
-
-jobs:
- aquatics:
- runs-on: ubuntu-latest
- container: eco4cast/rocker-neon4cast:latest
- env:
- GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
- AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
- AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- steps:
- - uses: actions/checkout@v3
- with:
- ref: prod
-
- - name: Generate forecasts
- shell: Rscript {0}
- run: |
- source("baseline_models/run_aquatics_baselines.R")
diff --git a/.github/workflows/catalog.yaml b/.github/workflows/catalog.yaml
deleted file mode 100644
index 2012bed79d..0000000000
--- a/.github/workflows/catalog.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-on:
- #schedule:
- # - cron: '0 23 * * *'
- workflow_dispatch:
-
-name: catalog
-
-jobs:
- docker:
- runs-on: ubuntu-latest
- env:
- GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
- OSN_KEY: ${{ secrets.OSN_KEY }}
- OSN_SECRET: ${{ secrets.OSN_SECRET }}
- #container: rocker/geospatial:latest
- container: eco4cast/rocker-neon4cast:latest
- steps:
- - run: git config --system --add safe.directory '*'
-
- - uses: actions/checkout@v3
- with:
- fetch-depth: 0
- set-safe-directory: '*'
-
- - name: install validator
- run: |
- pip install stac-validator
-
- - name: Render
- shell: Rscript {0}
- run: source("catalog/update_stac.R")
-
- - name: Commit and Push
- run: |
- git config user.name github-actions
- git config user.email github-actions@github.com
- git add catalog/* .
- git commit -a -m "update catalog" || echo "nothing to commit"
- git push https://${GITHUB_PAT}:${GITHUB_PAT}@github.com/${GITHUB_REPOSITORY}
-
diff --git a/.github/workflows/combined.yaml b/.github/workflows/combined.yaml
deleted file mode 100644
index d25c2b887f..0000000000
--- a/.github/workflows/combined.yaml
+++ /dev/null
@@ -1,131 +0,0 @@
-# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
-# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
-on:
- schedule:
- - cron: '0 5 * * *'
- workflow_dispatch:
-
-name: submissions-score-dashboard-catalog
-
-jobs:
- submissions:
- runs-on: ubuntu-latest
- env:
- GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
- AWS_ACCESS_KEY_SUBMISSIONS: ${{ secrets.AWS_ACCESS_KEY_SUBMISSIONS }}
- AWS_SECRET_ACCESS_KEY_SUBMISSIONS: ${{ secrets.AWS_SECRET_ACCESS_KEY_SUBMISSIONS }}
- OSN_KEY: ${{ secrets.OSN_KEY }}
- OSN_SECRET: ${{ secrets.OSN_SECRET }}
- container: eco4cast/rocker-neon4cast:latest
- steps:
- - uses: actions/checkout@v3
- with:
- ref: catalog-dashboard
-
- - name: Process submissions
- shell: Rscript {0}
- run: |
- source("submission_processing/process_submissions.R")
-
-
- scores:
- needs: submissions
- runs-on: ubuntu-latest
- env:
- GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
- OSN_KEY: ${{ secrets.OSN_KEY }}
- OSN_SECRET: ${{ secrets.OSN_SECRET }}
- container: eco4cast/rocker-neon4cast:latest
- steps:
- - uses: actions/checkout@v3
- with:
- ref: catalog-dashboard
-
- - name: Generate scores
- shell: Rscript {0}
- run: |
- source("scoring/scoring.R")
-
- - name: Update inventory
- shell: Rscript {0}
- run: |
- source("scoring/build_score_inventory.R")
-
- catalog:
- needs: scores
- runs-on: ubuntu-latest
- env:
- GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
- OSN_KEY: ${{ secrets.OSN_KEY }}
- OSN_SECRET: ${{ secrets.OSN_SECRET }}
- #container: rocker/geospatial:latest
- container: eco4cast/rocker-neon4cast:latest
- steps:
- - run: git config --system --add safe.directory '*'
-
- - uses: actions/checkout@v3
- with:
- ref: catalog-dashboard
- fetch-depth: 0
- set-safe-directory: '*'
-
- - name: install validator
- run: |
- pip install stac-validator
-
- - name: Render
- shell: Rscript {0}
- run: source("catalog/update_stac.R")
-
- - name: Commit and Push
- run: |
- git config user.name github-actions
- git config user.email github-actions@github.com
- git pull
- git add catalog/* .
- git commit -a -m "update catalog" || echo "nothing to commit"
- git push https://${GITHUB_PAT}:${GITHUB_PAT}@github.com/${GITHUB_REPOSITORY}
- dashboard:
- needs: catalog
- runs-on: ubuntu-latest
- env:
- GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
- container: eco4cast/rocker-neon4cast:latest
- steps:
- - run: git config --system --add safe.directory '*'
-
- - uses: actions/checkout@v3
- with:
- ref: catalog-dashboard
- fetch-depth: 0
- set-safe-directory: '*'
-
- - uses: quarto-dev/quarto-actions/setup@v2
- with:
- version: 1.4.146
-
- - name: install deps
- shell: Rscript {0}
- run: remotes::install_deps(".", dep=TRUE)
-
- - name: pull scores cache
- shell: Rscript {0}
- run: source("dashboard/cache.R")
-
- - name: Render
- run: |
- quarto render dashboard
-
- - name: Build site map
- shell: Rscript {0}
- run: source("dashboard/build_site_map.R")
-
- - name: Publish
- run: |
- git config user.name github-actions
- git config user.email github-actions@github.com
- git checkout gh-pages
- cp -r dashboard/docs/* .
- git add .
- git commit -a -m "update pages" || echo "nothing to commit"
- git push https://${GITHUB_PAT}:${GITHUB_PAT}@github.com/${GITHUB_REPOSITORY}
diff --git a/.github/workflows/dashboard.yaml b/.github/workflows/dashboard.yaml
deleted file mode 100644
index aa7acc398c..0000000000
--- a/.github/workflows/dashboard.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-on:
- #schedule:
- # - cron: '0 1 * * *'
- #push:
- # branches:
- # - main
- workflow_dispatch:
-
-name: dashboard
-
-jobs:
- docker:
- runs-on: ubuntu-latest
- env:
- GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
- container: eco4cast/rocker-neon4cast:latest
- steps:
- - run: git config --system --add safe.directory '*'
-
- - uses: actions/checkout@v3
- with:
- fetch-depth: 0
- set-safe-directory: '*'
-
- - name: install deps
- shell: Rscript {0}
- run: remotes::install_deps(".", dep=TRUE)
-
- - uses: quarto-dev/quarto-actions/setup@v2
- with:
- version: 1.4.146
-
- - name: pull scores cache
- shell: Rscript {0}
- run: source("dashboard/cache.R")
-
- #- name: Cache scores
- # id: cache-scores
- # uses: actions/cache@v3
- # with:
- # path: cache
- # key: ${{ runner.os }}-cache
-
- - name: Render
- run: |
- quarto render dashboard
-
- - name: Build site map
- shell: Rscript {0}
- run: source("dashboard/build_site_map.R")
-
- - name: Publish
- run: |
- git config user.name github-actions
- git config user.email github-actions@github.com
- git checkout gh-pages
- cp -r dashboard/docs/* .
- git add .
- git commit -a -m "update pages" || echo "nothing to commit"
- git push https://${GITHUB_PAT}:${GITHUB_PAT}@github.com/${GITHUB_REPOSITORY}
-
diff --git a/.github/workflows/docker_targets.yaml b/.github/workflows/docker_targets.yaml
deleted file mode 100644
index a76089d02f..0000000000
--- a/.github/workflows/docker_targets.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-name: build targets docker container
-
-on:
- workflow_dispatch:
- schedule:
- - cron: '0 0 * * *'
- #push:
- # branches:
- # - 'main'
-jobs:
- docker:
- runs-on: ubuntu-latest
- steps:
- - name: Checkout
- uses: actions/checkout@v3
- with:
- ref: prod
- - name: Set up QEMU
- uses: docker/setup-qemu-action@v2
- - name: Set up Docker Buildx
- uses: docker/setup-buildx-action@v2
- - name: Login to DockerHub
- uses: docker/login-action@v2
- with:
- username: ${{ secrets.DOCKERHUB_USERNAME }}
- password: ${{ secrets.DOCKERHUB_TOKEN }}
- - name: Build and push
- uses: docker/build-push-action@v3
- with:
- push: true
- tags: eco4cast/usgsrc4cast-targets:latest
- build-args: GITHUB_PAT=${{ secrets.PAT }}
- file: targets/Dockerfile
diff --git a/.github/workflows/drivers_stage1.yaml b/.github/workflows/drivers_stage1.yaml
deleted file mode 100644
index 0302ead383..0000000000
--- a/.github/workflows/drivers_stage1.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
-# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
-on:
- schedule:
- - cron: '0 23 * * *'
- workflow_dispatch:
-
-name: gefs_osn
-
-jobs:
- docker:
- timeout-minutes: 2880
- runs-on: [self-hosted]
- env:
- GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
- OSN_KEY: ${{ secrets.OSN_KEY }}
- OSN_SECRET: ${{ secrets.OSN_SECRET }}
-# container: rocker/geospatial
- steps:
- - uses: actions/checkout@v3
- with:
- ref: prod
-
- - name: Install
- shell: Rscript {0}
- run: |
- #devtools::install(dependencies=TRUE, upgrade="never")
- install.packages("remotes")
- remotes::install_github("eco4cast/gefs4cast")
-
- - name: Update GEFS
- shell: Rscript {0}
- run: |
- source("drivers/download_stage1_pseudo.R")
-
- - name: Generate stage 2
- shell: Rscript {0}
- run: |
- source("drivers/generate_stage2.R")
-
-
- - name: Update Stage3
- shell: Rscript {0}
- run: |
- source("drivers/update_stage3.R")
-
diff --git a/.github/workflows/drivers_stage3.yaml b/.github/workflows/drivers_stage3.yaml
deleted file mode 100644
index 94c6542176..0000000000
--- a/.github/workflows/drivers_stage3.yaml
+++ /dev/null
@@ -1,34 +0,0 @@
-# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
-# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
-on:
- schedule:
- - cron: '0 13 * * *'
- workflow_dispatch:
-
-name: gefs_osn_stage3
-
-jobs:
- docker:
- timeout-minutes: 2880
- runs-on: [self-hosted]
- env:
- GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
- OSN_KEY: ${{ secrets.OSN_KEY }}
- OSN_SECRET: ${{ secrets.OSN_SECRET }}
-# container: rocker/geospatial
- steps:
- - uses: actions/checkout@v3
- with:
- ref: prod
-
- - name: Install
- shell: Rscript {0}
- run: |
- #devtools::install(dependencies=TRUE, upgrade="never")
- install.packages("remotes")
- remotes::install_github("eco4cast/gefs4cast")
-
- - name: Update Stage3
- shell: Rscript {0}
- run: |
- source("drivers/update_stage3.R")
diff --git a/.github/workflows/scoring.yaml b/.github/workflows/scoring.yaml
deleted file mode 100644
index 765d56a80b..0000000000
--- a/.github/workflows/scoring.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
-# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
-on:
- schedule:
- - cron: '0 0 */3 * *'
- workflow_dispatch:
-
-name: scoring
-
-jobs:
- docker:
- #runs-on: ubuntu-latest
- runs-on: self-hosted
- timeout-minutes: 1440
- env:
- GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
- OSN_KEY: ${{ secrets.OSN_KEY }}
- OSN_SECRET: ${{ secrets.OSN_SECRET }}
- #container: eco4cast/rocker-neon4cast:latest
- steps:
- - uses: actions/checkout@v3
- with:
- ref: prod
-
- - name: Install
- shell: Rscript {0}
- run: |
- remotes::install_deps(".", dep=TRUE)
-
- - name: Generate scores
- shell: Rscript {0}
- run: |
- source("scoring/scoring.R")
-
- - name: Update inventory
- shell: Rscript {0}
- run: |
- source("scoring/build_score_inventory.R")
diff --git a/.github/workflows/submissions.yaml b/.github/workflows/submissions.yaml
deleted file mode 100644
index 659484af44..0000000000
--- a/.github/workflows/submissions.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
-# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
-on:
- schedule:
- - cron: '0 0 */3 * *'
- workflow_dispatch:
-
-
-name: process-submissions
-
-jobs:
- docker:
- runs-on: ubuntu-latest
- env:
- GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
- AWS_ACCESS_KEY_SUBMISSIONS: ${{ secrets.AWS_ACCESS_KEY_SUBMISSIONS }}
- AWS_SECRET_ACCESS_KEY_SUBMISSIONS: ${{ secrets.AWS_SECRET_ACCESS_KEY_SUBMISSIONS }}
- OSN_KEY: ${{ secrets.OSN_KEY }}
- OSN_SECRET: ${{ secrets.OSN_SECRET }}
- container: eco4cast/rocker-neon4cast:latest
- steps:
- - uses: actions/checkout@v3
- with:
- ref: prod
-
- - name: Process submissions
- shell: Rscript {0}
- run: |
- source("submission_processing/process_submissions.R")
diff --git a/.github/workflows/targets.yaml b/.github/workflows/targets.yaml
deleted file mode 100644
index fdb5435c15..0000000000
--- a/.github/workflows/targets.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
-# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
-on:
- schedule:
- - cron: '0 4 * * *'
- workflow_dispatch:
-
-name: target-generation
-
-jobs:
- chl-targets:
- runs-on: ubuntu-latest
- env:
- GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
- AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_SUBMISSIONS }}
- AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_SUBMISSIONS }}
- OSN_KEY: ${{ secrets.OSN_KEY }}
- OSN_SECRET: ${{ secrets.OSN_SECRET }}
- container: eco4cast/usgsrc4cast-targets:latest
- steps:
- - uses: actions/checkout@v3
- with:
- ref: prod
- - name: Generate targets
- shell: Rscript {0}
- run: |
- setwd("targets")
- targets::tar_make()
-
diff --git a/DESCRIPTION b/DESCRIPTION
deleted file mode 100644
index 55d671bf7e..0000000000
--- a/DESCRIPTION
+++ /dev/null
@@ -1,43 +0,0 @@
-Package: usgsrc4cast-ci
-Title: Cyberinfrastructure for the EFI-USGS River Chlorophyll Forecasting Challenge
-Version: 0.1
-Authors@R: c(person(
- "Quinn", "Thomas",
- email = "rqthomas@vt.edu",
- role = c("aut", "cre"),
- comment = c(ORCID = "0000-0003-1282-7825")),
- person(
- "Jacob", "Zwart",
- email = "jzwart@usgs.gov",
- role = c("aut", "cre"),
- comment = c(ORCID = "0000-0002-3870-405X")))
-Description: Cyberinfrastructure that processing submissions, scores submissions, generates catalog, and generates dashboards
-License: MIT
-Encoding: UTF-8
-Language: en-US
-Imports:
- score4cast,
- minioclient,
- rmarkdown,
- glue,
- dplyr,
- ggplot2,
- arrow,
- bslib,
- bsicons,
- ggiraph,
- patchwork,
- pak,
- jsonlite,
- stac4cast,
- reticulate,
- duckdbfs,
- furrr,
- future,
- gsheet,
- readr
-Remotes:
- github::eco4cast/score4cast,
- github::cboettig/minioclient,
- github::eco4cast/neon4cast,
- github::eco4cast/stac4cast
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 8b4fbcbab5..0000000000
--- a/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2023 Ecological Forecasting Initiative
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/R/access_model_metadata.R b/R/access_model_metadata.R
deleted file mode 100644
index 8460bd7e2a..0000000000
--- a/R/access_model_metadata.R
+++ /dev/null
@@ -1,14 +0,0 @@
-s3 <- arrow::s3_bucket(bucket = "bio230121-bucket01/vera4cast/metadata/model_id/", # update from vera4cast?
- endpoint_override = "renc.osn.xsede.org", anonymous = TRUE)
-
-d1 <- arrow::open_dataset(s3, format = "json") |> dplyr::collect()
-
-model_type <- unnest(d1[[3]], cols = names(d1[[3]]))$type
-
-model_type[which(stringr::str_detect(model_type, "mpirical"))] <- "Empirical"
-
-tibble::tibble(model_type = model_type) |>
-ggplot(aes(x = model_type)) +
- geom_bar() +
- labs(x = "Model Type", y = "Number submitting forecasts") +
- theme_bw()
diff --git a/R/archive_challenge.R b/R/archive_challenge.R
deleted file mode 100644
index 0d19a190fc..0000000000
--- a/R/archive_challenge.R
+++ /dev/null
@@ -1,97 +0,0 @@
-library(arrow)
-library(tidyverse)
-
-start_date <- "2023-01-01"
-end_date <- "2024-01-01"
-archive_format <- "parquet"
-
-curr_dir <- here::here()
-
-config <- yaml::read_yaml("challenge_configuration.yaml")
-
-fs::dir_create(file.path(curr_dir, "archive/scores"), recurse = TRUE)
-fs::dir_create(file.path(curr_dir, "archive/forecasts"), recurse = TRUE)
-fs::dir_create(file.path(curr_dir, "archive/targets"), recurse = TRUE)
-fs::dir_create(file.path(curr_dir, "archive/catalog"), recurse = TRUE)
-#######
-message("Archiving forecasts")
-
-s3_forecasts <- arrow::s3_bucket(file.path(config$forecasts_bucket,"parquet"),
- endpoint_override = config$endpoint,
- anonymous = TRUE)
-
-
-df <- open_dataset(s3_forecasts) |>
- filter(datetime >= lubridate::as_datetime(start_date), datetime < lubridate::as_datetime(end_date))
-
-if(archive_format == "parquet"){
- write_dataset(df, path = file.path(curr_dir, "archive/forecasts"),
- hive_style = TRUE,
- partitioning = c("project_id", "duration","variable", "model_id"))
-}else if(archive_format == "csv"){
- write_csv_arrow(df, sink = file.path("archive/forecasts.csv.gz"))
-}
-
-#######
-message("Archiving scores")
-
-s3_scores <- arrow::s3_bucket(config$scores_bucket,
- endpoint_override = config$endpoint,
- anonymous = TRUE)
-
-df_scores <- open_dataset(s3_scores) |>
- filter(datetime >= lubridate::as_datetime(start_date), datetime < lubridate::as_datetime(end_date))
-
-if(archive_format == "parquet"){
- write_dataset(df_scores, path = file.path("archive/scores"),
- hive_style = TRUE,
- partitioning = c("project_id", "duration","variable","model_id"))
-}else if(archive_format == "csv"){
- write_csv_arrow(df, sink = file.path("archive/scores.csv.gz"))
-}
-
-#######
-message("Archiving targets")
-
-minioclient::mc_alias_set("s3_store",
- config$endpoint,
- Sys.getenv("OSN_KEY"),
- Sys.getenv("OSN_SECRET"))
-
-minioclient::mc_mirror(from = paste0("s3_store/",config$targets_bucket), to = "archive/targets")
-
-######
-message("Archive catalog and metadata")
-
-setwd(here::here())
-jsons <- fs::dir_ls(path ="catalog", glob="*.json", recurse=TRUE)
-
-for(i in 1:length(jsons)){
- dir.create(file.path(curr_dir, "archive/catalog",dirname(jsons[i])),recursive = TRUE, showWarnings = FALSE)
- fs::file_copy(file.path(curr_dir, jsons[i]), dirname(file.path(curr_dir, "archive/catalog",jsons[i])), overwrite = TRUE)
-}
-
-# Archive variable descriptions
-googlesheets4::gs4_deauth()
-target_metadata <- googlesheets4::read_sheet(config$target_metadata_gsheet)
-
-if(archive_format == "parquet"){
- readr::write_csv(target_metadata, file.path(curr_dir, "archive/catalog/target_metadata.csv"))
-}else if(archive_format == "csv"){
- readr::write_csv(target_metadata, file.path(curr_dir, "archive/target_metadata.csv"))
-}
-
-###
-setwd(file.path(curr_dir, "archive"))
-files2zip <- fs::dir_ls(recurse = TRUE)
-file_name <- paste0("archive_", Sys.Date())
-files2zip <- files2zip[stringr::str_detect(files2zip, pattern = "DS_Store", negate = TRUE)][-1]
-utils::zip(zipfile = file.path(curr_dir, file_name), files = files2zip)
-
-# TO DO
-
-# generate EDI EML
-
-### Copy archive to bucket
-minioclient::mc_cp(from = file.path(curr_dir, paste0(file_name,".zip")),
- to = paste0("s3_store/",config$archive_bucket,"/",paste0(file_name,".zip")))
diff --git a/R/build_summeries.R b/R/build_summeries.R
deleted file mode 100644
index 677e92cd9f..0000000000
--- a/R/build_summeries.R
+++ /dev/null
@@ -1,41 +0,0 @@
-config <- yaml::read_yaml("../challenge_configuration.yaml")
-
-s3_inventory <- arrow::s3_bucket(paste0(config$inventory_bucket,"/catalog/forecasts/project_id=", config$project_id),
- endpoint_override = config$endpoint,
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
-inventory_df <- arrow::open_dataset(s3_inventory) |> dplyr::collect()
-
-
-df <- inventory_df |> dplyr::distinct(duration, model_id, variable, project_id, path, endpoint)
-
-
-s3 <- arrow::s3_bucket(config$forecasts_bucket,
- endpoint_override = config$endpoint,
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
-
-for(i in 1:nrow(df)){
-
- print(i)
-
- arrow::open_dataset(paste0("s3://anonymous@",df$path[i],"/model_id=",df$model_id[i],"?endpoint_override=",df$endpoint[i])) |>
- dplyr::mutate(model_id = df$model_id[i],
- variable = df$variable[i],
- duration = df$duration[i],
- project_id = df$project_id[i]) |>
- dplyr::collect() |>
- dplyr::summarise(prediction = mean(prediction), .by = dplyr::any_of(c("site_id", "datetime", "reference_datetime", "family", "duration", "model_id",
- "parameter", "pub_datetime", "reference_date", "variable", "project_id"))) |>
- score4cast::summarize_forecast(extra_groups = c("duration", "project_id")) |>
- dplyr::mutate(reference_date = lubridate::as_date(reference_datetime)) |>
- arrow::write_dataset(s3$path("summaries"), format = 'parquet',
- partitioning = c("project_id",
- "duration",
- "variable",
- "model_id",
- "reference_date"))
-
-}
diff --git a/R/delete_bucket.R b/R/delete_bucket.R
deleted file mode 100644
index aab8e6564e..0000000000
--- a/R/delete_bucket.R
+++ /dev/null
@@ -1,27 +0,0 @@
-for(i in 1:nrow(df)){
-
- aws.s3::delete_object(object = df$Key[i],
- bucket = "bio230121-bucket01",
- region = "renc",
- base_url = "osn.xsede.org",
- key = Sys.getenv("OSN_KEY"),
- secret = Sys.getenv("OSN_SECRET"))
-}
-
-df <- aws.s3::get_bucket_df(bucket = "bio230121-bucket01",
- prefix = "vera4cast/prov/",
- region = "renc",
- base_url = "osn.xsede.org",
- key = Sys.getenv("OSN_KEY"),
- secret = Sys.getenv("OSN_SECRET"))
-
-for(i in 1:nrow(df)){
-
- aws.s3::delete_object(object = df$Key[i],
- bucket = "bio230121-bucket01",
- region = "renc",
- base_url = "osn.xsede.org",
- key = Sys.getenv("OSN_KEY"),
- secret = Sys.getenv("OSN_SECRET"))
-}
-
diff --git a/R/eco4cast-helpers/fable_helpers.R b/R/eco4cast-helpers/fable_helpers.R
deleted file mode 100644
index 188c4adfe8..0000000000
--- a/R/eco4cast-helpers/fable_helpers.R
+++ /dev/null
@@ -1,124 +0,0 @@
-
-
-#' Format a fable fbl_ts forecast in EFI standard
-#'
-#' @param df a fbl_ts forecast
-#' @param times number of times to draw from distribution for ensemble method,
-#' ignored if distributions are normal.
-#' @return A data.frame (`[tsibble]`) in the EFI format
-#' @examples
-#'
-#' \dontrun{
-#' library(dplyr)
-#' library(readr)
-#' library(fable)
-#' aquatic <-
-#' read_csv("https://data.ecoforecast.org/neon4cast-targets/aquatics/aquatics-targets.csv.gz") %>%
-#' pivot_wider(names_from = "variable", values_from = "observation") %>%
-#' as_tsibble(index = datetime, key = site_id)
-#' oxygen_fc <- aquatic %>%
-#' model(null = MEAN(oxygen)) %>%
-#' forecast(h = "35 days") %>%
-#' efi_format()
-#' }
-#' @export
-#' @importFrom rlang .data `:=`
-#' @importFrom dplyr `%>%`
-efi_format <- function(df, times = 10){
-
- df <- drop_degenerate(df)
- var <- attributes(df)$dist
- family <- unique(stats::family(df[[var]]))
-
- if(length(unique(family)) > 1)
- stop("Multiple distributions detected. Please provide only one distribution type at a time.")
-
- if(family %in% c("normal"))
- efi_format_statistic(df, family)
- else
- efi_format_ensemble(df, times)
-
-}
-
-## Helper functions to turn a fable timeseries, which uses a special "distribution" column,
-## into a flat-file format. efi_statistic_format uses a 'statistic' column (indicating either mean or sd),
-## while efi_ensemble_format uses an 'ensemble' column, drawing `n` times from the distribution.
-efi_format_statistic <- function(df, family = NULL){
-
- stopifnot(inherits(df, "fbl_ts"))
- ## determine variable name
- var <- attributes(df)$dist
-
- if(is.null(family))
- family <- unique(stats::family(df[[var]]))
-
-
- df %>%
- dplyr::mutate(family = family) %>%
- dplyr::rename(model_id = .model) %>%
- dplyr::select(-.mean) %>%
- tidyr::pivot_longer(-dplyr::starts_with(standard_vars),
- names_to = "variable",
- values_to = "dist") %>%
- dplyr::mutate(pars = distributional::parameters(dist)) %>%
- tidyr::unnest(pars) %>%
- dplyr::select(-dist) %>%
- tidyr::pivot_longer(-dplyr::starts_with(standard_vars),
- names_to = "parameter", values_to = "prediction") %>%
- dplyr::as_tibble()
-
-}
-
-
-standard_vars <- c("site_id", "datetime", "parameter", "family",
- "reference_datetime", "prediction", "observation",
- "model_id", "model_name", "latitude", "longitude",
- "variable")
-
-utils::globalVariables(c("sd", ".model", "n", "dist", "pars",
- ".mean", "ensemble", standard_vars),
- # TODO: update to generic package
- "neon4cast")
-
-
-#' Format as EFI using ensemble draws
-#' @inheritParams efi_format
-#' @param times number of ensemble members to draw
-#' Supports unrecognized distributions by drawing samples
-#' @export
-efi_format_ensemble <- function(df, times = 10) {
-
- stopifnot(inherits(df, "fbl_ts"))
-
- var <- attributes(df)$dist
- df <- drop_degenerate(df)
-
- ## determine variable name
- n_groups <- nrow(df)
- ## Draw `times` samples from distribution using
- suppressWarnings({
- expand <- df %>%
- dplyr::mutate(sample = distributional::generate( .data[[var]], times) )
- })
- expand %>%
- tidyr::unnest(sample) %>%
- dplyr::mutate(parameter = as.character( rep(1:times, n_groups))) %>%
- dplyr::select(datetime, site_id, parameter,
- {{var}} := sample, model_id = .model) %>%
- dplyr::as_tibble() %>%
- dplyr::mutate(family = "ensemble") %>%
- tidyr::pivot_longer(-dplyr::starts_with(standard_vars),
- names_to = "variable", values_to = "prediction")
-}
-
-drop_degenerate <- function(df) {
- var <- attributes(df)$dist
- family <- family(df[[var]])
- if("degenerate" %in% family) {
- warning("dropping degenerate distributions")
- df <- df %>% dplyr::filter(family != "degenerate")
-
- family <- family(df[[var]])
- }
- df
-}
diff --git a/R/eco4cast-helpers/forecast_output_validator.R b/R/eco4cast-helpers/forecast_output_validator.R
deleted file mode 100644
index fde02dd61e..0000000000
--- a/R/eco4cast-helpers/forecast_output_validator.R
+++ /dev/null
@@ -1,114 +0,0 @@
-#' Validate forecast file
-#'
-#' @param forecast_file forecast csv or csv.gz file
-#' @export
-
-forecast_output_validator <- function(forecast_file){
-
-
- file_in <- forecast_file
-
- valid <- TRUE
-
- message(file_in)
-
- if(any(vapply(c("[.]csv", "[.]csv\\.gz"), grepl, logical(1), file_in))){
-
- # if file is csv zip file
- out <- readr::read_csv(file_in, guess_max = 1e6, show_col_types = FALSE)
-
- if(lexists(out, c("model_id"))){
- usethis::ui_done("file has model_id column")
- }else{
- usethis::ui_warn("file missing model_id column ")
- }
-
-
- if("variable" %in% names(out) & "prediction" %in% names(out)){
- usethis::ui_done("forecasted variables found correct variable + prediction column")
- }else{
- usethis::ui_warn("missing the variable and prediction columns")
- valid <- FALSE
- }
-
- if(lexists(out, "ensemble")){
- usethis::ui_warn("ensemble dimension should be named parameter")
- valid <- FALSE
- }else if(lexists(out, "family")){
-
- if(lexists(out, "parameter")){
- usethis::ui_done("file has correct family and parameter columns")
- }else{
- usethis::ui_warn("file does not have parameter column ")
- valid <- FALSE
- }
-
- }else{
- usethis::ui_warn("file does not have ensemble or family and/or parameter column")
- valid <- FALSE
- }
-
- #usethis::ui_todo("Checking that file contains siteID column...")
- if(lexists(out, c("site_id"))){
- usethis::ui_done("file has site_id column")
- }else{
- usethis::ui_warn("file missing site_id column")
- }
-
- if(lexists(out, c("datetime"))){
- usethis::ui_done("file has datetime column")
- if(!grepl("-", out$datetime[1])){
- usethis::ui_done("datetime column format is not in the correct YYYY-MM-DD format")
- valid <- FALSE
- }else{
- if(sum(class(out$datetime) %in% c("Date","POSIXct")) > 0){
- usethis::ui_done("file has correct datetime column")
- }else{
- usethis::ui_done("datetime column format is not in the correct YYYY-MM-DD format")
- valid <- FALSE
- }
- }
- }else{
- usethis::ui_warn("file missing datetime column")
- valid <- FALSE
- }
-
-
- if(lexists(out, c("duration"))){
- usethis::ui_done("file has duration column")
- }else{
- usethis::ui_warn("file missing duration column (values for the column: daily = P1D, 30min = PT30M)")
- }
-
- if(lexists(out, c("project_id"))){
- usethis::ui_done("file has project_id column")
- }else{
- usethis::ui_warn("file missing project_id column (use the challenge you're submitting to as the project_id")
- }
-
- if(lexists(out, c("reference_datetime"))){
- usethis::ui_done("file has reference_datetime column")
- }else if(lexists(out, c("start_time"))){
- usethis::ui_warn("file start_time column should be named reference_datetime. We are converting it during processing but please update your submission format")
- }else{
- usethis::ui_warn("file missing reference_datetime column")
- valid <- FALSE
- }
-
- }else{
- usethis::ui_warn("incorrect file extension (csv or csv.gz are accepted)")
- valid <- FALSE
- }
- if(!valid){
- ## TODO: could update this warning message
- message("Forecast file is not valid. The following link provides information about the format:\nhttps://projects.ecoforecast.org/neon4cast-ci/instructions.html#forecast-file-format")
- }else{
- message("Forecast format is valid")
- }
- return(valid)
-}
-
-
-lexists <- function(list,name){
- any(!is.na(match(name, names(list))))
-}
diff --git a/R/eco4cast-helpers/noaa_gefs.R b/R/eco4cast-helpers/noaa_gefs.R
deleted file mode 100644
index b7373646dc..0000000000
--- a/R/eco4cast-helpers/noaa_gefs.R
+++ /dev/null
@@ -1,210 +0,0 @@
-#' NOAA GEFS tables
-#'
-#' Access NOAA Global Ensemble Forecast System (GEFS) forecast predictions
-#' at ecological forecast sites. The GEFS is NOAA's longest horizon forecast, extending up
-#' to 30 days at present, issued at 0.5 degree spatial resolution.
-#' EFI downsamples these forecasts at the coordinates of all NEON sites and
-#' provides efficient access to archives of these forecasts in a simple tabular
-#' format for a subset of variables of interest.
-#'
-#' WARNING: This combined dataset contains billions of rows. Filtering
-#' to a forecast issued on specific `start_date`s or other subsets before
-#' `collect()`ing data into R is essential. Be patient, especially on slow
-#' network connections, and handle this data with care. See examples.
-#'
-#' At each site, 31 ensemble member forecasts are provided
-#' at 3 hr intervals for the first 10 days, and 6 hr intervals for up to 30 days
-#' (840 hr) horizon. Forecasts include the following variables:
-#' - TMP - temperature (K)
-#' - RH - Relative humidity (%)
-#' - PRES - Atmospheric pressure (Pa)
-#' - UGRD - U-component of wind speed (m/s)
-#' - VGRD - V-component of wind speed (m/s)
-#' - APCP - Total precipitation in interval (kg/m^2)
-#' - DSWRF - Downward shortwave radiation flux in interval
-#' - DLWRF - Downward longwave radiation flux in interval
-#'
-#' GEFS forecasts are issued four times a day, as indicated by the `start_date`
-#' and `cycle`. Only forecasts at midnight, `cycle = "00"` extend for the full
-#' 840 hour horizon. Other cycles 06, 12, 18 are provided only 6hrs ahead,
-#' as mostly being of interest for short-term forecasts. (Though users should
-#' note that other NOAA products provide more much accurate and higher
-#' resolution short term forecasts than GEFS.)
-#'
-#'
-#' All variables are given at height 2m above ground, as indicated in height.
-#' See https://www.nco.ncep.noaa.gov/pmb/products/gens/ for more details on
-#' GEFS variables and intervals.
-#'
-#' @references https://www.nco.ncep.noaa.gov/pmb/products/gens/
-#' @param cycle Hour at which forecast was made, as character string
-#' (`"00"`, `"06"`, `"12"` or `"18"`). Only `"00"` (default) has 30 days horizon.
-#' @param version GEFS forecast version. Prior versions correspond to forecasts
-#' issued before 2020-09-25 which have different ensemble number and horizon,
-#' among other changes, and are not made available here. Leave as default.
-#' @param endpoint the EFI host address (leave as default)
-#' @param verbose logical, displays or hides messages
-#' @param project_id the forecast challenge project_id you want pull GEFS from
-#' @param start_date forecast start date yyyy-mm-dd format
-#' @export
-#' @examplesIf interactive()
-#'
-#' weather <- noaa_stage1()
-#' # 5.7M rows of data:
-#' weather |>
-#' dplyr::filter(start_date == "2022-04-01") |>
-#' dplyr::collect()
-#'
-#'
-#'
-noaa_stage1 <- function(cycle = 0,
- version = "v12",
- endpoint = "data.ecoforecast.org", # TODO: why is this still default?
- verbose = TRUE,
- project_id,
- start_date = "") {
-
- vars <- arrow_env_vars()
-
- # TODO: check if project_id is valid
- check_project_id <- FALSE
- if(check_project_id){
- submitted_model_ids <- readr::read_csv("https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/inventory/model_id/model_id-project_id-inventory.csv", show_col_types = FALSE)
- all_project_ids <- unique(submitted_model_ids$project_id)
- if(!project_id %in% all_project_ids){
- stop(sprintf("The project_id you supplied, %s, is not in the list of current forecast challenge project_ids [%s]",
- project_id, paste(all_project_ids, collapse = ", ")))
- }
- }
- bucket <- sprintf("bio230014-bucket01/challenges/drivers/%s/noaa/gefs-v12/stage1/reference_datetime=%s",
- project_id,
- start_date)
-
- endpoint_override <- "https://sdsc.osn.xsede.org"
- s3 <- arrow::s3_bucket(paste0(bucket),
- endpoint_override = endpoint_override,
- anonymous = TRUE)
-
- site_df <- arrow::open_dataset(s3)
-
- unset_arrow_vars(vars)
-
- return(site_df)
-}
-
-#' NOAA GEFS forecasts with EFI stage 2 processing
-#' Stage2 processing involves the following transforms of the data:
-#' - Fluxes are standardized to per second rates
-#' - Variables are renamed to match CF conventions
-#' - Fluxes and states are interpolated to 1 hour intervals
-#'
-#' @inheritParams noaa_stage1
-#' @param cycle Hour at which forecast was made, as character string
-#' (`"00"`, `"06"`, `"12"` or `"18"`). Only `"00"` (default) has 30 days horizon.
-#' @param version GEFS forecast version. Prior versions correspond to forecasts
-#' issued before 2020-09-25 which have different ensemble number and horizon,
-#' among other changes, and are not made available here. Leave as default.
-#' @param endpoint the EFI host address (leave as default)
-#' @param verbose logical, displays or hides messages
-#' @param project_id the forecast challenge project_id you want pull GEFS from
-#' @param start_date forecast start date yyyy-mm-dd format
-#' @export
-noaa_stage2 <- function(cycle = 0,
- version = "v12",
- endpoint = NA,
- verbose = TRUE,
- project_id,
- start_date = "") {
-
- vars <- arrow_env_vars()
-
- # TODO: check if project_id is valid
- check_project_id <- FALSE
- if(check_project_id){
- submitted_model_ids <- readr::read_csv("https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/inventory/model_id/model_id-project_id-inventory.csv", show_col_types = FALSE)
- all_project_ids <- unique(submitted_model_ids$project_id)
- if(!project_id %in% all_project_ids){
- stop(sprintf("The project_id you supplied, %s, is not in the list of current forecast challenge project_ids [%s]",
- project_id, paste(all_project_ids, collapse = ", ")))
- }
- }
- bucket <- sprintf("bio230014-bucket01/challenges/drivers/%s/noaa/gefs-v12/stage2/reference_datetime=%s",
- project_id,
- start_date)
-
- endpoint_override <- "https://sdsc.osn.xsede.org"
- s3 <- arrow::s3_bucket(paste0(bucket),
- endpoint_override = endpoint_override,
- anonymous = TRUE)
-
- site_df <- arrow::open_dataset(s3) |>
- dplyr::mutate(reference_datetime = lubridate::as_datetime(start_date))
-
- unset_arrow_vars(vars)
-
- return(site_df)
-
-}
-
-#' NOAA GEFS forecasts with EFI stage 3 processing
-#'
-#' Stage 3 processing presents a 'nowcast' product by combining the most
-#' recent predictions from each available cycle. Product uses CF variable
-#' names and 1 hr interval
-#' @param cycle Hour at which forecast was made, as character string
-#' (`"00"`, `"06"`, `"12"` or `"18"`). Only `"00"` (default) has 30 days horizon.
-#' @param version GEFS forecast version. Prior versions correspond to forecasts
-#' issued before 2020-09-25 which have different ensemble number and horizon,
-#' among other changes, and are not made available here. Leave as default.
-#' @param endpoint the EFI host address (leave as default)
-#' @param verbose logical, displays or hides messages
-#' @param project_id the forecast challenge project_id you want pull GEFS from
-#' @export
-noaa_stage3 <- function(version = "v12",
- endpoint = "data.ecoforecast.org",
- verbose = TRUE,
- project_id) {
-
- vars <- arrow_env_vars()
-
- # TODO: check if project_id is valid
- check_project_id <- FALSE
- if(check_project_id){
- submitted_model_ids <- readr::read_csv("https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/inventory/model_id/model_id-project_id-inventory.csv", show_col_types = FALSE)
- all_project_ids <- unique(submitted_model_ids$project_id)
- if(!project_id %in% all_project_ids){
- stop(sprintf("The project_id you supplied, %s, is not in the list of current forecast challenge project_ids [%s]",
- project_id, paste(all_project_ids, collapse = ", ")))
- }
- }
- bucket <- sprintf("bio230014-bucket01/challenges/drivers/%s/noaa/gefs-v12/stage3",
- project_id)
-
- endpoint_override <- "https://sdsc.osn.xsede.org"
- s3 <- arrow::s3_bucket(bucket,
- endpoint_override = endpoint_override,
- anonymous = TRUE)
-
- site_df <- arrow::open_dataset(s3)
-
- unset_arrow_vars(vars)
-
- return(site_df)
-
-}
-
-arrow_env_vars <- function(){
- user_region <- Sys.getenv("AWS_DEFAULT_REGION")
- user_meta <- Sys.getenv("AWS_EC2_METADATA_DISABLED")
- Sys.unsetenv("AWS_DEFAULT_REGION")
- Sys.setenv(AWS_EC2_METADATA_DISABLED="TRUE")
-
- list(user_region=user_region, user_meta = user_meta)
-}
-
-unset_arrow_vars <- function(vars) {
- Sys.setenv("AWS_DEFAULT_REGION" = vars$user_region)
- if (vars$user_meta != "") {
- Sys.setenv(AWS_EC2_METADATA_DISABLED = vars$user_meta)
- }
-}
diff --git a/R/eco4cast-helpers/s3_helpers.R b/R/eco4cast-helpers/s3_helpers.R
deleted file mode 100644
index 7f986c97ee..0000000000
--- a/R/eco4cast-helpers/s3_helpers.R
+++ /dev/null
@@ -1,25 +0,0 @@
-# not sure what this is used for
-get_target <- function(variable,
- duration,
- project_id = "neon4cast",
- lazy = FALSE){
-
- s3_targets <- arrow::s3_bucket("bio230014-bucket01/challenges/targets",
- endpoint_override = "sdsc.osn.xsede.org")
-
- target <- arrow::open_csv_dataset(s3_targets,
- schema = arrow::schema(
- project_id = arrow::string(),
- site_id = arrow::string(),
- datetime = arrow::timestamp(unit = "ns", timezone = "UTC"),
- duration = arrow::string(),
- variable = arrow::string(),
- observation = arrow::float()),
- skip = 1) |>
- dplyr::filter(variable %in% variable,
- duration == duration,
- project_id == project_id)
-
- if(!lazy) target <- target |> dplyr::collect()
-
-}
diff --git a/R/eco4cast-helpers/submit.R b/R/eco4cast-helpers/submit.R
deleted file mode 100644
index 7a6a3bc959..0000000000
--- a/R/eco4cast-helpers/submit.R
+++ /dev/null
@@ -1,124 +0,0 @@
-## Technically this could become arrow-based
-
-#' submit forecast to EFI forecast challenge
-#'
-#' @inheritParams forecast_output_validator
-#' @param forecast_file the path to the forecast file to submit
-#' @param project_id the forecast challenge project_id to submit to
-#' @param metadata path to metadata file
-#' @param ask should we prompt for a go before submission?
-#' @param s3_region subdomain (leave as is for EFI challenge)
-#' @param s3_endpoint root domain (leave as is for EFI challenge)
-#' @export
-submit <- function(forecast_file,
- project_id,
- metadata = NULL,
- ask = interactive(),
- s3_region = "submit",
- s3_endpoint = "ecoforecast.org"
-){
- if(file.exists("~/.aws")){
- warning(paste("Detected existing AWS credentials file in ~/.aws,",
- "Consider renaming these so that automated upload will work"))
- }
- message("validating that file matches required standard")
- go <- forecast_output_validator(forecast_file)
-
- if(!go){
-
- # TODO: update this warning to point to non-neon4cast docs
- warning(paste0("forecasts was not in a valid format and was not submitted\n",
- "First, try read reinstalling neon4cast (remotes::install_github('eco4cast\\neon4cast'), restarting R, and trying again\n",
- "Second, see https://projects.ecoforecast.org/neon4cast-docs/Submission-Instructions.html for more information on the file format"))
- return(NULL)
- }
-
- check_model_id <- FALSE
- # TODO: see if we want to have a more robust check on the model id submission
- if(check_model_id){
- googlesheets4::gs4_deauth()
- message("Checking if model_id is registered")
- registered_model_id <- suppressMessages(googlesheets4::read_sheet("https://docs.google.com/spreadsheets/d/1f177dpaxLzc4UuQ4_SJV9JWIbQPlilVnEztyvZE6aSU/edit?usp=sharing", range = "Sheet1!A:V"))
-
- registered_project_id <- registered_model_id$`What forecasting challenge are you registering for?`
- registered_model_id <- registered_model_id$model_id
-
- registered_model_project_id <- paste(registered_project_id, registered_model_id, sep = "-")
-
- df <- read4cast::read_forecast(forecast_file)
- model_id <- df$model_id[1]
- model_project_id <- paste("neon4cast", model_id, sep = "-")
-
- if(grepl("(example)", model_id)){
- message(paste0("You are submitting a forecast with 'example' in the model_id. As an example forecast, it will be processed but not used in future analyses.\n",
- "No registration is required to submit an example forecast.\n",
- "If you want your forecast to be retained, please select a different model_id that does not contain `example` and register you model id at https://forms.gle/kg2Vkpho9BoMXSy57\n"))
- }
-
- if(!(model_project_id %in% registered_model_project_id) & !grepl("(example)",model_id)){
-
- message("Checking if model_id for neon4cast is already used in submissions")
-
- submitted_model_ids <- readr::read_csv("https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/inventory/model_id/model_id-project_id-inventory.csv", show_col_types = FALSE)
- submitted_project_model_id <- paste(submitted_model_ids$project_id, submitted_model_ids$model_id, sep = "-")
-
-
- if(model_project_id %in% submitted_project_model_id){
-
- stop(paste0("Your model_id (",model_id,") has not been registered yet but is already used in other submissions. Please use and register another model_id\n",
- " Register at https://forms.gle/kg2Vkpho9BoMXSy57\n",
- "If you want to submit without registering, include the word 'example' in your model_id. As an example forecast, it will be processed but not used in future analyses."))
-
- }else{
-
- stop(paste0("Your model_id (",model_id,") has not been registered\n",
- " Register at https://forms.gle/kg2Vkpho9BoMXSy57\n",
- "If you want to submit without registering, include the word 'example' in your model_id. As an example forecast, it will be processed but not used in future analyses."))
-
- }
- }
-
- if(!grepl("(example)",model_id)){
- if(first_submission & model_project_id %in% registered_model_project_id){
- submitted_model_ids <- readr::read_csv("https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/inventory/model_id/model_id-project_id-inventory.csv", show_col_types = FALSE)
- submitted_project_model_id <- paste(submitted_model_ids$project_id, submitted_model_ids$model_id, sep = "-")
-
- if(model_project_id %in% submitted_project_model_id){
- stop(paste0("Your model_id (",model_id,") is already used in other submitted forecasts. There are two causes for this error: \n
- - If you have previously submitted a forecast, set the argument `first_submission = FALSE` to remove this error\n
- - If you have not previously submitted a forecast, this error message means that the model_id has already been registered and used for submissions. Please register and use another model_id at [https://forms.gle/kg2Vkpho9BoMXSy57](https://forms.gle/kg2Vkpho9BoMXSy57)"))
- }
- }
- }else{
- message("Since `example` is in your model_id, you are submitting an example forecast that will be processed but not used in future analyses.")
- }
- }
-
- if(go & ask){
- go <- utils::askYesNo("Forecast file is valid, ready to submit?")
- }
-
- # check if project_id is valid
- check_project_id <- FALSE
- if(check_project_id){
- submitted_model_ids <- readr::read_csv("https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/inventory/model_id/model_id-project_id-inventory.csv", show_col_types = FALSE)
- all_project_ids <- unique(submitted_model_ids$project_id)
- if(!project_id %in% all_project_ids){
- stop(sprintf("The project_id you supplied, %s, is not in the list of current forecast challenge project_ids [%s]",
- project_id, paste(all_project_ids, collapse = ", ")))
- }
- }
-
- #GENERALIZATION: Here are specific AWS INFO
- exists <- aws.s3::put_object(file = forecast_file,
- object = basename(forecast_file),
- bucket = fs::path("submissions", project_id),
- region= s3_region,
- base_url = s3_endpoint)
-
- if(exists){
- message("Thank you for submitting!")
- }else{
- warning("Forecasts was not sucessfully submitted to server. Try again, then contact the Challenge organizers.")
- }
-}
diff --git a/R/eco4cast-helpers/to_hourly.R b/R/eco4cast-helpers/to_hourly.R
deleted file mode 100644
index 53442dbbdc..0000000000
--- a/R/eco4cast-helpers/to_hourly.R
+++ /dev/null
@@ -1,136 +0,0 @@
-
-#'
-#' @param df dataframe of stage1 NEON GEFS forecasts for sites to forecast at
-#' @param site_list a dataframe of the latitude and longitude for all site_ids in df
-#' @param use_solar_geom logical for using solar geometry for daily SW calculation
-#' @param pseudo logical for something...
-to_hourly <- function(df,
- site_list,
- use_solar_geom = TRUE,
- pseudo = FALSE){
-
- if(!pseudo){
- reference_datetime <- lubridate::as_datetime(df$reference_datetime)[1]
- }else{
- reference_datetime <- NA
- }
-
- var_order <- names(df)
-
- ensemble_maxtime <- df |>
- dplyr::group_by(site_id, family, ensemble) |>
- dplyr::summarise(max_time = max(datetime), .groups = "drop")
-
- ensembles <- unique(df$ensemble)
- datetime <- seq(min(df$datetime), max(df$datetime), by = "1 hour")
- variables <- unique(df$variable)
- sites <- unique(df$site_id)
-
- full_time <- expand.grid(sites, ensembles, datetime, variables) |>
- dplyr::rename(site_id = Var1,
- ensemble = Var2,
- datetime = Var3,
- variable = Var4) |>
- dplyr::mutate(datetime = lubridate::as_datetime(datetime)) |>
- dplyr::arrange(site_id, ensemble, variable, datetime) |>
- dplyr::left_join(ensemble_maxtime, by = c("site_id","ensemble")) |>
- dplyr::filter(datetime <= max_time) |>
- dplyr::select(-c("max_time")) |>
- dplyr::distinct()
-
- states <- df |>
- dplyr::select(site_id, family, horizon, ensemble, datetime, variable, prediction) |>
- dplyr::filter(!pseudo | (pseudo & horizon != "006") | (pseudo & datetime == max(df$datetime))) |>
- dplyr::select(-horizon) |>
- dplyr::group_by(site_id, family, ensemble, variable) |>
- dplyr::right_join(full_time, by = c("site_id", "ensemble", "datetime", "family", "variable")) |>
- dplyr::filter(variable %in% c("PRES", "RH", "TMP", "UGRD", "VGRD")) |>
- dplyr::arrange(site_id, family, ensemble, datetime) |>
- dplyr::mutate(prediction = imputeTS::na_interpolation(prediction, option = "linear")) |>
- dplyr::mutate(prediction = ifelse(variable == "TMP", prediction + 273, prediction)) |>
- dplyr::mutate(prediction = ifelse(variable == "RH", prediction/100, prediction)) |>
- dplyr::ungroup()
-
- fluxes <- df |>
- dplyr::select(site_id, family, horizon, ensemble, datetime, variable, prediction) |>
- dplyr::filter(horizon != "003") |>
- dplyr::select(-horizon) |>
- dplyr::group_by(site_id, family, ensemble, variable) |>
- dplyr::right_join(full_time, by = c("site_id", "ensemble", "datetime", "family", "variable")) |>
- dplyr::filter(variable %in% c("APCP","DSWRF","DLWRF")) |>
- dplyr::arrange(site_id, family, ensemble, datetime) |>
- tidyr::fill(prediction, .direction = "up") |>
- dplyr::mutate(prediction = ifelse(variable == "APCP", prediction / (6 * 60 * 60), prediction),
- variable = ifelse(variable == "APCP", "PRATE", variable)) |>
- dplyr::ungroup()
-
- if(use_solar_geom){
-
- fluxes <- fluxes |>
- dplyr::left_join(site_list, by = "site_id") |>
- dplyr::mutate(hour = lubridate::hour(datetime),
- date = lubridate::as_date(datetime),
- doy = lubridate::yday(datetime) + hour/24,
- longitude = ifelse(longitude < 0, 360 + longitude, longitude),
- rpot = downscale_solar_geom(doy, longitude, latitude)) |> # hourly sw flux calculated using solar geometry
- dplyr::group_by(site_id, family, ensemble, date, variable) |>
- dplyr::mutate(avg.rpot = mean(rpot, na.rm = TRUE),
- avg.SW = mean(prediction, na.rm = TRUE))|> # daily sw mean from solar geometry
- dplyr::ungroup() |>
- dplyr::mutate(prediction = ifelse(variable == "DSWRF" & avg.rpot > 0.0, rpot * (avg.SW/avg.rpot),prediction)) |>
- dplyr::select(any_of(var_order))
- }
-
- hourly_df <- dplyr::bind_rows(states, fluxes) |>
- dplyr::arrange(site_id, family, ensemble, datetime) |>
- dplyr::mutate(variable = ifelse(variable == "TMP", "air_temperature", variable),
- variable = ifelse(variable == "PRES", "air_pressure", variable),
- variable = ifelse(variable == "RH", "relative_humidity", variable),
- variable = ifelse(variable == "DLWRF", "surface_downwelling_longwave_flux_in_air", variable),
- variable = ifelse(variable == "DSWRF", "surface_downwelling_shortwave_flux_in_air", variable),
- variable = ifelse(variable == "PRATE", "precipitation_flux", variable),
- variable = ifelse(variable == "VGRD", "eastward_wind", variable),
- variable = ifelse(variable == "UGRD", "northward_wind", variable),
- variable = ifelse(variable == "APCP", "precipitation_amount", variable),
- reference_datetime = reference_datetime) |>
- dplyr::select(any_of(var_order))
-
- return(hourly_df)
-
-}
-
-cos_solar_zenith_angle <- function(doy, lat, lon, dt, hr) {
- et <- equation_of_time(doy)
- merid <- floor(lon / 15) * 15
- merid[merid < 0] <- merid[merid < 0] + 15
- lc <- (lon - merid) * -4/60 ## longitude correction
- tz <- merid / 360 * 24 ## time zone
- midbin <- 0.5 * dt / 86400 * 24 ## shift calc to middle of bin
- t0 <- 12 + lc - et - tz - midbin ## solar time
- h <- pi/12 * (hr - t0) ## solar hour
- dec <- -23.45 * pi / 180 * cos(2 * pi * (doy + 10) / 365) ## declination
- cosz <- sin(lat * pi / 180) * sin(dec) + cos(lat * pi / 180) * cos(dec) * cos(h)
- cosz[cosz < 0] <- 0
- return(cosz)
-}
-
-equation_of_time <- function(doy) {
- stopifnot(doy <= 367)
- f <- pi / 180 * (279.5 + 0.9856 * doy)
- et <- (-104.7 * sin(f) + 596.2 * sin(2 * f) + 4.3 *
- sin(4 * f) - 429.3 * cos(f) - 2 *
- cos(2 * f) + 19.3 * cos(3 * f)) / 3600 # equation of time -> eccentricity and obliquity
- return(et)
-}
-
-downscale_solar_geom <- function(doy, lon, lat) {
-
- dt <- median(diff(doy)) * 86400 # average number of seconds in time interval
- hr <- (doy - floor(doy)) * 24 # hour of day for each element of doy
-
- ## calculate potential radiation
- cosz <- cos_solar_zenith_angle(doy, lat, lon, dt, hr)
- rpot <- 1366 * cosz
- return(rpot)
-}
-
diff --git a/R/fablePersistenceModelFunction.R b/R/fablePersistenceModelFunction.R
deleted file mode 100644
index fbd6e69985..0000000000
--- a/R/fablePersistenceModelFunction.R
+++ /dev/null
@@ -1,78 +0,0 @@
-
-# Function carry out a random walk forecast
-RW_daily_forecast <- function(site, var, h,
- bootstrap = FALSE, boot_number = 200,
- transformation = 'none', verbose = TRUE,...) {
- # Work out when the forecast should start
- forecast_starts <- targets %>%
- dplyr::filter(!is.na(observation) & site_id == site & variable == var) %>%
- # Start the day after the most recent non-NA value
- dplyr::summarise(start_date = max(datetime) + lubridate::days(1)) %>% # Date
- dplyr::mutate(h = (Sys.Date() - start_date) + h) %>% # Horizon value
- dplyr::ungroup()
-
- if (verbose == T) {
- message(
- site,
- ' ',
- var,
- ' forecast with transformation = ',
- transformation,
- ' and bootstrap = ',
- bootstrap
- )
- }
-
- # filter the targets data set to the site_var pair
- targets_use <- targets %>%
- dplyr::filter(site_id == site,
- variable == var) %>%
- tsibble::as_tsibble(key = c('variable', 'site_id'), index = 'datetime') %>%
- # add NA values up to today (index)
- tsibble::fill_gaps(.end = Sys.Date()) %>%
- # Remove the NA's put at the end, so that the forecast starts from the last day with an observation,
- # rather than today
- dplyr::filter(datetime < forecast_starts$start_date)
-
- if (nrow(targets_use) == 0) {
- message('no targets available, no forecast run')
- empty_df <- data.frame('variable' = character(),
- 'site_id' = character(),
- '.model' = character(),
- 'datetime' = lubridate::ymd(),
- '.rep' = character(),
- '.sim' = numeric())
-
- return(empty_df)
-
- } else {
- if (transformation == 'log') {
- RW_model <- targets_use %>%
- fabletools::model(RW = fable::RW(log(observation)))
- }
- if (transformation == 'log1p') {
- RW_model <- targets_use %>%
- fabletools::model(RW = fable::RW(log1p(observation)))
- }
- if (transformation == 'none') {
- RW_model <- targets_use %>%
- fabletools::model(RW = fable::RW(observation))
- }
- if (transformation == 'sqrt') {
- RW_model <- targets_use %>%
- fabletools::model(RW = fable::RW(sqrt(observation)))
- }
-
- if (bootstrap == T) {
- forecast <- RW_model %>% fabletools::generate(
- h = as.numeric(forecast_starts$h),
- bootstrap = T,
- times = boot_number
- )
- } else
- forecast <- RW_model %>% fabletools::forecast(h = as.numeric(forecast_starts$h))
- message('forecast finished')
- return(forecast)
- }
-
-}
diff --git a/R/model_metadata_conversion.R b/R/model_metadata_conversion.R
deleted file mode 100644
index b6c5309018..0000000000
--- a/R/model_metadata_conversion.R
+++ /dev/null
@@ -1,55 +0,0 @@
-library(tidyverse)
-library(arrow)
-library(stac4cast)
-library(reticulate)
-library(rlang)
-library(RCurl)
-
-googlesheets4::gs4_deauth()
-
-registered_model_id <- googlesheets4::read_sheet("https://docs.google.com/spreadsheets/d/1-OsDaOoMZwPfQnz5U5aV-T9_vhTmyg92Ff5ARRunYhY/edit?usp=sharing")
-
-binary_cols <- c(8, 26, 25, 9, 10, 11, 13, 15, 17) # (Yes, No, Not sure)
-
-multi_choice_cols <- c(7, 18) # Just use value provided
-
-# contact info
-contact_df <- registered_model_id |>
- select(model_id, contact_name = `Contact name (or course instructor's name)`, contact_email = `Contact email (or course instructor's email)`, Institution)
-
-# binary values
-metadata_binary <- registered_model_id[,binary_cols]
-metadata_binary[metadata_binary == 'Yes'] = '1'
-metadata_binary[metadata_binary == 'No'] = '0'
-metadata_binary[metadata_binary == 'Not sure'] = NA
-binary_var_names <- c('dynamic_model', 'workshop_or_tutorial', 'instructor_contact', 'initial_conditions', 'time_varying_met_driver', 'drivers',
- 'process_error', 'multi_model_forecast_output', 'parameters')
-names(metadata_binary) <- binary_var_names
-
-# multiple choice values
-metadata_multi_choice <- registered_model_id[,multi_choice_cols]
-mc_var_names <- c('modeling_approach', 'data_assimilation_method')
-names(metadata_multi_choice) <- mc_var_names
-
-
-# extract specific model information from inventory bucket
-config <- yaml::read_yaml('challenge_configuration.yaml')
-
-inventory_df <- duckdbfs::open_dataset(glue::glue("s3://{config$inventory_bucket}/catalog"),
- s3_endpoint = "renc.osn.xsede.org", anonymous=TRUE) |>
- collect()
-
-inventory_metadata <- inventory_df |>
- group_by(model_id) |>
- mutate(min_date = min(date)) |>
- mutate(max_date = max(date)) |>
- ungroup() |>
- distinct(model_id, .keep_all = TRUE) |>
- select(model_id, min_date, max_date)
-
-
-# put it all together
-metadata_convert <- cbind(contact_df,metadata_binary, metadata_multi_choice) |>
- right_join(inventory_metadata, by = c('model_id'))
-
-write.csv(metadata_convert, 'model_metadata.csv', row.names = FALSE)
diff --git a/R/rebuild_inventory.R b/R/rebuild_inventory.R
deleted file mode 100644
index 2898012327..0000000000
--- a/R/rebuild_inventory.R
+++ /dev/null
@@ -1,39 +0,0 @@
-library(tidyverse)
-config <- yaml::read_yaml("challenge_configuration.yaml")
-
-s3 <- arrow::s3_bucket(paste0(config$forecasts_bucket, "/parquet"), endpoint_override = config$endpoint, anonymous = TRUE)
-
-bucket <- config$forecasts_bucket
-inventory_df <- arrow::open_dataset(s3) |>
- mutate(reference_date = lubridate::as_date(reference_datetime),
- date = lubridate::as_date(datetime),
- pub_date = lubridate::as_date(pub_datetime)) |>
- distinct(duration, model_id, site_id, reference_date, variable, date, project_id, pub_date) |>
- collect() |>
- mutate(path = glue::glue("{bucket}/parquet/project_id={project_id}/duration={duration}/variable={variable}"),
- path_full = glue::glue("{bucket}/parquet/project_id={project_id}/duration={duration}/variable={variable}/model_id={model_id}/reference_date={reference_date}/part-0.parquet"),
- path_summaries = glue::glue("{bucket}/summaries/project_id={project_id}/duration={duration}/variable={variable}/model_id={model_id}/reference_date={reference_date}/part-0.parquet"),
- endpoint =config$endpoint)
-
-sites <- readr::read_csv(config$site_table,show_col_types = FALSE) |>
- select(field_site_id, latitude, longitude) |>
- rename(site_id = field_site_id)
-
-inventory_df <- dplyr::left_join(inventory_df, sites, by = "site_id")
-
-s3_inventory <- arrow::s3_bucket(config$inventory_bucket,
- endpoint_override = config$endpoint,
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
-arrow::write_dataset(inventory_df, path = s3_inventory$path(glue::glue("catalog/forecasts/project_id={config$project_id}")))
-
-s3_inventory <- arrow::s3_bucket(config$inventory_bucket,
- endpoint_override = config$endpoint,
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
-inventory_df |> distinct(model_id, project_id) |>
- arrow::write_csv_arrow(s3_inventory$path("model_id/model_id-project_id-inventory.csv"))
-
-
diff --git a/R/repartition_bucket.R b/R/repartition_bucket.R
deleted file mode 100644
index 8384164bd3..0000000000
--- a/R/repartition_bucket.R
+++ /dev/null
@@ -1,64 +0,0 @@
-s3 <- arrow::s3_bucket(bucket = "bio230121-bucket01/vera4cast/forecasts/parquet",
- endpoint_override = "renc.osn.xsede.org",
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
-arrow::open_dataset(s3) |>
- arrow::write_dataset(path = ".")
-
-df <- aws.s3::get_bucket_df(bucket = "bio230121-bucket01",
- prefix = "vera4cast/forecasts/parquet",
- region = "renc",
- base_url = "osn.xsede.org",
- key = Sys.getenv("OSN_KEY"),
- secret = Sys.getenv("OSN_SECRET"))
-
-for(i in 1:nrow(df)){
-
- aws.s3::delete_object(object = df$Key[i],
- bucket = "bio230121-bucket01",
- region = "renc",
- base_url = "osn.xsede.org",
- key = Sys.getenv("OSN_KEY"),
- secret = Sys.getenv("OSN_SECRET"))
-}
-
-arrow::open_dataset("part-0.parquet") |>
- collect() |>
- dplyr::mutate(depth_m = ifelse(depth_m == 1.5 & site_id == "fcre", 1.6, depth_m)) |>
- arrow::write_dataset(s3, partitioning = c("project_id", "duration","variable","model_id","reference_date"))
-
-###
-
-s3 <- arrow::s3_bucket(bucket = "bio230121-bucket01/vera4cast/scores/parquet",
- endpoint_override = "renc.osn.xsede.org",
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
-arrow::open_dataset(s3) |> arrow::write_dataset(path = ".")
-
-df <- aws.s3::get_bucket_df(bucket = "bio230121-bucket01",
- prefix = "vera4cast/scores/parquet",
- region = "renc",
- base_url = "osn.xsede.org",
- key = Sys.getenv("OSN_KEY"),
- secret = Sys.getenv("OSN_SECRET"))
-
-for(i in 1:nrow(df)){
-
- aws.s3::delete_object(object = df$Key[i],
- bucket = "bio230121-bucket01",
- region = "renc",
- base_url = "osn.xsede.org",
- key = Sys.getenv("OSN_KEY"),
- secret = Sys.getenv("OSN_SECRET"))
-}
-
-arrow::open_dataset("part-0.parquet") |>
- collect() |>
- mutate(reference_datetime = stringr::str_sub(reference_datetime, start = 1, end = 10),
- reference_datetime = lubridate::as_datetime(reference_datetime)) |>
- arrow::write_dataset(s3, partitioning = c("duration", "variable","model_id","date"))
-
-
-
diff --git a/R/repartition_forecasts_scores.R b/R/repartition_forecasts_scores.R
deleted file mode 100644
index d5530754a3..0000000000
--- a/R/repartition_forecasts_scores.R
+++ /dev/null
@@ -1,121 +0,0 @@
-minioclient::mc_alias_set("s3_store",
- "renc.osn.xsede.org",
- Sys.getenv("OSN_KEY"),
- Sys.getenv("OSN_SECRET"))
-
-minioclient::mc_mirror("s3_store/bio230121-bucket01/vera4cast/forecasts/parquet", "temp_forecasts")
-
-
-
-s3 <- arrow::s3_bucket("bio230121-bucket01/vera4cast/forecasts/parquet",
- endpoint_override = "renc.osn.xsede.org",
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
-s <- arrow::schema(
- project_id = arrow::string(),
- site_id = arrow::string(),
- reference_datetime = arrow::timestamp(unit = "us"),
- datetime = arrow::timestamp(unit = "us"),
- depth_m = arrow::float(),
- family = arrow::string(),
- parameter = arrow::string(),
- prediction = arrow::float(),
- pub_datetime = arrow::timestamp(unit = "us"),
- duration = arrow::string(),
- variable = arrow::string(),
- model_id = arrow::string(),
- reference_date = arrow::string()
-)
-
-d <- arrow::open_dataset(s3,schema = s)
-
-
-s3_2 <- arrow::s3_bucket("bio230121-bucket01/vera4cast/forecasts",
- endpoint_override = "renc.osn.xsede.org",
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
-s3_2 <- arrow::s3_bucket("bio230121-bucket01/vera4cast/forecasts/parquet",
- endpoint_override = "renc.osn.xsede.org",
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
-d <- arrow::open_dataset("temp_forecasts",schema = s)
-
-d |> arrow::write_dataset(s3_2, format = 'parquet',
- partitioning = c("project_id",
- "duration",
- "variable",
- "model_id",
- "reference_date"))
-
-s3_2 <- arrow::s3_bucket("bio230121-bucket01/vera4cast/forecasts/parquet2",
- endpoint_override = "renc.osn.xsede.org",
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
-d <- arrow::open_dataset(s3_2)
-
-minioclient::mc_rm("s3_store/bio230121-bucket01/vera4cast/forecasts/parquet", recursive = TRUE)
-
-minioclient::mc_mv("s3_store/bio230121-bucket01/vera4cast/forecasts/parquet2", "s3_store/bio230121-bucket01/vera4cast/forecasts/parquet", recursive = TRUE)
-
-
-
-#####
-
-minioclient::mc_alias_set("s3_store",
- "renc.osn.xsede.org",
- Sys.getenv("OSN_KEY"),
- Sys.getenv("OSN_SECRET"))
-
-minioclient::mc_mirror("s3_store/bio230121-bucket01/vera4cast/scores/parquet", "temp_scores")
-
-minioclient::mc_rm("s3_store/bio230121-bucket01/vera4cast/scores/parquet", recursive = TRUE)
-
-s3 <- arrow::s3_bucket("temp_scores",
- endpoint_override = "renc.osn.xsede.org",
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
-s <- arrow::schema(
- project_id = arrow::string(),
- site_id = arrow::string(),
- reference_datetime = arrow::timestamp(unit = "us"),
- datetime = arrow::timestamp(unit = "us"),
- depth_m = arrow::float(),
- family = arrow::string(),
- parameter = arrow::string(),
- prediction = arrow::float(),
- pub_datetime = arrow::timestamp(unit = "us"),
- duration = arrow::string(),
- variable = arrow::string(),
- model_id = arrow::string(),
- reference_date = arrow::string()
-)
-
-d <- arrow::open_dataset("temp_scores") |>
- dplyr::mutate(pub_datetime = lubridate::as_datetime(pub_datetime),
- project_id = "vera4cast")
-
-s3 <- arrow::s3_bucket("bio230121-bucket01/vera4cast/scores",
- endpoint_override = "renc.osn.xsede.org",
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
-s3$CreateDir("parquet")
-
-s3 <- arrow::s3_bucket("bio230121-bucket01/vera4cast/scores/parquet",
- endpoint_override = "renc.osn.xsede.org",
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
-
-d |> dplyr::filter(variable == "WindSpeed_ms_mean") |>
- arrow::write_dataset(s3, format = 'parquet',
- partitioning = c("project_id",
- "duration",
- "variable",
- "model_id",
- "date"))
diff --git a/R/spatial_subset_example.R b/R/spatial_subset_example.R
deleted file mode 100644
index 09e4190e47..0000000000
--- a/R/spatial_subset_example.R
+++ /dev/null
@@ -1,30 +0,0 @@
-uri <- "s3://anonymous@bio230014-bucket01/challenges/inventory/catalog/forecasts/project_id=neon4cast?endpoint_override=sdsc.osn.xsede.org"
-
-library(duckdbfs)
-library(dplyr)
-library(sf)
-load_spatial()
-
-library(spData)
-ca <- us_states |>
- filter(NAME == "California") |>
- pull(geometry) |>
- sf::st_as_text()
-
-
-paths <- open_dataset(uri) |>
- mutate(geometry = ST_Point(longitude, latitude)) |>
- filter(st_within(geometry, ST_GeomFromText({ca}))) |>
- filter(date == as_date("2023-11-01"), variable == "gcc_90") |>
- to_sf() |>
- collect()
-
-
-local_sites <- unique(paths$site_id)
-
-open_dataset(paste0("s3://", paths$path_full), s3_endpoint = "sdsc.osn.xsede.org") |>
- filter(family == "ensemble", site_id %in% local_sites) |>
- collect() |>
- ggplot(aes(x = prediction)) +
- geom_histogram() +
- facet_wrap(~site_id, scale = "free")
diff --git a/README.md b/README.md
deleted file mode 100644
index ed5bc2c7ff..0000000000
--- a/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# Ecological Forecasting Initiative (EFI) and U.S. Geological Survey (USGS) River Chlorophyll Forecasting Challenge
-
-The following elements of a forecasting challenge workflow are included in this repo. The GitHub Actions are the tool for automating the tasks.
-
-1) Generation of targets
-3) Processing of submissions
-4) Download of weather drivers for forecasts
-5) Generation of baseline forecasts
-6) Evaluation (scoring) of submitted forecasts
-7) Generation of dashboard
-8) Generation of catalog
-9) Creation of Docker containers with supporting software
-10) Creation of archive/snapshots of challenge submissions and targets.
-
-See [https://doi.org/10.1002/fee.2616](https://doi.org/10.1002/fee.2616) for more information on the structure of a forecasting challenge, using the NEON forecasting challenge as the example.
-
-Supported by the U.S. National Science Foundation grants (DEB-1926388 and OAC-2209866)
-
-## Disclaimer
-Although this software program has been used by the U.S. Geological Survey (USGS), no warranty, expressed or implied, is made by the USGS or the U.S. Government as to the accuracy and functioning of the program and related program material nor shall the fact of distribution constitute any such warranty, and no responsibility is assumed by the USGS in connection therewith.
-This software is provided “AS IS.”
-
-## License Disclaimer
-As a government employee, the contributions from Jacob Zwart to this repository are in the public domain.
diff --git a/USGS_site_metadata.csv b/USGS_site_metadata.csv
deleted file mode 100644
index bbf584d711..0000000000
--- a/USGS_site_metadata.csv
+++ /dev/null
@@ -1,11 +0,0 @@
-site_id,project_id,agency_cd,site_no,station_nm,site_tp_cd,latitude,longitude,site_url,colocated,queryTime
-USGS-14211720,usgsrc4cast,USGS,14211720,"WILLAMETTE RIVER AT PORTLAND, OR",ST-TS,45.5175,-122.6691667,https://waterdata.usgs.gov/monitoring-location/14211720,FALSE,2023-12-20T22:59:24Z
-USGS-14211010,usgsrc4cast,USGS,14211010,"CLACKAMAS RIVER NEAR OREGON CITY, OR",ST,45.3792874,-122.5773134,https://waterdata.usgs.gov/monitoring-location/14211010,FALSE,2023-12-20T22:59:24Z
-USGS-14181500,usgsrc4cast,USGS,14181500,"NORTH SANTIAM RIVER AT NIAGARA, OR",ST,44.75377778,-122.2974444,https://waterdata.usgs.gov/monitoring-location/14181500,FALSE,2023-12-20T22:59:24Z
-USGS-05586300,usgsrc4cast,USGS,05586300,"ILLINOIS RIVER AT FLORENCE, IL",ST,39.63275,-90.6076667,https://waterdata.usgs.gov/monitoring-location/05586300,FALSE,2023-12-20T22:59:24Z
-USGS-05558300,usgsrc4cast,USGS,05558300,"ILLINOIS RIVER AT HENRY, IL",ST,41.10727778,-89.3562222,https://waterdata.usgs.gov/monitoring-location/05558300,FALSE,2023-12-20T22:59:24Z
-USGS-05553700,usgsrc4cast,USGS,05553700,"ILLINOIS RIVER AT STARVED ROCK, IL",ST,41.3247564,-88.9839693,https://waterdata.usgs.gov/monitoring-location/05553700,FALSE,2023-12-20T22:59:24Z
-USGS-05543010,usgsrc4cast,USGS,05543010,"ILLINOIS RIVER AT SENECA, IL",ST,41.29988889,-88.6141944,https://waterdata.usgs.gov/monitoring-location/05543010,FALSE,2023-12-20T22:59:24Z
-USGS-05549500,usgsrc4cast,USGS,05549500,"FOX RIVER NEAR MCHENRY, IL",ST,42.3100222,-88.2514745,https://waterdata.usgs.gov/monitoring-location/05549500,FALSE,2023-12-20T22:59:24Z
-USGS-01427510,usgsrc4cast,USGS,01427510,DELAWARE RIVER AT CALLICOON NY,ST,41.75675,-75.0574167,https://waterdata.usgs.gov/monitoring-location/01427510,FALSE,2023-12-20T22:59:24Z
-USGS-01463500,usgsrc4cast,USGS,01463500,Delaware River at Trenton NJ,ST,40.22166667,-74.7780556,https://waterdata.usgs.gov/monitoring-location/01463500,FALSE,2023-12-20T22:59:24Z
diff --git a/baseline_models/R/fablePersistenceModelFunction.R b/baseline_models/R/fablePersistenceModelFunction.R
deleted file mode 100644
index fbd6e69985..0000000000
--- a/baseline_models/R/fablePersistenceModelFunction.R
+++ /dev/null
@@ -1,78 +0,0 @@
-
-# Function carry out a random walk forecast
-RW_daily_forecast <- function(site, var, h,
- bootstrap = FALSE, boot_number = 200,
- transformation = 'none', verbose = TRUE,...) {
- # Work out when the forecast should start
- forecast_starts <- targets %>%
- dplyr::filter(!is.na(observation) & site_id == site & variable == var) %>%
- # Start the day after the most recent non-NA value
- dplyr::summarise(start_date = max(datetime) + lubridate::days(1)) %>% # Date
- dplyr::mutate(h = (Sys.Date() - start_date) + h) %>% # Horizon value
- dplyr::ungroup()
-
- if (verbose == T) {
- message(
- site,
- ' ',
- var,
- ' forecast with transformation = ',
- transformation,
- ' and bootstrap = ',
- bootstrap
- )
- }
-
- # filter the targets data set to the site_var pair
- targets_use <- targets %>%
- dplyr::filter(site_id == site,
- variable == var) %>%
- tsibble::as_tsibble(key = c('variable', 'site_id'), index = 'datetime') %>%
- # add NA values up to today (index)
- tsibble::fill_gaps(.end = Sys.Date()) %>%
- # Remove the NA's put at the end, so that the forecast starts from the last day with an observation,
- # rather than today
- dplyr::filter(datetime < forecast_starts$start_date)
-
- if (nrow(targets_use) == 0) {
- message('no targets available, no forecast run')
- empty_df <- data.frame('variable' = character(),
- 'site_id' = character(),
- '.model' = character(),
- 'datetime' = lubridate::ymd(),
- '.rep' = character(),
- '.sim' = numeric())
-
- return(empty_df)
-
- } else {
- if (transformation == 'log') {
- RW_model <- targets_use %>%
- fabletools::model(RW = fable::RW(log(observation)))
- }
- if (transformation == 'log1p') {
- RW_model <- targets_use %>%
- fabletools::model(RW = fable::RW(log1p(observation)))
- }
- if (transformation == 'none') {
- RW_model <- targets_use %>%
- fabletools::model(RW = fable::RW(observation))
- }
- if (transformation == 'sqrt') {
- RW_model <- targets_use %>%
- fabletools::model(RW = fable::RW(sqrt(observation)))
- }
-
- if (bootstrap == T) {
- forecast <- RW_model %>% fabletools::generate(
- h = as.numeric(forecast_starts$h),
- bootstrap = T,
- times = boot_number
- )
- } else
- forecast <- RW_model %>% fabletools::forecast(h = as.numeric(forecast_starts$h))
- message('forecast finished')
- return(forecast)
- }
-
-}
diff --git a/baseline_models/R/randomWalkNullModelFunction.R b/baseline_models/R/randomWalkNullModelFunction.R
deleted file mode 100644
index ae52629bf5..0000000000
--- a/baseline_models/R/randomWalkNullModelFunction.R
+++ /dev/null
@@ -1,51 +0,0 @@
-###Random Walk Null Model
-##' Creates a random walk phenology forecast model based on PhenoCam and MODIS data
-##'
-##' @param data The data in the form of a list with data$p, data$mn, data$me, and data$n
-##' @param nchain The desired number of chains in the MCMC
-##' @param priorCal If a calibration period has been performed enter the priors here in the form of a list (if not do not include)
-##' @export
-##' @import rjags
-##' @import coda
-randomWalkPhenoModel <- function(data,nchain,priorCal=FALSE){
- ##Set priors
- if(typeof(priorCal)==typeof(FALSE)){ ##Done when there was not a calibration period performed separately (or if this is the calibration)
- data$s1.proc <- 1262.626
- data$s2.proc <- 50.50505
- data$x1.a <- 1 #Done to keep distribution close to 0 (over 75% of the data <0.05)
- data$x1.b <- 30
- #data$s1.PC <- 1262.626 ## Very roughly based off of what I think are reasonable and uninformed priors
- #data$s2.PC <- 50.50505 ##From mean <- 1/(0.2**2) and var = (mean-1/((0.4/1.96)**2))/2
- }
-
- ###JAGS model
- RandomWalk = "
- model{
-
- #### Data Models
- for(i in 1:N){
- p[i] ~ dnorm(x[i],p.prec[i])
- }
-
- #### Process Model
- for(i in 2:N){
- xl[i]~dnorm(x[i-1],p.proc)
- x[i] <- max(0, min(1,xl[i]))
- }
-
- #### Priors
- x[1] ~ dbeta(x1.a,x1.b)
- #p.PC ~ dgamma(s1.PC,s2.PC)
- p.proc ~ dgamma(s1.proc,s2.proc)
-
- }
- "
-
- ###Create the JAGS model using the basic RandomWalk Model
-
- j.model <- jags.model (file = textConnection(RandomWalk),
- data = data,
- n.chains = nchain)
- return(j.model)
-}
-
diff --git a/baseline_models/models/aquatics_climatology.R b/baseline_models/models/aquatics_climatology.R
deleted file mode 100644
index cec60aae09..0000000000
--- a/baseline_models/models/aquatics_climatology.R
+++ /dev/null
@@ -1,128 +0,0 @@
-#'# Ecological Forecasting Initiative Null Model
-
-#'## Set-up
-source("R/eco4cast-helpers/submit.R")
-source("R/eco4cast-helpers/forecast_output_validator.R")
-
-#' Required packages.
-#' EFIstandards is at remotes::install_github("eco4cast/EFIstandards")
-library(tidyverse)
-library(lubridate)
-library(aws.s3)
-library(jsonlite)
-library(imputeTS)
-#' set the random number for reproducible MCMC runs
-set.seed(329)
-
-config <- yaml::read_yaml("challenge_configuration.yaml")
-
-#'Team name code
-team_name <- "climatology"
-
-#'Read in target file.
-targets <- readr::read_csv(config$target_groups$aquatics$targets_file,
- show_col_types = F)
-
-sites <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/",
- "raw/prod/USGS_site_metadata.csv"),
- show_col_types = F)
-
-# calculates a doy average for each target variable in each site
-target_clim <- targets %>%
- mutate(doy = yday(datetime),
- year = year(datetime)) %>%
- filter(year < year(Sys.Date())) |>
- group_by(doy, site_id, variable) %>%
- summarise(mean = mean(observation, na.rm = TRUE),
- sd = sd(observation, na.rm = TRUE),
- .groups = "drop") %>%
- mutate(mean = ifelse(is.nan(mean), NA, mean))
-
-#curr_month <- month(Sys.Date())
-curr_month <- month(Sys.Date())
-if(curr_month < 10){
- curr_month <- paste0("0", curr_month)
-}
-
-
-curr_year <- year(Sys.Date())
-start_date <- Sys.Date() + days(1)
-
-forecast_dates <- seq(start_date, as_date(start_date + days(34)), "1 day")
-forecast_doy <- yday(forecast_dates)
-
-forecast_dates_df <- tibble(datetime = forecast_dates,
- doy = forecast_doy)
-
-forecast <- target_clim %>%
- mutate(doy = as.integer(doy)) %>%
- filter(doy %in% forecast_doy) %>%
- full_join(forecast_dates_df, by = 'doy') %>%
- arrange(site_id, datetime)
-
-subseted_site_names <- unique(forecast$site_id)
-site_vector <- NULL
-for(i in 1:length(subseted_site_names)){
- site_vector <- c(site_vector, rep(subseted_site_names[i], length(forecast_dates)))
-}
-
-forecast_tibble <- tibble(datetime = rep(forecast_dates, length(subseted_site_names)),
- site_id = site_vector,
- variable = "chla")
-
-forecast <- right_join(forecast, forecast_tibble)
-
-forecast |>
- ggplot(aes(x = datetime, y = mean)) +
- geom_point() +
- facet_grid(site_id ~ variable, scale = "free")
-
-combined <- forecast %>%
- select(datetime, site_id, variable, mean, sd) %>%
- group_by(site_id, variable) %>%
- # remove rows where all in group are NA
- filter(all(!is.na(mean))) %>%
- # retain rows where group size >= 2, to allow interpolation
- filter(n() >= 2) %>%
- mutate(mu = imputeTS::na_interpolation(mean),
- sigma = median(sd, na.rm = TRUE)) %>%
- pivot_longer(c("mu", "sigma"),names_to = "parameter", values_to = "prediction") |>
- mutate(family = "normal") |>
- ungroup() |>
- mutate(reference_datetime = lubridate::as_date(min(datetime)) - lubridate::days(1),
- model_id = "climatology",
- project_id = "usgsrc4cast",
- duration = "P1D") |>
- select(project_id, model_id, datetime, reference_datetime, duration, site_id, family, parameter, variable, prediction)
-
-combined |>
- filter(parameter == "mu") |>
- ggplot(aes(x = datetime, y = prediction)) +
- geom_point() +
- facet_grid(site_id ~ variable, scale = "free")
-
-
-# plot the forecasts
-combined %>%
- select(datetime, prediction ,parameter, variable, site_id) %>%
- pivot_wider(names_from = parameter, values_from = prediction) %>%
- ggplot(aes(x = datetime)) +
- geom_ribbon(aes(ymin=mu - sigma*1.96, ymax=mu + sigma*1.96), alpha = 0.1) +
- geom_line(aes(y = mu)) +
- facet_wrap(~site_id, scales = "free") +
- theme_bw()
-
-file_date <- combined$reference_datetime[1]
-
-forecast_file <- paste("usgsrc4cast", file_date, "climatology.csv.gz", sep = "-")
-
-write_csv(combined, forecast_file)
-
-# using function in R/eco4cast-helpers/ to submit to sub-folder in submit bucket
-submit(forecast_file = forecast_file,
- project_id = "usgsrc4cast",
- ask = FALSE)
-
-unlink(forecast_file)
-
-
diff --git a/baseline_models/models/aquatics_persistenceRW.R b/baseline_models/models/aquatics_persistenceRW.R
deleted file mode 100644
index a0ec35d4b5..0000000000
--- a/baseline_models/models/aquatics_persistenceRW.R
+++ /dev/null
@@ -1,68 +0,0 @@
-library(tidyverse)
-library(tsibble)
-library(fable)
-source('R/fablePersistenceModelFunction.R')
-source("R/eco4cast-helpers/submit.R")
-source("R/eco4cast-helpers/forecast_output_validator.R")
-
-
-# 1.Read in the targets data
-config <- yaml::read_yaml("challenge_configuration.yaml")
-#'Read in target file.
-targets <- readr::read_csv(config$target_groups$aquatics$targets_file,
- show_col_types = F) %>%
- mutate(observation = ifelse(observation == 0 & variable == "chla", 0.00001, observation))
-
-# 2. Make the targets into a tsibble with explicit gaps
-targets_ts <- targets %>%
- as_tsibble(key = c('variable', 'site_id'), index = 'datetime') %>%
- # add NA values up to today (index)
- fill_gaps(.end = Sys.Date())
-
-# 2. Run through each via map
-# Requires a dataframe that has each of the variable in the RW_forecast function
-site_var_combinations <- expand.grid(site = unique(targets$site_id),
- var = unique(targets$variable)) %>%
- # assign the transformation depending on the variable. chla and oxygen get a log(x) transformation
- mutate(transformation = ifelse(var %in% c('chla', 'oxygen'), 'log', 'none')) %>%
- mutate(boot_number = 200,
- h = 35,
- bootstrap = T,
- verbose = T)
-
-# runs the RW forecast for each combination of variable and site_id
-RW_forecasts <- purrr::pmap_dfr(site_var_combinations, RW_daily_forecast)
-
-# convert the output into EFI standard
-RW_forecasts_EFI <- RW_forecasts %>%
- rename(parameter = .rep,
- prediction = .sim) %>%
- # For the EFI challenge we only want the forecast for future
- filter(datetime > Sys.Date()) %>%
- group_by(site_id, variable) %>%
- mutate(reference_datetime = min(datetime) - lubridate::days(1),
- family = "ensemble",
- model_id = "persistenceRW",
- project_id = "usgsrc4cast",
- duration = "P1D") %>%
- select(project_id, model_id, datetime, reference_datetime, duration, site_id, family, parameter, variable, prediction)
-
-# 4. Write forecast file
-
-file_date <- RW_forecasts_EFI$reference_datetime[1]
-
-## TODO: does this need to be renamed?
-forecast_file <- paste("usgsrc4cast", file_date, "persistenceRW.csv.gz", sep = "-")
-
-RW_forecasts_EFI <- RW_forecasts_EFI |>
- filter(variable != "ch")
-
-write_csv(RW_forecasts_EFI, forecast_file)
-
-# using function in R/eco4cast-helpers/ to submit to sub-folder in submit bucket
-submit(forecast_file = forecast_file,
- project_id = "usgsrc4cast",
- metadata = NULL,
- ask = FALSE)
-
-unlink(forecast_file)
diff --git a/baseline_models/run_aquatics_baselines.R b/baseline_models/run_aquatics_baselines.R
deleted file mode 100644
index 8a540a0af1..0000000000
--- a/baseline_models/run_aquatics_baselines.R
+++ /dev/null
@@ -1,15 +0,0 @@
-#renv::restore()
-
-print(paste0("Running Creating Aquatics baselines at ", Sys.time()))
-
-
-print(paste0("Running daily climatology at ", Sys.time()))
-source("baseline_models/models/aquatics_climatology.R")
-print(paste0("Completed daily climatology ", Sys.time()))
-
-print(paste0("Running daily persistence at ", Sys.time()))
-source("baseline_models/models/aquatics_persistenceRW.R")
-print(paste0("Completed daily persistence ", Sys.time()))
-
-# TODO: not sure what this is here for
-# RCurl::url.exists("https://hc-ping.com/a848914e-9abf-45e4-bcf3-27f570cc3623")
diff --git a/catalog/R/catalog-common.R b/catalog/R/catalog-common.R
deleted file mode 100644
index 8ec479c84d..0000000000
--- a/catalog/R/catalog-common.R
+++ /dev/null
@@ -1,4 +0,0 @@
-conformsTo = list(
- "https=//api.stacspec.org/v1.0.0-rc.1/collections",
- "https=//api.stacspec.org/v1.0.0-rc.1/core"
-)
diff --git a/catalog/R/sites.R b/catalog/R/sites.R
deleted file mode 100644
index ccbe7b0932..0000000000
--- a/catalog/R/sites.R
+++ /dev/null
@@ -1,53 +0,0 @@
-
-source('R/stac_functions.R')
-config <- yaml::read_yaml('challenge_configuration.yaml')
-catalog_config <- config$catalog_config
-
-## READ S3 INVENTORY FOR DATES
-# s3_inventory <- arrow::s3_bucket("neon4cast-inventory",
-# endpoint_override = "data.ecoforecast.org",
-# anonymous = TRUE)
-
-# s3_inventory <- arrow::s3_bucket(config$inventory_bucket,
-# endpoint_override = config$endpoint,
-# anonymous = TRUE)
-
-s3_df <- duckdbfs::open_dataset(glue::glue("s3://anonymous@{config$inventory_bucket}/catalog?endpoint_override=",config$endpoint)) |>
- #dplyr::filter(...1 == "parquet", ...2 == {theme}) |>
- dplyr::select(model_id, reference_date, date) |>
- # dplyr::mutate(model_id = gsub("model_id=", "", model_id),
- # reference_datetime =
- # gsub("reference_datetime=", "", reference_datetime),
- # date = gsub("date=", "", date)) |>
- dplyr::collect()
-
-#s3_df <- stac4cast::get_grouping(config$inventory_bucket, "aquatics")
-
-s3_df <- s3_df |> filter(model_id != 'null')
-
-theme_max_date <- max(s3_df$date)
-theme_min_date <- min(s3_df$date)
-
-theme_sites <- read_csv("https://raw.githubusercontent.com/eco4cast/neon4cast-targets/main/NEON_Field_Site_Metadata_20220412.csv", col_types = cols())
-theme_sites$site_lat_lon <- lapply(1:nrow(theme_sites), function(i) c(theme_sites$field_longitude[i], theme_sites$field_latitude[i]))
-
-build_description <- "This collection contains information to describe the NEON sites included in the forecasting challenge"
-
-build_site_item(theme_id = 'sites',
- start_date = '2000-01-01',
- end_date = Sys.Date(),
- destination_path = 'stac/sites',
- theme_title = 'NEON Sites',
- collection_name = 'NEON Sites',
- #thumbnail_link = 'https://projects.ecoforecast.org/neon4cast-catalog/img/BONA_Twr.jpg',
- thumbnail_link = 'https://www.neonscience.org/sites/default/files/styles/max_2600x2600/public/2021-04/2021_04_graphic_Domain_Map_no-Titles-png.png?itok=7MsHPigZ',
- site_coords = theme_sites$site_lat_lon)
-
-# build_site_theme(start_date = '2000-01-01',
-# end_date = Sys.Date(),
-# id_value = 'efi-sites',
-# theme_description = build_description,
-# theme_title = 'NEON Sites',
-# destination_path = "stac/sites/",
-# thumbnail_link = "https://projects.ecoforecast.org/neon4cast-catalog/img/BONA_Twr.jpg",
-# thumbnail_title = 'NEON Sites')
diff --git a/catalog/R/stac_functions.R b/catalog/R/stac_functions.R
deleted file mode 100644
index e8d0fb15f9..0000000000
--- a/catalog/R/stac_functions.R
+++ /dev/null
@@ -1,900 +0,0 @@
-## MODEL level functions
-
-generate_authors <- function(metadata_table, index){
-
- x <- list(list('url' = 'pending',
- 'name' = 'pending',
- 'roles' = list("producer",
- "processor",
- "licensor"))
- )
-}
-
-generate_model_assets <- function(m_vars, m_duration, aws_path){
-
- metadata_json_asset <- list(
- "1"= list(
- 'type'= 'application/json',
- 'title' = 'Model Metadata',
- 'href' = paste0("https://", config$endpoint,"/",
- config$model_metadata_bucket,"/",
- "project_id=",config$project_id, "/",
- m,".json"),
- 'description' = paste0("Use `jsonlite::fromJSON()` to download the model metadata JSON file. This R code will return metadata provided during the model registration.
- \n\n### R\n\n```{r}\n# Use code below\n\nmodel_metadata <- jsonlite::fromJSON(",paste0('"','https://', config$endpoint,'/', config$model_metadata_bucket,'/', 'project_id=', config$project_id, '/', m,'.json"'),")\n\n")
- )
- )
-
- iterator_list <- 1:length(m_vars)
-
- model_data_assets <- purrr::map(iterator_list, function(i)
- list(
- 'type'= 'application/x-parquet',
- 'title' = paste0('Database Access for ',m_vars[i],' ', m_duration[i]),
- 'href' = paste0("s3://anonymous@",
- aws_path,
- "/parquet/duration=P1D/variable=", m_vars[i],
- "/model_id=", m,
- "?endpoint_override=",config$endpoint),
- 'description' = paste0("Use `arrow` for remote access to the database. This R code will return results for this variable and model combination.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(",paste0("s3://anonymous@",
- aws_path,
- "/parquet/duration=P1D/variable=", m_vars[i],
- "/model_id=", m,
- "?endpoint_override=",config$endpoint),")\ndf <- all_results |> dplyr::collect()\n\n```
- \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n")
- )
- )
-
- model_assets <- c(metadata_json_asset, model_data_assets)
-
- return(model_assets)
-}
-
-
-build_model <- function(model_id,
- theme_id,
- team_name,
- model_description,
- start_date,
- end_date,
- use_metadata,
- var_values,
- duration_names,
- site_values,
- model_documentation,
- destination_path,
- description_path,
- aws_download_path,
- theme_title,
- collection_name,
- thumbnail_image_name,
- table_schema,
- table_description) {
-
-
- preset_keywords <- list("Forecasting", config$project_id)
- variables_reformat <- paste(var_values, collapse = ", ")
- site_reformat <- paste(site_values, collapse = ", ")
-
- aws_asset_link <- paste0("s3://anonymous@",
- aws_download_path,
- "/model_id=", model_id,
- "?endpoint_override=",config$endpoint)
-
- aws_asset_description <- paste0("Use `arrow` for remote access to the database. This R code will return results for forecasts of the variable by the specific model .\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(",aws_asset_link,")\ndf <- all_results |> dplyr::collect()\n\n```
- \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n")
-
- meta <- list(
- "stac_version"= "1.0.0",
- "stac_extensions"= list('https://stac-extensions.github.io/table/v1.2.0/schema.json'),
- "type"= "Feature",
- "id"= model_id,
- "bbox"=
- list(list(as.numeric(catalog_config$bbox$min_lon),
- as.numeric(catalog_config$bbox$max_lat),
- as.numeric(catalog_config$bbox$max_lon),
- as.numeric(catalog_config$bbox$max_lat))),
- "geometry"= list(
- "type"= catalog_config$site_type,
- "coordinates"= get_site_coords(sites = site_values)
- ),
- "properties"= list(
- #'description' = model_description,
- "description" = glue::glue('
-
- model info: {model_description}
-
- Sites: {site_reformat}
-
- Variables: {variables_reformat}
-'),
- "start_datetime" = start_date,
- "end_datetime" = end_date,
- "providers"= c(generate_authors(metadata_table = model_documentation),list(
- list(
- "url"= catalog_config$host_url,
- "name"= catalog_config$host_name,
- "roles"= list(
- "host"
- )
- )
- )
- ),
- "license"= "CC0-1.0",
- "keywords"= c(preset_keywords, variables_reformat),
- #"table:columns" = stac4cast::build_table_columns_full_bucket(table_schema, table_description)
- "table:columns" = build_table_columns_full_bucket(table_schema, table_description)
- ),
- "collection"= collection_name,
- "links"= list(
- list(
- "rel"= "collection",
- 'href' = '../collection.json',
- "type"= "application/json",
- "title"= theme_title
- ),
- list(
- "rel"= "root",
- 'href' = '../../../catalog.json',
- "type"= "application/json",
- "title"= "Forecast Catalog"
- ),
- list(
- "rel"= "parent",
- 'href' = '../collection.json',
- "type"= "application/json",
- "title"= theme_title
- ),
- list(
- "rel"= "self",
- "href" = paste0(model_id,'.json'),
- "type"= "application/json",
- "title"= "Model Forecast"
- )),
- "assets"= generate_model_assets(var_values, duration_names, aws_download_path)#,
- #pull_images(theme_id,model_id,thumbnail_image_name)
- )
-
-
- dest <- destination_path
- json <- file.path(dest, paste0(model_id, ".json"))
-
- jsonlite::write_json(meta,
- json,
- pretty=TRUE,
- auto_unbox=TRUE)
- stac4cast::stac_validate(json)
-
- rm(meta)
-}
-
-get_grouping <- function(inv_bucket,
- theme,
- collapse=TRUE) {
-
- groups <- duckdbfs::open_dataset(glue::glue("s3://anonymous@{inv_bucket}/catalog?endpoint_override=",config$endpoint)) |>
- dplyr::filter(...1 == "parquet", ...2 == {theme}) |>
- dplyr::select(model_id = ...3, reference_datetime = ...4, date = ...5) |>
- dplyr::mutate(model_id = gsub("model_id=", "", model_id),
- reference_datetime =
- gsub("reference_datetime=", "", reference_datetime),
- date = gsub("date=", "", date)) |>
- dplyr::collect()
-
-}
-
-# DONT USE THIS FUNCTION ANYMORE -- WAS USED FOR ORIGINAL NEON4CAST STAC CODE (KEEPING THIS FOR REFERENCE BUT DELETE EVENTUALLY)
-# generate_vars_sites <- function(m_id, theme){
-#
-# # if (m_id %in% c('GLEON_JRabaey_temp_physics','GLEON_lm_lag_1day','GLEON_physics','USGSHABs1','air2waterSat_2','fARIMA')){
-# # output_info <- c('pending','pending')
-# # } else{
-#
-# # do this for each theme / model
-# # info_df <- arrow::open_dataset(info_extract$path(glue::glue("{theme}/model_id={m_id}/"))) |>
-# # #filter(reference_datetime == "2023-06-18")|> #just grab one EM to limit processing
-# # collect()
-# #
-# info_df <- duckdbfs::open_dataset(glue::glue("s3://anonymous@neon4cast-scores/parquet/{theme}/
-# model_id={model_id}/reference_datetime={reference_datetime}?endpoint_override=sdsc.osn.xsede.org")) |>
-# collect()
-#
-# if ('siteID' %in% names(info_df)){
-# info_df <- info_df |>
-# rename(site_id = siteID)
-# }
-#
-# vars_vector <- sort(unique(info_df$variable))
-# sites_vector <- sort(unique(info_df$site_id))
-#
-# vars_list <- as.list(sort(unique(info_df$variable)))
-# sites_list <- as.list(sort(unique(info_df$site_id)))
-#
-# # output_vectors <- c(paste(vars_vector, collapse = ', '),
-# # paste(sites_vector, collapse = ', '))
-#
-# output_list <- list(vars_list,sites_list)
-#
-# full_object <- list(vars_vector, sites_vector, output_list)
-#
-# return(full_object)
-# }
-
-
-## FORECAST LEVEL FUNCTIONS
-generate_model_items <- function(model_list){
-
- x <- purrr::map(model_list, function(i)
- list(
- "rel" = 'item',
- 'type'= 'application/json',
- 'href' = paste0('model_items/',i,'.json'))
- )
-
- return(x)
-}
-
-pull_images <- function(theme, m_id, image_name){
-
- info_df <- arrow::open_dataset(info_extract$path(glue::glue("{theme}/model_id={m_id}/"))) |>
- collect()
-
- sites_vector <- sort(unique(info_df$site_id))
-
- base_path <- catalog_config$base_image_path
-
- image_assets <- purrr::map(sites_vector, function(i)
- #url_validator <- Rcurl::url.exists(file.path(base_path,theme,m_id,i,image_name))
- list(
- "href"= file.path(base_path,theme,m_id,i,image_name),
- "type"= "image/png",
- "title"= paste0('Latest Results for ', i),
- "description"= 'Image from s3 storage',
- "roles" = list('thumbnail')
- )
- )
-
- ## check if image rendered successfully on bucket. If not remove from assets
- item_remove <- c()
-
- if (image_name == 'latest_scores.png'){
- for (item in seq.int(1:length(image_assets))){
- url_validator = RCurl::url.exists(image_assets[[item]]$href)
- if(url_validator == FALSE){
- print(paste0('Removing ', image_assets[[item]]$title))
- item_remove <- append(item_remove,item)
- }
- }
- if (length(item_remove) > 0){
- image_assets <- image_assets[-item_remove]
- }
- }
-
- return(image_assets)
-
-}
-
-
-get_site_coords <- function(site_metadata, sites){
-
- site_df <- read_csv(site_metadata)
-
- # site_df <- data.frame(site_id = c('fcre', 'bvre', 'ccre'),
- # site_lon = c(-79.837217, -79.815936, -79.95856),
- # site_lat = c(37.303153, 37.312909, 37.370259))
-
- site_lat_lon <- lapply(sites, function(i) c(site_df$latitude[which(site_df[,2] == i)], site_df$longtitude[which(site_df[,2] == i)]))
-
- return(site_lat_lon)
-}
-
-
-generate_group_values <- function(group_values){
-
- x <- purrr::map(group_values, function(i)
- list(
- "rel" = "child",
- "type" = "application/json",
- "href" = paste0(i,"/collection.json"),
- "title" = i)
- )
-
- return(x)
-}
-
-
-build_forecast_scores <- function(table_schema,
- theme_id,
- table_description,
- start_date,
- end_date,
- id_value,
- description_string,
- about_string,
- about_title,
- theme_title,
- model_documentation,
- destination_path,
- aws_download_path,
- link_items,
- thumbnail_link,
- thumbnail_title
-){
-
- aws_asset_link <- paste0("s3://anonymous@",
- aws_download_path,
- #"/model_id=", model_id,
- "?endpoint_override=",config$endpoint)
-
- aws_asset_description <- aws_asset_description <- paste0("Use `arrow` for remote access to the database. This R code will return results for the VERA Forecasting Challenge.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(",aws_asset_link,")\ndf <- all_results |> dplyr::collect()\n\n```
- \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n")
- forecast_score <- list(
- "id" = id_value,
- "description" = description_string,
- "stac_version"= "1.0.0",
- "license"= "CC0-1.0",
- "stac_extensions"= list("https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"),
- 'type' = 'Collection',
- 'links' = c(link_items, #generate_model_items()
- list(
- list(
- "rel" = "child",
- "type" = "application/json",
- "href" = "models/collection.json",
- "title" = "group item"
- ),
- list(
- "rel" = "parent",
- "type"= "application/json",
- "href" = '../catalog.json'
- ),
- list(
- "rel" = "root",
- "type" = "application/json",
- "href" = '../catalog.json'
- ),
- list(
- "rel" = "self",
- "type" = "application/json",
- "href" = 'collection.json'
- ),
- list(
- "rel" = "cite-as",
- "href" = catalog_config$citation_doi
- ),
- list(
- "rel" = "about",
- "href" = about_string,
- "type" = "text/html",
- "title" = about_title
- ),
- list(
- "rel" = "describedby",
- "href" = catalog_config$dashboard_url,
- "title" = catalog_config$dashboard_title,
- "type" = "text/html"
- )
- )),
- "title" = theme_title,
- "extent" = list(
- "spatial" = list(
- 'bbox' = list(list(as.numeric(catalog_config$bbox$min_lon),
- as.numeric(catalog_config$bbox$max_lat),
- as.numeric(catalog_config$bbox$max_lon),
- as.numeric(catalog_config$bbox$max_lat)))),
- "temporal" = list(
- 'interval' = list(list(
- paste0(start_date,"T00:00:00Z"),
- paste0(end_date,"T00:00:00Z"))
- ))
- ),
- #"table:columns" = stac4cast::build_table_columns_full_bucket(table_schema, table_description),
- "table:columns" = build_table_columns_full_bucket(table_schema, table_description),
-
- 'assets' = list(
- # 'data' = list(
- # "href"= model_documentation,
- # "type"= "text/csv",
- # "roles" = list('data'),
- # "title"= "NEON Field Site Metadata",
- # "description"= readr::read_file(model_metadata_path)
- # ),
- 'data' = list(
- "href" = aws_asset_link,
- "type"= "application/x-parquet",
- "title"= 'Database Access',
- "roles" = list('data'),
- "description"= aws_asset_description
- ),
- 'thumbnail' = list(
- "href"= thumbnail_link,
- "type"= "image/JPEG",
- "roles" = list('thumbnail'),
- "title"= thumbnail_title
- )
- )
- )
-
-
- dest <- destination_path
- json <- file.path(dest, "collection.json")
-
- jsonlite::write_json(forecast_score,
- json,
- pretty=TRUE,
- auto_unbox=TRUE)
- stac4cast::stac_validate(json)
-}
-
-
-generate_group_variable_items <- function(variables){
-
-
- var_values <- variables
-
- x <- purrr::map(var_values, function(i)
- list(
- "rel" = 'child',
- 'type'= 'application/json',
- 'href' = paste0(i,'/collection.json'))
- )
-
- return(x)
-}
-
-generate_variable_model_items <- function(model_list){
-
-
- #var_values <- variables
-
- x <- purrr::map(model_list, function(i)
- list(
- "rel" = 'item',
- 'type'= 'application/json',
- 'href' = paste0('../../models/model_items/',i,'.json'))
- )
-
- return(x)
-}
-
-build_group_variables <- function(table_schema,
- theme_id,
- table_description,
- start_date,
- end_date,
- id_value,
- description_string,
- about_string,
- about_title,
- theme_title,
- destination_path,
- aws_download_path,
- group_var_items
-){
-
- aws_asset_link <- paste0("s3://anonymous@",
- aws_download_path,
- #"/model_id=", model_id,
- "?endpoint_override=",config$endpoint)
-
- aws_asset_description <- aws_asset_description <- paste0("Use `arrow` for remote access to the database. This R code will return results for the NEON Ecological Forecasting Aquatics theme.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(",aws_asset_link,")\ndf <- all_results |> dplyr::collect()\n\n```
- \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n")
- forecast_score <- list(
- "id" = id_value,
- "description" = description_string,
- "stac_version"= "1.0.0",
- "license"= "CC0-1.0",
- "stac_extensions"= list("https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"),
- 'type' = 'Collection',
- 'links' = c(group_var_items,#generate_group_variable_items(variables = group_var_values)
- list(
- list(
- "rel" = "parent",
- "type"= "application/json",
- "href" = '../collection.json'
- ),
- list(
- "rel" = "root",
- "type" = "application/json",
- "href" = '../collection.json'
- ),
- list(
- "rel" = "self",
- "type" = "application/json",
- "href" = 'collection.json'
- ),
- list(
- "rel" = "cite-as",
- "href" = "https://doi.org/10.1002/fee.2616"
- ),
- list(
- "rel" = "about",
- "href" = about_string,
- "type" = "text/html",
- "title" = about_title
- ),
- list(
- "rel" = "describedby",
- "href" = "https://ltreb-reservoirs.github.io/vera4cast/", # TODO: update this to something?
- "title" = "EFI-USGS Forecast Challenge Dashboard",
- "type" = "text/html"
- )
- )),
- "title" = theme_title,
- "extent" = list(
- "spatial" = list(
- 'bbox' = list(list(as.numeric(catalog_config$bbox$min_lon),
- as.numeric(catalog_config$bbox$max_lat),
- as.numeric(catalog_config$bbox$max_lon),
- as.numeric(catalog_config$bbox$max_lat)))),
- "temporal" = list(
- 'interval' = list(list(
- paste0(start_date,"T00:00:00Z"),
- paste0(end_date,"T00:00:00Z"))
- ))
- ),
- #"table:columns" = stac4cast::build_table_columns_full_bucket(table_schema, table_description),
- "table:columns" = build_table_columns_full_bucket(table_schema, table_description),
- 'assets' = list(
- 'data' = list(
- "href" = aws_asset_link,
- "type"= "application/x-parquet",
- "title"= 'Database Access',
- "roles" = list('data'),
- "description"= aws_asset_description
- )
- )
- )
-
-
- dest <- destination_path
- json <- file.path(dest, 'collection.json')
-
- jsonlite::write_json(forecast_score,
- json,
- pretty=TRUE,
- auto_unbox=TRUE)
- stac4cast::stac_validate(json)
-}
-
-# build_theme <- function(start_date,end_date, id_value, theme_description, theme_title, destination_path, thumbnail_link, thumbnail_title){
-#
-# theme <- list(
-# "id" = id_value,
-# "type" = "Collection",
-# "links" = list(
-# list(
-# "rel" = "child",
-# "type" = "application/json",
-# "href" = 'forecasts/collection.json',
-# "title" = 'forecast item'
-# ),
-# list(
-# "rel" = "child",
-# "type" = "application/json",
-# "href" = 'scores/collection.json',
-# "title" = 'scores item'
-# ),
-# list(
-# "rel"= "parent",
-# "type"= "application/json",
-# "href"= "../catalog.json",
-# "title" = 'parent'
-# ),
-# list(
-# "rel"= "root",
-# "type"= "application/json",
-# "href"= "../catalog.json",
-# "title" = 'root'
-# ),
-# list(
-# "rel"= "self",
-# "type"= "application/json",
-# "href" = 'collection.json',
-# "title" = 'self'
-# ),
-# list(
-# "rel" ="cite-as",
-# "href"= catalog_config$citation_link,
-# "title" = "citation"
-# ),
-# list(
-# "rel"= "about",
-# "href"= catalog_config$about_string,
-# "type"= "text/html",
-# "title"= catalog_config$about_title
-# ),
-# list(
-# "rel"= "describedby",
-# "href"= catalog_config$about_string,
-# "title"= catalog_config$about_title,
-# "type"= "text/html"
-# )
-# ),
-# "title"= theme_title,
-# 'assets' = list(
-# 'thumbnail' = list(
-# "href"= thumbnail_link,
-# "type"= "image/JPEG",
-# "roles" = list('thumbnail'),
-# "title"= thumbnail_title
-# )
-# ),
-# "extent" = list(
-# "spatial" = list(
-# 'bbox' = list(list(as.numeric(catalog_config$bbox$min_lon),
-# as.numeric(catalog_config$bbox$max_lat),
-# as.numeric(catalog_config$bbox$max_lon),
-# as.numeric(catalog_config$bbox$max_lat)))
-# ),
-# "temporal" = list(
-# 'interval' = list(list(
-# paste0(start_date,'T00:00:00Z'),
-# paste0(end_date,'T00:00:00Z'))
-# ))
-# ),
-# "license" = "CC0-1.0",
-# "keywords" = list(
-# "Forecasting",
-# "Data",
-# "Ecology"
-# ),
-# "providers" = list(
-# list(
-# "url"= catalog_config$host_url,
-# "name"= catalog_config$host_name,
-# "roles" = list(
-# "producer",
-# "processor",
-# "licensor"
-# )
-# ),
-# list(
-# "url"= catalog_config$host_url,
-# "name"= catalog_config$host_name,
-# "roles" = list('host')
-# )
-# ),
-# "description" = theme_description,
-# "stac_version" = "1.0.0",
-# "stac_extensions" = list(
-# "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
-# "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
-# "https://stac-extensions.github.io/table/v1.2.0/schema.json"
-# ),
-# "publications" = list(
-# "doi" = catalog_config$citation_doi,
-# "citation"= catalog_config$citation_text
-# )
-# )
-#
-#
-# dest <- destination_path
-# json <- file.path(dest, "collection.json")
-#
-# jsonlite::write_json(theme,
-# json,
-# pretty=TRUE,
-# auto_unbox=TRUE)
-# stac4cast::stac_validate(json)
-# }
-
-
-
-
-## ADD PLACEHOLDER FUNCTION FOR STAC4CAST TABLE BUILD
-build_table_columns_full_bucket <- function(data_object,description_df){
-
- full_string_list <- strsplit(data_object$ToString(),'\n')[[1]]
-
- #create initial empty list
- init_list = vector(mode="list", length = data_object$num_cols)
-
- ## loop through parquet df and description information to build the list
- for (i in seq.int(1,data_object$num_cols)){
- list_items <- strsplit(full_string_list[i],': ')[[1]]
- col_list <- list(name = list_items[1],
- type = list_items[2],
- description = description_df[1,list_items[1]])
-
- init_list[[i]] <- col_list
-
- }
- return(init_list)
-}
-
-## WE DON'T USE THE FOLLOWING TWO FUNCITONS ANYMORE. KEEPING THEM FOR REFERENCE BUT DELETE EVENTUALLY
-#' build_site_item <- function(theme_id,
-#' start_date,
-#' end_date,
-#' destination_path,
-#' theme_title,
-#' collection_name,
-#' thumbnail_link,
-#' site_coords) {
-#'
-#'
-#' preset_keywords <- list("Forecasting", "NEON")
-#'
-#' meta <- list(
-#' "stac_version"= "1.0.0",
-#' "stac_extensions"= list('https://stac-extensions.github.io/table/v1.2.0/schema.json'),
-#' "type"= "Feature",
-#' "id"= collection_name,
-#' "bbox"=
-#' list(-156.6194, 17.9696, -66.7987, 71.2824),
-#' "geometry"= list(
-#' "type"= "MultiPoint",
-#' "coordinates"= site_coords
-#' ),
-#' "properties"= list(
-#' #'description' = model_description,
-#' "description" = 'NEON Site Information',
-#' "start_datetime" = start_date,
-#' "end_datetime" = end_date,
-#' "providers"= list(
-#' list(
-#' "url"= "https://ecoforecast.org",
-#' "name"= "Ecoforecast Challenge",
-#' "roles"= list(
-#' "host"
-#' )
-#' )
-#' ),
-#' "license"= "CC0-1.0",
-#' "keywords"= c(preset_keywords),
-#' "table:columns" = build_site_metadata()
-#' ),
-#' "collection"= collection_name,
-#' "links"= list(
-#' list(
-#' "rel"= "catalog",
-#' 'href' = '../catalog.json',
-#' "type"= "application/json",
-#' "title"= theme_title
-#' ),
-#' list(
-#' "rel"= "root",
-#' 'href' = '../catalog.json',
-#' "type"= "application/json",
-#' "title"= "EFI Forecast Catalog"
-#' ),
-#' list(
-#' "rel"= "parent",
-#' 'href' = '../catalog.json',
-#' "type"= "application/json",
-#' "title"= theme_title
-#' ),
-#' list(
-#' "rel"= "self",
-#' "href" = 'collection.json',
-#' "type"= "application/json",
-#' "title"= "Raw JSON Text"
-#' ),
-#' list(
-#' "rel" ="cite-as",
-#' "href"= "https://doi.org/10.1002/fee.2616",
-#' "title" = "citation"
-#' ),
-#' list(
-#' "rel"= "about",
-#' "href"= "https://projects.ecoforecast.org/neon4cast-docs/",
-#' "type"= "text/html",
-#' "title"= "NEON Forecast Challenge Documentation"
-#' ),
-#' list(
-#' "rel"= "describedby",
-#' "href"= "https://www.neonscience.org/field-sites/explore-field-sites",
-#' "title"= "Explore the NEON Field Sites",
-#' "type"= "text/html"
-#' )),
-#' "assets"= list(
-#' 'data' = list(
-#' "href" = "https://raw.githubusercontent.com/eco4cast/neon4cast-targets/main/NEON_Field_Site_Metadata_20220412.csv",
-#' "type"= "text/plain",
-#' "title"= 'NEON Sites Table',
-#' "roles" = list('data'),
-#' "description"= 'Table that includes information for all NEON sites'
-#' ),
-#' "thumbnail" = list(
-#' "href"= thumbnail_link,
-#' "type"= "image/png",
-#' "title"= 'NEON Sites Image',
-#' "description"= 'Image describing the NEON sites',
-#' "roles" = list('thumbnail')
-#' )
-#' )
-#' )
-#'
-#'
-#' dest <- destination_path
-#' json <- file.path(dest, "collection.json")
-#'
-#' jsonlite::write_json(meta,
-#' json,
-#' pretty=TRUE,
-#' auto_unbox=TRUE)
-#' stac4cast::stac_validate(json)
-#'
-#' rm(meta)
-#' }
-
-#
-# build_site_metadata <- function(){
-# site_test <- read_csv("https://raw.githubusercontent.com/eco4cast/neon4cast-targets/main/NEON_Field_Site_Metadata_20220412.csv", col_types = cols())
-#
-# schema_info <- sapply(site_test, class)
-#
-# description_create <- data.frame(field_domain_id = 'domain identifier',
-# field_site_id = 'site identifier',
-# field_site_name = 'site name',
-# terrestrial = 'terrestrial theme indicator for site',
-# aquatics = 'aquatics theme indicator for site',
-# phenology = 'phenology theme indicator for site',
-# ticks = 'ticks theme indicator for site',
-# beetles = 'beetles theme indicator for site',
-# phenocam_code = 'code for phenocam',
-# phenocam_roi = 'phenocam region of interest',
-# phenocam_vegetation = 'phenocam vegetation identifier',
-# field_site_type = 'site theme type',
-# field_site_subtype = 'site theme subtype',
-# field_colocated_site = 'colocated field site',
-# field_site_host = 'site host organization',
-# field_site_url = 'site host organization URL',
-# field_nonneon_research_allowed = 'indicate whether non-NEON research is allowed at this site',
-# field_access_details = 'details for accessing the field site',
-# field_neon_field_operations_office = 'NEON field operations office',
-# field_latitude = 'field site latitude',
-# field_longitude = 'field site longitude',
-# field_geodetic_datum = 'geodetic datum for the field site',
-# field_utm_northing = 'northing UTM coordinates',
-# field_utm_easting = 'easting UTM coordinates',
-# field_utm_zone = 'UTM zone for field site',
-# field_site_county = 'county where field site is located',
-# field_site_state = 'state where field site is located',
-# field_site_country = 'country where field site is located',
-# field_mean_elevation_m = 'mean elevation of field site in meters',
-# field_minimum_elevation_m = 'minimum elevation of field site in meters',
-# field_maximum_elevation_m = 'maximum elevation of field site in meters',
-# field_mean_annual_temperature_C = 'mean annual temperaure of field site in degC',
-# field_mean_annual_precipitation_mm= 'mean annual precipitation of field site in mm',
-# field_dominant_wind_direction = 'the dominant wind direction at the field site',
-# field_mean_canopy_height_m = 'mean canpoy height at the field site in meters',
-# field_dominant_nlcd_classes = 'National Land Cover Database Class for field site',
-# field_dominant_plant_species = 'dominant plant species at field site',
-# field_usgs_huc = 'USGS Hydrologic Unit Code for the field site',
-# field_watershed_name = 'watershed name for the field site',
-# field_watershed_size_km2 = 'watershed size of field site in square kilometers',
-# field_lake_depth_mean_m = 'mean lake depth of field site in meters',
-# field_lake_depth_max_m = 'max lake depth of field site in meters',
-# field_tower_height_m = 'height of tower at field site in meters',
-# field_usgs_geology_unit = 'USGS geology unit for field site',
-# field_megapit_soil_family = 'megapit soil family for field site',
-# field_soil_subgroup = 'soild subgroup of field site',
-# field_avg_number_of_green_days = 'average number of green days at field site',
-# field_avg_green_increase_doy = 'day of year for average green increase at field site',
-# field_avg_green_max_doy = 'average day of year with maximum green at field site',
-# field_avg_green_decrease_doy = 'avergae day of year of green decrease at field site',
-# field_avg_green_min_doy = 'average day of year with minimum green at field site',
-# field_phenocams = 'details about phenocams located at each field site',
-# field_number_tower_levels = 'number of tower levels at field site',
-# neon_url = 'NEON URL for field site')
-#
-#
-#
-#
-#
-# x <- purrr::map(seq.int(1:ncol(site_test)), function(i)
-# list(
-# "name" = names(site_test)[i],
-# 'description'= description_create[,i],
-# 'type' = schema_info[[i]]
-# )
-# )
-#
-# return(x)
-# }
diff --git a/catalog/catalog.R b/catalog/catalog.R
deleted file mode 100644
index 6941e8b938..0000000000
--- a/catalog/catalog.R
+++ /dev/null
@@ -1,79 +0,0 @@
-source("catalog/R/catalog-common.R")
-source('catalog/R/stac_functions.R')
-
-config <- yaml::read_yaml('challenge_configuration.yaml')
-
-build_catalog <- function(){
- catalog <- list(
- "type"= "Catalog",
- "id"= paste0(config$project_id, "-stac"),
- "title"= paste0(config$challenge_long_name," Catalog"),
- "description"= paste0("A STAC (Spatiotemporal Asset Catalog) describing forecasts and forecast scores for the ",config$project_id," Forecasting Challenge"),
- "stac_version"= "1.0.0",
- "conformsTo"= 'conformsTo',
- "links"= list(
- list(
- "rel"= "self",
- "type"= "application/json",
- "href" = 'catalog.json'
- ),
- list(
- "rel"= "root",
- "type"= "application/json",
- "href" = 'catalog.json'
- ),
- list(
- "rel"= "child",
- "type"= "application/json",
- "title"= "Forecasts",
- "href" = 'forecasts/collection.json'),
- list(
- "rel"= "child",
- "type"= "application/json",
- "title"= "Scores",
- "href" = 'scores/collection.json'),
- list(
- "rel"= "child",
- "type"= "application/json",
- "title"= "Inventory",
- "href" = 'inventory/collection.json'),
- list(
- "rel"= "child",
- "type"= "application/json",
- "title"= "NOAA Forecasts",
- "href" = 'noaa_forecasts/collection.json'
- ),
- list(
- "rel"= "child",
- "type"= "application/json",
- "title"= "Targets",
- "href" = 'targets/collection.json'
- ),
- list(
- "rel"= "child",
- "type"= "application/json",
- "title"= "Forecast Summaries",
- "href" = 'summaries/collection.json'
- ),
- list(
- "rel"= "child",
- "type"= "application/json",
- "title"= "Sites",
- "href" = 'sites/collection.json'
- ),
- list(
- "rel" = "parent",
- "type" = "application/json",
- "href" = "https://raw.githubusercontent.com/eco4cast/challenge-catalogs/main/catalog.json"
- )
- )
- )
-
- dest <- "catalog/"
- jsonlite::write_json(catalog, file.path(dest, "catalog.json"),
- pretty=TRUE, auto_unbox=TRUE)
- stac4cast::stac_validate(file.path(dest, "catalog.json"))
-
-}
-
-build_catalog()
diff --git a/catalog/catalog.json b/catalog/catalog.json
deleted file mode 100644
index b06b062dec..0000000000
--- a/catalog/catalog.json
+++ /dev/null
@@ -1,67 +0,0 @@
-{
- "type": "Catalog",
- "id": "usgsrc4cast-stac",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Catalog",
- "description": "A STAC (Spatiotemporal Asset Catalog) describing forecasts and forecast scores for the usgsrc4cast Forecasting Challenge",
- "stac_version": "1.0.0",
- "conformsTo": "conformsTo",
- "links": [
- {
- "rel": "self",
- "type": "application/json",
- "href": "catalog.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "catalog.json"
- },
- {
- "rel": "child",
- "type": "application/json",
- "title": "Forecasts",
- "href": "forecasts/collection.json"
- },
- {
- "rel": "child",
- "type": "application/json",
- "title": "Scores",
- "href": "scores/collection.json"
- },
- {
- "rel": "child",
- "type": "application/json",
- "title": "Inventory",
- "href": "inventory/collection.json"
- },
- {
- "rel": "child",
- "type": "application/json",
- "title": "NOAA Forecasts",
- "href": "noaa_forecasts/collection.json"
- },
- {
- "rel": "child",
- "type": "application/json",
- "title": "Targets",
- "href": "targets/collection.json"
- },
- {
- "rel": "child",
- "type": "application/json",
- "title": "Forecast Summaries",
- "href": "summaries/collection.json"
- },
- {
- "rel": "child",
- "type": "application/json",
- "title": "Sites",
- "href": "sites/collection.json"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "https://raw.githubusercontent.com/eco4cast/challenge-catalogs/main/catalog.json"
- }
- ]
-}
diff --git a/catalog/forecasts/Aquatics/Daily_Chlorophyll_a/collection.json b/catalog/forecasts/Aquatics/Daily_Chlorophyll_a/collection.json
deleted file mode 100644
index e03b113078..0000000000
--- a/catalog/forecasts/Aquatics/Daily_Chlorophyll_a/collection.json
+++ /dev/null
@@ -1,272 +0,0 @@
-{
- "id": "Daily_Chlorophyll_a",
- "description": "This page includes all models for the Daily_Chlorophyll_a variable.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/USGSHABs1.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/cb_prophet.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/climatology.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/persistenceRW.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procBlanchardMonod.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procBlanchardSteele.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procCTMIMonod.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procCTMISteele.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procEppleyNorbergMonod.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procEppleyNorbergSteele.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procHinshelwoodSteele.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_arima.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_bag_mlp.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_ets.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_humidity_lm.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_lasso.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_lasso_all_sites.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_precip_lm.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_precip_lm_all_sites.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_randfor.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_tbats.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_temp_lm_all_sites.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_humidity_lm_all_sites.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_temp_lm.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procHinshelwoodMonod.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_auto_adam.json"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/neon4cast-docs/",
- "type": "text/html",
- "title": "NEON Ecological Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/neon4cast-docs/",
- "title": "NEON Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Daily_Chlorophyll_a",
- "extent": {
- "spatial": {
- "bbox": [
- [-149.6106, 29.676, -82.0084, 68.6307]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2023-01-01T00:00:00Z",
- "2024-01-21T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "parameter",
- "type": "string",
- "description": "ensemble member or distribution parameter"
- },
- {
- "name": "prediction",
- "type": "double",
- "description": "predicted value for variable"
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique identifier for the forecast project"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly, daily, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "reference_date",
- "type": "string",
- "description": "date that the forecast was initiated"
- }
- ],
- "assets": {
- "data": {
- "href": "\"s3://anonymous@bio230014-bucket01/challenges/forecasts/parquet/project_id=neon4cast/duration=P1D/variable=chla?endpoint_override=sdsc.osn.xsede.org\"",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for forecasts of the variable by the specific model .\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/forecasts/parquet/project_id=neon4cast/duration=P1D/variable=chla?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "pending",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "pending"
- }
- }
-}
diff --git a/catalog/forecasts/Aquatics/collection.json b/catalog/forecasts/Aquatics/collection.json
deleted file mode 100644
index 3fb590969b..0000000000
--- a/catalog/forecasts/Aquatics/collection.json
+++ /dev/null
@@ -1,157 +0,0 @@
-{
- "id": "Aquatics",
- "description": "This page includes variables for the Aquatics group.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "child",
- "type": "application/json",
- "href": "Daily_Dissolved_oxygen/collection.json"
- },
- {
- "rel": "child",
- "type": "application/json",
- "href": "Daily_Water_temperature/collection.json"
- },
- {
- "rel": "child",
- "type": "application/json",
- "href": "Daily_Chlorophyll_a/collection.json"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/neon4cast-docs/",
- "type": "text/html",
- "title": "NEON Ecological Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/neon4cast-docs/",
- "title": "NEON Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Aquatics",
- "extent": {
- "spatial": {
- "bbox": [
- [-149.6106, 18.1135, -66.7987, 68.6698]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2023-01-01T00:00:00Z",
- "2024-12-09T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "parameter",
- "type": "string",
- "description": "ensemble member or distribution parameter"
- },
- {
- "name": "prediction",
- "type": "double",
- "description": "predicted value for variable"
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique identifier for the forecast project"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly, daily, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "reference_date",
- "type": "string",
- "description": "date that the forecast was initiated"
- }
- ],
- "assets": {
- "data": {
- "href": "\"s3://anonymous@bio230014-bucket01/challenges/forecasts/parquet/?endpoint_override=sdsc.osn.xsede.org\"",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for the NEON Ecological Forecasting Aquatics theme.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/forecasts/parquet/?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |>\n dplyr::filter(variable %in% c(\"oxygen\", \"temperature\", \"chla\")) |>\n dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "https://projects.ecoforecast.org/neon4cast-catalog/img/neon_buoy.jpg",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "NEON Aquatics Buoy"
- }
- }
-}
diff --git a/catalog/forecasts/aquatics/Daily_Chlorophyll_a/collection.json b/catalog/forecasts/aquatics/Daily_Chlorophyll_a/collection.json
deleted file mode 100644
index c180808ec7..0000000000
--- a/catalog/forecasts/aquatics/Daily_Chlorophyll_a/collection.json
+++ /dev/null
@@ -1,152 +0,0 @@
-{
- "id": "Daily_Chlorophyll_a",
- "description": "This page includes all models for the Daily_Chlorophyll_a variable.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/climatology.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/persistenceRW.json"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Daily_Chlorophyll_a",
- "extent": {
- "spatial": {
- "bbox": [
- ["Inf", "Inf", "-Inf", "-Inf"]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2024-02-07T00:00:00Z",
- "2024-03-14T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "parameter",
- "type": "string",
- "description": "ensemble member or distribution parameter"
- },
- {
- "name": "prediction",
- "type": "double",
- "description": "predicted value for variable"
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique identifier for the forecast project"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly, daily, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "reference_date",
- "type": "string",
- "description": "date that the forecast was initiated"
- }
- ],
- "assets": {
- "data": {
- "href": "s3://anonymous@bio230014-bucket01/challenges/forecasts/parquet/project_id=usgsrc4cast/duration=P1D/variable=chla?endpoint_override=sdsc.osn.xsede.org",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for forecasts of the variable by the specific model .\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/forecasts/parquet/project_id=usgsrc4cast/duration=P1D/variable=chla?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "pending",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "pending"
- }
- }
-}
diff --git a/catalog/forecasts/aquatics/collection.json b/catalog/forecasts/aquatics/collection.json
deleted file mode 100644
index 57b5da1262..0000000000
--- a/catalog/forecasts/aquatics/collection.json
+++ /dev/null
@@ -1,147 +0,0 @@
-{
- "id": "aquatics",
- "description": "This page includes variables for the aquatics group.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "child",
- "type": "application/json",
- "href": "Daily_Chlorophyll_a/collection.json"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "aquatics",
- "extent": {
- "spatial": {
- "bbox": [
- ["Inf", "Inf", "-Inf", "-Inf"]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2024-02-07T00:00:00Z",
- "2024-03-14T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "parameter",
- "type": "string",
- "description": "ensemble member or distribution parameter"
- },
- {
- "name": "prediction",
- "type": "double",
- "description": "predicted value for variable"
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique identifier for the forecast project"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly, daily, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "reference_date",
- "type": "string",
- "description": "date that the forecast was initiated"
- }
- ],
- "assets": {
- "data": {
- "href": "s3://anonymous@bio230014-bucket01/challenges/forecasts/parquet/?endpoint_override=sdsc.osn.xsede.org",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for the NEON Ecological Forecasting Aquatics theme.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/forecasts/parquet/?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |>\n dplyr::filter(variable %in% c(\"chla\")) |>\n dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/Back-b.jpg",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "USGS Streamgage"
- }
- }
-}
diff --git a/catalog/forecasts/collection.json b/catalog/forecasts/collection.json
deleted file mode 100644
index 4dc66e61a6..0000000000
--- a/catalog/forecasts/collection.json
+++ /dev/null
@@ -1,159 +0,0 @@
-{
- "id": "daily-forecasts",
- "description": "Forecasts are the raw forecasts that includes all ensemble members or distribution parameters. Due to the size of the raw forecasts, we recommend accessing the scores (summaries of the forecasts) to analyze forecasts (unless you need the individual ensemble members). You can access the forecasts at the top level of the dataset where all models, variables, and dates that forecasts were produced (reference_datetime) are available. The code to access the entire dataset is provided as an asset. Given the size of the forecast catalog, it can be time-consuming to access the data at the full dataset level. For quicker access to the forecasts for a particular model (model_id), we also provide the code to access the data at the model_id level as an asset for each model.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "child",
- "type": "application/json",
- "href": "aquatics/collection.json",
- "title": "aquatics"
- },
- {
- "rel": "child",
- "type": "application/json",
- "href": "models/collection.json",
- "title": "group item"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../catalog.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../catalog.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Forecasts",
- "extent": {
- "spatial": {
- "bbox": [
- [
- -122.6692,
- 39.6327,
- -74.7781,
- 45.5175
- ]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2024-02-07T00:00:00Z",
- "2024-03-14T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "parameter",
- "type": "string",
- "description": "ensemble member or distribution parameter"
- },
- {
- "name": "prediction",
- "type": "double",
- "description": "predicted value for variable"
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique identifier for the forecast project"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly, daily, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "reference_date",
- "type": "string",
- "description": "date that the forecast was initiated"
- }
- ],
- "assets": {
- "data": {
- "href": "s3://anonymous@bio230014-bucket01/challenges/forecasts/parquet/?endpoint_override=sdsc.osn.xsede.org",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for the Forecasting Challenge.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/forecasts/parquet/?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/Back-b.jpg",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "USGS Streamgage"
- }
- }
-}
diff --git a/catalog/forecasts/forecast_models.R b/catalog/forecasts/forecast_models.R
deleted file mode 100644
index 7c964342df..0000000000
--- a/catalog/forecasts/forecast_models.R
+++ /dev/null
@@ -1,309 +0,0 @@
-library(arrow)
-library(dplyr)
-library(gsheet)
-library(readr)
-
-#source('catalog/R/stac_functions.R')
-config <- yaml::read_yaml('challenge_configuration.yaml')
-catalog_config <- config$catalog_config
-
-# file.sources = list.files(c("../stac4cast/R"), full.names=TRUE,
-# ignore.case=TRUE)
-# sapply(file.sources,source,.GlobalEnv)
-
-## CREATE table for column descriptions
-forecast_description_create <- data.frame(datetime = 'datetime of the forecasted value (ISO 8601)',
- site_id = 'For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)',
- family = 'For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”',
- parameter = 'ensemble member or distribution parameter',
- variable = 'name of forecasted variable',
- prediction = 'predicted value for variable',
- pub_datetime = 'datetime that forecast was submitted',
- reference_datetime = 'datetime that the forecast was initiated (horizon = 0)',
- model_id = 'unique model identifier',
- reference_date = 'date that the forecast was initiated',
- project_id = 'unique identifier for the forecast project',
- depth_m = 'depth (meters) in water column of prediction',
- duration = 'temporal duration of forecast (hourly, daily, etc.); follows ISO 8601 duration convention')
-
-
-## CHANGE THE WAY TO READ THE SCHEMA
-## just read in example forecast to extract schema information -- ask about better ways of doing this
-# theme <- 'daily'
-# reference_datetime <- '2023-09-01'
-# site_id <- 'fcre'
-# model_id <- 'climatology'
-
-print('FIND FORECAST TABLE SCHEMA')
-forecast_theme_df <- arrow::open_dataset(arrow::s3_bucket(config$forecasts_bucket,
- endpoint_override = config$endpoint, anonymous = TRUE)) #|>
- #filter(model_id == model_id, site_id = site_id, reference_datetime = reference_datetime)
-# NOTE IF NOT USING FILTER -- THE stac4cast::build_table_columns() NEEDS TO BE UPDATED
- #(USE strsplit(forecast_theme_df$ToString(), "\n") INSTEAD OF strsplit(forecast_theme_df[[1]]$ToString(), "\n"))
-
-## identify model ids from bucket -- used in generate model items function
-# forecast_data_df <- duckdbfs::open_dataset(glue::glue("s3://{config$inventory_bucket}/catalog/forecasts/project_id={config$project_id}"),
-# s3_endpoint = config$endpoint, anonymous=TRUE) |>
-# collect()
-
-print('FIND INVENTORY BUCKET')
-forecast_s3 <- arrow::s3_bucket(glue::glue("{config$inventory_bucket}/catalog/forecasts/project_id={config$project_id}"),
- endpoint_override = "sdsc.osn.xsede.org",
- anonymous=TRUE)
-
-# forecast_data_df <- duckdbfs::open_dataset(glue::glue("s3://{config$inventory_bucket}/catalog/forecasts"),
-# s3_endpoint = config$endpoint, anonymous=TRUE) |>
-# collect()
-
-print('OPEN INVENTORY BUCKET')
-forecast_data_df <- arrow::open_dataset(forecast_s3) |>
- filter(project_id == config$project_id) |>
- collect()
-
-theme_models <- forecast_data_df |>
- distinct(model_id)
-
-forecast_date_range <- forecast_data_df |> dplyr::summarise(min(date),max(date))
-forecast_min_date <- forecast_date_range$`min(date)`
-forecast_max_date <- forecast_date_range$`max(date)`
-
-build_description <- paste0("Forecasts are the raw forecasts that includes all ensemble members or distribution parameters. Due to the size of the raw forecasts, we recommend accessing the scores (summaries of the forecasts) to analyze forecasts (unless you need the individual ensemble members). You can access the forecasts at the top level of the dataset where all models, variables, and dates that forecasts were produced (reference_datetime) are available. The code to access the entire dataset is provided as an asset. Given the size of the forecast catalog, it can be time-consuming to access the data at the full dataset level. For quicker access to the forecasts for a particular model (model_id), we also provide the code to access the data at the model_id level as an asset for each model.")
-
-stac4cast::build_forecast_scores(table_schema = forecast_theme_df,
- #theme_id = 'Forecasts',
- table_description = forecast_description_create,
- start_date = forecast_min_date,
- end_date = forecast_max_date,
- id_value = "daily-forecasts",
- description_string = build_description,
- about_string = catalog_config$about_string,
- about_title = catalog_config$about_title,
- theme_title = "Forecasts",
- destination_path = catalog_config$forecast_path,
- aws_download_path = catalog_config$aws_download_path_forecasts,
- link_items = stac4cast::generate_group_values(group_values = names(config$variable_groups)),
- thumbnail_link = catalog_config$forecasts_thumbnail,
- thumbnail_title = catalog_config$forecasts_thumbnail_title,
- model_child = TRUE)
-
-## create separate JSON for model landing page
-if (!dir.exists(paste0(catalog_config$forecast_path,"models"))){
- dir.create(paste0(catalog_config$forecast_path,"models"))
-}
-
-stac4cast::build_group_variables(table_schema = forecast_theme_df,
- table_description = forecast_description_create,
- start_date = forecast_min_date,
- end_date = forecast_max_date,
- id_value = "models",
- description_string = build_description,
- about_string = catalog_config$about_string,
- about_title = catalog_config$about_title,
- dashboard_string = catalog_config$dashboard_url,
- dashboard_title = catalog_config$dashboard_title,
- theme_title = "Models",
- destination_path = paste0(catalog_config$forecast_path,"models"),
- aws_download_path = catalog_config$aws_download_path_forecasts,
- group_var_items = stac4cast::generate_model_items(model_list = theme_models$model_id),
- thumbnail_link = 'pending',
- thumbnail_title = 'pending',
- group_var_vector = NULL,
- group_sites = NULL)
-
-## CREATE MODELS
-variable_gsheet <- gsheet2tbl(config$target_metadata_gsheet)
-
-## READ IN MODEL METADATA
-# googlesheets4::gs4_deauth()
-#
-# registered_model_id <- googlesheets4::read_sheet(config$model_metadata_gsheet)
-
-# read in model metadata and filter for the relevant project
-registered_model_id <- gsheet2tbl(config$model_metadata_gsheet) |>
- filter(`What forecasting challenge are you registering for?` == config$project_id)
-
-forecast_sites <- c()
-
-## LOOP OVER MODEL IDS AND CREATE JSONS
-for (m in theme_models$model_id){
-
- # make model items directory
- if (!dir.exists(paste0(catalog_config$forecast_path,"models/model_items"))){
- dir.create(paste0(catalog_config$forecast_path,"models/model_items"))
- }
-
- print(m)
- model_date_range <- forecast_data_df |> filter(model_id == m) |> dplyr::summarise(min(date),max(date))
- model_min_date <- model_date_range$`min(date)`
- model_max_date <- model_date_range$`max(date)`
-
- model_var_duration_df <- forecast_data_df |> filter(model_id == m) |> distinct(variable,duration) |>
- mutate(duration_name = ifelse(duration == 'P1D', 'Daily', duration)) |>
- mutate(duration_name = ifelse(duration == 'PT1H', 'Hourly', duration_name)) |>
- mutate(duration_name = ifelse(duration == 'PT30M', '30min', duration_name)) |>
- mutate(duration_name = ifelse(duration == 'P1W', 'Weekly', duration_name))
-
- model_var_full_name <- model_var_duration_df |>
- left_join((variable_gsheet |>
- select(variable = `"official" targets name`, full_name = `Variable name`) |>
- distinct(variable, .keep_all = TRUE)), by = c('variable'))
-
- model_sites <- forecast_data_df |> filter(model_id == m) |> distinct(site_id)
-
- model_vars <- forecast_data_df |> filter(model_id == m) |> distinct(variable) |> left_join(model_var_full_name, by = 'variable')
- model_vars$var_duration_name <- paste0(model_vars$duration_name, " ", model_vars$full_name)
-
- forecast_sites <- append(forecast_sites, stac4cast::get_site_coords(site_metadata = catalog_config$site_metadata_url,
- sites = model_sites$site_id))
-
- idx = which(registered_model_id$model_id == m)
-
- stac4cast::build_model(model_id = m,
- team_name = registered_model_id$`Long name of the model (can include spaces)`[idx],
- model_description = registered_model_id[idx,"Describe your modeling approach in your own words."][[1]],
- start_date = model_min_date,
- end_date = model_max_date,
- var_values = model_vars$var_duration_name,
- duration_names = model_var_duration_df$duration,
- site_values = model_sites$site_id,
- site_table = catalog_config$site_metadata_url,
- model_documentation = registered_model_id,
- destination_path = paste0(catalog_config$forecast_path,"models/model_items"),
- aws_download_path = config$forecasts_bucket, # CHANGE THIS BUCKET NAME
- collection_name = 'forecasts',
- thumbnail_image_name = NULL,
- table_schema = forecast_theme_df,
- table_description = forecast_description_create,
- full_var_df = model_vars,
- #code_web_link = registered_model_id$`Web link to model code`[idx],
- code_web_link = 'pending')
-}
-
-
-## BUILD VARIABLE GROUPS
-#variable_gsheet <- gsheet2tbl(config$target_metadata_gsheet)
-
-for (i in 1:length(config$variable_groups)){ ## organize variable groups
- print(names(config$variable_groups)[i])
-
- # check data and skip if no data found
- var_group_data_check <- forecast_data_df |>
- filter(variable %in% config$variable_groups[[i]]$variable)
-
- if (nrow(var_group_data_check) == 0){
- print('No data available for group')
- next
- }
-
- ## REMOVE STALE OR UNUSED DIRECTORIES
- current_var_path <- paste0(catalog_config$summaries_path,names(config$variable_groups[i]))
- current_var_dirs <- list.dirs(current_var_path, recursive = FALSE, full.names = TRUE)
- unlink(current_var_dirs, recursive = TRUE)
-
- if (!dir.exists(paste0(catalog_config$forecast_path,names(config$variable_groups[i])))){
- dir.create(paste0(catalog_config$forecast_path,names(config$variable_groups[i])))
- }
-
- for(j in 1:length(config$variable_groups[[i]]$variable)){ # FOR EACH VARIABLE WITHIN A MODEL GROUP
-
- ## restructure variable names
- var_values <- config$variable_groups[[i]]$variable
- var_name <- config$variable_groups[[i]]$variable[j]
- print(var_name)
-
- # check data and skip if no data found
- var_data_check <- forecast_data_df |>
- filter(variable == var_name)
-
- if (nrow(var_data_check) == 0){
- print('No data available for variable')
- next
- }
-
- duration_name <- config$variable_groups[[i]]$duration[j]
-
- # match variable with full name in gsheet
- #var_name_full <- variable_gsheet[which(variable_gsheet$`"official" targets name` == var_values),1][[1]]
- var_name_full <- variable_gsheet[which(variable_gsheet$`"official" targets name` %in% var_values),1][[1]]
-
-
-
- ## create new vector to store duration names
- duration_values <- config$variable_groups[[i]]$duration
- duration_values[which(duration_values == 'P1D')] <- 'Daily'
- duration_values[which(duration_values == 'PT1H')] <- 'Hourly'
- duration_values[which(duration_values == 'PT30M')] <- '30min'
- duration_values[which(duration_values == 'P1W')] <- 'Weekly'
-
- #var_name_combined_list <- paste0(var_values, '_',duration_values)
- var_name_combined_list <- paste0(duration_values,'_',var_name_full)
-
- ## CREATE VARIABLE GROUP JSONS
- group_description <- paste0('This page includes variables for the ',names(config$variable_groups[i]),' group.')
-
- ## find group sites
- find_group_sites <- forecast_data_df |>
- filter(variable %in% var_values) |>
- distinct(site_id)
-
- stac4cast::build_group_variables(table_schema = forecast_theme_df,
- #theme_id = names(config$variable_groups[i]),
- table_description = forecast_description_create,
- start_date = forecast_min_date,
- end_date = forecast_max_date,
- id_value = names(config$variable_groups[i]),
- description_string = group_description,
- about_string = catalog_config$about_string,
- about_title = catalog_config$about_title,
- dashboard_string = catalog_config$dashboard_url,
- dashboard_title = catalog_config$dashboard_title,
- theme_title = names(config$variable_groups[i]),
- destination_path = paste0(catalog_config$forecast_path,names(config$variable_groups[i])),
- aws_download_path = catalog_config$aws_download_path_forecasts,
- group_var_items = stac4cast::generate_group_variable_items(variables = var_name_combined_list),
- thumbnail_link = config$variable_groups[[i]]$thumbnail_link,
- thumbnail_title = config$variable_groups[[i]]$thumbnail_title,
- group_var_vector = unique(var_values),
- group_sites = find_group_sites$site_id)
-
- if (!dir.exists(paste0(catalog_config$forecast_path,names(config$variable_groups)[i],'/',var_name_combined_list[j]))){
- dir.create(paste0(catalog_config$forecast_path,names(config$variable_groups)[i],'/',var_name_combined_list[j]))
- }
-
- var_data <- forecast_data_df |>
- filter(variable == var_name,
- duration == duration_name)
-
- var_date_range <- var_data |> dplyr::summarise(min(date),max(date))
- var_min_date <- var_date_range$`min(date)`
- var_max_date <- var_date_range$`max(date)`
-
- var_models <- var_data |> distinct(model_id)
-
- find_var_sites <- forecast_data_df |>
- filter(variable == var_name) |>
- distinct(site_id)
-
- var_description <- paste0('This page includes all models for the ',var_name_combined_list[j],' variable.')
-
- stac4cast::build_group_variables(table_schema = forecast_theme_df,
- table_description = forecast_description_create,
- start_date = var_min_date,
- end_date = var_max_date,
- id_value = var_name_combined_list[j],
- description_string = var_description,
- about_string = catalog_config$about_string,
- about_title = catalog_config$about_title,
- dashboard_string = catalog_config$dashboard_url,
- dashboard_title = catalog_config$dashboard_title,
- theme_title = var_name_combined_list[j],
- destination_path = file.path(catalog_config$forecast_path,names(config$variable_groups)[i],var_name_combined_list[j]),
- aws_download_path = var_data$path[1],
- group_var_items = stac4cast::generate_variable_model_items(model_list = var_models$model_id),
- thumbnail_link = 'pending',
- thumbnail_title = 'pending',
- group_var_vector = NULL,
- group_sites = find_var_sites$site_id)
-
- }
-}
diff --git a/catalog/forecasts/models/collection.json b/catalog/forecasts/models/collection.json
deleted file mode 100644
index 61c0eac9f7..0000000000
--- a/catalog/forecasts/models/collection.json
+++ /dev/null
@@ -1,152 +0,0 @@
-{
- "id": "models",
- "description": "Forecasts are the raw forecasts that includes all ensemble members or distribution parameters. Due to the size of the raw forecasts, we recommend accessing the scores (summaries of the forecasts) to analyze forecasts (unless you need the individual ensemble members). You can access the forecasts at the top level of the dataset where all models, variables, and dates that forecasts were produced (reference_datetime) are available. The code to access the entire dataset is provided as an asset. Given the size of the forecast catalog, it can be time-consuming to access the data at the full dataset level. For quicker access to the forecasts for a particular model (model_id), we also provide the code to access the data at the model_id level as an asset for each model.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "item",
- "type": "application/json",
- "href": "model_items/climatology.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "model_items/persistenceRW.json"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Models",
- "extent": {
- "spatial": {
- "bbox": [
- [-122.6692, 39.6328, -74.7781, 45.5175]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2024-02-07T00:00:00Z",
- "2024-03-14T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "parameter",
- "type": "string",
- "description": "ensemble member or distribution parameter"
- },
- {
- "name": "prediction",
- "type": "double",
- "description": "predicted value for variable"
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique identifier for the forecast project"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly, daily, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "reference_date",
- "type": "string",
- "description": "date that the forecast was initiated"
- }
- ],
- "assets": {
- "data": {
- "href": "s3://anonymous@bio230014-bucket01/challenges/forecasts/parquet/?endpoint_override=sdsc.osn.xsede.org",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for forecasts of the variable by the specific model .\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/forecasts/parquet/?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "pending",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "pending"
- }
- }
-}
diff --git a/catalog/forecasts/models/model_items/.empty b/catalog/forecasts/models/model_items/.empty
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/catalog/forecasts/models/model_items/climatology.json b/catalog/forecasts/models/model_items/climatology.json
deleted file mode 100644
index 1dfb280dc3..0000000000
--- a/catalog/forecasts/models/model_items/climatology.json
+++ /dev/null
@@ -1,174 +0,0 @@
-{
- "stac_version": "1.0.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Feature",
- "id": "climatology",
- "bbox": [
- [
- -122.6692,
- 45.5175,
- -74.7781,
- 45.5175
- ]
- ],
- "geometry": {
- "type": "MultiPoint",
- "coordinates": [
- [],
- [],
- [],
- [],
- [],
- [],
- [],
- [],
- []
- ]
- },
- "properties": {
- "description": "\nmodel info: Forecasts stream chlorophyll-a based on the historic average and standard deviation for that given site and day-of-year.\n\nSites: USGS-01427510, USGS-01463500, USGS-05543010, USGS-05553700, USGS-05558300, USGS-05586300, USGS-14181500, USGS-14211010, USGS-14211720\n\nVariables: Daily Chlorophyll_a",
- "start_datetime": "2024-02-07",
- "end_datetime": "2024-03-14",
- "providers": [
- {
- "url": "pending",
- "name": "pending",
- "roles": [
- "producer",
- "processor",
- "licensor"
- ]
- },
- {
- "url": "https://www.ecoforecastprojectvt.org",
- "name": "Ecoforecast Challenge",
- "roles": [
- "host"
- ]
- }
- ],
- "license": "CC0-1.0",
- "keywords": [
- "Forecasting",
- "usgsrc4cast",
- "Daily Chlorophyll_a"
- ],
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "parameter",
- "type": "string",
- "description": "ensemble member or distribution parameter"
- },
- {
- "name": "prediction",
- "type": "double",
- "description": "predicted value for variable"
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique identifier for the forecast project"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly, daily, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "reference_date",
- "type": "string",
- "description": "date that the forecast was initiated"
- }
- ]
- },
- "collection": "forecasts",
- "links": [
- {
- "rel": "collection",
- "href": "../collection.json",
- "type": "application/json",
- "title": "climatology"
- },
- {
- "rel": "root",
- "href": "../../../catalog.json",
- "type": "application/json",
- "title": "Forecast Catalog"
- },
- {
- "rel": "parent",
- "href": "../collection.json",
- "type": "application/json",
- "title": "climatology"
- },
- {
- "rel": "self",
- "href": "climatology.json",
- "type": "application/json",
- "title": "Model Forecast"
- },
- {
- "rel": "item",
- "href": "pending",
- "type": "text/html",
- "title": "Link for Model Code"
- }
- ],
- "assets": {
- "1": {
- "type": "application/json",
- "title": "Model Metadata",
- "href": "https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/metadata/model_id/climatology.json",
- "description": "Use `jsonlite::fromJSON()` to download the model metadata JSON file. This R code will return metadata provided during the model registration.\n \n\n### R\n\n```{r}\n# Use code below\n\nmodel_metadata <- jsonlite::fromJSON(\"https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/metadata/model_id/climatology.json\")\n\n"
- },
- "2": {
- "type": "text/html",
- "title": "Link for Model Code",
- "href": "pending",
- "description": "The link to the model code provided by the model submission team"
- },
- "3": {
- "type": "application/x-parquet",
- "title": "Database Access for Daily Chlorophyll_a",
- "href": "s3://anonymous@bio230014-bucket01/challenges/forecastsproject_id=/duration=P1D/variable=chla/model_id=climatology?endpoint_override=sdsc.osn.xsede.org",
- "description": "Use `arrow` for remote access to the database. This R code will return results for this variable and model combination.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/forecastsproject_id=/duration=P1D/variable=chla/model_id=climatology?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- }
- }
-}
diff --git a/catalog/forecasts/models/model_items/persistenceRW.json b/catalog/forecasts/models/model_items/persistenceRW.json
deleted file mode 100644
index 8374faa30a..0000000000
--- a/catalog/forecasts/models/model_items/persistenceRW.json
+++ /dev/null
@@ -1,175 +0,0 @@
-{
- "stac_version": "1.0.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Feature",
- "id": "persistenceRW",
- "bbox": [
- [
- -122.6692,
- 45.5175,
- -74.7781,
- 45.5175
- ]
- ],
- "geometry": {
- "type": "MultiPoint",
- "coordinates": [
- [],
- [],
- [],
- [],
- [],
- [],
- [],
- [],
- [],
- []
- ]
- },
- "properties": {
- "description": "\nmodel info: Random walk model based on most recent stream chl-a observations using the fable::RW() model.\n\nSites: USGS-01427510, USGS-01463500, USGS-05543010, USGS-05549500, USGS-05553700, USGS-05558300, USGS-05586300, USGS-14181500, USGS-14211010, USGS-14211720\n\nVariables: Daily Chlorophyll_a",
- "start_datetime": "2024-02-07",
- "end_datetime": "2024-03-13",
- "providers": [
- {
- "url": "pending",
- "name": "pending",
- "roles": [
- "producer",
- "processor",
- "licensor"
- ]
- },
- {
- "url": "https://www.ecoforecastprojectvt.org",
- "name": "Ecoforecast Challenge",
- "roles": [
- "host"
- ]
- }
- ],
- "license": "CC0-1.0",
- "keywords": [
- "Forecasting",
- "usgsrc4cast",
- "Daily Chlorophyll_a"
- ],
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "parameter",
- "type": "string",
- "description": "ensemble member or distribution parameter"
- },
- {
- "name": "prediction",
- "type": "double",
- "description": "predicted value for variable"
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique identifier for the forecast project"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly, daily, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "reference_date",
- "type": "string",
- "description": "date that the forecast was initiated"
- }
- ]
- },
- "collection": "forecasts",
- "links": [
- {
- "rel": "collection",
- "href": "../collection.json",
- "type": "application/json",
- "title": "persistenceRW"
- },
- {
- "rel": "root",
- "href": "../../../catalog.json",
- "type": "application/json",
- "title": "Forecast Catalog"
- },
- {
- "rel": "parent",
- "href": "../collection.json",
- "type": "application/json",
- "title": "persistenceRW"
- },
- {
- "rel": "self",
- "href": "persistenceRW.json",
- "type": "application/json",
- "title": "Model Forecast"
- },
- {
- "rel": "item",
- "href": "pending",
- "type": "text/html",
- "title": "Link for Model Code"
- }
- ],
- "assets": {
- "1": {
- "type": "application/json",
- "title": "Model Metadata",
- "href": "https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/metadata/model_id/persistenceRW.json",
- "description": "Use `jsonlite::fromJSON()` to download the model metadata JSON file. This R code will return metadata provided during the model registration.\n \n\n### R\n\n```{r}\n# Use code below\n\nmodel_metadata <- jsonlite::fromJSON(\"https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/metadata/model_id/persistenceRW.json\")\n\n"
- },
- "2": {
- "type": "text/html",
- "title": "Link for Model Code",
- "href": "pending",
- "description": "The link to the model code provided by the model submission team"
- },
- "3": {
- "type": "application/x-parquet",
- "title": "Database Access for Daily Chlorophyll_a",
- "href": "s3://anonymous@bio230014-bucket01/challenges/forecastsproject_id=/duration=P1D/variable=chla/model_id=persistenceRW?endpoint_override=sdsc.osn.xsede.org",
- "description": "Use `arrow` for remote access to the database. This R code will return results for this variable and model combination.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/forecastsproject_id=/duration=P1D/variable=chla/model_id=persistenceRW?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- }
- }
-}
diff --git a/catalog/inventory/collection.json b/catalog/inventory/collection.json
deleted file mode 100644
index daae5522bc..0000000000
--- a/catalog/inventory/collection.json
+++ /dev/null
@@ -1,166 +0,0 @@
-{
- "id": "inventory",
- "description": "The catalog contains forecasts for the EFI-USGS River Chlorophyll Forecasting Challenge. The forecasts are the raw forecasts that include all ensemble members (if a forecast represents uncertainty using an ensemble). Due to the size of the raw forecasts, we recommend accessing the scores (summaries of the forecasts) to analyze forecasts (unless you need the individual ensemble members). You can access the forecasts at the top level of the dataset where all models, variables, and dates that forecasts were produced (reference_datetime) are available. The code to access the entire dataset is provided as an asset. Given the size of the forecast catalog, it can be time-consuming to access the data at the full dataset level. For quicker access to the forecasts for a particular model (model_id), we also provide the code to access the data at the model_id level as an asset for each model.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../catalog.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../catalog.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Inventory",
- "extent": {
- "spatial": {
- "bbox": [
- [
- -122.6692,
- 39.6327,
- -74.7781,
- 45.5175
- ]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2024-02-07T00:00:00Z",
- "2024-03-14T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "duration",
- "type": "string",
- "description": "sample duration code for variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "unique site identifier"
- },
- {
- "name": "reference_date",
- "type": "date32[day]",
- "description": "date that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "forecast variable"
- },
- {
- "name": "date",
- "type": "date32[day]",
- "description": "date of the predicted value"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique project identifier"
- },
- {
- "name": "pub_date",
- "type": "date32[day]",
- "description": {}
- },
- {
- "name": "path",
- "type": null,
- "description": "storage path for forecast data"
- },
- {
- "name": "path_full",
- "type": null,
- "description": {}
- },
- {
- "name": "path_summaries",
- "type": null,
- "description": {}
- },
- {
- "name": "endpoint",
- "type": "string",
- "description": "storage location for forecast data"
- },
- {
- "name": "latitude",
- "type": "double",
- "description": {}
- },
- {
- "name": "longitude",
- "type": "double",
- "description": {}
- }
- ],
- "assets": {
- "data": {
- "href": "s3://anonymous@bio230014-bucket01/challenges/inventory/catalog/forecasts/project_id=usgsrc4cast?endpoint_override=sdsc.osn.xsede.org",
- "type": "application/x-parquet",
- "title": "Forecast Inventory Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for the forecast challenge inventory bucket.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/inventory/catalog/forecasts/project_id=usgsrc4cast?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "data.1": {
- "href": "s3://anonymous@bio230014-bucket01/challenges/inventory/catalog/scores/project_id=usgsrc4cast?endpoint_override=sdsc.osn.xsede.org",
- "type": "application/x-parquet",
- "title": "Scores Inventory Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for the forecast challenge inventory bucket.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/inventory/catalog/scores/project_id=usgsrc4cast?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/Streamgaging%20Basics%20photo%20showing%20Acoustic%20Doppler%20Current%20Profiler2.jpg",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "USGS Image"
- }
- }
-}
diff --git a/catalog/inventory/create_inventory_page.R b/catalog/inventory/create_inventory_page.R
deleted file mode 100644
index 5e1f461f2a..0000000000
--- a/catalog/inventory/create_inventory_page.R
+++ /dev/null
@@ -1,64 +0,0 @@
-library(arrow)
-library(dplyr)
-library(gsheet)
-library(readr)
-
-#source('catalog/R/stac_functions.R')
-config <- yaml::read_yaml('challenge_configuration.yaml')
-catalog_config <- config$catalog_config
-
-# file.sources = list.files(c("../stac4cast/R"), full.names=TRUE,
-# ignore.case=TRUE)
-# sapply(file.sources,source,.GlobalEnv)
-
-## CREATE table for column descriptions
-inventory_description_create <- data.frame(duration = 'sample duration code for variable',
- model_id = 'unique model identifier',
- site_id = 'unique site identifier',
- reference_date = 'date that the forecast was initiated (horizon = 0)',
- variable = 'forecast variable',
- date = 'date of the predicted value',
- project_id = 'unique project identifier',
- path = 'storage path for forecast data',
- endpoint = 'storage location for forecast data')
-
-#inventory_theme_df <- arrow::open_dataset(glue::glue("s3://{config$inventory_bucket}/catalog/forecasts/project_id={config$project_id}"), endpoint_override = config$endpoint, anonymous = TRUE) #|>
-
-inventory_theme_df <- arrow::open_dataset(arrow::s3_bucket(config$inventory_bucket, endpoint_override = config$endpoint, anonymous = TRUE))
-
-# inventory_data_df <- duckdbfs::open_dataset(glue::glue("s3://{config$inventory_bucket}/catalog"),
-# s3_endpoint = config$endpoint, anonymous=TRUE) |>
-# inventory_data_df <- arrow::open_dataset(arrow::s3_bucket(config$inventory_bucket, endpoint_override = config$endpoint, anonymous = TRUE)) |>
-# collect()
-
-inventory_data_df <- duckdbfs::open_dataset(glue::glue("s3://{config$inventory_bucket}/catalog/forecasts"),
- s3_endpoint = config$endpoint, anonymous=TRUE) |>
- collect() |>
- dplyr::filter(project_id == config$project_id)
-
-theme_models <- inventory_data_df |>
- distinct(model_id)
-
-inventory_date_range <- inventory_data_df |> dplyr::summarise(min(date),max(date))
-inventory_min_date <- inventory_date_range$`min(date)`
-inventory_max_date <- inventory_date_range$`max(date)`
-
-build_description <- paste0("The catalog contains forecasts for the ", config$challenge_long_name,". The forecasts are the raw forecasts that include all ensemble members (if a forecast represents uncertainty using an ensemble). Due to the size of the raw forecasts, we recommend accessing the scores (summaries of the forecasts) to analyze forecasts (unless you need the individual ensemble members). You can access the forecasts at the top level of the dataset where all models, variables, and dates that forecasts were produced (reference_datetime) are available. The code to access the entire dataset is provided as an asset. Given the size of the forecast catalog, it can be time-consuming to access the data at the full dataset level. For quicker access to the forecasts for a particular model (model_id), we also provide the code to access the data at the model_id level as an asset for each model.")
-
-
-stac4cast::build_inventory(table_schema = inventory_theme_df,
- table_description = inventory_description_create,
- start_date = inventory_min_date,
- end_date = inventory_max_date,
- id_value = "inventory",
- description_string = build_description,
- about_string = catalog_config$about_string,
- about_title = catalog_config$about_title,
- theme_title = "Inventory",
- destination_path = catalog_config$inventory_path,
- aws_download_path = config$inventory_bucket,
- #link_items = stac4cast::generate_group_values(group_values = names(config$variable_groups)),
- link_items = NULL,
- thumbnail_link = catalog_config$inventory_thumbnail,
- thumbnail_title = catalog_config$inventory_thumbnail_title,
- project_identifier = config$project_id)
diff --git a/catalog/model_metadata.R b/catalog/model_metadata.R
deleted file mode 100644
index cce5444df6..0000000000
--- a/catalog/model_metadata.R
+++ /dev/null
@@ -1,185 +0,0 @@
-Sys.setenv(AWS_ACCESS_KEY_ID=Sys.getenv("OSN_KEY"),
- AWS_SECRET_ACCESS_KEY=Sys.getenv("OSN_SECRET"))
-
-config <- yaml::read_yaml("challenge_configuration.yaml")
-
-endpoint <- config$endpoint
-
-minioclient::install_mc()
-
-minioclient::mc_alias_set("osn",
- config$endpoint,
- Sys.getenv("OSN_KEY"),
- Sys.getenv("OSN_SECRET"))
-
-#googlesheets4::gs4_deauth()
-# registered_models <- googlesheets4::read_sheet(config$model_metadata_gsheet) |>
-# dplyr::filter(`What forecasting challenge are you registering for?` == config$project_id,
-# !is.na(registered_models$`Which category best matches your modeling approach?`))
-
-registered_models <- gsheet::gsheet2tbl(config$model_metadata_gsheet) |>
- dplyr::filter(`What forecasting challenge are you registering for?` == config$project_id,
- !is.na(`Which category best matches your modeling approach?`))
-
-for(i in 1:nrow(registered_models)){
-
- #Need to get from forecast output
- progagates_method <- "Infer from family column in archived forecasts"
-
- metadata <- list()
-
- metadata$creator$individual_name <- "Pending"
- metadata$creator$electronicMailAddress <- "Pending"
- metadata$creator$organizationName <- "Pending"
- metadata$model_id <- registered_models$model_id[i]
- metadata$model_description$intellectualRights <- "https://creativecommons.org/licenses/by/4.0/"
- metadata$model_description$name <- registered_models$`Long name of the model`[i]
- metadata$model_description$type <- registered_models$`Which category best matches your modeling approach?`[i]
- metadata$model_description$repository <- registered_models$`Web link to model code`[i]
-
- # Initial Conditions
-
- if(registered_models$`Do your forecasts include uncertainty from initial conditions?`[i] == "Yes and they were estimated from data"){
- metadata$uncertainty$initial_conditions$present <- TRUE
- metadata$uncertainty$initial_conditions$data_driven <- TRUE
- metadata$uncertainty$initial_conditions$progagates$type <- progagates_method
- }else if(registered_models$`Do your forecasts include uncertainty from initial conditions?`[i] == "Yes and they were not estimated from data (e.g., assumed initial conditions were the model equilibrium)"){
- metadata$uncertainty$initial_conditions$present <- TRUE
- metadata$uncertainty$initial_conditions$data_driven <- FALSE
- metadata$uncertainty$initial_conditions$progagates$type <- progagates_method
- }else if(registered_models$`Do your forecasts include uncertainty from initial conditions?`[i] == "No"){
- if(registered_models$`Is your forecast model dynamic? (i.e. is tomorrow’s forecast dependent on today’s forecast)?`[i] == "Yes"){
- metadata$uncertainty$initial_conditions$present <- TRUE
- metadata$uncertainty$initial_conditions$data_driven <- FALSE
- }else{
- metadata$uncertainty$initial_conditions$present <- FALSE
- }
- }else{
- metadata$uncertainty$initial_conditions$present <- "Unknown"
- }
-
- if(registered_models$`Do you update your initial conditions or parameters between forecast submissions using newly available data (i.e., data assimilation)?`[i] %in%
- c("Initial conditions", "Both initial conditions and parameters")){
- metadata$uncertainty$initial_conditions$assimilation$type = registered_models$`What method did you use if you updated your initial conditions or parameters using data assimilation?`[i]
- }
-
- #Parameters
-
- if(registered_models$`Does your forecast include uncertainty from the model parameters?`[i] == "Yes and at least one is estimated from data"){
- metadata$uncertainty$parameters$present <- TRUE
- metadata$uncertainty$parameters$data_driven <- TRUE
- metadata$uncertainty$parameters$progagates$type <- progagates_method
- }else if(registered_models$`Does your forecast include uncertainty from the model parameters?`[i] == "Yes and they are not estimated from data"){
- metadata$uncertainty$parameters$present <- TRUE
- metadata$uncertainty$parameters$data_driven <- FALSE
- metadata$uncertainty$parameters$progagates$type <- progagates_method
- }else if(registered_models$`Does your forecast include uncertainty from the model parameters?`[i] == "No"){
- if(registered_models$`Does your model include parameters?`[i] == "Yes"){
- metadata$uncertainty$parameters$present <- TRUE
- metadata$uncertainty$parameters$data_driven <- FALSE
- }else{
- metadata$uncertainty$parameters$present <- FALSE
- }
- }else{
- metadata$uncertainty$parameters$present <- "Unknown"
- }
-
- if(registered_models$`Do you update your initial conditions or parameters between forecast submissions using newly available data (i.e., data assimilation)?`[i] %in%
- c("Parameter", "Both initial conditions and parameters")){
- metadata$uncertainty$parameters$assimilation$type = registered_models$`What method did you use if you updated your initial conditions or parameters using data assimilation?`[i]
- }
-
- # Drivers
-
- if(registered_models$`Does your forecast include uncertainty from drivers (i.e., ensemble weather forecasts)?`[i] == "Yes"){
- metadata$uncertainty$drivers$present <- TRUE
- metadata$uncertainty$drivers$data_driven <- TRUE
- metadata$uncertainty$drivers$progagates$type <- progagates_method
- }else if(registered_models$`Does your forecast include uncertainty from drivers (i.e., ensemble weather forecasts)?`[i] == "No"){
- if(registered_models$`Does the forecast use drivers?`[i] == "Yes"){
- metadata$uncertainty$drivers$present <- TRUE
- metadata$uncertainty$drivers$data_driven <- TRUE
- }else{
- metadata$uncertainty$drivers$present <- FALSE
- }
- }else{
- metadata$uncertainty$drivers$present <- "Unknown"
- }
-
- #Process model
-
- if(registered_models$`Does your forecast include uncertainty from the model (process uncertainty)?`[i] == "Yes and the uncertainty was estimated from data"){
- metadata$uncertainty$process_error$present <- TRUE
- metadata$uncertainty$process_error$data_driven <- TRUE
- metadata$uncertainty$process_error$progagates$type <- progagates_method
- }else if(registered_models$`Does your forecast include uncertainty from the model (process uncertainty)?`[i] == "Yes and the uncertainty was not estimated from data"){
- metadata$uncertainty$process_error$present <- TRUE
- metadata$uncertainty$process_error$data_driven <- FALSE
- metadata$uncertainty$process_error$progagates$type <- progagates_method
- }else if(registered_models$`Does your forecast include uncertainty from the model (process uncertainty)?`[i] == "No"){
- metadata$uncertainty$process_error$present <- FALSE
- }else{
- metadata$uncertainty$process$present <- "Unknown"
- }
-
- # Measurement error
-
- if(registered_models$`Does your forecast include uncertainty from measurement noise?`[i] == "Yes and the noise was estimated from data"){
- metadata$uncertainty$obs_error$present <- TRUE
- metadata$uncertainty$obs_error$data_driven <- TRUE
- metadata$uncertainty$obs_error$progagates$type <- progagates_method
- }else if(registered_models$`Does your forecast include uncertainty from measurement noise?`[i] == "Yes and the noise was not estimated from data"){
- metadata$uncertainty$obs_error$present <- TRUE
- metadata$uncertainty$obs_error$data_driven <- FALSE
- metadata$uncertainty$obs_error$progagates$type <- progagates_method
- }else if(registered_models$`Does your forecast include uncertainty from measurement noise?`[i] == "No"){
- metadata$uncertainty$obs_error$present <- FALSE
- }else{
- metadata$uncertainty$obs_error$present <- "Unknown"
- }
-
- #Structural uncertainty
-
- #How is structural error "data driven"
-
- if(registered_models$`Does your forecast include uncertainty from using different models?`[i] == "Yes"){
- metadata$uncertainty$structural_error$present <- TRUE
- metadata$uncertainty$structural_error$data_driven <- FALSE
- metadata$uncertainty$structural_error$progagates$type <- progagates_method
- }else if(registered_models$`Does your forecast include uncertainty from using different models?`[i] == "No"){
- metadata$uncertainty$structural_error$present <- FALSE
- }else{
- metadata$uncertainty$structural_error$present <- "Unknown"
- }
-
- # Random effects
-
- if(registered_models$`Does your forecast include uncertainty from parameter random effects?`[i] == "Yes and the uncertainty was estimated from data"){
- metadata$uncertainty$random_effects$present <- TRUE
- metadata$uncertainty$random_effects$data_driven <- TRUE
- metadata$uncertainty$random_effects$progagates$type <- progagates_method
- }else if(registered_models$`Does your forecast include uncertainty from parameter random effects?`[i] == "Yes and the uncertainty was not estimated from data (uncommon)"){
- metadata$uncertainty$random_effects$present <- TRUE
- metadata$uncertainty$random_effects$data_driven <- FALSE
- metadata$uncertainty$random_effects$progagates$type <- progagates_method
- }else if(registered_models$`Does your forecast include uncertainty from parameter random effects?`[i] == "No"){
- metadata$uncertainty$random_effects$present <- FALSE
- }else{
- metadata$uncertainty$random_effects$present <- "Unknown"
- }
-
- file_name <- paste0(metadata$model_id, ".json")
- jsonlite::write_json(metadata, path = file.path("catalog",file_name), pretty = TRUE)
-
- minioclient::mc_cp(file.path("catalog",file_name),
- file.path("osn",
- config$model_metadata_bucket,
- paste0("project_id=", config$project_id),
- file_name))
-
- unlink(file.path("catalog",file_name))
-}
-
-
-
-
diff --git a/catalog/noaa_forecasts/Pseudo/collection.json b/catalog/noaa_forecasts/Pseudo/collection.json
deleted file mode 100644
index c96b7a837e..0000000000
--- a/catalog/noaa_forecasts/Pseudo/collection.json
+++ /dev/null
@@ -1,142 +0,0 @@
-{
- "id": "Pseudo",
- "description": "The catalog contains NOAA forecasts used for the EFI-USGS River Chlorophyll Forecasting Challenge. The forecasts are the raw forecasts that include all ensemble members (if a forecast represents uncertainty using an ensemble). You can access the forecasts at the top level of the dataset where all models, variables, and dates that forecasts were produced (reference_datetime) are available. The code to access the entire dataset is provided as an asset. Given the size of the forecast catalog, it can be time-consuming to access the data at the full dataset level. For quicker access to the forecasts for a site or datetime, we also provide the code to access the data at the site_id and datetime level as an asset for each forecast",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../../catalog.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Pseudo",
- "extent": {
- "spatial": {
- "bbox": [
- [-122.6692, 39.6328, -74.7781, 45.5175]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2024-01-29T00:00:00Z",
- "2024-03-13T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "prediction",
- "type": "double",
- "description": "predicted value for variable"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "height",
- "type": "string",
- "description": "variable height"
- },
- {
- "name": "horizon",
- "type": "double",
- "description": "number of days in forecast"
- },
- {
- "name": "parameter",
- "type": "int32",
- "description": "ensemble member or distribution parameter"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "forecast_valid",
- "type": "string",
- "description": "date when forecast is valid"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "longitude",
- "type": "double",
- "description": "forecast site longitude"
- },
- {
- "name": "latitude",
- "type": "double",
- "description": "forecast site latitude"
- }
- ],
- "assets": {
- "data": {
- "href": "s3://anonymous@drivers/noaa/gefs-v12-reprocess/pseudo/parquet?endpoint_override=s3.flare-forecast.org",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for NEON forecasts associated with the forecasting challenge.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@drivers/noaa/gefs-v12-reprocess/pseudo/parquet?endpoint_override=s3.flare-forecast.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/DSC_0001.jpg",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "USGS Image"
- }
- }
-}
diff --git a/catalog/noaa_forecasts/Stage1-stats/collection.json b/catalog/noaa_forecasts/Stage1-stats/collection.json
deleted file mode 100644
index 957d2aa360..0000000000
--- a/catalog/noaa_forecasts/Stage1-stats/collection.json
+++ /dev/null
@@ -1,142 +0,0 @@
-{
- "id": "Stage1-stats",
- "description": "The catalog contains NOAA forecasts used for the EFI-USGS River Chlorophyll Forecasting Challenge. The forecasts are the raw forecasts that include all ensemble members (if a forecast represents uncertainty using an ensemble). You can access the forecasts at the top level of the dataset where all models, variables, and dates that forecasts were produced (reference_datetime) are available. The code to access the entire dataset is provided as an asset. Given the size of the forecast catalog, it can be time-consuming to access the data at the full dataset level. For quicker access to the forecasts for a site or datetime, we also provide the code to access the data at the site_id and datetime level as an asset for each forecast",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../../catalog.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Stage1-stats",
- "extent": {
- "spatial": {
- "bbox": [
- [-122.6692, 39.6328, -74.7781, 45.5175]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2024-01-29T00:00:00Z",
- "2024-03-13T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "prediction",
- "type": "double",
- "description": "predicted value for variable"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "height",
- "type": "string",
- "description": "variable height"
- },
- {
- "name": "horizon",
- "type": "double",
- "description": "number of days in forecast"
- },
- {
- "name": "parameter",
- "type": "int32",
- "description": "ensemble member or distribution parameter"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "forecast_valid",
- "type": "string",
- "description": "date when forecast is valid"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "longitude",
- "type": "double",
- "description": "forecast site longitude"
- },
- {
- "name": "latitude",
- "type": "double",
- "description": "forecast site latitude"
- }
- ],
- "assets": {
- "data": {
- "href": "s3://anonymous@drivers/noaa/gefs-v12-reprocess/stage1-stats/parquet?endpoint_override=s3.flare-forecast.org",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for NEON forecasts associated with the forecasting challenge.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@drivers/noaa/gefs-v12-reprocess/stage1-stats/parquet?endpoint_override=s3.flare-forecast.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/DSC_0001.jpg",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "USGS Image"
- }
- }
-}
diff --git a/catalog/noaa_forecasts/Stage1/collection.json b/catalog/noaa_forecasts/Stage1/collection.json
deleted file mode 100644
index f7d612e3f2..0000000000
--- a/catalog/noaa_forecasts/Stage1/collection.json
+++ /dev/null
@@ -1,142 +0,0 @@
-{
- "id": "Stage1",
- "description": "The catalog contains NOAA forecasts used for the EFI-USGS River Chlorophyll Forecasting Challenge. The forecasts are the raw forecasts that include all ensemble members (if a forecast represents uncertainty using an ensemble). You can access the forecasts at the top level of the dataset where all models, variables, and dates that forecasts were produced (reference_datetime) are available. The code to access the entire dataset is provided as an asset. Given the size of the forecast catalog, it can be time-consuming to access the data at the full dataset level. For quicker access to the forecasts for a site or datetime, we also provide the code to access the data at the site_id and datetime level as an asset for each forecast",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../../catalog.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Stage1",
- "extent": {
- "spatial": {
- "bbox": [
- [-122.6692, 39.6328, -74.7781, 45.5175]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2024-01-29T00:00:00Z",
- "2024-03-13T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "prediction",
- "type": "double",
- "description": "predicted value for variable"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "height",
- "type": "string",
- "description": "variable height"
- },
- {
- "name": "horizon",
- "type": "double",
- "description": "number of days in forecast"
- },
- {
- "name": "parameter",
- "type": "int32",
- "description": "ensemble member or distribution parameter"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "forecast_valid",
- "type": "string",
- "description": "date when forecast is valid"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "longitude",
- "type": "double",
- "description": "forecast site longitude"
- },
- {
- "name": "latitude",
- "type": "double",
- "description": "forecast site latitude"
- }
- ],
- "assets": {
- "data": {
- "href": "s3://anonymous@drivers/noaa/gefs-v12-reprocess/stage1/parquet?endpoint_override=s3.flare-forecast.org",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for NEON forecasts associated with the forecasting challenge.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@drivers/noaa/gefs-v12-reprocess/stage1/parquet?endpoint_override=s3.flare-forecast.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/DSC_0001.jpg",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "USGS Image"
- }
- }
-}
diff --git a/catalog/noaa_forecasts/Stage2/collection.json b/catalog/noaa_forecasts/Stage2/collection.json
deleted file mode 100644
index 0557c9890d..0000000000
--- a/catalog/noaa_forecasts/Stage2/collection.json
+++ /dev/null
@@ -1,142 +0,0 @@
-{
- "id": "Stage2",
- "description": "The catalog contains NOAA forecasts used for the EFI-USGS River Chlorophyll Forecasting Challenge. The forecasts are the raw forecasts that include all ensemble members (if a forecast represents uncertainty using an ensemble). You can access the forecasts at the top level of the dataset where all models, variables, and dates that forecasts were produced (reference_datetime) are available. The code to access the entire dataset is provided as an asset. Given the size of the forecast catalog, it can be time-consuming to access the data at the full dataset level. For quicker access to the forecasts for a site or datetime, we also provide the code to access the data at the site_id and datetime level as an asset for each forecast",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../../catalog.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Stage2",
- "extent": {
- "spatial": {
- "bbox": [
- [-122.6692, 39.6328, -74.7781, 45.5175]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2024-01-29T00:00:00Z",
- "2024-03-13T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "prediction",
- "type": "double",
- "description": "predicted value for variable"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "height",
- "type": "string",
- "description": "variable height"
- },
- {
- "name": "horizon",
- "type": "double",
- "description": "number of days in forecast"
- },
- {
- "name": "parameter",
- "type": "int32",
- "description": "ensemble member or distribution parameter"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "forecast_valid",
- "type": "string",
- "description": "date when forecast is valid"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "longitude",
- "type": "double",
- "description": "forecast site longitude"
- },
- {
- "name": "latitude",
- "type": "double",
- "description": "forecast site latitude"
- }
- ],
- "assets": {
- "data": {
- "href": "s3://anonymous@drivers/noaa/gefs-v12-reprocess/stage2/parquet?endpoint_override=s3.flare-forecast.org",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for NEON forecasts associated with the forecasting challenge.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@drivers/noaa/gefs-v12-reprocess/stage2/parquet?endpoint_override=s3.flare-forecast.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/DSC_0001.jpg",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "USGS Image"
- }
- }
-}
diff --git a/catalog/noaa_forecasts/Stage3/collection.json b/catalog/noaa_forecasts/Stage3/collection.json
deleted file mode 100644
index 9946239a4c..0000000000
--- a/catalog/noaa_forecasts/Stage3/collection.json
+++ /dev/null
@@ -1,142 +0,0 @@
-{
- "id": "Stage3",
- "description": "The catalog contains NOAA forecasts used for the EFI-USGS River Chlorophyll Forecasting Challenge. The forecasts are the raw forecasts that include all ensemble members (if a forecast represents uncertainty using an ensemble). You can access the forecasts at the top level of the dataset where all models, variables, and dates that forecasts were produced (reference_datetime) are available. The code to access the entire dataset is provided as an asset. Given the size of the forecast catalog, it can be time-consuming to access the data at the full dataset level. For quicker access to the forecasts for a site or datetime, we also provide the code to access the data at the site_id and datetime level as an asset for each forecast",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../../catalog.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Stage3",
- "extent": {
- "spatial": {
- "bbox": [
- [-122.6692, 39.6328, -74.7781, 45.5175]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2024-01-29T00:00:00Z",
- "2024-03-13T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "prediction",
- "type": "double",
- "description": "predicted value for variable"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "height",
- "type": "string",
- "description": "variable height"
- },
- {
- "name": "horizon",
- "type": "double",
- "description": "number of days in forecast"
- },
- {
- "name": "parameter",
- "type": "int32",
- "description": "ensemble member or distribution parameter"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "forecast_valid",
- "type": "string",
- "description": "date when forecast is valid"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "longitude",
- "type": "double",
- "description": "forecast site longitude"
- },
- {
- "name": "latitude",
- "type": "double",
- "description": "forecast site latitude"
- }
- ],
- "assets": {
- "data": {
- "href": "s3://anonymous@drivers/noaa/gefs-v12-reprocess/stage3/parquet?endpoint_override=s3.flare-forecast.org",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for NEON forecasts associated with the forecasting challenge.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@drivers/noaa/gefs-v12-reprocess/stage3/parquet?endpoint_override=s3.flare-forecast.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/DSC_0001.jpg",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "USGS Image"
- }
- }
-}
diff --git a/catalog/noaa_forecasts/collection.json b/catalog/noaa_forecasts/collection.json
deleted file mode 100644
index cb74bce851..0000000000
--- a/catalog/noaa_forecasts/collection.json
+++ /dev/null
@@ -1,177 +0,0 @@
-{
- "id": "noaa-forecasts",
- "description": "The catalog contains NOAA forecasts used for the EFI-USGS River Chlorophyll Forecasting Challenge. The forecasts are the raw forecasts that include all ensemble members (if a forecast represents uncertainty using an ensemble). You can access the forecasts at the top level of the dataset where all models, variables, and dates that forecasts were produced (reference_datetime) are available. The code to access the entire dataset is provided as an asset. Given the size of the forecast catalog, it can be time-consuming to access the data at the full dataset level. For quicker access to the forecasts for a site or datetime, we also provide the code to access the data at the site_id and datetime level as an asset for each forecast",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "child",
- "type": "application/json",
- "href": "Pseudo/collection.json",
- "title": "Pseudo"
- },
- {
- "rel": "child",
- "type": "application/json",
- "href": "Stage1-stats/collection.json",
- "title": "Stage1-stats"
- },
- {
- "rel": "child",
- "type": "application/json",
- "href": "Stage1/collection.json",
- "title": "Stage1"
- },
- {
- "rel": "child",
- "type": "application/json",
- "href": "Stage2/collection.json",
- "title": "Stage2"
- },
- {
- "rel": "child",
- "type": "application/json",
- "href": "Stage3/collection.json",
- "title": "Stage3"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../catalog.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../catalog.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "NOAA-Forecasts",
- "extent": {
- "spatial": {
- "bbox": [
- [
- -122.6692,
- 39.6327,
- -74.7781,
- 45.5175
- ]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2024-01-29T00:00:00Z",
- "2024-03-13T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "prediction",
- "type": "double",
- "description": "predicted value for variable"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "height",
- "type": "string",
- "description": "variable height"
- },
- {
- "name": "horizon",
- "type": "double",
- "description": "number of days in forecast"
- },
- {
- "name": "parameter",
- "type": "int32",
- "description": "ensemble member or distribution parameter"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "forecast_valid",
- "type": "string",
- "description": "date when forecast is valid"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "longitude",
- "type": "double",
- "description": "forecast site longitude"
- },
- {
- "name": "latitude",
- "type": "double",
- "description": "forecast site latitude"
- }
- ],
- "assets": {
- "data": {
- "href": "s3://anonymous@drivers/noaa/gefs-v12-reprocess/?endpoint_override=sdsc.osn.xsede.org",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for the Forecasting Challenge.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@drivers/noaa/gefs-v12-reprocess/?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/DSC_0001.jpg",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "USGS Image"
- }
- }
-}
diff --git a/catalog/noaa_forecasts/noaa_forecasts.R b/catalog/noaa_forecasts/noaa_forecasts.R
deleted file mode 100644
index 6fbdeab7f3..0000000000
--- a/catalog/noaa_forecasts/noaa_forecasts.R
+++ /dev/null
@@ -1,111 +0,0 @@
-library(arrow)
-library(dplyr)
-library(gsheet)
-library(readr)
-
-#source('catalog/R/stac_functions.R')
-config <- yaml::read_yaml('challenge_configuration.yaml')
-catalog_config <- config$catalog_config
-
-# file.sources = list.files(c("../stac4cast/R"), full.names=TRUE,
-# ignore.case=TRUE)
-# sapply(file.sources,source,.GlobalEnv)
-
-## CREATE table for column descriptions
-noaa_description_create <- data.frame(site_id = 'For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)',
- prediction = 'predicted value for variable',
- variable = 'name of forecasted variable',
- height = 'variable height',
- horizon = 'number of days in forecast',
- parameter = 'ensemble member or distribution parameter',
- family = 'For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”',
- reference_datetime = 'datetime that the forecast was initiated (horizon = 0)',
- forecast_valid = 'date when forecast is valid',
- datetime = 'datetime of the forecasted value (ISO 8601)',
- longitude = 'forecast site longitude',
- latitude = 'forecast site latitude')
-
-
-noaa_theme_df <- arrow::open_dataset(arrow::s3_bucket(paste0(config$noaa_forecast_bucket,"stage2/parquet/0/2023-08-01/feea"), endpoint_override = config$noaa_endpoint, anonymous = TRUE))
-
-
-noaa_theme_dates <- arrow::open_dataset(arrow::s3_bucket(paste0(config$driver_bucket,"/gefs-v12/stage2"),
- endpoint_override = config$endpoint,
- anonymous = TRUE)) |>
- dplyr::summarise(min(datetime),max(datetime)) |>
- collect()
-noaa_min_date <- noaa_theme_dates$`min(datetime)`
-noaa_max_date <- noaa_theme_dates$`max(datetime)`
-
-#filter(model_id == model_id, site_id = site_id, reference_datetime = reference_datetime)
-# NOTE IF NOT USING FILTER -- THE stac4cast::build_table_columns() NEEDS TO BE UPDATED
-#(USE strsplit(forecast_theme_df$ToString(), "\n") INSTEAD OF strsplit(forecast_theme_df[[1]]$ToString(), "\n"))
-
-## identify model ids from bucket -- used in generate model items function
-# noaa_data_df <- duckdbfs::open_dataset(glue::glue("s3://{config$inventory_bucket}/catalog"),
-# s3_endpoint = config$endpoint, anonymous=TRUE) |>
-# collect()
-
-# theme_models <- forecast_data_df |>
-# distinct(model_id)
-
-### SET TO PENDING FOR NOW
-# noaa_date_range <- noaa_data_df |> dplyr::summarise(min(datetime),max(datetime))
-# noaa_min_date <- noaa_date_range$`min(datetime)`
-# noaa_max_date <- noaa_date_range$`max(datetime)`
-
-build_description <- paste0("The catalog contains NOAA forecasts used for the ", config$challenge_long_name,". The forecasts are the raw forecasts that include all ensemble members (if a forecast represents uncertainty using an ensemble). You can access the forecasts at the top level of the dataset where all models, variables, and dates that forecasts were produced (reference_datetime) are available. The code to access the entire dataset is provided as an asset. Given the size of the forecast catalog, it can be time-consuming to access the data at the full dataset level. For quicker access to the forecasts for a site or datetime, we also provide the code to access the data at the site_id and datetime level as an asset for each forecast")
-
-stac4cast::build_forecast_scores(table_schema = noaa_theme_df,
- #theme_id = 'Forecasts',
- table_description = noaa_description_create,
- start_date = noaa_min_date,
- end_date = noaa_max_date,
- id_value = "noaa-forecasts",
- description_string = build_description,
- about_string = catalog_config$about_string,
- about_title = catalog_config$about_title,
- theme_title = "NOAA-Forecasts",
- destination_path = catalog_config$noaa_path,
- aws_download_path = config$noaa_forecast_bucket,
- link_items = stac4cast::generate_group_values(group_values = config$noaa_forecast_groups),
- thumbnail_link = catalog_config$noaa_thumbnail,
- thumbnail_title = catalog_config$noaa_thumbnail_title,
- model_child = FALSE)
-
-
-## BUILD VARIABLE GROUPS
-## find group sites
-find_noaa_sites <- read_csv(config$site_table) |>
- distinct(site_id)
-
-for (i in 1:length(config$noaa_forecast_groups)){ ## organize variable groups
- print(config$noaa_forecast_groups[i])
-
-
- if (!dir.exists(paste0(catalog_config$noaa_path,config$noaa_forecast_groups[i]))){
- dir.create(paste0(catalog_config$noaa_path,config$noaa_forecast_groups[i]))
- }
-
-
- ## CREATE NOAA GROUP JSONS
- group_description <- paste0('This page includes information for NOAA forecasts ', config$noaa_forecast_groups[i])
-
- stac4cast::build_noaa_forecast(table_schema = noaa_theme_df,
- table_description = noaa_description_create,
- start_date = noaa_min_date,
- end_date = noaa_max_date,
- id_value = config$noaa_forecast_groups[i],
- description_string = build_description,
- about_string = catalog_config$about_string,
- about_title = catalog_config$about_title,
- theme_title = config$noaa_forecast_groups[i],
- destination_path = paste0(catalog_config$noaa_path, config$noaa_forecast_groups[i]),
- aws_download_path = config$noaa_forecast_bucket,
- link_items = NULL,
- thumbnail_link = catalog_config$noaa_thumbnail,
- thumbnail_title = catalog_config$noaa_thumbnail_title,
- group_sites = find_noaa_sites$field_site_id,
- path_item = config$noaa_forecast_group_paths[i])
-
-}
diff --git a/catalog/scores/Aquatics/Daily_Chlorophyll_a/collection.json b/catalog/scores/Aquatics/Daily_Chlorophyll_a/collection.json
deleted file mode 100644
index 6ca03f75d9..0000000000
--- a/catalog/scores/Aquatics/Daily_Chlorophyll_a/collection.json
+++ /dev/null
@@ -1,312 +0,0 @@
-{
- "id": "Daily_Chlorophyll_a",
- "description": "This page includes all models for the Daily_Chlorophyll_a variable.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/USGSHABs1.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/cb_prophet.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/climatology.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/persistenceRW.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procBlanchardMonod.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procBlanchardSteele.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procCTMIMonod.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procCTMISteele.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procEppleyNorbergMonod.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procEppleyNorbergSteele.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procHinshelwoodMonod.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procHinshelwoodSteele.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_arima.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_auto_adam.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_bag_mlp.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_ets.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_humidity_lm.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_humidity_lm_all_sites.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_lasso.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_lasso_all_sites.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_precip_lm.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_precip_lm_all_sites.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_randfor.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_tbats.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_temp_lm.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_temp_lm_all_sites.json"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/neon4cast-docs/",
- "type": "text/html",
- "title": "NEON Ecological Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/neon4cast-docs/",
- "title": "NEON Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Daily_Chlorophyll_a",
- "extent": {
- "spatial": {
- "bbox": [
- [-149.6106, 29.676, -82.0084, 68.6307]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2023-01-01T00:00:00Z",
- "2023-11-30T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat); however in netCDF this could be handled by the CF Discrete Sampling Geometry data model."
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified For probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.” For summary statistics: “summary.”If this dimension does not vary, it is permissible to specify family as a variable attribute if the file format being used supports this (e.g.,netCDF)."
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "observation",
- "type": "double",
- "description": "observed value for variable"
- },
- {
- "name": "crps",
- "type": "double",
- "description": "crps forecast score"
- },
- {
- "name": "logs",
- "type": "double",
- "description": "logs forecast score"
- },
- {
- "name": "mean",
- "type": "double",
- "description": "mean forecast prediction"
- },
- {
- "name": "median",
- "type": "double",
- "description": "median forecast prediction"
- },
- {
- "name": "sd",
- "type": "double",
- "description": "standard deviation forecasts"
- },
- {
- "name": "quantile97.5",
- "type": "double",
- "description": "upper 97.5 percentile value of forecast"
- },
- {
- "name": "quantile02.5",
- "type": "double",
- "description": "upper 2.5 percentile value of forecast"
- },
- {
- "name": "quantile90",
- "type": "double",
- "description": "upper 90 percentile value of forecast"
- },
- {
- "name": "quantile10",
- "type": "double",
- "description": "upper 10 percentile value of forecast"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique project identifier"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly = PT1H, daily = P1D, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "date",
- "type": "string",
- "description": "ISO 8601 (ISO 2019) date of the predicted value; follows CF convention http://cfconventions.org/cf-conventions/cf-conventions.html#time-coordinate. This variable was called time before v0.5of the EFI convention. For time-integrated variables (e.g., cumulative net primary productivity), one should specify the start_datetime and end_datetime as two variables, instead of the single datetime. If this is not provided the datetime is assumed to be the MIDPOINT of the integration period."
- }
- ],
- "assets": {
- "data": {
- "href": "\"s3://anonymous@bio230014-bucket01/challenges/scores/parquet/project_id=neon4cast/duration=P1D/variable=chla?endpoint_override=sdsc.osn.xsede.org\"",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for forecasts of the variable by the specific model .\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/scores/parquet/project_id=neon4cast/duration=P1D/variable=chla?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "pending",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "pending"
- }
- }
-}
diff --git a/catalog/scores/Aquatics/collection.json b/catalog/scores/Aquatics/collection.json
deleted file mode 100644
index 7352bcf1b6..0000000000
--- a/catalog/scores/Aquatics/collection.json
+++ /dev/null
@@ -1,197 +0,0 @@
-{
- "id": "Aquatics",
- "description": "This page includes variables for the Aquatics group.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "child",
- "type": "application/json",
- "href": "Daily_Dissolved_oxygen/collection.json"
- },
- {
- "rel": "child",
- "type": "application/json",
- "href": "Daily_Water_temperature/collection.json"
- },
- {
- "rel": "child",
- "type": "application/json",
- "href": "Daily_Chlorophyll_a/collection.json"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/neon4cast-docs/",
- "type": "text/html",
- "title": "NEON Ecological Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/neon4cast-docs/",
- "title": "NEON Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Aquatics",
- "extent": {
- "spatial": {
- "bbox": [
- [-149.6106, 18.1135, -66.7987, 68.6698]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2023-01-01T00:00:00Z",
- "2023-12-15T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat); however in netCDF this could be handled by the CF Discrete Sampling Geometry data model."
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified For probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.” For summary statistics: “summary.”If this dimension does not vary, it is permissible to specify family as a variable attribute if the file format being used supports this (e.g.,netCDF)."
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "observation",
- "type": "double",
- "description": "observed value for variable"
- },
- {
- "name": "crps",
- "type": "double",
- "description": "crps forecast score"
- },
- {
- "name": "logs",
- "type": "double",
- "description": "logs forecast score"
- },
- {
- "name": "mean",
- "type": "double",
- "description": "mean forecast prediction"
- },
- {
- "name": "median",
- "type": "double",
- "description": "median forecast prediction"
- },
- {
- "name": "sd",
- "type": "double",
- "description": "standard deviation forecasts"
- },
- {
- "name": "quantile97.5",
- "type": "double",
- "description": "upper 97.5 percentile value of forecast"
- },
- {
- "name": "quantile02.5",
- "type": "double",
- "description": "upper 2.5 percentile value of forecast"
- },
- {
- "name": "quantile90",
- "type": "double",
- "description": "upper 90 percentile value of forecast"
- },
- {
- "name": "quantile10",
- "type": "double",
- "description": "upper 10 percentile value of forecast"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique project identifier"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly = PT1H, daily = P1D, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "date",
- "type": "string",
- "description": "ISO 8601 (ISO 2019) date of the predicted value; follows CF convention http://cfconventions.org/cf-conventions/cf-conventions.html#time-coordinate. This variable was called time before v0.5of the EFI convention. For time-integrated variables (e.g., cumulative net primary productivity), one should specify the start_datetime and end_datetime as two variables, instead of the single datetime. If this is not provided the datetime is assumed to be the MIDPOINT of the integration period."
- }
- ],
- "assets": {
- "data": {
- "href": "\"s3://anonymous@bio230014-bucket01/challenges/scores/parquet/?endpoint_override=sdsc.osn.xsede.org\"",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for the NEON Ecological Forecasting Aquatics theme.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/scores/parquet/?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |>\n dplyr::filter(variable %in% c(\"oxygen\", \"temperature\", \"chla\")) |>\n dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "https://projects.ecoforecast.org/neon4cast-catalog/img/neon_buoy.jpg",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "NEON Aquatics Buoy"
- }
- }
-}
diff --git a/catalog/scores/aquatics/Daily_Chlorophyll_a/collection.json b/catalog/scores/aquatics/Daily_Chlorophyll_a/collection.json
deleted file mode 100644
index 21dd28a1d7..0000000000
--- a/catalog/scores/aquatics/Daily_Chlorophyll_a/collection.json
+++ /dev/null
@@ -1,192 +0,0 @@
-{
- "id": "Daily_Chlorophyll_a",
- "description": "This page includes all models for the Daily_Chlorophyll_a variable.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/persistenceRW.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/climatology.json"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Daily_Chlorophyll_a",
- "extent": {
- "spatial": {
- "bbox": [
- ["Inf", "Inf", "-Inf", "-Inf"]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2024-02-07T00:00:00Z",
- "2024-02-09T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat); however in netCDF this could be handled by the CF Discrete Sampling Geometry data model."
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified For probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.” For summary statistics: “summary.”If this dimension does not vary, it is permissible to specify family as a variable attribute if the file format being used supports this (e.g.,netCDF)."
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "observation",
- "type": "double",
- "description": "observed value for variable"
- },
- {
- "name": "crps",
- "type": "double",
- "description": "crps forecast score"
- },
- {
- "name": "logs",
- "type": "double",
- "description": "logs forecast score"
- },
- {
- "name": "mean",
- "type": "double",
- "description": "mean forecast prediction"
- },
- {
- "name": "median",
- "type": "double",
- "description": "median forecast prediction"
- },
- {
- "name": "sd",
- "type": "double",
- "description": "standard deviation forecasts"
- },
- {
- "name": "quantile97.5",
- "type": "double",
- "description": "upper 97.5 percentile value of forecast"
- },
- {
- "name": "quantile02.5",
- "type": "double",
- "description": "upper 2.5 percentile value of forecast"
- },
- {
- "name": "quantile90",
- "type": "double",
- "description": "upper 90 percentile value of forecast"
- },
- {
- "name": "quantile10",
- "type": "double",
- "description": "upper 10 percentile value of forecast"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique project identifier"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly = PT1H, daily = P1D, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "date",
- "type": "string",
- "description": "ISO 8601 (ISO 2019) date of the predicted value; follows CF convention http://cfconventions.org/cf-conventions/cf-conventions.html#time-coordinate. This variable was called time before v0.5of the EFI convention. For time-integrated variables (e.g., cumulative net primary productivity), one should specify the start_datetime and end_datetime as two variables, instead of the single datetime. If this is not provided the datetime is assumed to be the MIDPOINT of the integration period."
- }
- ],
- "assets": {
- "data": {
- "href": "s3://anonymous@bio230014-bucket01/challenges/scores/parquet/project_id=usgsrc4cast/duration=P1D/variable=chla?endpoint_override=sdsc.osn.xsede.org",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for forecasts of the variable by the specific model .\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/scores/parquet/project_id=usgsrc4cast/duration=P1D/variable=chla?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "pending",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "pending"
- }
- }
-}
diff --git a/catalog/scores/aquatics/collection.json b/catalog/scores/aquatics/collection.json
deleted file mode 100644
index 3b37c864e9..0000000000
--- a/catalog/scores/aquatics/collection.json
+++ /dev/null
@@ -1,187 +0,0 @@
-{
- "id": "aquatics",
- "description": "This page includes variables for the aquatics group.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "child",
- "type": "application/json",
- "href": "Daily_Chlorophyll_a/collection.json"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "aquatics",
- "extent": {
- "spatial": {
- "bbox": [
- ["Inf", "Inf", "-Inf", "-Inf"]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2024-02-07T00:00:00Z",
- "2024-02-09T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat); however in netCDF this could be handled by the CF Discrete Sampling Geometry data model."
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified For probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.” For summary statistics: “summary.”If this dimension does not vary, it is permissible to specify family as a variable attribute if the file format being used supports this (e.g.,netCDF)."
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "observation",
- "type": "double",
- "description": "observed value for variable"
- },
- {
- "name": "crps",
- "type": "double",
- "description": "crps forecast score"
- },
- {
- "name": "logs",
- "type": "double",
- "description": "logs forecast score"
- },
- {
- "name": "mean",
- "type": "double",
- "description": "mean forecast prediction"
- },
- {
- "name": "median",
- "type": "double",
- "description": "median forecast prediction"
- },
- {
- "name": "sd",
- "type": "double",
- "description": "standard deviation forecasts"
- },
- {
- "name": "quantile97.5",
- "type": "double",
- "description": "upper 97.5 percentile value of forecast"
- },
- {
- "name": "quantile02.5",
- "type": "double",
- "description": "upper 2.5 percentile value of forecast"
- },
- {
- "name": "quantile90",
- "type": "double",
- "description": "upper 90 percentile value of forecast"
- },
- {
- "name": "quantile10",
- "type": "double",
- "description": "upper 10 percentile value of forecast"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique project identifier"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly = PT1H, daily = P1D, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "date",
- "type": "string",
- "description": "ISO 8601 (ISO 2019) date of the predicted value; follows CF convention http://cfconventions.org/cf-conventions/cf-conventions.html#time-coordinate. This variable was called time before v0.5of the EFI convention. For time-integrated variables (e.g., cumulative net primary productivity), one should specify the start_datetime and end_datetime as two variables, instead of the single datetime. If this is not provided the datetime is assumed to be the MIDPOINT of the integration period."
- }
- ],
- "assets": {
- "data": {
- "href": "s3://anonymous@bio230014-bucket01/challenges/scores/parquet/?endpoint_override=sdsc.osn.xsede.org",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for the NEON Ecological Forecasting Aquatics theme.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/scores/parquet/?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |>\n dplyr::filter(variable %in% c(\"chla\")) |>\n dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/Back-b.jpg",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "USGS Streamgage"
- }
- }
-}
diff --git a/catalog/scores/collection.json b/catalog/scores/collection.json
deleted file mode 100644
index ca551f7e4b..0000000000
--- a/catalog/scores/collection.json
+++ /dev/null
@@ -1,199 +0,0 @@
-{
- "id": "daily-scores",
- "description": "The catalog contains scores for the EFI-USGS River Chlorophyll Forecasting Challenge. The scores are summaries of the forecasts (i.e., mean, median, confidence intervals), matched observations (if available), and scores (metrics of how well the model distribution compares to observations). You can access the scores at the top level of the dataset where all models, variables, and dates that forecasts were produced (reference_datetime) are available. The code to access the entire dataset is provided as an asset. Given the size of the scores catalog, it can be time-consuming to access the data at the full dataset level. For quicker access to the scores for a particular model (model_id), we also provide the code to access the data at the model_id level as an asset for each model.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "child",
- "type": "application/json",
- "href": "aquatics/collection.json",
- "title": "aquatics"
- },
- {
- "rel": "child",
- "type": "application/json",
- "href": "models/collection.json",
- "title": "group item"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../catalog.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../catalog.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Scores",
- "extent": {
- "spatial": {
- "bbox": [
- [
- -122.6692,
- 39.6327,
- -74.7781,
- 45.5175
- ]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2024-02-07T00:00:00Z",
- "2024-02-09T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat); however in netCDF this could be handled by the CF Discrete Sampling Geometry data model."
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified For probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.” For summary statistics: “summary.”If this dimension does not vary, it is permissible to specify family as a variable attribute if the file format being used supports this (e.g.,netCDF)."
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "observation",
- "type": "double",
- "description": "observed value for variable"
- },
- {
- "name": "crps",
- "type": "double",
- "description": "crps forecast score"
- },
- {
- "name": "logs",
- "type": "double",
- "description": "logs forecast score"
- },
- {
- "name": "mean",
- "type": "double",
- "description": "mean forecast prediction"
- },
- {
- "name": "median",
- "type": "double",
- "description": "median forecast prediction"
- },
- {
- "name": "sd",
- "type": "double",
- "description": "standard deviation forecasts"
- },
- {
- "name": "quantile97.5",
- "type": "double",
- "description": "upper 97.5 percentile value of forecast"
- },
- {
- "name": "quantile02.5",
- "type": "double",
- "description": "upper 2.5 percentile value of forecast"
- },
- {
- "name": "quantile90",
- "type": "double",
- "description": "upper 90 percentile value of forecast"
- },
- {
- "name": "quantile10",
- "type": "double",
- "description": "upper 10 percentile value of forecast"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique project identifier"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly = PT1H, daily = P1D, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "date",
- "type": "string",
- "description": "ISO 8601 (ISO 2019) date of the predicted value; follows CF convention http://cfconventions.org/cf-conventions/cf-conventions.html#time-coordinate. This variable was called time before v0.5of the EFI convention. For time-integrated variables (e.g., cumulative net primary productivity), one should specify the start_datetime and end_datetime as two variables, instead of the single datetime. If this is not provided the datetime is assumed to be the MIDPOINT of the integration period."
- }
- ],
- "assets": {
- "data": {
- "href": "s3://anonymous@bio230014-bucket01/challenges/scores/parquet/?endpoint_override=sdsc.osn.xsede.org",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for the Forecasting Challenge.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/scores/parquet/?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/Back-b.jpg",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "USGS Streamgage"
- }
- }
-}
diff --git a/catalog/scores/models/collection.json b/catalog/scores/models/collection.json
deleted file mode 100644
index 7cda4dea03..0000000000
--- a/catalog/scores/models/collection.json
+++ /dev/null
@@ -1,192 +0,0 @@
-{
- "id": "models",
- "description": "The catalog contains scores for the EFI-USGS River Chlorophyll Forecasting Challenge. The scores are summaries of the forecasts (i.e., mean, median, confidence intervals), matched observations (if available), and scores (metrics of how well the model distribution compares to observations). You can access the scores at the top level of the dataset where all models, variables, and dates that forecasts were produced (reference_datetime) are available. The code to access the entire dataset is provided as an asset. Given the size of the scores catalog, it can be time-consuming to access the data at the full dataset level. For quicker access to the scores for a particular model (model_id), we also provide the code to access the data at the model_id level as an asset for each model.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "item",
- "type": "application/json",
- "href": "model_items/persistenceRW.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "model_items/climatology.json"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Models",
- "extent": {
- "spatial": {
- "bbox": [
- [-122.6692, 39.6328, -74.7781, 45.5175]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2024-02-07T00:00:00Z",
- "2024-02-09T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat); however in netCDF this could be handled by the CF Discrete Sampling Geometry data model."
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified For probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.” For summary statistics: “summary.”If this dimension does not vary, it is permissible to specify family as a variable attribute if the file format being used supports this (e.g.,netCDF)."
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "observation",
- "type": "double",
- "description": "observed value for variable"
- },
- {
- "name": "crps",
- "type": "double",
- "description": "crps forecast score"
- },
- {
- "name": "logs",
- "type": "double",
- "description": "logs forecast score"
- },
- {
- "name": "mean",
- "type": "double",
- "description": "mean forecast prediction"
- },
- {
- "name": "median",
- "type": "double",
- "description": "median forecast prediction"
- },
- {
- "name": "sd",
- "type": "double",
- "description": "standard deviation forecasts"
- },
- {
- "name": "quantile97.5",
- "type": "double",
- "description": "upper 97.5 percentile value of forecast"
- },
- {
- "name": "quantile02.5",
- "type": "double",
- "description": "upper 2.5 percentile value of forecast"
- },
- {
- "name": "quantile90",
- "type": "double",
- "description": "upper 90 percentile value of forecast"
- },
- {
- "name": "quantile10",
- "type": "double",
- "description": "upper 10 percentile value of forecast"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique project identifier"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly = PT1H, daily = P1D, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "date",
- "type": "string",
- "description": "ISO 8601 (ISO 2019) date of the predicted value; follows CF convention http://cfconventions.org/cf-conventions/cf-conventions.html#time-coordinate. This variable was called time before v0.5of the EFI convention. For time-integrated variables (e.g., cumulative net primary productivity), one should specify the start_datetime and end_datetime as two variables, instead of the single datetime. If this is not provided the datetime is assumed to be the MIDPOINT of the integration period."
- }
- ],
- "assets": {
- "data": {
- "href": "s3://anonymous@bio230014-bucket01/challenges/scores/parquet/?endpoint_override=sdsc.osn.xsede.org",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for forecasts of the variable by the specific model .\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/scores/parquet/?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "pending",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "pending"
- }
- }
-}
diff --git a/catalog/scores/models/model_items/.empty b/catalog/scores/models/model_items/.empty
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/catalog/scores/models/model_items/climatology.json b/catalog/scores/models/model_items/climatology.json
deleted file mode 100644
index df69da2ac9..0000000000
--- a/catalog/scores/models/model_items/climatology.json
+++ /dev/null
@@ -1,214 +0,0 @@
-{
- "stac_version": "1.0.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Feature",
- "id": "climatology",
- "bbox": [
- [
- -122.6692,
- 45.5175,
- -74.7781,
- 45.5175
- ]
- ],
- "geometry": {
- "type": "MultiPoint",
- "coordinates": [
- [],
- [],
- [],
- [],
- [],
- [],
- [],
- [],
- []
- ]
- },
- "properties": {
- "description": "\nmodel info: Forecasts stream chlorophyll-a based on the historic average and standard deviation for that given site and day-of-year.\n\nSites: USGS-01427510, USGS-01463500, USGS-05543010, USGS-05553700, USGS-05558300, USGS-05586300, USGS-14181500, USGS-14211010, USGS-14211720\n\nVariables: Daily Chlorophyll_a",
- "start_datetime": "2024-02-07",
- "end_datetime": "2024-02-09",
- "providers": [
- {
- "url": "pending",
- "name": "pending",
- "roles": [
- "producer",
- "processor",
- "licensor"
- ]
- },
- {
- "url": "https://www.ecoforecastprojectvt.org",
- "name": "Ecoforecast Challenge",
- "roles": [
- "host"
- ]
- }
- ],
- "license": "CC0-1.0",
- "keywords": [
- "Forecasting",
- "usgsrc4cast",
- "Daily Chlorophyll_a"
- ],
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat); however in netCDF this could be handled by the CF Discrete Sampling Geometry data model."
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified For probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.” For summary statistics: “summary.”If this dimension does not vary, it is permissible to specify family as a variable attribute if the file format being used supports this (e.g.,netCDF)."
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "observation",
- "type": "double",
- "description": "observed value for variable"
- },
- {
- "name": "crps",
- "type": "double",
- "description": "crps forecast score"
- },
- {
- "name": "logs",
- "type": "double",
- "description": "logs forecast score"
- },
- {
- "name": "mean",
- "type": "double",
- "description": "mean forecast prediction"
- },
- {
- "name": "median",
- "type": "double",
- "description": "median forecast prediction"
- },
- {
- "name": "sd",
- "type": "double",
- "description": "standard deviation forecasts"
- },
- {
- "name": "quantile97.5",
- "type": "double",
- "description": "upper 97.5 percentile value of forecast"
- },
- {
- "name": "quantile02.5",
- "type": "double",
- "description": "upper 2.5 percentile value of forecast"
- },
- {
- "name": "quantile90",
- "type": "double",
- "description": "upper 90 percentile value of forecast"
- },
- {
- "name": "quantile10",
- "type": "double",
- "description": "upper 10 percentile value of forecast"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique project identifier"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly = PT1H, daily = P1D, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "date",
- "type": "string",
- "description": "ISO 8601 (ISO 2019) date of the predicted value; follows CF convention http://cfconventions.org/cf-conventions/cf-conventions.html#time-coordinate. This variable was called time before v0.5of the EFI convention. For time-integrated variables (e.g., cumulative net primary productivity), one should specify the start_datetime and end_datetime as two variables, instead of the single datetime. If this is not provided the datetime is assumed to be the MIDPOINT of the integration period."
- }
- ]
- },
- "collection": "scores",
- "links": [
- {
- "rel": "collection",
- "href": "../collection.json",
- "type": "application/json",
- "title": "climatology"
- },
- {
- "rel": "root",
- "href": "../../../catalog.json",
- "type": "application/json",
- "title": "Forecast Catalog"
- },
- {
- "rel": "parent",
- "href": "../collection.json",
- "type": "application/json",
- "title": "climatology"
- },
- {
- "rel": "self",
- "href": "climatology.json",
- "type": "application/json",
- "title": "Model Forecast"
- },
- {
- "rel": "item",
- "href": "pending",
- "type": "text/html",
- "title": "Link for Model Code"
- }
- ],
- "assets": {
- "1": {
- "type": "application/json",
- "title": "Model Metadata",
- "href": "https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/metadata/model_id/climatology.json",
- "description": "Use `jsonlite::fromJSON()` to download the model metadata JSON file. This R code will return metadata provided during the model registration.\n \n\n### R\n\n```{r}\n# Use code below\n\nmodel_metadata <- jsonlite::fromJSON(\"https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/metadata/model_id/climatology.json\")\n\n"
- },
- "2": {
- "type": "text/html",
- "title": "Link for Model Code",
- "href": "pending",
- "description": "The link to the model code provided by the model submission team"
- },
- "3": {
- "type": "application/x-parquet",
- "title": "Database Access for Daily Chlorophyll_a",
- "href": "s3://anonymous@bio230014-bucket01/challenges/scoresproject_id=/duration=P1D/variable=chla/model_id=climatology?endpoint_override=sdsc.osn.xsede.org",
- "description": "Use `arrow` for remote access to the database. This R code will return results for this variable and model combination.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/scoresproject_id=/duration=P1D/variable=chla/model_id=climatology?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- }
- }
-}
diff --git a/catalog/scores/models/model_items/persistenceRW.json b/catalog/scores/models/model_items/persistenceRW.json
deleted file mode 100644
index de73b6f9b6..0000000000
--- a/catalog/scores/models/model_items/persistenceRW.json
+++ /dev/null
@@ -1,215 +0,0 @@
-{
- "stac_version": "1.0.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Feature",
- "id": "persistenceRW",
- "bbox": [
- [
- -122.6692,
- 45.5175,
- -74.7781,
- 45.5175
- ]
- ],
- "geometry": {
- "type": "MultiPoint",
- "coordinates": [
- [],
- [],
- [],
- [],
- [],
- [],
- [],
- [],
- [],
- []
- ]
- },
- "properties": {
- "description": "\nmodel info: Random walk model based on most recent stream chl-a observations using the fable::RW() model.\n\nSites: USGS-01427510, USGS-01463500, USGS-05543010, USGS-05549500, USGS-05553700, USGS-05558300, USGS-05586300, USGS-14181500, USGS-14211010, USGS-14211720\n\nVariables: Daily Chlorophyll_a",
- "start_datetime": "2024-02-07",
- "end_datetime": "2024-02-09",
- "providers": [
- {
- "url": "pending",
- "name": "pending",
- "roles": [
- "producer",
- "processor",
- "licensor"
- ]
- },
- {
- "url": "https://www.ecoforecastprojectvt.org",
- "name": "Ecoforecast Challenge",
- "roles": [
- "host"
- ]
- }
- ],
- "license": "CC0-1.0",
- "keywords": [
- "Forecasting",
- "usgsrc4cast",
- "Daily Chlorophyll_a"
- ],
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat); however in netCDF this could be handled by the CF Discrete Sampling Geometry data model."
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified For probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.” For summary statistics: “summary.”If this dimension does not vary, it is permissible to specify family as a variable attribute if the file format being used supports this (e.g.,netCDF)."
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "observation",
- "type": "double",
- "description": "observed value for variable"
- },
- {
- "name": "crps",
- "type": "double",
- "description": "crps forecast score"
- },
- {
- "name": "logs",
- "type": "double",
- "description": "logs forecast score"
- },
- {
- "name": "mean",
- "type": "double",
- "description": "mean forecast prediction"
- },
- {
- "name": "median",
- "type": "double",
- "description": "median forecast prediction"
- },
- {
- "name": "sd",
- "type": "double",
- "description": "standard deviation forecasts"
- },
- {
- "name": "quantile97.5",
- "type": "double",
- "description": "upper 97.5 percentile value of forecast"
- },
- {
- "name": "quantile02.5",
- "type": "double",
- "description": "upper 2.5 percentile value of forecast"
- },
- {
- "name": "quantile90",
- "type": "double",
- "description": "upper 90 percentile value of forecast"
- },
- {
- "name": "quantile10",
- "type": "double",
- "description": "upper 10 percentile value of forecast"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique project identifier"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly = PT1H, daily = P1D, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "date",
- "type": "string",
- "description": "ISO 8601 (ISO 2019) date of the predicted value; follows CF convention http://cfconventions.org/cf-conventions/cf-conventions.html#time-coordinate. This variable was called time before v0.5of the EFI convention. For time-integrated variables (e.g., cumulative net primary productivity), one should specify the start_datetime and end_datetime as two variables, instead of the single datetime. If this is not provided the datetime is assumed to be the MIDPOINT of the integration period."
- }
- ]
- },
- "collection": "scores",
- "links": [
- {
- "rel": "collection",
- "href": "../collection.json",
- "type": "application/json",
- "title": "persistenceRW"
- },
- {
- "rel": "root",
- "href": "../../../catalog.json",
- "type": "application/json",
- "title": "Forecast Catalog"
- },
- {
- "rel": "parent",
- "href": "../collection.json",
- "type": "application/json",
- "title": "persistenceRW"
- },
- {
- "rel": "self",
- "href": "persistenceRW.json",
- "type": "application/json",
- "title": "Model Forecast"
- },
- {
- "rel": "item",
- "href": "pending",
- "type": "text/html",
- "title": "Link for Model Code"
- }
- ],
- "assets": {
- "1": {
- "type": "application/json",
- "title": "Model Metadata",
- "href": "https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/metadata/model_id/persistenceRW.json",
- "description": "Use `jsonlite::fromJSON()` to download the model metadata JSON file. This R code will return metadata provided during the model registration.\n \n\n### R\n\n```{r}\n# Use code below\n\nmodel_metadata <- jsonlite::fromJSON(\"https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/metadata/model_id/persistenceRW.json\")\n\n"
- },
- "2": {
- "type": "text/html",
- "title": "Link for Model Code",
- "href": "pending",
- "description": "The link to the model code provided by the model submission team"
- },
- "3": {
- "type": "application/x-parquet",
- "title": "Database Access for Daily Chlorophyll_a",
- "href": "s3://anonymous@bio230014-bucket01/challenges/scoresproject_id=/duration=P1D/variable=chla/model_id=persistenceRW?endpoint_override=sdsc.osn.xsede.org",
- "description": "Use `arrow` for remote access to the database. This R code will return results for this variable and model combination.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/scoresproject_id=/duration=P1D/variable=chla/model_id=persistenceRW?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- }
- }
-}
diff --git a/catalog/scores/scores_models.R b/catalog/scores/scores_models.R
deleted file mode 100644
index 2cffa8430d..0000000000
--- a/catalog/scores/scores_models.R
+++ /dev/null
@@ -1,316 +0,0 @@
-library(arrow)
-library(dplyr)
-library(gsheet)
-library(readr)
-
-#source('catalog/R/stac_functions.R')
-config <- yaml::read_yaml('challenge_configuration.yaml')
-catalog_config <- config$catalog_config
-
-# names(config$variable_groups)
-# variable_groups <- names(config$variable_groups)
-# variable_list <- config$variable_groups
-
-
-## CREATE table for column descriptions
-scores_description_create <- data.frame(reference_datetime ='datetime that the forecast was initiated (horizon = 0)',
- site_id = 'For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat); however in netCDF this could be handled by the CF Discrete Sampling Geometry data model.',
- datetime = 'datetime of the forecasted value (ISO 8601)',
- family = 'For ensembles: “ensemble.” Default value if unspecified For probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.” For summary statistics: “summary.”If this dimension does not vary, it is permissible to specify family as a variable attribute if the file format being used supports this (e.g.,netCDF).',
- variable = 'name of forecasted variable',
- observation = 'observed value for variable',
- crps = 'crps forecast score',
- logs = 'logs forecast score',
- mean = 'mean forecast prediction',
- median = 'median forecast prediction',
- sd = 'standard deviation forecasts',
- quantile97.5 = 'upper 97.5 percentile value of forecast',
- quantile02.5 = 'upper 2.5 percentile value of forecast',
- quantile90 = 'upper 90 percentile value of forecast',
- quantile10 = 'upper 10 percentile value of forecast',
- duration = 'temporal duration of forecast (hourly = PT1H, daily = P1D, etc.); follows ISO 8601 duration convention',
- depth_m = 'depth (meters) in water column of prediction',
- model_id = 'unique model identifier',
- date = 'ISO 8601 (ISO 2019) date of the predicted value; follows CF convention http://cfconventions.org/cf-conventions/cf-conventions.html#time-coordinate. This variable was called time before v0.5of the EFI convention. For time-integrated variables (e.g., cumulative net primary productivity), one should specify the start_datetime and end_datetime as two variables, instead of the single datetime. If this is not provided the datetime is assumed to be the MIDPOINT of the integration period.',
- pub_datetime = 'datetime that forecast was submitted',
- project_id = 'unique project identifier')
-
-
-## just read in example forecast to extract schema information -- ask about better ways of doing this
-# theme <- 'daily'
-# reference_datetime <- '2023-09-01'
-# site_id <- 'fcre'
-# model_id <- 'climatology'
-
-print('FIND SCORES TABLE SCHEMA')
-scores_theme_df <- arrow::open_dataset(arrow::s3_bucket(config$scores_bucket, endpoint_override = config$endpoint, anonymous = TRUE)) #|>
- #filter(model_id == model_id, site_id = site_id, reference_datetime = reference_datetime)
-
-## identify model ids from bucket -- used in generate model items function
-# scores_data_df <- duckdbfs::open_dataset(glue::glue("s3://{config$inventory_bucket}/catalog/forecasts/project_id={config$project_id}"),
-# s3_endpoint = config$endpoint, anonymous=TRUE) |>
-# collect()
-
-print('FIND INVENTORY BUCKET')
-scores_s3 <- arrow::s3_bucket(glue::glue("{config$inventory_bucket}/catalog/scores/project_id={config$project_id}"),
- endpoint_override = "sdsc.osn.xsede.org",
- anonymous=TRUE)
-
-print('OPEN INVENTORY BUCKET')
-scores_data_df <- arrow::open_dataset(scores_s3) |>
- filter(project_id == config$project_id) |>
- collect()
-
-theme_models <- scores_data_df |>
- distinct(model_id)
-
-scores_date_range <- scores_data_df |> dplyr::summarise(min(date),max(date))
-scores_min_date <- scores_date_range$`min(date)`
-scores_max_date <- scores_date_range$`max(date)`
-
-build_description <- paste0("The catalog contains scores for the ", config$challenge_long_name,". The scores are summaries of the forecasts (i.e., mean, median, confidence intervals), matched observations (if available), and scores (metrics of how well the model distribution compares to observations). You can access the scores at the top level of the dataset where all models, variables, and dates that forecasts were produced (reference_datetime) are available. The code to access the entire dataset is provided as an asset. Given the size of the scores catalog, it can be time-consuming to access the data at the full dataset level. For quicker access to the scores for a particular model (model_id), we also provide the code to access the data at the model_id level as an asset for each model.")
-
-#variable_group <- c('test_daily')
-
-
-stac4cast::build_forecast_scores(table_schema = scores_theme_df,
- #theme_id = 'Scores',
- table_description = scores_description_create,
- start_date = scores_min_date,
- end_date = scores_max_date,
- id_value = "daily-scores",
- description_string = build_description,
- about_string = catalog_config$about_string,
- about_title = catalog_config$about_title,
- theme_title = "Scores",
- destination_path = catalog_config$scores_path,
- aws_download_path = catalog_config$aws_download_path_scores,
- link_items = stac4cast::generate_group_values(group_values = names(config$variable_groups)),
- thumbnail_link = catalog_config$scores_thumbnail,
- thumbnail_title = catalog_config$scores_thumbnail_title,
- model_child = TRUE)
-
-## create separate JSON for model landing page
-## create separate JSON for model landing page
-if (!dir.exists(paste0(catalog_config$scores_path,"models"))){
- dir.create(paste0(catalog_config$scores_path,"models"))
-}
-
-stac4cast::build_group_variables(table_schema = scores_theme_df,
- #theme_id = 'models',
- table_description = scores_description_create,
- start_date = scores_min_date,
- end_date = scores_max_date,
- id_value = "models",
- description_string = build_description,
- about_string = catalog_config$about_string,
- about_title = catalog_config$about_title,
- dashboard_string = catalog_config$dashboard_url,
- dashboard_title = catalog_config$dashboard_title,
- theme_title = "Models",
- destination_path = paste0(catalog_config$scores_path,"models"),
- aws_download_path = catalog_config$aws_download_path_scores,
- group_var_items = stac4cast::generate_model_items(model_list = theme_models$model_id),
- thumbnail_link = 'pending',
- thumbnail_title = 'pending',
- group_var_vector = NULL,
- group_sites = NULL)
-
-## CREATE MODELS
-
-## READ IN MODEL METADATA
-variable_gsheet <- gsheet2tbl(config$target_metadata_gsheet)
-
-#registered_model_id <- gsheet2tbl(config$model_metadata_gsheet)
-
-# read in model metadata and filter for the relevant project
-registered_model_id <- gsheet2tbl(config$model_metadata_gsheet) |>
- filter(`What forecasting challenge are you registering for?` == config$project_id)
-
-scores_sites <- c()
-
-## loop over model ids and extract components if present in metadata table
-for (m in theme_models$model_id){
-
- # make model items directory
- if (!dir.exists(paste0(catalog_config$forecast_path,"models/model_items"))){
- dir.create(paste0(catalog_config$forecast_path,"models/model_items"))
- }
-
- print(m)
- model_date_range <- scores_data_df |> filter(model_id == m) |> dplyr::summarise(min(date),max(date))
- model_min_date <- model_date_range$`min(date)`
- model_max_date <- model_date_range$`max(date)`
-
- model_sites <- scores_data_df |> filter(model_id == m) |> distinct(site_id)
- model_vars <- scores_data_df |> filter(model_id == m) |> distinct(variable)
-
- model_var_duration_df <- scores_data_df |> filter(model_id == m) |> distinct(variable,duration) |>
- mutate(duration_name = ifelse(duration == 'P1D', 'Daily', duration)) |>
- mutate(duration_name = ifelse(duration == 'PT1H', 'Hourly', duration_name)) |>
- mutate(duration_name = ifelse(duration == 'PT30M', '30min', duration_name)) |>
- mutate(duration_name = ifelse(duration == 'P1W', 'Weekly', duration_name))
-
- model_var_full_name <- model_var_duration_df |>
- left_join((variable_gsheet |>
- select(variable = `"official" targets name`, full_name = `Variable name`) |>
- distinct(variable, .keep_all = TRUE)), by = c('variable'))
-
- model_sites <- scores_data_df |> filter(model_id == m) |> distinct(site_id)
-
- model_vars <- scores_data_df |> filter(model_id == m) |> distinct(variable) |> left_join(model_var_full_name, by = 'variable')
- model_vars$var_duration_name <- paste0(model_vars$duration_name, " ", model_vars$full_name)
-
- #model_var_duration_df$full_variable_name <- paste0(model_var_duration_df$variable, "_", model_var_duration_df$duration_name)
-
- scores_sites <- append(scores_sites, stac4cast::get_site_coords(site_metadata = catalog_config$site_metadata_url,
- sites = model_sites$site_id))
-
- idx = which(registered_model_id$model_id == m)
-
- stac4cast::build_model(model_id = m,
- team_name = registered_model_id$`Long name of the model (can include spaces)`[idx],
- model_description = registered_model_id[idx,"Describe your modeling approach in your own words."][[1]],
- start_date = model_min_date,
- end_date = model_max_date,
- var_values = model_vars$var_duration_name,
- duration_names = model_var_duration_df$duration,
- site_values = model_sites$site_id,
- site_table = catalog_config$site_metadata_url,
- model_documentation = registered_model_id,
- destination_path = paste0(catalog_config$scores_path,"models/model_items"),
- aws_download_path = config$scores_bucket, # CHANGE THIS BUCKET NAME
- collection_name = 'scores',
- thumbnail_image_name = NULL,
- table_schema = scores_theme_df,
- table_description = scores_description_create,
- full_var_df = model_vars,
- #code_web_link = registered_model_id$`Web link to model code`[idx],
- code_web_link = 'pending')
-}
-
-
-## BUILD VARIABLE GROUPS
-
-for (i in 1:length(config$variable_groups)){
- print(names(config$variable_groups)[i])
-
- # check data and skip if no data found
- var_group_data_check <- scores_data_df |>
- filter(variable %in% config$variable_groups[[i]]$variable)
-
- if (nrow(var_group_data_check) == 0){
- print('No data available for group')
- next
- }
-
-
-
- if (!dir.exists(paste0(catalog_config$scores_path,names(config$variable_groups[i])))){
- dir.create(paste0(catalog_config$scores_path,names(config$variable_groups[i])))
- }
-
- for(j in 1:length(config$variable_groups[[i]]$variable)){ # FOR EACH VARIABLE WITHIN A MODEL GROUP
-
- ## restructure variable names
- var_values <- config$variable_groups[[i]]$variable
- var_name <- config$variable_groups[[i]]$variable[j]
- print(var_name)
-
- # check data and skip if no data found
- var_data_check <- scores_data_df |>
- filter(variable == var_name)
-
- if (nrow(var_data_check) == 0){
- print('No data available for variable')
- next
- }
-
- duration_name <- config$variable_groups[[i]]$duration[j]
-
- # match variable with full name in gsheet
- var_name_full <- variable_gsheet[which(variable_gsheet$`"official" targets name` %in% var_values),1][[1]]
-
- ## create new vector to store duration names
- duration_values <- config$variable_groups[[i]]$duration
- duration_values[which(duration_values == 'P1D')] <- 'Daily'
- duration_values[which(duration_values == 'PT1H')] <- 'Hourly'
- duration_values[which(duration_values == 'PT30M')] <- '30min'
- duration_values[which(duration_values == 'P1W')] <- 'Weekly'
-
- var_name_combined_list <- paste0(duration_values,'_',var_name_full)
-
- ## CREATE VARIABLE GROUP JSONS
- group_description <- paste0('This page includes variables for the ',names(config$variable_groups[i]),' group.')
-
- ## find group sites
- find_group_sites <- scores_data_df |>
- filter(variable %in% var_values) |>
- distinct(site_id)
-
- stac4cast::build_group_variables(table_schema = scores_theme_df,
- #theme_id = names(config$variable_groups[i]),
- table_description = scores_description_create,
- start_date = scores_min_date,
- end_date = scores_max_date,
- id_value = names(config$variable_groups[i]),
- description_string = group_description,
- about_string = catalog_config$about_string,
- about_title = catalog_config$about_title,
- dashboard_string = catalog_config$dashboard_url,
- dashboard_title = catalog_config$dashboard_title,
- theme_title = names(config$variable_groups[i]),
- destination_path = paste0(catalog_config$scores_path,names(config$variable_groups[i])),
- aws_download_path = catalog_config$aws_download_path_scores,
- group_var_items = stac4cast::generate_group_variable_items(variables = var_name_combined_list),
- thumbnail_link = config$variable_groups[[i]]$thumbnail_link,
- thumbnail_title = config$variable_groups[[i]]$thumbnail_title,
- group_var_vector = unique(var_values),
- group_sites = find_group_sites$site_id)
-
- if (!dir.exists(paste0(catalog_config$scores_path,names(config$variable_groups)[i],'/',var_name_combined_list[j]))){
- dir.create(paste0(catalog_config$scores_path,names(config$variable_groups)[i],'/',var_name_combined_list[j]))
- }
-
- var_data <- scores_data_df |>
- filter(variable == var_name,
- duration == duration_name)
-
- var_date_range <- var_data |> dplyr::summarise(min(date),max(date))
- var_min_date <- var_date_range$`min(date)`
- var_max_date <- var_date_range$`max(date)`
-
- var_models <- var_data |> distinct(model_id)
-
- find_var_sites <- scores_data_df |>
- filter(variable == var_name) |>
- distinct(site_id)
-
- var_description <- paste0('This page includes all models for the ',var_name_combined_list[j],' variable.')
-
- var_path <- gsub('forecasts','scores',var_data$path[1])
-
- stac4cast::build_group_variables(table_schema = scores_theme_df,
- #theme_id = var_name_combined_list[j],
- table_description = scores_description_create,
- start_date = var_min_date,
- end_date = var_max_date,
- id_value = var_name_combined_list[j],
- description_string = var_description,
- about_string = catalog_config$about_string,
- about_title = catalog_config$about_title,
- dashboard_string = catalog_config$dashboard_url,
- dashboard_title = catalog_config$dashboard_title,
- theme_title = var_name_combined_list[j],
- destination_path = file.path(catalog_config$scores_path,names(config$variable_groups)[i],var_name_combined_list[j]),
- aws_download_path = var_path,
- group_var_items = stac4cast::generate_variable_model_items(model_list = var_models$model_id),
- thumbnail_link = 'pending',
- thumbnail_title = 'pending',
- group_var_vector = NULL,
- group_sites = find_var_sites$site_id)
-
- }
-
-
-}
diff --git a/catalog/sites/build_sites_page.R b/catalog/sites/build_sites_page.R
deleted file mode 100644
index 0159b8d9d9..0000000000
--- a/catalog/sites/build_sites_page.R
+++ /dev/null
@@ -1,57 +0,0 @@
-library(arrow)
-library(dplyr)
-library(gsheet)
-library(readr)
-
-#source('catalog/R/stac_functions.R')
-config <- yaml::read_yaml('challenge_configuration.yaml')
-catalog_config <- config$catalog_config
-
-## CREATE table for column descriptions
-site_description_create <- data.frame(site_id = 'site identifier',
- project_id = 'forecast challenge identifier',
- agency_cd = 'organization / agency responsible for site monitoring',
- site_no = 'National Water Information System stream gage identifier',
- station_nm = 'National Water Information System station long name',
- site_tp_cd = 'National Water Information System site type code; https://maps.waterdata.usgs.gov/mapper/help/sitetype.html',
- latitude = 'site latitude',
- longitude = 'site longitude',
- site_url = 'National Water Information System URL for monitoring site',
- colocated = '', # TODO: what is colocated?
- queryTime = 'timestamp when site metadata was retrieved')
-
-#inventory_theme_df <- arrow::open_dataset(glue::glue("s3://{config$inventory_bucket}/catalog/forecasts/project_id={config$project_id}"), endpoint_override = config$endpoint, anonymous = TRUE) #|>
-
-#target_url <- "https://renc.osn.xsede.org/bio230121-bucket01/vera4cast/targets/project_id=vera4cast/duration=P1D/daily-insitu-targets.csv.gz"
-site_df <- read_csv(config$site_table, show_col_types = FALSE)
-
-# inventory_theme_df <- arrow::open_dataset(arrow::s3_bucket(config$inventory_bucket, endpoint_override = config$endpoint, anonymous = TRUE))
-#
-# inventory_data_df <- duckdbfs::open_dataset(glue::glue("s3://{config$inventory_bucket}/catalog"),
-# s3_endpoint = config$endpoint, anonymous=TRUE) |>
-# collect()
-#
-# theme_models <- inventory_data_df |>
-# distinct(model_id)
-
-# target_date_range <- targets |> dplyr::summarise(min(datetime),max(datetime))
-# target_min_date <- as.Date(target_date_range$`min(datetime)`)
-# target_max_date <- as.Date(target_date_range$`max(datetime)`)
-
-build_description <- paste0("The catalog contains site metadata for the ", config$challenge_long_name)
-
-
-stac4cast::build_sites(table_schema = site_df,
- table_description = site_description_create,
- # start_date = target_min_date,
- # end_date = target_max_date,
- id_value = "sites",
- description_string = build_description,
- about_string = catalog_config$about_string,
- about_title = catalog_config$about_title,
- theme_title = "Site Metadata",
- destination_path = config$site_path,
- #link_items = stac4cast::generate_group_values(group_values = names(config$variable_groups)),
- link_items = NULL,
- thumbnail_link = config$site_thumbnail,
- thumbnail_title = config$site_thumbnail_title)
diff --git a/catalog/sites/collection.json b/catalog/sites/collection.json
deleted file mode 100644
index 87b0bc4a30..0000000000
--- a/catalog/sites/collection.json
+++ /dev/null
@@ -1,142 +0,0 @@
-{
- "id": "sites",
- "description": "The catalog contains site metadata for the EFI-USGS River Chlorophyll Forecasting Challenge",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../catalog.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../catalog.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Site Metadata",
- "extent": {
- "spatial": {
- "bbox": [
- [
- -122.6692,
- 39.6327,
- -74.7781,
- 45.5175
- ]
- ]
- },
- "temporal": {
- "interval": [
- [
- "pendingT00:00:00Z",
- "pendingT00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "site_id",
- "type": "character",
- "description": "site identifier"
- },
- {
- "name": "project_id",
- "type": "character",
- "description": "forecast challenge identifier"
- },
- {
- "name": "agency_cd",
- "type": "character",
- "description": "organization / agency responsible for site monitoring"
- },
- {
- "name": "site_no",
- "type": "character",
- "description": "National Water Information System stream gage identifier"
- },
- {
- "name": "station_nm",
- "type": "character",
- "description": "National Water Information System station long name"
- },
- {
- "name": "site_tp_cd",
- "type": "character",
- "description": "National Water Information System site type code; https://maps.waterdata.usgs.gov/mapper/help/sitetype.html"
- },
- {
- "name": "latitude",
- "type": "numeric",
- "description": "site latitude"
- },
- {
- "name": "longitude",
- "type": "numeric",
- "description": "site longitude"
- },
- {
- "name": "site_url",
- "type": "character",
- "description": "National Water Information System URL for monitoring site"
- },
- {
- "name": "colocated",
- "type": "logical",
- "description": ""
- },
- {
- "name": "queryTime",
- "type": ["POSIXct", "POSIXt"],
- "description": "timestamp when site metadata was retrieved"
- }
- ],
- "assets": {
- "data": {
- "href": "https://raw.githubusercontent.com/eco4cast/usgsrc4cast-ci/main/USGS_site_metadata.csv",
- "type": "application/x-parquet",
- "title": "Site Metadata Access",
- "roles": [
- "data"
- ],
- "description": "This R code will return results for the site metadata.\n\n### R\n\n```{r}\n# Use code below\n\nurl <- https://raw.githubusercontent.com/eco4cast/usgsrc4cast-ci/main/USGS_site_metadata.csv\nsites <- readr::read_csv(url, show_col_types = FALSE)\n```"
- },
- "thumbnail": {
- "href": "https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/screencapture-waterdata-usgs-gov-nwis-rt-2018-08-02-13_00_05-01.jpg",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "USGS Sites Map"
- }
- }
-}
diff --git a/catalog/summaries/Aquatics/Daily_Chlorophyll_a/collection.json b/catalog/summaries/Aquatics/Daily_Chlorophyll_a/collection.json
deleted file mode 100644
index a6fa09fdab..0000000000
--- a/catalog/summaries/Aquatics/Daily_Chlorophyll_a/collection.json
+++ /dev/null
@@ -1,297 +0,0 @@
-{
- "id": "Daily_Chlorophyll_a",
- "description": "This page includes all models for the Daily_Chlorophyll_a variable.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/USGSHABs1.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/cb_prophet.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/climatology.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/persistenceRW.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procBlanchardMonod.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procBlanchardSteele.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procCTMIMonod.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procCTMISteele.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procEppleyNorbergMonod.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procEppleyNorbergSteele.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procHinshelwoodSteele.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_arima.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_bag_mlp.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_ets.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_humidity_lm.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_lasso.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_lasso_all_sites.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_precip_lm.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_precip_lm_all_sites.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_randfor.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_tbats.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_temp_lm_all_sites.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_humidity_lm_all_sites.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_temp_lm.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/procHinshelwoodMonod.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/tg_auto_adam.json"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/neon4cast-docs/",
- "type": "text/html",
- "title": "NEON Ecological Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/neon4cast-docs/",
- "title": "NEON Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Daily_Chlorophyll_a",
- "extent": {
- "spatial": {
- "bbox": [
- [-149.6106, 29.676, -82.0084, 68.6307]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2023-01-01T00:00:00Z",
- "2024-01-21T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "mean",
- "type": "double",
- "description": "mean forecast prediction"
- },
- {
- "name": "median",
- "type": "double",
- "description": "median forecast prediction"
- },
- {
- "name": "sd",
- "type": "double",
- "description": "standard deviation forecasts"
- },
- {
- "name": "quantile97.5",
- "type": "double",
- "description": "upper 97.5 percentile value of forecast"
- },
- {
- "name": "quantile02.5",
- "type": "double",
- "description": "upper 2.5 percentile value of forecast"
- },
- {
- "name": "quantile90",
- "type": "double",
- "description": "upper 90 percentile value of forecast"
- },
- {
- "name": "quantile10",
- "type": "double",
- "description": "upper 10 percentile value of forecast"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique identifier for the forecast project"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly, daily, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "reference_date",
- "type": "string",
- "description": "date that the forecast was initiated"
- }
- ],
- "assets": {
- "data": {
- "href": "\"s3://anonymous@bio230014-bucket01/challenges/forecasts/parquet/project_id=neon4cast/duration=P1D/variable=chla?endpoint_override=sdsc.osn.xsede.org\"",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for forecasts of the variable by the specific model .\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/forecasts/parquet/project_id=neon4cast/duration=P1D/variable=chla?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "pending",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "pending"
- }
- }
-}
diff --git a/catalog/summaries/Aquatics/collection.json b/catalog/summaries/Aquatics/collection.json
deleted file mode 100644
index ad783369a0..0000000000
--- a/catalog/summaries/Aquatics/collection.json
+++ /dev/null
@@ -1,182 +0,0 @@
-{
- "id": "Aquatics",
- "description": "This page includes variables for the Aquatics group.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "child",
- "type": "application/json",
- "href": "Daily_Dissolved_oxygen/collection.json"
- },
- {
- "rel": "child",
- "type": "application/json",
- "href": "Daily_Water_temperature/collection.json"
- },
- {
- "rel": "child",
- "type": "application/json",
- "href": "Daily_Chlorophyll_a/collection.json"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/neon4cast-docs/",
- "type": "text/html",
- "title": "NEON Ecological Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/neon4cast-docs/",
- "title": "NEON Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Aquatics",
- "extent": {
- "spatial": {
- "bbox": [
- [-149.6106, 18.1135, -66.7987, 68.6698]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2023-01-01T00:00:00Z",
- "2024-12-09T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "mean",
- "type": "double",
- "description": "mean forecast prediction"
- },
- {
- "name": "median",
- "type": "double",
- "description": "median forecast prediction"
- },
- {
- "name": "sd",
- "type": "double",
- "description": "standard deviation forecasts"
- },
- {
- "name": "quantile97.5",
- "type": "double",
- "description": "upper 97.5 percentile value of forecast"
- },
- {
- "name": "quantile02.5",
- "type": "double",
- "description": "upper 2.5 percentile value of forecast"
- },
- {
- "name": "quantile90",
- "type": "double",
- "description": "upper 90 percentile value of forecast"
- },
- {
- "name": "quantile10",
- "type": "double",
- "description": "upper 10 percentile value of forecast"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique identifier for the forecast project"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly, daily, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "reference_date",
- "type": "string",
- "description": "date that the forecast was initiated"
- }
- ],
- "assets": {
- "data": {
- "href": "\"s3://anonymous@bio230014-bucket01/vera4cast/forecasts/summaries/parquet/?endpoint_override=sdsc.osn.xsede.org\"",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for the NEON Ecological Forecasting Aquatics theme.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/vera4cast/forecasts/summaries/parquet/?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |>\n dplyr::filter(variable %in% c(\"oxygen\", \"temperature\", \"chla\")) |>\n dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "https://projects.ecoforecast.org/neon4cast-catalog/img/neon_buoy.jpg",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "NEON Aquatics Buoy"
- }
- }
-}
diff --git a/catalog/summaries/aquatics/Daily_Chlorophyll_a/collection.json b/catalog/summaries/aquatics/Daily_Chlorophyll_a/collection.json
deleted file mode 100644
index ff12ff56e7..0000000000
--- a/catalog/summaries/aquatics/Daily_Chlorophyll_a/collection.json
+++ /dev/null
@@ -1,177 +0,0 @@
-{
- "id": "Daily_Chlorophyll_a",
- "description": "This page includes all models for the Daily_Chlorophyll_a variable.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/climatology.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "../../models/model_items/persistenceRW.json"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Daily_Chlorophyll_a",
- "extent": {
- "spatial": {
- "bbox": [
- ["Inf", "Inf", "-Inf", "-Inf"]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2024-02-07T00:00:00Z",
- "2024-03-14T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "mean",
- "type": "double",
- "description": "mean forecast prediction"
- },
- {
- "name": "median",
- "type": "double",
- "description": "median forecast prediction"
- },
- {
- "name": "sd",
- "type": "double",
- "description": "standard deviation forecasts"
- },
- {
- "name": "quantile97.5",
- "type": "double",
- "description": "upper 97.5 percentile value of forecast"
- },
- {
- "name": "quantile02.5",
- "type": "double",
- "description": "upper 2.5 percentile value of forecast"
- },
- {
- "name": "quantile90",
- "type": "double",
- "description": "upper 90 percentile value of forecast"
- },
- {
- "name": "quantile10",
- "type": "double",
- "description": "upper 10 percentile value of forecast"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique identifier for the forecast project"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly, daily, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "reference_date",
- "type": "string",
- "description": "date that the forecast was initiated"
- }
- ],
- "assets": {
- "data": {
- "href": "s3://anonymous@bio230014-bucket01/challenges/forecasts/parquet/project_id=usgsrc4cast/duration=P1D/variable=chla?endpoint_override=sdsc.osn.xsede.org",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for forecasts of the variable by the specific model .\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/forecasts/parquet/project_id=usgsrc4cast/duration=P1D/variable=chla?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "pending",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "pending"
- }
- }
-}
diff --git a/catalog/summaries/aquatics/collection.json b/catalog/summaries/aquatics/collection.json
deleted file mode 100644
index c2361a337e..0000000000
--- a/catalog/summaries/aquatics/collection.json
+++ /dev/null
@@ -1,172 +0,0 @@
-{
- "id": "aquatics",
- "description": "This page includes variables for the aquatics group.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "child",
- "type": "application/json",
- "href": "Daily_Chlorophyll_a/collection.json"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "aquatics",
- "extent": {
- "spatial": {
- "bbox": [
- ["Inf", "Inf", "-Inf", "-Inf"]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2024-02-07T00:00:00Z",
- "2024-03-14T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "mean",
- "type": "double",
- "description": "mean forecast prediction"
- },
- {
- "name": "median",
- "type": "double",
- "description": "median forecast prediction"
- },
- {
- "name": "sd",
- "type": "double",
- "description": "standard deviation forecasts"
- },
- {
- "name": "quantile97.5",
- "type": "double",
- "description": "upper 97.5 percentile value of forecast"
- },
- {
- "name": "quantile02.5",
- "type": "double",
- "description": "upper 2.5 percentile value of forecast"
- },
- {
- "name": "quantile90",
- "type": "double",
- "description": "upper 90 percentile value of forecast"
- },
- {
- "name": "quantile10",
- "type": "double",
- "description": "upper 10 percentile value of forecast"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique identifier for the forecast project"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly, daily, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "reference_date",
- "type": "string",
- "description": "date that the forecast was initiated"
- }
- ],
- "assets": {
- "data": {
- "href": "s3://anonymous@bio230014-bucket01/vera4cast/forecasts/summaries/parquet/?endpoint_override=sdsc.osn.xsede.org",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for the NEON Ecological Forecasting Aquatics theme.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/vera4cast/forecasts/summaries/parquet/?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |>\n dplyr::filter(variable %in% c(\"chla\")) |>\n dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/Back-b.jpg",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "USGS Streamgage"
- }
- }
-}
diff --git a/catalog/summaries/collection.json b/catalog/summaries/collection.json
deleted file mode 100644
index 332caa159a..0000000000
--- a/catalog/summaries/collection.json
+++ /dev/null
@@ -1,184 +0,0 @@
-{
- "id": "summaries",
- "description": "Summaries are the forecasts statistics of the raw forecasts (i.e., mean, median, confidence intervals). You can access the summaries at the top level of the dataset where all models, variables, and dates that forecasts were produced (reference_datetime) are available. The code to access the entire dataset is provided as an asset. Given the size of the forecast catalog, it can be time-consuming to access the data at the full dataset level. For quicker access to the forecasts for a particular model (model_id), we also provide the code to access the data at the model_id level as an asset for each model.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "child",
- "type": "application/json",
- "href": "aquatics/collection.json",
- "title": "aquatics"
- },
- {
- "rel": "child",
- "type": "application/json",
- "href": "models/collection.json",
- "title": "group item"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../catalog.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../catalog.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Forecast Summaries",
- "extent": {
- "spatial": {
- "bbox": [
- [
- -122.6692,
- 39.6327,
- -74.7781,
- 45.5175
- ]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2024-02-07T00:00:00Z",
- "2024-03-14T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "mean",
- "type": "double",
- "description": "mean forecast prediction"
- },
- {
- "name": "median",
- "type": "double",
- "description": "median forecast prediction"
- },
- {
- "name": "sd",
- "type": "double",
- "description": "standard deviation forecasts"
- },
- {
- "name": "quantile97.5",
- "type": "double",
- "description": "upper 97.5 percentile value of forecast"
- },
- {
- "name": "quantile02.5",
- "type": "double",
- "description": "upper 2.5 percentile value of forecast"
- },
- {
- "name": "quantile90",
- "type": "double",
- "description": "upper 90 percentile value of forecast"
- },
- {
- "name": "quantile10",
- "type": "double",
- "description": "upper 10 percentile value of forecast"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique identifier for the forecast project"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly, daily, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "reference_date",
- "type": "string",
- "description": "date that the forecast was initiated"
- }
- ],
- "assets": {
- "data": {
- "href": "s3://anonymous@bio230014-bucket01/vera4cast/forecasts/summaries/parquet/?endpoint_override=sdsc.osn.xsede.org",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for the Forecasting Challenge.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/vera4cast/forecasts/summaries/parquet/?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/Manual%20measurement%20streamgage.jpg",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "USGS Image"
- }
- }
-}
diff --git a/catalog/summaries/models/collection.json b/catalog/summaries/models/collection.json
deleted file mode 100644
index a601aa832b..0000000000
--- a/catalog/summaries/models/collection.json
+++ /dev/null
@@ -1,177 +0,0 @@
-{
- "id": "models",
- "description": "Summaries are the forecasts statistics of the raw forecasts (i.e., mean, median, confidence intervals). You can access the summaries at the top level of the dataset where all models, variables, and dates that forecasts were produced (reference_datetime) are available. The code to access the entire dataset is provided as an asset. Given the size of the forecast catalog, it can be time-consuming to access the data at the full dataset level. For quicker access to the forecasts for a particular model (model_id), we also provide the code to access the data at the model_id level as an asset for each model.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "item",
- "type": "application/json",
- "href": "model_items/climatology.json"
- },
- {
- "rel": "item",
- "type": "application/json",
- "href": "model_items/persistenceRW.json"
- },
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../collection.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Models",
- "extent": {
- "spatial": {
- "bbox": [
- [-122.6692, 39.6328, -74.7781, 45.5175]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2024-02-07T00:00:00Z",
- "2024-03-14T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "mean",
- "type": "double",
- "description": "mean forecast prediction"
- },
- {
- "name": "median",
- "type": "double",
- "description": "median forecast prediction"
- },
- {
- "name": "sd",
- "type": "double",
- "description": "standard deviation forecasts"
- },
- {
- "name": "quantile97.5",
- "type": "double",
- "description": "upper 97.5 percentile value of forecast"
- },
- {
- "name": "quantile02.5",
- "type": "double",
- "description": "upper 2.5 percentile value of forecast"
- },
- {
- "name": "quantile90",
- "type": "double",
- "description": "upper 90 percentile value of forecast"
- },
- {
- "name": "quantile10",
- "type": "double",
- "description": "upper 10 percentile value of forecast"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique identifier for the forecast project"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly, daily, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "reference_date",
- "type": "string",
- "description": "date that the forecast was initiated"
- }
- ],
- "assets": {
- "data": {
- "href": "s3://anonymous@bio230014-bucket01/vera4cast/forecasts/summaries/parquet/?endpoint_override=sdsc.osn.xsede.org",
- "type": "application/x-parquet",
- "title": "Database Access",
- "roles": [
- "data"
- ],
- "description": "Use `arrow` for remote access to the database. This R code will return results for forecasts of the variable by the specific model .\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/vera4cast/forecasts/summaries/parquet/?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- },
- "thumbnail": {
- "href": "pending",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "pending"
- }
- }
-}
diff --git a/catalog/summaries/models/model_items/.empty b/catalog/summaries/models/model_items/.empty
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/catalog/summaries/models/model_items/climatology.json b/catalog/summaries/models/model_items/climatology.json
deleted file mode 100644
index beb7f88ca8..0000000000
--- a/catalog/summaries/models/model_items/climatology.json
+++ /dev/null
@@ -1,199 +0,0 @@
-{
- "stac_version": "1.0.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Feature",
- "id": "climatology",
- "bbox": [
- [
- -122.6692,
- 45.5175,
- -74.7781,
- 45.5175
- ]
- ],
- "geometry": {
- "type": "MultiPoint",
- "coordinates": [
- [],
- [],
- [],
- [],
- [],
- [],
- [],
- [],
- []
- ]
- },
- "properties": {
- "description": "\nmodel info: Forecasts stream chlorophyll-a based on the historic average and standard deviation for that given site and day-of-year.\n\nSites: USGS-01427510, USGS-01463500, USGS-05543010, USGS-05553700, USGS-05558300, USGS-05586300, USGS-14181500, USGS-14211010, USGS-14211720\n\nVariables: Daily Chlorophyll_a",
- "start_datetime": "2024-02-07",
- "end_datetime": "2024-03-14",
- "providers": [
- {
- "url": "pending",
- "name": "pending",
- "roles": [
- "producer",
- "processor",
- "licensor"
- ]
- },
- {
- "url": "https://www.ecoforecastprojectvt.org",
- "name": "Ecoforecast Challenge",
- "roles": [
- "host"
- ]
- }
- ],
- "license": "CC0-1.0",
- "keywords": [
- "Forecasting",
- "usgsrc4cast",
- "Daily Chlorophyll_a"
- ],
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "mean",
- "type": "double",
- "description": "mean forecast prediction"
- },
- {
- "name": "median",
- "type": "double",
- "description": "median forecast prediction"
- },
- {
- "name": "sd",
- "type": "double",
- "description": "standard deviation forecasts"
- },
- {
- "name": "quantile97.5",
- "type": "double",
- "description": "upper 97.5 percentile value of forecast"
- },
- {
- "name": "quantile02.5",
- "type": "double",
- "description": "upper 2.5 percentile value of forecast"
- },
- {
- "name": "quantile90",
- "type": "double",
- "description": "upper 90 percentile value of forecast"
- },
- {
- "name": "quantile10",
- "type": "double",
- "description": "upper 10 percentile value of forecast"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique identifier for the forecast project"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly, daily, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "reference_date",
- "type": "string",
- "description": "date that the forecast was initiated"
- }
- ]
- },
- "collection": "forecasts",
- "links": [
- {
- "rel": "collection",
- "href": "../collection.json",
- "type": "application/json",
- "title": "climatology"
- },
- {
- "rel": "root",
- "href": "../../../catalog.json",
- "type": "application/json",
- "title": "Forecast Catalog"
- },
- {
- "rel": "parent",
- "href": "../collection.json",
- "type": "application/json",
- "title": "climatology"
- },
- {
- "rel": "self",
- "href": "climatology.json",
- "type": "application/json",
- "title": "Model Forecast"
- },
- {
- "rel": "item",
- "href": "https://github.com/eco4cast/usgsrc4cast-ci/blob/main/baseline_models/models/aquatics_climatology.R",
- "type": "text/html",
- "title": "Link for Model Code"
- }
- ],
- "assets": {
- "1": {
- "type": "application/json",
- "title": "Model Metadata",
- "href": "https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/metadata/model_id/climatology.json",
- "description": "Use `jsonlite::fromJSON()` to download the model metadata JSON file. This R code will return metadata provided during the model registration.\n \n\n### R\n\n```{r}\n# Use code below\n\nmodel_metadata <- jsonlite::fromJSON(\"https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/metadata/model_id/climatology.json\")\n\n"
- },
- "2": {
- "type": "text/html",
- "title": "Link for Model Code",
- "href": "https://github.com/eco4cast/usgsrc4cast-ci/blob/main/baseline_models/models/aquatics_climatology.R",
- "description": "The link to the model code provided by the model submission team"
- },
- "3": {
- "type": "application/x-parquet",
- "title": "Database Access for Daily Chlorophyll_a",
- "href": "s3://anonymous@bio230014-bucket01/challenges/forecasts/summariesproject_id=/duration=P1D/variable=chla/model_id=climatology?endpoint_override=sdsc.osn.xsede.org",
- "description": "Use `arrow` for remote access to the database. This R code will return results for this variable and model combination.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/forecasts/summariesproject_id=/duration=P1D/variable=chla/model_id=climatology?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- }
- }
-}
diff --git a/catalog/summaries/models/model_items/persistenceRW.json b/catalog/summaries/models/model_items/persistenceRW.json
deleted file mode 100644
index be7a7ce474..0000000000
--- a/catalog/summaries/models/model_items/persistenceRW.json
+++ /dev/null
@@ -1,200 +0,0 @@
-{
- "stac_version": "1.0.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Feature",
- "id": "persistenceRW",
- "bbox": [
- [
- -122.6692,
- 45.5175,
- -74.7781,
- 45.5175
- ]
- ],
- "geometry": {
- "type": "MultiPoint",
- "coordinates": [
- [],
- [],
- [],
- [],
- [],
- [],
- [],
- [],
- [],
- []
- ]
- },
- "properties": {
- "description": "\nmodel info: Random walk model based on most recent stream chl-a observations using the fable::RW() model.\n\nSites: USGS-01427510, USGS-01463500, USGS-05543010, USGS-05549500, USGS-05553700, USGS-05558300, USGS-05586300, USGS-14181500, USGS-14211010, USGS-14211720\n\nVariables: Daily Chlorophyll_a",
- "start_datetime": "2024-02-07",
- "end_datetime": "2024-03-13",
- "providers": [
- {
- "url": "pending",
- "name": "pending",
- "roles": [
- "producer",
- "processor",
- "licensor"
- ]
- },
- {
- "url": "https://www.ecoforecastprojectvt.org",
- "name": "Ecoforecast Challenge",
- "roles": [
- "host"
- ]
- }
- ],
- "license": "CC0-1.0",
- "keywords": [
- "Forecasting",
- "usgsrc4cast",
- "Daily Chlorophyll_a"
- ],
- "table:columns": [
- {
- "name": "reference_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that the forecast was initiated (horizon = 0)"
- },
- {
- "name": "site_id",
- "type": "string",
- "description": "For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)"
- },
- {
- "name": "datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime of the forecasted value (ISO 8601)"
- },
- {
- "name": "family",
- "type": "string",
- "description": "For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”"
- },
- {
- "name": "pub_datetime",
- "type": "timestamp[us, tz=UTC]",
- "description": "datetime that forecast was submitted"
- },
- {
- "name": "mean",
- "type": "double",
- "description": "mean forecast prediction"
- },
- {
- "name": "median",
- "type": "double",
- "description": "median forecast prediction"
- },
- {
- "name": "sd",
- "type": "double",
- "description": "standard deviation forecasts"
- },
- {
- "name": "quantile97.5",
- "type": "double",
- "description": "upper 97.5 percentile value of forecast"
- },
- {
- "name": "quantile02.5",
- "type": "double",
- "description": "upper 2.5 percentile value of forecast"
- },
- {
- "name": "quantile90",
- "type": "double",
- "description": "upper 90 percentile value of forecast"
- },
- {
- "name": "quantile10",
- "type": "double",
- "description": "upper 10 percentile value of forecast"
- },
- {
- "name": "project_id",
- "type": "string",
- "description": "unique identifier for the forecast project"
- },
- {
- "name": "duration",
- "type": "string",
- "description": "temporal duration of forecast (hourly, daily, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "string",
- "description": "name of forecasted variable"
- },
- {
- "name": "model_id",
- "type": "string",
- "description": "unique model identifier"
- },
- {
- "name": "reference_date",
- "type": "string",
- "description": "date that the forecast was initiated"
- }
- ]
- },
- "collection": "forecasts",
- "links": [
- {
- "rel": "collection",
- "href": "../collection.json",
- "type": "application/json",
- "title": "persistenceRW"
- },
- {
- "rel": "root",
- "href": "../../../catalog.json",
- "type": "application/json",
- "title": "Forecast Catalog"
- },
- {
- "rel": "parent",
- "href": "../collection.json",
- "type": "application/json",
- "title": "persistenceRW"
- },
- {
- "rel": "self",
- "href": "persistenceRW.json",
- "type": "application/json",
- "title": "Model Forecast"
- },
- {
- "rel": "item",
- "href": "https://github.com/eco4cast/usgsrc4cast-ci/blob/main/baseline_models/models/aquatics_persistenceRW.R",
- "type": "text/html",
- "title": "Link for Model Code"
- }
- ],
- "assets": {
- "1": {
- "type": "application/json",
- "title": "Model Metadata",
- "href": "https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/metadata/model_id/persistenceRW.json",
- "description": "Use `jsonlite::fromJSON()` to download the model metadata JSON file. This R code will return metadata provided during the model registration.\n \n\n### R\n\n```{r}\n# Use code below\n\nmodel_metadata <- jsonlite::fromJSON(\"https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/metadata/model_id/persistenceRW.json\")\n\n"
- },
- "2": {
- "type": "text/html",
- "title": "Link for Model Code",
- "href": "https://github.com/eco4cast/usgsrc4cast-ci/blob/main/baseline_models/models/aquatics_persistenceRW.R",
- "description": "The link to the model code provided by the model submission team"
- },
- "3": {
- "type": "application/x-parquet",
- "title": "Database Access for Daily Chlorophyll_a",
- "href": "s3://anonymous@bio230014-bucket01/challenges/forecasts/summariesproject_id=/duration=P1D/variable=chla/model_id=persistenceRW?endpoint_override=sdsc.osn.xsede.org",
- "description": "Use `arrow` for remote access to the database. This R code will return results for this variable and model combination.\n\n### R\n\n```{r}\n# Use code below\n\nall_results <- arrow::open_dataset(\"s3://anonymous@bio230014-bucket01/challenges/forecasts/summariesproject_id=/duration=P1D/variable=chla/model_id=persistenceRW?endpoint_override=sdsc.osn.xsede.org\")\ndf <- all_results |> dplyr::collect()\n\n```\n \n\nYou can use dplyr operations before calling `dplyr::collect()` to `summarise`, `select` columns, and/or `filter` rows prior to pulling the data into a local `data.frame`. Reducing the data that is pulled locally will speed up the data download speed and reduce your memory usage.\n\n\n"
- }
- }
-}
diff --git a/catalog/summaries/summaries_models.R b/catalog/summaries/summaries_models.R
deleted file mode 100644
index d790cdbb4d..0000000000
--- a/catalog/summaries/summaries_models.R
+++ /dev/null
@@ -1,306 +0,0 @@
-library(arrow)
-library(dplyr)
-library(gsheet)
-library(readr)
-
-#source('catalog/R/stac_functions.R')
-config <- yaml::read_yaml('challenge_configuration.yaml')
-catalog_config <- config$catalog_config
-
-# file.sources = list.files(c("../stac4cast/R"), full.names=TRUE,
-# ignore.case=TRUE)
-# sapply(file.sources,source,.GlobalEnv)
-
-## CREATE table for column descriptions
-summaries_description_create <- data.frame(reference_datetime = 'datetime that the forecast was initiated (horizon = 0)',
- site_id = 'For forecasts that are not on a spatial grid, use of a site dimension that maps to a more detailed geometry (points, polygons, etc.) is allowable. In general this would be documented in the external metadata (e.g., alook-up table that provides lon and lat)',
- datetime = 'datetime of the forecasted value (ISO 8601)',
- family = 'For ensembles: “ensemble.” Default value if unspecified for probability distributions: Name of the statistical distribution associated with the reported statistics. The “sample” distribution is synonymous with “ensemble.”For summary statistics: “summary.”',
- pub_datetime = 'datetime that forecast was submitted',
- depth_m = 'depth (meters) in water column of prediction',
- mean = 'mean forecast prediction',
- median = 'median forecast prediction',
- sd = 'standard deviation forecasts',
- quantile97.5 = 'upper 97.5 percentile value of forecast',
- quantile02.5 = 'upper 2.5 percentile value of forecast',
- quantile90 = 'upper 90 percentile value of forecast',
- quantile10 = 'upper 10 percentile value of forecast',
- project_id = 'unique identifier for the forecast project',
- duration = 'temporal duration of forecast (hourly, daily, etc.); follows ISO 8601 duration convention',
- variable = 'name of forecasted variable',
- model_id = 'unique model identifier',
- reference_date = 'date that the forecast was initiated')
-
-
-summaries_theme_df <- arrow::open_dataset(arrow::s3_bucket(config$summaries_bucket, endpoint_override = config$endpoint, anonymous = TRUE)) #|>
-#filter(model_id == model_id, site_id = site_id, reference_datetime = reference_datetime)
-# NOTE IF NOT USING FILTER -- THE stac4cast::build_table_columns() NEEDS TO BE UPDATED
-#(USE strsplit(summaries_theme_df$ToString(), "\n") INSTEAD OF strsplit(summaries_theme_df[[1]]$ToString(), "\n"))
-
-## identify model ids from bucket -- used in generate model items function
-
-summaries_data_df <- duckdbfs::open_dataset(glue::glue("s3://{config$inventory_bucket}/catalog/forecasts"),
- s3_endpoint = config$endpoint, anonymous=TRUE) |>
- collect() |>
- dplyr::filter(project_id == config$project_id)
-
-theme_models <- summaries_data_df |>
- distinct(model_id)
-
-forecast_date_range <- summaries_data_df |> dplyr::summarise(min(date),max(date))
-forecast_min_date <- forecast_date_range$`min(date)`
-forecast_max_date <- forecast_date_range$`max(date)`
-
-build_description <- paste0("Summaries are the forecasts statistics of the raw forecasts (i.e., mean, median, confidence intervals). You can access the summaries at the top level of the dataset where all models, variables, and dates that forecasts were produced (reference_datetime) are available. The code to access the entire dataset is provided as an asset. Given the size of the forecast catalog, it can be time-consuming to access the data at the full dataset level. For quicker access to the forecasts for a particular model (model_id), we also provide the code to access the data at the model_id level as an asset for each model.")
-
-stac4cast::build_forecast_scores(table_schema = summaries_theme_df,
- #theme_id = 'Forecasts',
- table_description = summaries_description_create,
- start_date = forecast_min_date,
- end_date = forecast_max_date,
- id_value = "summaries",
- description_string = build_description,
- about_string = catalog_config$about_string,
- about_title = catalog_config$about_title,
- theme_title = "Forecast Summaries",
- destination_path = catalog_config$summaries_path,
- aws_download_path = catalog_config$summaries_download_path,
- link_items = stac4cast::generate_group_values(group_values = names(config$variable_groups)),
- thumbnail_link = catalog_config$summaries_thumbnail,
- thumbnail_title = catalog_config$summaries_thumbnail_title,
- model_child = TRUE)
-
-## create separate JSON for model landing page
-
-stac4cast::build_group_variables(table_schema = summaries_theme_df,
- table_description = summaries_description_create,
- start_date = forecast_min_date,
- end_date = forecast_max_date,
- id_value = "models",
- description_string = build_description,
- about_string = catalog_config$about_string,
- about_title = catalog_config$about_title,
- dashboard_string = catalog_config$dashboard_url,
- dashboard_title = catalog_config$dashboard_title,
- theme_title = "Models",
- destination_path = paste0(catalog_config$summaries_path,"models"),
- aws_download_path = catalog_config$summaries_download_path,
- group_var_items = stac4cast::generate_model_items(model_list = theme_models$model_id),
- thumbnail_link = 'pending',
- thumbnail_title = 'pending',
- group_var_vector = NULL,
- group_sites = NULL)
-
-## CREATE MODELS
-variable_gsheet <- gsheet2tbl(config$target_metadata_gsheet)
-
-## READ IN MODEL METADATA
-# googlesheets4::gs4_deauth()
-#
-# registered_model_id <- googlesheets4::read_sheet(config$model_metadata_gsheet)
-
-# read in model metadata and filter for the relevant project
-registered_model_id <- gsheet2tbl(config$model_metadata_gsheet) |>
- filter(`What forecasting challenge are you registering for?` == config$project_id)
-
-forecast_sites <- c()
-
-## LOOP OVER MODEL IDS AND CREATE JSONS
-for (m in theme_models$model_id){
-
- # make model items directory
- if (!dir.exists(paste0(catalog_config$summaries_path,"models/model_items"))){
- dir.create(paste0(catalog_config$summaries_path,"models/model_items"))
- }
-
- print(m)
- model_date_range <- summaries_data_df |> filter(model_id == m) |> dplyr::summarise(min(date),max(date))
- model_min_date <- model_date_range$`min(date)`
- model_max_date <- model_date_range$`max(date)`
-
- model_var_duration_df <- summaries_data_df |> filter(model_id == m) |> distinct(variable,duration) |>
- mutate(duration_name = ifelse(duration == 'P1D', 'Daily', duration)) |>
- mutate(duration_name = ifelse(duration == 'PT1H', 'Hourly', duration_name)) |>
- mutate(duration_name = ifelse(duration == 'PT30M', '30min', duration_name)) |>
- mutate(duration_name = ifelse(duration == 'P1W', 'Weekly', duration_name))
-
- model_var_full_name <- model_var_duration_df |>
- left_join((variable_gsheet |>
- select(variable = `"official" targets name`, full_name = `Variable name`) |>
- distinct(variable, .keep_all = TRUE)), by = c('variable'))
-
- model_sites <- summaries_data_df |> filter(model_id == m) |> distinct(site_id)
-
- model_vars <- summaries_data_df |> filter(model_id == m) |> distinct(variable) |> left_join(model_var_full_name, by = 'variable')
- model_vars$var_duration_name <- paste0(model_vars$duration_name, " ", model_vars$full_name)
-
- forecast_sites <- append(forecast_sites, stac4cast::get_site_coords(site_metadata = catalog_config$site_metadata_url,
- sites = model_sites$site_id))
-
- idx = which(registered_model_id$model_id == m)
-
- stac4cast::build_model(model_id = m,
- team_name = registered_model_id$`Long name of the model (can include spaces)`[idx],
- model_description = registered_model_id[idx,"Describe your modeling approach in your own words."][[1]],
- start_date = model_min_date,
- end_date = model_max_date,
- var_values = model_vars$var_duration_name,
- duration_names = model_var_duration_df$duration,
- site_values = model_sites$site_id,
- site_table = catalog_config$site_metadata_url,
- model_documentation = registered_model_id,
- destination_path = paste0(catalog_config$summaries_path,"models/model_items"),
- aws_download_path = config$summaries_bucket, # CHANGE THIS BUCKET NAME
- collection_name = 'forecasts',
- thumbnail_image_name = NULL,
- table_schema = summaries_theme_df,
- table_description = summaries_description_create,
- full_var_df = model_vars,
- code_web_link = registered_model_id$`Web link to model code`[idx])
- #code_web_link = 'pending')
-}
-
-
-## BUILD VARIABLE GROUPS
-
-for (i in 1:length(config$variable_groups)){ ## organize variable groups
- print(names(config$variable_groups)[i])
-
- # check data and skip if no data found
- var_group_data_check <- summaries_data_df |>
- filter(variable %in% config$variable_groups[[i]]$variable)
-
- if (nrow(var_group_data_check) == 0){
- print('No data available for group')
- next
- }
-
- ## REMOVE STALE OR UNUSED DIRECTORIES
- current_var_path <- paste0(catalog_config$summaries_path,names(config$variable_groups[i]))
- current_var_dirs <- list.dirs(current_var_path, recursive = FALSE, full.names = TRUE)
- unlink(current_var_dirs, recursive = TRUE)
-
- if (!dir.exists(paste0(catalog_config$summaries_path,names(config$variable_groups[i])))){
- dir.create(paste0(catalog_config$summaries_path,names(config$variable_groups[i])))
- }
-
-
- for(j in 1:length(config$variable_groups[[i]]$variable)){ # FOR EACH VARIABLE WITHIN A MODEL GROUP
-
- ## restructure variable names
- var_values <- config$variable_groups[[i]]$variable
- var_name <- config$variable_groups[[i]]$variable[j]
- print(var_name)
-
- # check data and skip if no data found
- var_data_check <- summaries_data_df |>
- filter(variable == var_name)
-
- if (nrow(var_data_check) == 0){
- print('No data available for variable')
- next
- }
-
-
- duration_name <- config$variable_groups[[i]]$duration[j]
-
- # # match variable with full name in gsheet
- # #var_name_full <- variable_gsheet[which(variable_gsheet$`"official" targets name` == var_values),1][[1]]
- # var_name_full <- variable_gsheet[which(variable_gsheet$`"official" targets name` %in% var_values),1][[1]]
-
- # match variable with full name in gsheet
- var_gsheet_arrange <- variable_gsheet |>
- arrange(duration)
-
- var_name_full <- var_gsheet_arrange[which(var_gsheet_arrange$`"official" targets name` %in% var_values),1][[1]]
-
-
- ## create new vector to store duration names
- duration_values <- config$variable_groups[[i]]$duration
- duration_values[which(duration_values == 'P1D')] <- 'Daily'
- duration_values[which(duration_values == 'PT1H')] <- 'Hourly'
- duration_values[which(duration_values == 'PT30M')] <- '30min'
- duration_values[which(duration_values == 'P1W')] <- 'Weekly'
-
- #var_name_combined_list <- paste0(var_values, '_',duration_values)
- #var_name_combined_list <- paste0(duration_values,' ',var_name_full)
- var_name_combined_list <- paste0(duration_values,'_',var_name_full)
-
- if (length(unique(var_name_combined_list)) == 1){
- var_name_combined_list <- unique(var_name_combined_list)
- }
-
-
- ## CREATE VARIABLE GROUP JSONS
- group_description <- paste0('This page includes variables for the ',names(config$variable_groups[i]),' group.')
-
- ## find group sites
- find_group_sites <- summaries_data_df |>
- filter(variable %in% var_values) |>
- distinct(site_id)
-
- stac4cast::build_group_variables(table_schema = summaries_theme_df,
- #theme_id = names(config$variable_groups[i]),
- table_description = summaries_description_create,
- start_date = forecast_min_date,
- end_date = forecast_max_date,
- id_value = names(config$variable_groups[i]),
- description_string = group_description,
- about_string = catalog_config$about_string,
- about_title = catalog_config$about_title,
- dashboard_string = catalog_config$dashboard_url,
- dashboard_title = catalog_config$dashboard_title,
- theme_title = names(config$variable_groups[i]),
- destination_path = paste0(catalog_config$summaries_path,names(config$variable_groups[i])),
- aws_download_path = catalog_config$summaries_download_path,
- group_var_items = stac4cast::generate_group_variable_items(variables = var_name_combined_list),
- thumbnail_link = config$variable_groups[[i]]$thumbnail_link,
- thumbnail_title = config$variable_groups[[i]]$thumbnail_title,
- group_var_vector = unique(var_values),
- group_sites = find_group_sites$site_id)
-
- if (!dir.exists(paste0(catalog_config$summaries_path,names(config$variable_groups)[i],'/',var_name_combined_list[j]))){
- dir.create(paste0(catalog_config$summaries_path,names(config$variable_groups)[i],'/',var_name_combined_list[j]))
- }
-
- # var_data <- summaries_data_df |>
- # filter(variable == var_name,
- # duration == duration_name)
- var_data <- summaries_data_df |>
- filter(variable == var_name)
- #duration == duration_name)
-
- var_date_range <- var_data |> dplyr::summarise(min(date),max(date))
- var_min_date <- var_date_range$`min(date)`
- var_max_date <- var_date_range$`max(date)`
-
- var_models <- var_data |> distinct(model_id)
-
- find_var_sites <- summaries_data_df |>
- filter(variable == var_name) |>
- distinct(site_id)
-
- var_description <- paste0('This page includes all models for the ',var_name_combined_list[j],' variable.')
-
- stac4cast::build_group_variables(table_schema = summaries_theme_df,
- table_description = summaries_description_create,
- start_date = var_min_date,
- end_date = var_max_date,
- id_value = var_name_combined_list[j],
- description_string = var_description,
- about_string = catalog_config$about_string,
- about_title = catalog_config$about_title,
- dashboard_string = catalog_config$dashboard_url,
- dashboard_title = catalog_config$dashboard_title,
- theme_title = var_name_combined_list[j],
- destination_path = file.path(catalog_config$summaries_path,names(config$variable_groups)[i],var_name_combined_list[j]),
- aws_download_path = var_data$path[1],
- group_var_items = stac4cast::generate_variable_model_items(model_list = var_models$model_id),
- thumbnail_link = 'pending',
- thumbnail_title = 'pending',
- group_var_vector = NULL,
- group_sites = find_var_sites$site_id)
-
- }
-
-}
diff --git a/catalog/targets/collection.json b/catalog/targets/collection.json
deleted file mode 100644
index 4ae1766854..0000000000
--- a/catalog/targets/collection.json
+++ /dev/null
@@ -1,117 +0,0 @@
-{
- "id": "targets",
- "description": "The targets are observations that can be used to evaluate and build forecasts. We provide the code to access different targets as an asset.",
- "stac_version": "1.0.0",
- "license": "CC0-1.0",
- "stac_extensions": [
- "https://stac-extensions.github.io/scientific/v1.0.0/schema.json",
- "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json",
- "https://stac-extensions.github.io/table/v1.2.0/schema.json"
- ],
- "type": "Collection",
- "links": [
- {
- "rel": "parent",
- "type": "application/json",
- "href": "../catalog.json"
- },
- {
- "rel": "root",
- "type": "application/json",
- "href": "../catalog.json"
- },
- {
- "rel": "self",
- "type": "application/json",
- "href": "collection.json"
- },
- {
- "rel": "cite-as",
- "href": "https://doi.org/10.1002/fee.2616"
- },
- {
- "rel": "about",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "type": "text/html",
- "title": "EFI-USGS River Chlorophyll Forecasting Challenge Documentation"
- },
- {
- "rel": "describedby",
- "href": "https://projects.ecoforecast.org/usgsrc4cast-docs/",
- "title": "EFI-USGS River Chlorophyll Forecast Challenge Dashboard",
- "type": "text/html"
- }
- ],
- "title": "Targets",
- "extent": {
- "spatial": {
- "bbox": [
- [
- -122.6692,
- 39.6327,
- -74.7781,
- 45.5175
- ]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2009-01-22T00:00:00Z",
- "2024-02-09T00:00:00Z"
- ]
- ]
- }
- },
- "table:columns": [
- {
- "name": "project_id",
- "type": "character",
- "description": "unique project identifier"
- },
- {
- "name": "site_id",
- "type": "character",
- "description": "unique site identifier"
- },
- {
- "name": "datetime",
- "type": "Date",
- "description": "datetime of the observed value (ISO 8601)"
- },
- {
- "name": "duration",
- "type": "character",
- "description": "temporal duration of target (hourly = PT1H, daily = P1D, etc.); follows ISO 8601 duration convention"
- },
- {
- "name": "variable",
- "type": "character",
- "description": "observation variable"
- },
- {
- "name": "observation",
- "type": "numeric",
- "description": "observed value for variable"
- }
- ],
- "assets": {
- "thumbnail": {
- "href": "https://raw.githubusercontent.com/eco4cast/neon4cast-ci/main/catalog/thumbnail_plots/neon_stream.jpg",
- "type": "image/JPEG",
- "roles": [
- "thumbnail"
- ],
- "title": "Test Image"
- },
- "2": {
- "href": "https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/targets/project_id=usgsrc4cast/duration=P1D/river-chl-targets.csv.gz",
- "type": "application/x-parquet",
- "title": "aquatics Target Access",
- "roles": [
- "data"
- ],
- "description": "This R code will return results for the relevant targets file.\n\n### R\n\n```{r}\n# Use code below\n\nurl <- \"https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/targets/project_id=usgsrc4cast/duration=P1D/river-chl-targets.csv.gz\"\ntargets <- readr::read_csv(url, show_col_types = FALSE)\n```"
- }
- }
-}
diff --git a/catalog/targets/create_targets_page.R b/catalog/targets/create_targets_page.R
deleted file mode 100644
index 0dd4a8f715..0000000000
--- a/catalog/targets/create_targets_page.R
+++ /dev/null
@@ -1,57 +0,0 @@
-library(arrow)
-library(dplyr)
-library(gsheet)
-library(readr)
-
-#source('catalog/R/stac_functions.R')
-config <- yaml::read_yaml('challenge_configuration.yaml')
-catalog_config <- config$catalog_config
-
-# file.sources = list.files(c("../stac4cast/R"), full.names=TRUE,
-# ignore.case=TRUE)
-# sapply(file.sources,source,.GlobalEnv)
-
-## CREATE table for column descriptions
-targets_description_create <- data.frame(project_id = 'unique project identifier',
- site_id = 'unique site identifier',
- datetime = 'datetime of the observed value (ISO 8601)',
- duration = 'temporal duration of target (hourly = PT1H, daily = P1D, etc.); follows ISO 8601 duration convention',
- depth_m = 'depth (meters) in water column of observation',
- variable = 'observation variable',
- observation = 'observed value for variable')
-
-#inventory_theme_df <- arrow::open_dataset(glue::glue("s3://{config$inventory_bucket}/catalog/forecasts/project_id={config$project_id}"), endpoint_override = config$endpoint, anonymous = TRUE) #|>
-
-target_url <- config$target_groups$aquatics$targets_file
-targets <- read_csv(target_url, show_col_types = FALSE)
-
-# inventory_theme_df <- arrow::open_dataset(arrow::s3_bucket(config$inventory_bucket, endpoint_override = config$endpoint, anonymous = TRUE))
-#
-# inventory_data_df <- duckdbfs::open_dataset(glue::glue("s3://{config$inventory_bucket}/catalog"),
-# s3_endpoint = config$endpoint, anonymous=TRUE) |>
-# collect()
-#
-# theme_models <- inventory_data_df |>
-# distinct(model_id)
-
-target_date_range <- targets |> dplyr::summarise(min(datetime),max(datetime))
-target_min_date <- as.Date(target_date_range$`min(datetime)`)
-target_max_date <- as.Date(target_date_range$`max(datetime)`)
-
-build_description <- paste0("The targets are observations that can be used to evaluate and build forecasts. We provide the code to access different targets as an asset.")
-
-
-stac4cast::build_targets(table_schema = targets,
- table_description = targets_description_create,
- start_date = target_min_date,
- end_date = target_max_date,
- id_value = "targets",
- description_string = build_description,
- about_string = catalog_config$about_string,
- about_title = catalog_config$about_title,
- theme_title = "Targets",
- destination_path = config$targets_path,
- #link_items = stac4cast::generate_group_values(group_values = names(config$variable_groups)),
- link_items = NULL,
- thumbnail_link = config$targets_thumbnail,
- thumbnail_title = config$targets_thumbnail_title)
diff --git a/catalog/thumbnail_plots/51629805865_0ef01ffbbc_c.jpg b/catalog/thumbnail_plots/51629805865_0ef01ffbbc_c.jpg
deleted file mode 100644
index c9ed1c3a13..0000000000
Binary files a/catalog/thumbnail_plots/51629805865_0ef01ffbbc_c.jpg and /dev/null differ
diff --git a/catalog/thumbnail_plots/52760199990_d1a0f154fe_c.jpg b/catalog/thumbnail_plots/52760199990_d1a0f154fe_c.jpg
deleted file mode 100644
index 7a6bc67f7a..0000000000
Binary files a/catalog/thumbnail_plots/52760199990_d1a0f154fe_c.jpg and /dev/null differ
diff --git a/catalog/thumbnail_plots/generate_thumbnails.R b/catalog/thumbnail_plots/generate_thumbnails.R
deleted file mode 100644
index 19567f5f84..0000000000
--- a/catalog/thumbnail_plots/generate_thumbnails.R
+++ /dev/null
@@ -1,201 +0,0 @@
-library(arrow)
-library(tidyverse)
-library(ggplot2)
-library(minioclient)
-
-#theme <- 'beetles'
-
-#get model ids
-# s3 <- s3_bucket("neon4cast-inventory", endpoint_override="data.ecoforecast.org", anonymous = TRUE)
-# paths <- open_dataset(s3$path("neon4cast-scores")) |> collect()
-# models_df <- paths |> filter(...1 == "parquet", ...2 == theme) |> distinct(...3)
-#
-# theme_models <- models_df |>
-# tidyr::separate(...3, c('name','model.id'), "=")
-
-# info_extract <- arrow::s3_bucket("neon4cast-scores/parquet/", endpoint_override = "data.ecoforecast.org", anonymous = TRUE)
-
-
-theme_models <- duckdbfs::open_dataset(glue::glue("s3://{config$inventory_bucket}/catalog"),
- s3_endpoint = "renc.osn.xsede.org", anonymous=TRUE) |>
- collect()
-
-theme_models <- theme_models |>
- distinct(model_id)
-
-
-## save climatology data
-#aquatics / phenology
-# baseline_df <- arrow::open_dataset(info_extract$path(glue::glue("{theme}/model_id=climatology/"))) |>
-# # filter(reference_datetime == latest_forecast_date,
-# # datetime %in% latest_forecast$datetime) |>
-# collect()
-
-
-baseline_df <- scores_theme_df <- arrow::open_dataset(arrow::s3_bucket(config$scores_bucket, endpoint_override = "renc.osn.xsede.org", anonymous = TRUE)) |>
- filter(model_id == 'climatology')
-
-
-#test_models <- c(aquatic_models$model.id[1:2], 'tg_arima')
-
-for (m_id in theme_models$model_id[1:2]){
- print(m_id)
-
- info_df <- arrow::open_dataset(info_extract$path(glue::glue("{theme}/model_id={m_id}/"))) |>
- collect()
-
- # latest_forecast_date <- max(info_df$reference_datetime)
- #
- # latest_forecast <- info_df |>
- # filter(reference_datetime == latest_forecast_date)
-
- #latest_forecast$horizon <- as.Date(latest_forecast$datetime) - as.Date(latest_forecast$reference_datetime)
-
-
- for (site in unique(info_df$site_id)){
- print(site)
-
- ## FORECAST
- latest_forecast_df <- info_df |>
- filter(site_id == site,
- reference_datetime == max(reference_datetime))
-
- ## check if path for image exists locally
- img_save_path <- file.path("thumbnail_store",m_id,theme,site)
-
- if (file.exists(img_save_path) == FALSE){
- print('creating new dir')
- dir.create(img_save_path, recursive = TRUE)
- }
-
- # Forecast Plot -- uses scores data. Raw forecasts might not be useful
- forecast_plot <- ggplot(data = latest_forecast_df, aes(datetime, mean)) +
- geom_line(color = "steelblue", linewidth = 1) +
- ggplot2::geom_ribbon(aes(ymin = quantile10, ymax = quantile90), alpha = 0.2) +
- labs(title = paste0("Latest Forecast for ", site," (",unique(latest_forecast_df$reference_datetime),')'),
- subtitle = "(plots include the mean and the +- 90% CI)",
- y = "Forecast value", x = "Date") +
- facet_grid(variable ~ ., scales = "free_y") +
- theme_bw() +
- theme(plot.title = element_text(hjust = 0.5),
- plot.subtitle=element_text(hjust=0.5))
-
- forecast_img_path <- paste0(img_save_path,'/latest_forecast.png')
-
- ggsave(filename = forecast_img_path, plot = forecast_plot,height = 6, width = 8)
-
- mc_cp(forecast_img_path, glue::glue("efi/neon4cast-catalog/{theme}/{m_id}/{site}/latest_forecast.png"))
-
-
- ## SCORES -- CHECK TO SEE IF CLIMATOLOGY EXISTS
-
- latest_scores_site <- info_df |>
- filter(site_id == site,
- reference_datetime < (as.Date(Sys.Date()) - days(30))) |>
- group_by(reference_datetime) |>
- mutate(max_horizon = max(as.Date(datetime)) - as.Date(reference_datetime)) |>
- ungroup() |>
- distinct(reference_datetime, .keep_all = TRUE) |>
- filter(max_horizon == max(max_horizon)) |>
- filter(reference_datetime == max(reference_datetime))
-
- baseline_site_df <- baseline_df |>
- filter(site_id == site) |>
- filter(reference_datetime == latest_scores_site$reference_datetime) |>
- #filter(datetime %in% latest_scores_site$date) |>
- rename(clim_crps = crps) |>
- select(datetime,variable, clim_crps)
-
- if (nrow(baseline_site_df) == 0){
- print(paste0('no climatology forecast for ',{site}, ' on ', {latest_scores_site$reference_datetime}))
- next
- }
-
- latest_scores_df <- info_df |>
- filter(site_id == site,
- reference_datetime == latest_scores_site$reference_datetime) |>
- right_join(baseline_site_df, by = c('datetime','variable'))
-
- # Scores plot
- scores_plot <- ggplot(data = latest_scores_df, aes(datetime, crps)) +
- geom_line(color = "steelblue", linewidth = 1) +
- geom_line(aes(y = clim_crps), color = 'darkred', linetype = 'dashed') +
- labs(title = paste0("Latest Scores for ", site," (",latest_scores_site$reference_datetime,')'),
- subtitle = "modeled CRPS score (blue) and the climatology CRPS score (red)",
- y = "CRPS Score", x = "Horizon (Days)") +
- facet_grid(variable ~ ., scale = "free_y") +
- theme_bw() +
- theme(plot.title = element_text(hjust = 0.5),
- plot.subtitle=element_text(hjust=0.5))
-
- ##save plot
- scores_img_path <- paste0(img_save_path,'/latest_scores.png')
-
- ggsave(filename = scores_img_path, plot = scores_plot, height = 6, width = 8)
-
- mc_cp(scores_img_path, glue::glue("efi/neon4cast-catalog/{theme}/{m_id}/{site}/latest_scores.png"))
-
- print('--- done ---')
-
-
- #mc_alias_set("efi", endpoint="data.ecoforecast.org", access='austin', secret='RokQD3E8mJUFpUbn') # needed only once per machine
- }
-
-}
-
-
-#
-#
-# ## MAKE NEW SCORES PLOT (average crps for all reference datetimes from model/site)
-#
-# info_df <- arrow::open_dataset(info_extract$path(glue::glue("{theme}/model_id={m_id}/"))) |>
-# collect()
-#
-# latest_forecast_date <- max(info_df$reference_datetime)
-#
-# latest_forecast_site <- info_df |>
-# filter(reference_datetime == latest_forecast_date,
-# site_id == site)
-#
-# ## for loop (model)
-# info_df <- arrow::open_dataset(info_extract$path(glue::glue("{theme}/model_id={m_id}/"))) |>
-# collect()
-#
-# latest_forecast_date <- max(info_df$reference_datetime)
-#
-# # for loop (site)
-#
-# latest_forecast$horizon <- as.Date(latest_forecast$datetime) - as.Date(latest_forecast$reference_datetime)
-#
-# # site_df <- info_df |>
-# # filter(site_id == site) |>
-# # group_by(variable, reference_datetime) |>
-# # mutate(crps_mean = mean(crps, na.rm = TRUE)) |>
-# # ungroup() |>
-# # distinct(variable, reference_datetime, .keep_all = TRUE) |>
-# # select(reference_datetime, site_id, variable, crps_mean)
-# #
-# # clim_site_df <- climatology_df |>
-# # filter(site_id == site,
-# # reference_datetime %in% site_df$reference_datetime) |>
-# # group_by(variable, reference_datetime) |>
-# # mutate(crps_mean = mean(crps, na.rm = TRUE)) |>
-# # ungroup() |>
-# # distinct(variable, reference_datetime, .keep_all = TRUE)
-#
-#
-# info_df <- arrow::open_dataset(info_extract$path(glue::glue("{theme}/model_id={m_id}/"))) |>
-# collect()
-#
-# site <- 'BARC'
-# today <- Sys.Date()
-#
-# latest_forecast_site <- info_df |>
-# filter(site_id == site,
-# reference_datetime < (as.Date(Sys.Date()) - days(30))) |>
-# group_by(reference_datetime) |>
-# mutate(max_horizon = max(as.Date(datetime)) - as.Date(reference_datetime)) |>
-# ungroup() |>
-# distinct(reference_datetime, .keep_all = TRUE) |>
-# filter(max_horizon == max(max_horizon)) |>
-# filter(reference_datetime == max(reference_datetime))
diff --git a/catalog/thumbnail_plots/neon_desert.jpg b/catalog/thumbnail_plots/neon_desert.jpg
deleted file mode 100644
index 4a8027ff56..0000000000
Binary files a/catalog/thumbnail_plots/neon_desert.jpg and /dev/null differ
diff --git a/catalog/thumbnail_plots/neon_forest.jpg b/catalog/thumbnail_plots/neon_forest.jpg
deleted file mode 100644
index 3fff2f70cf..0000000000
Binary files a/catalog/thumbnail_plots/neon_forest.jpg and /dev/null differ
diff --git a/catalog/thumbnail_plots/neon_stream.jpg b/catalog/thumbnail_plots/neon_stream.jpg
deleted file mode 100644
index 833073022c..0000000000
Binary files a/catalog/thumbnail_plots/neon_stream.jpg and /dev/null differ
diff --git a/catalog/thumbnail_plots/neon_wetland.jpg b/catalog/thumbnail_plots/neon_wetland.jpg
deleted file mode 100644
index 4b90ccf0f3..0000000000
Binary files a/catalog/thumbnail_plots/neon_wetland.jpg and /dev/null differ
diff --git a/catalog/thumbnail_plots/tick_drag.jpg b/catalog/thumbnail_plots/tick_drag.jpg
deleted file mode 100644
index dda7c90122..0000000000
Binary files a/catalog/thumbnail_plots/tick_drag.jpg and /dev/null differ
diff --git a/catalog/update_stac.R b/catalog/update_stac.R
deleted file mode 100644
index c4e81039ad..0000000000
--- a/catalog/update_stac.R
+++ /dev/null
@@ -1,61 +0,0 @@
-library(jsonlite)
-library(arrow)
-library(dplyr)
-library(lubridate)
-
-#reticulate::miniconda_path() |>
-# reticulate::use_miniconda()
-
-#Generate EFI model metadata
-source('catalog/model_metadata.R')
-
-# catalog
-print('BUILDING CATALOG')
-source('catalog/catalog.R')
-
-# forecasts
-print('BUILDING FORECASTS')
-source('catalog/forecasts/forecast_models.R')
-
-rm(list = ls()) # remove all environmental vars between forecast and scores
-gc()
-
-# scores
-print('BUILDING SCORES')
-source('catalog/scores/scores_models.R')
-
-rm(list = ls())
-gc()
-
-# inventory
-print('BUILDING INVENTORY')
-source('catalog/inventory/create_inventory_page.R')
-
-rm(list = ls())
-gc()
-
-# summaries
-print('BUILDING FORECAST SUMMARIES')
-source('catalog/summaries/summaries_models.R')
-
-rm(list = ls())
-gc()
-
-# targets
-print('BUILDING TARGETS')
-source('catalog/targets/create_targets_page.R')
-
-rm(list = ls())
-gc()
-
-# noaa
-print('BUILDING NOAA')
-source('catalog/noaa_forecasts/noaa_forecasts.R')
-
-rm(list = ls())
-gc()
-
-# sites
-print('BUILDING SITES')
-source('catalog/sites/build_sites_page.R')
-
diff --git a/cron.sh b/cron.sh
deleted file mode 100644
index eb9517d604..0000000000
--- a/cron.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/with-contenv bash
-
-touch /var/log/cron.log
-cron -f
diff --git a/dashboard/.gitignore b/dashboard/.gitignore
deleted file mode 100644
index 075b2542af..0000000000
--- a/dashboard/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/.quarto/
diff --git a/dashboard/R/build_dashboard_sites.R b/dashboard/R/build_dashboard_sites.R
deleted file mode 100644
index c2c110b88a..0000000000
--- a/dashboard/R/build_dashboard_sites.R
+++ /dev/null
@@ -1,47 +0,0 @@
-library(tidyverse)
-config <- yaml::read_yaml('challenge_configuration.yaml')
-catalog_config <- config$catalog_config
-
-project_sites <- read_csv(catalog_config$site_metadata_url, col_types = cols())
-project_sites$site_lat_lon <- lapply(1:nrow(project_sites), function(i) c(project_sites$field_longitude[i], project_sites$field_latitude[i]))
-project_sites$field_site_name_short <- gsub(' NEON', '',project_sites$field_site_name) # remove the NEON on back end of name
-
-iterator_list <- 1:nrow(project_sites)
-
-site_name_coords <- purrr::map(iterator_list, function(i)
- list(
- "type" = "Feature",
- "properties" = list(
- "site_id" = project_sites$field_site_name_short[i],
- "Partner" = "NEON",
- "n" = 5 ),
- "geometry" = list(
- "type" = "Point",
- "coordinates" = c(project_sites$field_longitude[i], project_sites$field_latitude[i])
- )))
-
-
-site_info <- list(
- "type" = "FeatureCollection",
- "name" = "neon",
- "crs" = list(
- "type" = "name",
- "properties" = list(
- "name" = "urn:ogc:def:crs:OGC:1.3:CRS84")
- ),
- "features" = site_name_coords
-)
-
-dest <- 'dashboard/'
-json <- file.path(dest, "sites.json")
-
-
-jsonlite::write_json(site_info,
- json,
- pretty=TRUE,
- auto_unbox=TRUE)
-
-#stac4cast::stac_validate(json)
-
-
-
diff --git a/dashboard/R/flare-plots.R b/dashboard/R/flare-plots.R
deleted file mode 100644
index fdd7553d7e..0000000000
--- a/dashboard/R/flare-plots.R
+++ /dev/null
@@ -1,286 +0,0 @@
-# Script of functions for different plots
-
-# Plot the temperature forecast
-# depths - what depths to facet
-plot_temp <- function(score_df, depths = 0.5) {
-
- # Generate labels for plots
- my_breaks <- lubridate::with_tz(seq(min(score_df$datetime), max(score_df$datetime), by = "1 day"),"America/New_York")
- my_label <- lubridate::with_tz(seq(lubridate::as_datetime(score_df$reference_datetime)[1], max(score_df$datetime), by = "5 days"),"America/New_York")
- my_labels <- as.character(my_breaks)
- my_labels[which(!(my_breaks %in% my_label))] <- " "
-
- my_labels <- as.Date(my_labels, format = "%Y-%m-%d")
- my_labels <- as.character(my_labels)
- my_labels[is.na(my_labels)] <- " "
-
- y_label <- expression(paste('Water temperature (',degree,'C)', sep = ""))
-
- # Generate the pot
- score_df |>
- # Filter the score_df and get in the right format
- dplyr::filter(depth %in% depths) |>
- dplyr::mutate(datetime = lubridate::with_tz(lubridate::as_datetime(datetime), "America/New_York"),
- reference_datetime = lubridate::with_tz(lubridate::as_datetime(reference_datetime), "America/New_York"),
- depth = paste0("Depth: ", depth)) |>
- dplyr::filter(datetime >= reference_datetime) |>
-
- ggplot(aes(x = datetime)) +
- geom_ribbon(aes(ymin = quantile10, ymax = quantile90), fill = "lightblue", color = "lightblue") +
- geom_line(aes(y = mean)) +
- scale_x_continuous(breaks = my_breaks, labels = my_labels) +
- facet_wrap(~depth) +
- labs(y = y_label) +
- ylim(c(-5,40)) +
- theme_bw() +
- theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=0.2)) +
- theme(text = element_text(size = 20))
-}
-
-
-plot_temp_single_panel <- function(score_data,site_identifier, model_identifier, depth_values,site_name,y_axis_limits) {
-
- # find most recent forecast run date for the model and site
- most_recent <- arrow::open_dataset(score_data) |>
- filter(site_id %in% site_identifier,
- model_id %in% model_identifier) |>
- summarize(max = max(reference_datetime)) |>
- collect() |>
- pull()
-
- # subset and collect data based off of site, model, depth, and reference datetime
- score_df <- arrow::open_dataset(score_data) |>
- filter(variable == "temperature",
- # depth %in% c(0.5),
- depth %in% depth_values,
- site_id %in% site_identifier,
- model_id %in% model_identifier,
- reference_datetime == most_recent) |>
- dplyr::collect()
-
- # Fix dates and rename columns to match plotting code
- plot_df <- score_df |>
- dplyr::mutate(datetime = lubridate::with_tz(lubridate::as_datetime(datetime), "America/New_York"),
- reference_datetime = lubridate::with_tz(lubridate::as_datetime(reference_datetime), "America/New_York")) |>#,
- dplyr::filter(datetime >= reference_datetime) |>
- rename(date = datetime, forecast_mean = mean, forecast_sd = sd, forecast_upper_90 = quantile90, forecast_lower_90 = quantile10,
- observed = observation, forecast_start_day = reference_datetime)
-
-
- curr_tibble <- plot_df
-
- p <- ggplot2::ggplot(curr_tibble, ggplot2::aes(x = as.Date(date))) +
- ggplot2::ylim(y_axis_limits) +
- ggplot2::geom_line(ggplot2::aes(y = forecast_mean, color = as.factor(depth)), size = 0.5)+
- ggplot2::geom_ribbon(ggplot2::aes(ymin = forecast_lower_90, ymax = forecast_upper_90,
- fill = as.factor(depth)),
- alpha = 0.2) +
- #ggplot2::geom_point(data = obs_hist, ggplot2::aes(y = value, color = as.factor(depth)), size = 2) +
- ggplot2::geom_vline(aes(xintercept = as.Date(forecast_start_day),
- linetype = "solid"),
- alpha = 1) +
- #alpha = forecast_start_day_alpha) +
- #ggplot2::annotate(x = as.Date(curr_tibble$forecast_start_day - 2*24*60*60), y = max(curr_tibble$forecast_upper_90), label = 'Past', geom = 'text') +
- #ggplot2::annotate(x = as.Date(curr_tibble$forecast_start_day + 3*24*60*60), y = max(curr_tibble$forecast_upper_90), label = 'Future', geom = 'text') +
- ggplot2::annotate(x = as.Date(curr_tibble$forecast_start_day - 24*60*60), y = max(curr_tibble$forecast_upper_90), label = 'Past', geom = 'text') +
- ggplot2::annotate(x = as.Date(curr_tibble$forecast_start_day + 24*60*60), y = max(curr_tibble$forecast_upper_90), label = 'Future', geom = 'text') +
- ggplot2::theme_light() +
- ggplot2::scale_fill_manual(name = "Depth (m)",
- values = c("#D55E00", '#009E73', '#0072B2'),
- labels = as.character(depth_values)) +
- #labels = c('0.1', '5.0', '10.0')) +
- ggplot2::scale_color_manual(name = "Depth (m)",
- values = c("#D55E00", '#009E73', '#0072B2'),
- labels = as.character(depth_values)) +
- #labels = c('0.1', '5.0', '10.0')) +
- ggplot2::scale_x_date(date_breaks = '4 days',
- date_labels = '%b %d\n%a',
- limits = c(as.Date(min(curr_tibble$date) - 1), as.Date(max(curr_tibble$date)))) +
- #limits = c(as.Date(min(obs_hist$date)), as.Date(max(curr_tibble$date)))) +
- #limits = c(as.Date(config$run_config$start_datetime) - 1, as.Date(config$run_config$forecast_start_datetime) + num_days_plot)) +
- ggplot2::scale_linetype_manual(name = "",
- values = c('solid'),
- labels = c('Forecast Date')) +
- ggplot2::scale_y_continuous(name = 'Temperature (°C)',
- sec.axis = sec_axis(trans = (~.*(9/5) + 32), name = 'Temperature (°F)')) +
- ggplot2::labs(x = "Date",
- y = "Temperature (°C)", #state_names[i],
- fill = 'Depth (m)',
- color = 'Depth',
- title = paste0(site_name," water temperature forecast, ", lubridate::date(curr_tibble$forecast_start_day)),
- caption = 'Points represent sensor observations of water temperature. Lines represents the mean prediction from the forecast ensembles, or the most likely outcome.\n The shaded areas represent the 90% confidence interval of the forecast, or the possible range of outcomes based on the forecast.') +
- ggplot2::theme(axis.text.x = ggplot2::element_text(size = 10),
- plot.title = element_text(size = 16))
-
- print(p)
-
-}
-
-
-
-plot_depth <- function(score_df) {
- # Generate labels for plots
- my_breaks <- lubridate::with_tz(seq(min(score_df$datetime), max(score_df$datetime), by = "1 day"),"America/New_York")
- my_label <- lubridate::with_tz(seq(lubridate::as_datetime(score_df$reference_datetime)[1], max(score_df$datetime), by = "5 days"), "America/New_York")
- my_labels <- as.character(my_breaks)
- my_labels[which(!(my_breaks %in% my_label))] <- " "
-
- my_labels <- as.Date(my_labels, format = "%Y-%m-%d")
- my_labels <- as.character(my_labels)
- my_labels[is.na(my_labels)] <- " "
-
-
- # limits for axes
- depth_change <- ceiling((max(score_df$mean) - min(score_df$mean))*2)/2
- max_depth <- ceiling(max(score_df$mean)*2)/2
-
- # Generate plot
- score_df %>%
- # Filter the dataframe and get in right format
- dplyr::mutate(datetime = lubridate::with_tz(lubridate::as_datetime(datetime), "America/New_York"),
- reference_datetime = lubridate::with_tz(lubridate::as_datetime(reference_datetime), "America/New_York")) %>%
- dplyr::filter(datetime >= reference_datetime) |>
-
- ggplot(aes(x=datetime))+
- geom_ribbon(aes(ymin = quantile10, ymax = quantile90), colour = 'lightgreen', fill = 'lightgreen') +
- geom_line(aes( y=mean)) +
- scale_x_continuous(breaks = my_breaks, labels = my_labels) +
- scale_y_continuous(limits = c(max_depth - depth_change, max_depth)) +
- labs(y = 'Lake depth (m)') +
- theme_bw() +
- theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=0.2)) +
- theme(text = element_text(size = 20),
- panel.grid.minor = element_blank())
-}
-
-# Plot the % chance of being mixed - needs ensemble forecast
-# eval_depths = depths used to determine mixing, either max min or specific depths
-# use_density = use a density difference to determine mixing? T/F
-# threshold = the density or temperature difference used to determine mixing
-
-plot_mixing <- function(forecast_df, eval_depths = 'min/max', use_density = TRUE, threshold = 0.1) {
-
- # Labels for plot
- my_breaks <- lubridate::with_tz(seq(min(forecast_df$datetime), max(forecast_df$datetime), by = "1 day"),"America/New_York")
- my_label <- lubridate::with_tz(seq(lubridate::as_datetime(forecast_df$reference_datetime)[1], max(forecast_df$datetime), by = "5 days"), "America/New_York")
- my_labels <- as.character(my_breaks)
- my_labels[which(!(my_breaks %in% my_label))] <- " "
-
- my_labels <- as.Date(my_labels, format = "%Y-%m-%d")
- my_labels <- as.character(my_labels)
- my_labels[is.na(my_labels)] <- " "
-
- # which depths should be evaluated to determine mixing
- if (!is.numeric(eval_depths)) {
- # extracts the maximum and minimum in the forecast
- max_depth <- forecast_df |>
- filter(variable == "temperature") |>
- select(datetime, parameter, depth, variable, prediction) |>
- mutate(is_na = ifelse(is.na(prediction), 1, 0)) |>
- group_by(depth) |>
- summarize(num_na = sum(is_na), .groups = "drop") |>
- filter(num_na == 0) |>
- summarize(max = max(depth)) |>
- pull(max)
-
- min_depth <- min(forecast_df$depth, na.rm = T)
- } else {
- # or uses the user specified values
- max_depth <- max(eval_depths)
- min_depth <- min(eval_depths)
- }
-
- # if use_density is false uses a temperature difference
- if (use_density == FALSE) {
- message(paste0('using a ', threshold, ' C temperature difference to define mixing'))
- temp_forecast <- forecast_df |>
- filter(depth %in% c(max_depth, min_depth),
- datetime >= reference_datetime) |>
- pivot_wider(names_from = depth, names_prefix = 'wtr_', values_from = prediction)
-
- colnames(temp_forecast)[which(colnames(temp_forecast) == paste0('wtr_', min_depth))] <- 'min_depth'
- colnames(temp_forecast)[which(colnames(temp_forecast) == paste0('wtr_', max_depth))] <- 'max_depth'
-
- temp_forecast |>
- mutate(mixed = ifelse((min_depth - max_depth) < threshold,
- 1, 0)) |>
- group_by(datetime) |>
- summarise(percent_mix = 100*(sum(mixed)/n())) |>
- ggplot(aes(datetime, y=percent_mix)) +
- geom_line() +
- scale_x_continuous(breaks = my_breaks, labels = my_labels) +
- scale_y_continuous(limits = c(0,100)) +
- labs(y = '% chance of lake mixing') +
- theme_bw() +
- theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=0.2)) +
- theme(text = element_text(size = 20),
- plot.caption = element_text(size = 12),
- panel.grid.minor = element_blank())
- }
-
-
- if (use_density == TRUE) {
- message(paste0('using a ', threshold, ' kg/m3 density difference to define mixing'))
-
- temp_forecast <- forecast_df |>
- filter(depth %in% c(max_depth, min_depth),
- datetime >= reference_datetime) |>
- pivot_wider(names_from = depth, names_prefix = 'wtr_', values_from = prediction)# %>% na.omit()
-
- colnames(temp_forecast)[which(colnames(temp_forecast) == paste0('wtr_', min_depth))] <- 'min_depth'
- colnames(temp_forecast)[which(colnames(temp_forecast) == paste0('wtr_', max_depth))] <- 'max_depth'
-
- temp_forecast |>
- mutate(min_depth = rLakeAnalyzer::water.density(min_depth),
- max_depth = rLakeAnalyzer::water.density(max_depth),
- mixed = ifelse((max_depth - min_depth) < threshold,
- 1, 0)) |>
- group_by(datetime) |>
- summarise(percent_mix = 100*(sum(mixed)/n())) |>
- ggplot(aes(datetime, y=percent_mix)) +
- geom_line() +
- scale_x_continuous(breaks = my_breaks, labels = my_labels) +
- scale_y_continuous(limits = c(0,100)) +
- labs(y = '% chance of lake mixing') +
- theme_bw() +
- theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=0.2)) +
- theme(text = element_text(size = 20),
- plot.caption = element_text(size = 12),
- panel.grid.minor = element_blank())
- }
-
-
-
-}
-
-
-# Generate plot for ice chance %
-plot_ice <- function(forecast_df) {
-
-
- # Labels for plot
- my_breaks <- lubridate::with_tz(seq(min(forecast_df$datetime), max(forecast_df$datetime), by = "1 day"),"America/New_York")
- my_label <- lubridate::with_tz(seq(lubridate::as_datetime(forecast_df$reference_datetime)[1], max(forecast_df$datetime), by = "5 days"),"America/New_York")
- my_labels <- as.character(my_breaks)
- my_labels[which(!(my_breaks %in% my_label))] <- " "
-
- my_labels <- as.Date(my_labels, format = "%Y-%m-%d")
- my_labels <- as.character(my_labels)
- my_labels[is.na(my_labels)] <- " "
-
- forecast_df %>%
- mutate(ice = ifelse(prediction > 0, 1, 0)) %>%
- dplyr::filter(datetime >= reference_datetime) |>
- group_by(datetime) %>%
- summarise(percent_ice = 100*(sum(ice)/n())) %>%
- ggplot(., aes(datetime, y=percent_ice)) +
- geom_line() +
- scale_x_continuous(breaks = my_breaks, labels = my_labels) +
- scale_y_continuous(limits = c(0,100)) +
- labs(y = '% chance of ice') +
- theme_bw() +
- theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=0.2)) +
- theme(text = element_text(size = 20),
- plot.caption = element_text(size = 12),
- panel.grid.minor = element_blank())
-}
diff --git a/dashboard/R/plot-utils.R b/dashboard/R/plot-utils.R
deleted file mode 100644
index 945b98df98..0000000000
--- a/dashboard/R/plot-utils.R
+++ /dev/null
@@ -1,152 +0,0 @@
-library(ggiraph)
-library(patchwork)
-library(tidyverse)
-library(score4cast)
-library(glue)
-
-forecast_ggobj <- function(df, ncol = NULL, show.legend = TRUE) {
-
- df |> collect() |>
- ggplot() +
- geom_point(aes(datetime, observation)) +
- geom_ribbon_interactive(aes(x = datetime, ymin = quantile02.5, ymax = quantile97.5,
- fill = model_id, data_id = model_id, tooltip = model_id),
- alpha = 0.2, show.legend=FALSE) +
- geom_line_interactive(aes(datetime, mean, col = model_id,
- tooltip = model_id, data_id = model_id), show.legend=show.legend) +
- labs(x = 'datetime', y = 'predicted') +
- facet_wrap(~site_id, scales = "free", ncol=ncol) +
- guides(x = guide_axis(angle = 45)) +
- theme_bw()
-}
-
-
-forecast_plots <- function(df, ncol = NULL, show.legend = FALSE) {
-
- if(nrow(df)==0) return(NULL)
-
- ggobj <- forecast_ggobj(df, ncol, show.legend)
- girafe(ggobj = ggobj,
- width_svg = 8, height_svg = 4,
- options = list(
- opts_hover_inv(css = "opacity:0.20;"),
- opts_hover(css = "stroke-width:2;"),
- opts_zoom(max = 4)
- ))
-
-}
-
-
-
-by_model_id <- function(df, show.legend = FALSE) {
- leaderboard <-
- df |>
- group_by(model_id) |>
- summarise(crps = mean(crps, na.rm=TRUE),
- #logs = mean(logs, na.rm=TRUE),
- .groups = "drop") |>
- collect() |>
- mutate(model_id = fct_rev(fct_reorder(model_id, crps)))
-
- leaderboard |>
- pivot_longer(cols = c(crps), names_to="metric", values_to="score") |>
-
- ggplot(aes(x = model_id, y= score, fill=model_id)) +
- geom_col_interactive(aes(tooltip = model_id, data_id = model_id),
- show.legend = FALSE) +
- # scale_y_log10() +
- coord_flip() +
- facet_wrap(~metric, scales='free') +
- theme_bw() +
- theme(axis.text.y = element_blank()) # don't show model_id twice
-
- }
-
-
-
-
-by_reference_datetime <- function(df, show.legend = FALSE) {
- leaderboard <-
- df |>
- group_by(model_id, reference_datetime) |>
- summarise(crps = mean(crps, na.rm=TRUE),
- #logs = mean(logs, na.rm=TRUE),
- .groups = "drop") |>
- mutate(reference_datetime = lubridate::as_datetime(reference_datetime)) |>
- collect() |>
- mutate(model_id = fct_rev(fct_reorder(model_id, crps)))
-
- leaderboard |>
- pivot_longer(cols = c(crps), names_to="metric", values_to="score") |>
-
- ggplot(aes(x = reference_datetime, y= score, col=model_id)) +
- geom_line_interactive(aes(tooltip = model_id, data_id = model_id),
- show.legend = FALSE) +
- scale_y_log10() +
- facet_wrap(~metric, scales='free') +
- guides(x = guide_axis(angle = 45)) +
- theme_bw()
-}
-
-
-
-by_horizon <- function(df, show.legend=FALSE) {
-
- leaderboard2 <- df |>
- group_by(model_id, horizon) |>
- summarise(crps = mean(crps, na.rm=TRUE),
- #logs = mean(logs, na.rm=TRUE),
- .groups = "drop") |>
- collect() |>
- mutate(model_id = fct_rev(fct_reorder(model_id, crps))) # sort by score
-
- leaderboard2 |>
- pivot_longer(cols = c(crps), names_to="metric", values_to="score") |>
- ggplot(aes(x = horizon, y= score, col=model_id)) +
- geom_line_interactive(aes(tooltip = model_id, data_id = model_id),
- show.legend = show.legend) +
- facet_wrap(~metric, scales='free') +
- scale_y_log10() +
- theme_bw()
-}
-
-
-horizon_filter <- function(df, horizon_cutoff=35, horizon_units="days") {
- df |>
- mutate(horizon =
- difftime(
- lubridate::as_datetime(datetime),
- lubridate::as_datetime(reference_datetime),
- units = horizon_units)
- ) |>
- filter(horizon <= horizon_cutoff, horizon > 0)
-}
-
-leaderboard_plots <- function(df,
- var,
- horizon_cutoff = 35,
- horizon_units = "days",
- show.legend=TRUE) {
-
- df <- df |> filter(variable == var) |> filter(!is.na(observation))
- df <- horizon_filter(df, horizon_cutoff, horizon_units)
- if(nrow(df)==0) return(NULL)
-
- board1 <- by_model_id(df, show.legend = FALSE)
- board2 <- by_reference_datetime(df, show.legend = FALSE) + theme_bw()
- board3 <- by_horizon(df, show.legend = FALSE) + theme_bw()
-
- ggob <- board1 / board2 / board3 # patchwork stack
-
- girafe(
- ggobj = ggob,
- width_svg = 8,
- height_svg = 6,
- options = list(
- opts_hover_inv(css = "opacity:0.20;"),
- opts_hover(css = "stroke-width:2;"),
- opts_zoom(max = 4)
- )
- )
-
-}
diff --git a/dashboard/_quarto.yml b/dashboard/_quarto.yml
deleted file mode 100644
index 0964e2146e..0000000000
--- a/dashboard/_quarto.yml
+++ /dev/null
@@ -1,42 +0,0 @@
-project:
- type: website
- output-dir: docs
-
-website:
- title: "EFI-USGS River Chlorophyll Forecast Challenge"
- page-navigation: true
- repo-url: https://github.com/eco4cast/usgsrc4cast-ci
- repo-actions: [source, issue]
-
- navbar:
- background: primary
- search: true
- left:
- - href: targets.qmd
- icon: bullseye
- - href: instructions.qmd
- icon: info-square
- - href: performance.qmd
- icon: droplet
- - href: catalog.qmd
- icon: book
- - href: learn-more.qmd
- icon: check2-circle
- page-footer:
- left: "CC-BY"
- right:
- - icon: github
- href: https://github.com/eco4cast/usgsrc4cast-ci
-
-format:
- html:
- theme: [sandstone, custom.scss]
- css: styles.css
- title-block-style: default
- title-block-banner: "img/USGS_logo_green.png"
- title-block-banner-color: white
- page-layout: full
- toc: true
-
-editor: visual
-
diff --git a/dashboard/build_site_map.R b/dashboard/build_site_map.R
deleted file mode 100644
index 823d22429b..0000000000
--- a/dashboard/build_site_map.R
+++ /dev/null
@@ -1,34 +0,0 @@
-library(whisker)
-library(httr)
-
-config <- yaml::read_yaml('challenge_configuration.yaml')
-
-tpl <- '
-
- {{#links}}
-
- {{{loc}}}
- {{{lastmod}}}
-
- {{/links}}
-
-'
-links <- c(paste0(config$challenge_url, "/catalog.html"),
- paste0(config$challenge_url, "/targets.html"),
- paste0(config$challenge_url, "/instructions.html"),
- paste0(config$challenge_url, "/performance.html"),
- paste0(config$challenge_url, "/index.html"))
-
-links <- c(links, paste0("https://radiantearth.github.io/stac-browser/#/external/raw.githubusercontent.com/", config$github_repo, "/main/",fs::dir_ls(path = 'catalog', glob="*.json", recurse=TRUE)))
-
-map_links <- function(l) {
- tmp <- GET(l)
- d <- tmp$headers[['last-modified']]
-
- list(loc=l,
- lastmod=format(as.Date(d,format="%a, %d %b %Y %H:%M:%S")))
-}
-
-links <- lapply(links, map_links)
-
-cat(whisker.render(tpl), file = "dashboard/docs/sitemap.xml")
diff --git a/dashboard/cache.R b/dashboard/cache.R
deleted file mode 100644
index f77c13a534..0000000000
--- a/dashboard/cache.R
+++ /dev/null
@@ -1,9 +0,0 @@
-library(minioclient)
-config <- yaml::read_yaml("challenge_configuration.yaml")
-install_mc()
-mc_alias_set("mc_bucket", endpoint = config$endpoint,
- access_key = "", secret_key = "")
-
-mc(paste0("mirror --overwrite mc_bucket/",config$scores_bucket,"/parquet/project_id=", config$project_id," cache/scores"))
-
-mc(paste0("mirror --overwrite mc_bucket/",config$forecasts_bucket,"/summaries/project_id=", config$project_id," cache/summaries"))
diff --git a/dashboard/catalog.qmd b/dashboard/catalog.qmd
deleted file mode 100644
index 33501912f3..0000000000
--- a/dashboard/catalog.qmd
+++ /dev/null
@@ -1,38 +0,0 @@
----
-title: "Forecast catalog"
----
-
-## What types of models are submitting forecasts?
-
-*Note: This figure will become more complete, as more models register and provide model descriptions*
-
-```{r echo = FALSE}
-s3 <- arrow::s3_bucket(bucket = "bio230014-bucket01/challenges/metadata/model_id/",
- endpoint_override = "sdsc.osn.xsede.org", anonymous = TRUE)
-
-d1 <- arrow::open_dataset(s3, format = "json") |> dplyr::collect()
-
-model_type <- tidyr::unnest(d1[[3]], cols = names(d1[[3]]))$type
-
-model_type[which(stringr::str_detect(model_type, "mpirical"))] <- "Empirical"
-
-tibble::tibble(model_type = model_type) |>
-ggplot2::ggplot(ggplot2::aes(x = model_type)) +
- ggplot2::geom_bar() +
- ggplot2::labs(x = "Model Type", y = "Number submitting forecasts") +
- ggplot2::theme_bw()
-```
-
-## Catalog of forecast submissions and evaluations {#sec-spatiotemporal-asset-catalog}
-
-The catalog of submitted forecasts and the evaluation of the forecasts ("scores") is available through the SpatioTemporal Asset Catalogs browser (below).\
-
-The catalog provides the code that you can use to access forecasts and scores.\
-
-A full page version can be found [here](https://radiantearth.github.io/stac-browser/#/external/raw.githubusercontent.com/eco4cast/usgsrc4cast-ci/main/catalog/catalog.json)\
-
-
-
-```{=html}
-
-```
diff --git a/dashboard/img/USGS_logo_green.png b/dashboard/img/USGS_logo_green.png
deleted file mode 100644
index d7c1d199d4..0000000000
Binary files a/dashboard/img/USGS_logo_green.png and /dev/null differ
diff --git a/dashboard/img/workflow.png b/dashboard/img/workflow.png
deleted file mode 100644
index 8b1aed01c8..0000000000
Binary files a/dashboard/img/workflow.png and /dev/null differ
diff --git a/dashboard/index.qmd b/dashboard/index.qmd
deleted file mode 100644
index ee2e7647a9..0000000000
--- a/dashboard/index.qmd
+++ /dev/null
@@ -1,120 +0,0 @@
----
-title: "Forecasting Challenge"
-editor:
- markdown:
- wrap: 72
----
-
-We invite you to submit forecasts!
-
-The EFI-USGS River Chlorophyll Forecasting Challenge is an open platform for the ecological and data science communities to forecast data from [the U.S. Geological Survey (USGS)](https://www.usgs.gov/) before they are collected.
-
-The Challenge is hosted by the [Ecological Forecasting Initiative Research Coordination Network](https://ecoforecast.org) and sponsored by the U.S. National Science Foundation. This challenge is co-hosted by the USGS Proxies Project, an effort supported by the Water Mission Area Water Quality Processes program to develop estimation methods for PFAS, harmful algal blooms, and metals, at multiple spatial and temporal scales.
-
-## Why a forecasting challenge?
-
-Our vision is to use forecasts to advance theory and to support natural resource management. We can begin to realize this vision by creating and analyzing a catalog of forecasts from a range of ecological systems, spatiotemporal scales, and environmental gradients.
-
-Our forecasting challenge is platform for the ecological and data science communities to advance skills in forecasting ecological systems and for generating forecasts that contribute to a synthetic understanding of patterns of predictability in ecology. Rewards for contributing are skill advancement, joy, and potential involved in manuscripts. We do not currently crown winner nor offer financial awards.
-
-The Challenge is [an excellent focal project in university courses](https://www.neonscience.org/impact/observatory-blog/efi-neon-forecasting-challenge-classroom).
-
-
-
-```{r setup, include=FALSE, echo=FALSE, message=FALSE, warning=FALSE}
-knitr::opts_chunk$set(echo=FALSE, message=FALSE, warning=FALSE)
-library(dplyr)
-library(arrow)
-library(bslib)
-library(bsicons)
-library(leaflet)
-library(tidyverse)
-config <- yaml::read_yaml("../challenge_configuration.yaml")
-sites <- suppressMessages(sf::st_read("sites.json"))
-s3_inventory <- arrow::s3_bucket(paste0(config$inventory_bucket, "/catalog/forecasts/project_id=", config$project_id), endpoint_override = config$endpoint, anonymous = TRUE)
-n_forecasts <- arrow::open_dataset(s3_inventory) |> distinct(model_id, reference_date) |> collect() |> nrow()
-n_models <- arrow::open_dataset(s3_inventory) |> distinct(model_id) |> collect() |> nrow()
-
-s3_targets <- arrow::s3_bucket(config$targets_bucket, endpoint_override = config$endpoint, anonymous = TRUE)
-
-most_recent_targets <- arrow::open_csv_dataset(s3_targets,
- schema = arrow::schema(
- project_id = arrow::string(),
- site_id = arrow::string(),
- datetime = arrow::timestamp(unit = "ns"), # timezone = "UTC"),
- duration = arrow::string(),
- #depth_m = arrow::float(),
- variable = arrow::string(),
- observation = arrow::float()),
- skip = 1) |>
- filter(project_id == config$project_id) |>
- summarize(max = max(datetime),
- min = min(datetime),
- n = n_distinct(variable)) |>
- mutate(max = lubridate::as_date(max),
- min = lubridate::as_date(min)) |>
- dplyr::collect()
-
-#unique_forecasted_targets <- arrow::open_dataset("../cache/duration=P1D") |>
-# dplyr::distinct(variable) |> count() |>
-# dplyr::collect() |>
-# pull(n)
-
-last_updated <- Sys.Date()
-```
-
-```{r}
-
-layout_column_wrap(
- width = "250px",
- value_box(
- title = "Total forecasts submitted to the NEON Challenge",
- value = n_forecasts,
- showcase = bs_icon("graph-up"),
- theme_color = "success"
- ),
- value_box(
- title = "Most recent data for model training",
- value = most_recent_targets$max,
- showcase = bs_icon("bullseye"),
- theme_color = "info"
- ),
- value_box(
- title = "Number of years of data for model training",
- value = round(as.numeric(most_recent_targets$max - most_recent_targets$min) /365, 2),
- showcase = bs_icon("bullseye"),
- theme_color = "info"
- ),
- value_box(
- title = "Number of variables being forecasted",
- value = most_recent_targets$n,
- showcase = bs_icon("clipboard-data"),
- theme_color = "success"
- )
-)
-
-```
-
-
-
-## The Challenge is a platform
-
-Our platform is designed to empower you to contribute by providing target data, numerical weather forecasts, and tutorials. We automatically score your forecasts using the latest NEON data. All forecasts and scores are publicly available through cloud storage and discoverable through our catalog.
-
-![The NEON Ecological Forecasting Challenge platform from Thomas et al. 2023](img/workflow.png){fig-align="center"}
-
-
-Figure from [Thomas et al. 2023](https://doi.org/10.1002/fee.2616)
-
-## Contact
-
-eco4cast.initiative@gmail.com
-
-## Acknowledgements
-
-Thomas, R. Q., Boettiger, C., Carey, C. C., Dietze, M. C., Johnson, L. R., Kenney, M. A., et al. (2023). The NEON Ecological Forecasting Challenge. Frontiers in Ecology and the Environment, 21(3), 112–113. [https://doi.org/10.1002/fee.2616](https://doi.org/10.1002/fee.2616)
-
-We thank NEON for providing the freely available data and the EFI community for feedback on the design of the Challenge. This material is based upon work supported by the National
-Science Foundation under Grant DEB-1926388.
-
-Page last updated on `r Sys.Date()`
diff --git a/dashboard/instructions.qmd b/dashboard/instructions.qmd
deleted file mode 100644
index d486ec1984..0000000000
--- a/dashboard/instructions.qmd
+++ /dev/null
@@ -1,222 +0,0 @@
----
-title: "How to forecast"
-number-sections: true
-editor:
- markdown:
- wrap: sentence
----
-
-## tl;dr: How to submit a forecast
-
-We provide an overview of the steps for submitting with the details below:
-
-1) Explore the [data](targets.qmd#sec-targets) (e.g., targets) and build your forecast model.
-2) Register and describe your model at . You are not required to register if your forecast submission uses the word "example" in your model_id". Any forecasts with "example" in the model_id will not be used in forecast evaluation analyses. Use neon4cast as the challenge you are registering for.
-3) Generate a forecast!
-4) Write the forecast output to a file that follows our standardized format (described below).
-5) Submit your forecast using an R function (provided below).
-6) Watch your forecast be evaluated as new data are collected.
-
-## Generating a forecast
-
-### All forecasting approaches are welcome
-
-We encourage you to use any modeling approach to make a prediction about the future conditions at any of the NEON sites and variables.
-
-### Must include uncertainty
-
-Forecasts require you to make an assessment of the confidence in your prediction of the future.
-You can represent your confidence (i.e., uncertainty in the forecast) using a distribution or numerically using an ensemble (or sample) of predictions.
-
-### Any model drivers/covariates/features are welcome
-
-You can use any data as model inputs (including all of the forecast target data available to date).
-All sensor-based target data are available in with a 1 to 7 day delay (latency) from time of collection.
-You may want to use the updated target data to re-train a model or for use in data assimilation.
-
-As a genuine forecasting challenge, you will need forecasted drivers if your model uses drivers as inputs.
-If you are interested in using forecasted meteorology, we are downloading and processing NOAA Global Ensemble Forecasting System (GEFS) weather forecasts for each NEON site.
-The NOAA GEFS forecasts extend 35-days ahead.
-More information about accessing the weather forecasts can be found [here]( https://projects.ecoforecast.org/neon4cast-docs/Shared-Forecast-Drivers.html)
-
-### Forecasts can be for a range of horizons
-
-Forecasts can be submitted for 1 day to 1 year-ahead, depending on the variable.
-See the [variable tables](targets.qmd#sec-targets) for the horizon that is associated with each variable.
-
-### Forecasts can be submitted everyday
-
-Since forecasts can be submitted everyday, automation is important.
-We provide an [example GitHub](https://github.com/eco4cast/neon4cast-example) repository that can be used to automate your forecast with GitHub Actions.
-It also includes the use of a custom Docker Container [eco4cast/rocker-neon4cast:latest](https://github.com/eco4cast/neon4cast-ci/blob/main/Dockerfile) that has many of the packages and functions needed to generate and submit forecasts.
-
-We only evaluate forecasts for any weekly variables (e.g., beetles and ticks) that were submitted on the Sunday of each week.
-Therefore we recommend only submitting forecasts of the weekly variables on Sundays.
-
-## You can forecast at any of the NEON sites
-
-If are you are getting started, we recommend a set of [focal sites](targets.qmd#sec-starting-sites) for each of the five "themes".
-You are also welcome to submit forecasts to all or a subset of NEON sites . More information about NEON sites can be found in the [site metadata](https://radiantearth.github.io/stac-browser/#/external/raw.githubusercontent.com/eco4cast/neon4cast-ci/main/catalog/sites/collection.json) and on NEON's [website](https://www.neonscience.org/field-sites/explore-field-sites)
-
-## Forecast file format
-
-The file is a csv format with the following columns:
-
-- `project_id`: use `neon4cast`
-
-- `model_id`: the short name of the model defined as the model_id in your registration. The model_id should have no spaces.
- `model_id` should reflect a method to forecast one or a set of target variables and must be unique to the neon4cast challenge.
-
-- `datetime`: forecast timestamp.
- Format `%Y-%m-%d %H:%M:%S` with UTC as the time zone.
- Forecasts submitted with a `%Y-%m-%d` format will be converted to a full datetime assuming UTC mid-night.
-
-- `reference_datetime`: The start of the forecast; this should be 0 times steps in the future.
- There should only be one value of `reference_datetime` in the file.
- Format is `%Y-%m-%d %H:%M:%S` with UTC as the time zone.
- Forecasts submitted with a `%Y-%m-%d` format will be converted to a full datetime assuming UTC mid-night.
-
-- `duration`: the time-step of the forecast.
- Use the value of `P1D` for a daily forecast, `P1W` for a weekly forecast, and `PT30M` for 30 minute forecast.
- This value should match the duration of the target variable that you are forecasting.
- Formatted as [ISO 8601 duration](https://en.wikipedia.org/wiki/ISO_8601#Durations)
-
-- `site_id`: code for NEON site.
-
-- `family` name of the probability distribution that is described by the parameter values in the parameter column (see list below for accepted distribution).
- An ensemble forecast as a family of `ensemble`.
- See note below about family
-
-- `parameter` the parameters for the distribution (see note below about the parameter column) or the number of the ensemble members.
- For example, the parameters for a normal distribution are called `mu` and `sigma`.
-
-- `variable`: standardized variable name.
- It must match the variable name in the target file.
-
-- `prediction`: forecasted value for the parameter in the parameter column
-
-## Representing uncertainity
-
-Uncertainty is represented through the family - parameter columns in the file that you submit.
-
-#### Parameteric forecast
-
-For a parametric forecast with a normal distribution, the `family` column would have the word `normal` to designate a normal distribution and the parameter column must have values of `mu` and `sigma` for each forecasted variable, site_id, depth, and time combination.
-
-Parameteric forecasts for binary variables should use `bernoulli` as the family and `prob` as the parameter.
-
-The following names and parameterization of the distribution are currently supported (family: parameters):
-
-- `lognormal`: `mu`, `sigma`
-- `normal`: `mu`,`sigma`
-- `bernoulli`: `prob`
-- `beta`: `shape1`, `shape2`
-- `uniform`: `min`, `max`
-- `gamma`: `shape`, `rate`
-- `logistic`: `location`, `scale`
-- `exponential`: `rate`
-- `poisson`: `lambda`
-
-If you are submitting a forecast that is not in the supported list, we recommend using the ensemble format and sampling from your distribution to generate a set of ensemble members that represents your forecast distribution.
-
-#### Ensemble (or sample) forecast
-
-Ensemble (or sample) forecasts use the `family` value of `ensemble` and the `parameter` values are the ensemble index.
-
-When forecasts using the ensemble family are scored, we assume that the ensemble is a set of equally likely realizations that are sampled from a distribution that is comparable to that of the observations (i.e., no broad adjustments are required to make the ensemble more consistent with observations).
-This is referred to as a "perfect ensemble" by Bröcker and Smith (2007).
-Ensemble (or sample) forecasts are transformed to a probability distribution function (e.g., dressed) using the default methods in the `scoringRules` R package (empirical version of the quantile decomposition for the `crps`).
-
-### Example forecasts
-
-Here is an example of a forecast that uses a normal distribution:
-
-```{r}
-df <- readr::read_csv("https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/forecasts/raw/T20231102190926_aquatics-2023-10-19-climatology.csv.gz", show_col_types = FALSE)
-```
-```{r}
-df |>
- head() |>
- knitr::kable()
-```
-
-For an ensemble (or sample) forecast, the `family` column uses the word `ensemble` to designate that it is a ensemble forecast and the parameter column is the ensemble member number (`1`, `2`, `3` ...)
-
-```{r}
-df <- readr::read_csv("https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/forecasts/raw/T20231102190926_aquatics-2023-10-19-persistenceRW.csv.gz", show_col_types = FALSE)
-```
-```{r}
-df |>
- dplyr::arrange(variable, site_id, datetime, parameter) |>
- head() |>
- knitr::kable()
-```
-
-
-## Submission process
-
-### File name
-
-Save your forecast as a csv file with the following naming convention:
-
-`theme_name-year-month-day-model_id.csv`.
-Compressed csv files with the csv.gz extension are also accepted.
-
-The `theme_name` options are: terrestrial_daily, terrestrial_30min, aquatics, beetles, ticks, or phenology.
-
-The year, month, and day are the year, month, and day the reference_datetime (horizon = 0).
-For example, if a forecast starts today and tomorrow is the first forecasted day, horizon = 0 would be today, and used in the file name.
-`model_id` is the id for the model name that you specified in the model metadata Google Form (model_id has no spaces in it).
-
-### Uploading forecast
-
-Individual forecast files can be uploaded any time.
-
-Teams will submit their forecast csv files through an R function.
-The csv file can only contain one unique `model_id` and one unique `project_id`.
-
-The function is available using the following code
-
-```{r eval = FALSE}
-remotes::install_github("eco4cast/neon4cast")
-```
-
-The submit function is
-
-```{r eval = FALSE}
-library(neon4cast)
-neon4cast::submit(forecast_file = "theme_name-year-month-day-model_id.csv")
-```
-
-## Post-submission
-
-### Processing
-
-After submission, our servers will process uploaded files by converting them to a [parquet format](https://radiantearth.github.io/stac-browser/#/external/raw.githubusercontent.com/eco4cast/neon4cast-ci/main/catalog/forecasts/collection.json) on our public s3 storage.
-A `pub_datetime` column will be added that denotes when a forecast was submitted. [Summaries](https://radiantearth.github.io/stac-browser/#/external/raw.githubusercontent.com/eco4cast/neon4cast-ci/main/catalog/summaries/collection.json) are generated of the forecasts provide descriptive statistics of the forecast.
-
-### Evaluation
-
-All forecasts are scored daily using new data until the full horizon of the forecast has been scored.
-Forecasts are scored using the `crps` function in the [`scoringRules`](https://cran.r-project.org/web/packages/scoringRules/index.html) R package. More information about the scoring metric can be found at [here](https://projects.ecoforecast.org/neon4cast-docs/Evaluation.html)
-
-### Comparison
-
-Forecast performance can be compared to the performance of baseline models.
-We are automatically submitting the following baseline models:
-
-- `climatology`: the normal distribution (mean and standard deviation) of that day-of-year in the historical observations
-- `persistenceRW`: a random walk model that assumes no change in the mean behavior. The random walk is initialized using the most resent observation.
-- `mean`: the historical mean of the data is submitted for the beetles theme.
-
-Our [forecast performance](performance.qmd#performance) page includes evaluations of all submitted models.
-
-### Catalog
-
-Information and code for accessing the forecasts and scores can be found on our [forecast catalog](catalog.qmd#sec-spatiotemporal-asset-catalog) page.
-
-## Questions?
-
-Thanks for reading this document!\
-
-If you still have questions about how to submit your forecast to the NEON Ecological Forecasting Challenge, we encourage you to email Dr. Quinn Thomas (rqthomas{at}vt.edu).
diff --git a/dashboard/learn-more.qmd b/dashboard/learn-more.qmd
deleted file mode 100644
index 274ff471ea..0000000000
--- a/dashboard/learn-more.qmd
+++ /dev/null
@@ -1,50 +0,0 @@
-# Learn more
-
-## Tutorials
-
-Introductory tutorial for submitting to Challenge focused on aquatics theme: [https://github.com/OlssonF/NEON-forecast-challenge-workshop](https://github.com/OlssonF/NEON-forecast-challenge-workshop). [A webinar version of tutorial](https://www.youtube.com/watch?v=-5iPNr19g-4)
-
-More advanced tutorial for submitting to Challenge focused on terrestrial theme: [https://github.com/rqthomas/FluxCourseForecast](https://github.com/mdietze/FluxCourseForecast)
-
-Other tutorial materials about [ecological forecasting](https://ecoforecast.org/resources/educational-resources/)
-
-## Research from the Ecological Forecasting Initiative Research Coordination Network.
-
-### Ecological Forecasting
-
-Lewis, A., W. Woelmer, H. Wander, D. Howard, J. Smith, R. McClure, M. Lofton, N. Hammond, R. Corrigan, R.Q. Thomas, C.C. Carey. 2022. Increased adoption of best practices in ecological forecasting enables comparisons of forecastability across systems. Ecological Applications 32: e02500 [https://doi.org/10.1002/eap.2500](https://doi.org/10.1002/eap.2500)
-
-Lewis, A. S. L., Rollinson, C. R., Allyn, A. J., Ashander, J., Brodie, S., Brookson, C. B., et al. (2023). The power of forecasts to advance ecological theory. Methods in Ecology and Evolution, 14(3), 746–756. [https://doi.org/10.1111/2041-210X.13955](https://doi.org/10.1111/2041-210X.13955)
-
-### Manuscripts about the Challenge
-
-Thomas, R. Q., Boettiger, C., Carey, C. C., Dietze, M. C., Johnson, L. R., Kenney, M. A., et al. (2023). The NEON Ecological Forecasting Challenge. Frontiers in Ecology and the Environment, 21(3), 112–113. [https://doi.org/10.1002/fee.2616](https://doi.org/10.1002/fee.2616)
-
-Thomas, R.Q, R.P. McClure, T.N. Moore, W.M. Woelmer, C. Boettiger, R.J. Figueiredo, R.T. Hensley, C.C. Carey. Near-term forecasts of NEON lakes reveal gradients of environmental predictability across the U.S. Frontiers in Ecology and Environment 21: 220–226. [https://doi.org/10.1002/fee.2623](https://doi.org/10.1002/fee.2623)
-
-Wheeler, K., M. Dietze, D. LeBauer, J. Peters, A.D. Richardson, R.Q. Thomas, K. Zhu, U. Bhat, S. Munch, R.F Buzbee, M. Chen, B. Goldstein, J.S. Guo, D. Hao, C. Jones, M. Kelly-Fair, H. Liu, C. Malmborg, N. Neupane. D. Pal, A. Ross, V. Shirey, Y. Song, M. Steen, E.A. Vance, W.M. Woelmer, J. Wynne and L. Zachmann. Predicting Spring Phenology in Deciduous Broadleaf Forests: An Open Community Forecast Challenge.
-
-### Details about the standards used in the challenge
-
-Dietze, M., R.Q. Thomas, J. Peters, C. Boettiger, A. Shiklomanov, and J. Ashander. 2023. A community convention for ecological forecasting: output files and metadata v1.0. Ecosphere 14: e4686 [https://doi.org/10.1002/ecs2.4686](https://doi.org/10.1002/ecs2.4686)
-
-### Educational manuscripts
-
-Moore, T.N., R.Q. Thomas, W.M. Woelmer, C.C Carey. 2022. Integrating ecological forecasting into undergraduate ecology curricula with an R Shiny application-based teaching module. Forecasting 4:604-633. [https://doi.org/10.3390/forecast4030033](https://doi.org/10.3390/forecast4030033)
-
-Peters, J. and R.Q. Thomas. 2021. Going Virtual: What We Learned from the Ecological Forecasting Initiative Research Coordination Network Virtual Workshop. Bulletin of the Ecological Society of America 102: e01828 [https://doi.org/10.1002/bes2.1828](https://doi.org/10.1002/bes2.1828)
-
-Willson, A.M., H. Gallo, J.A. Peters, A. Abeyta, N. Bueno Watts, C.C. Carey, T.N. Moore, G. Smies, R.Q. Thomas, W.M. Woelmer, and J.S. McLachlan. 2023. Assessing opportunities and inequities in undergraduate ecological forecasting education. Ecology and Evolution 13: e10001. [https://doi.org/10.1002/ece3.10001](https://doi.org/10.1002/ece3.10001)
-
-Woelmer, W. M., Bradley, L. M., Haber, L. T., Klinges, D. H., Lewis, A. S. L., Mohr, E. J., et al. (2021). Ten simple rules for training yourself in an emerging field. PLOS Computational Biology, 17(10), e1009440. [https://doi.org/10.1371/journal.pcbi.1009440](https://doi.org/10.1371/journal.pcbi.1009440)
-
-Woelmer, W.M., T.N. Moore, M.E. Lofton, R.Q. Thomas, and C.C. Carey. 2023. Embedding communication concepts in forecasting training increases students’ understanding of ecological uncertainty Ecosphere 14: e4628 [https://doi.org/10.1002/ecs2.4628](https://doi.org/10.1002/ecs2.4628)
-
-## Videos
-
-{{< video https://www.youtube.com/watch?v=r3oHH1AuItI >}}
-
-
-
-
-
diff --git a/dashboard/performance.qmd b/dashboard/performance.qmd
deleted file mode 100644
index 33cdbda116..0000000000
--- a/dashboard/performance.qmd
+++ /dev/null
@@ -1,154 +0,0 @@
----
-title: "Forecast performance"
----
-
-```{r, include=FALSE}
-knitr::opts_chunk$set(echo = FALSE, message=FALSE, warning=FALSE)
-```
-
-```{r setup}
-library(ggiraph)
-library(dplyr)
-library(ggplot2)
-library(glue)
-source("R/plot-utils.R")
-#source("../R/ignore_sigpipes.R")
-#ignore_sigpipe()
-
-# TODO: update these
-aquatics_focal_sites <- c("BARC", "CRAM")
-```
-
-This page visualizes the forecasts and forecast performance for the focal target variables.
-
-## Most recent forecasts {#sec-performance}
-
-Only the top performing models from the last 30 days are shown.
-
-```{r echo = FALSE}
-### GET FORECASTS
-reference_datetimes <- arrow::open_dataset("../cache/summaries") |>
- dplyr::summarize(reference_datetime_max = max(reference_datetime), .by = "variable") |>
- dplyr::collect() |>
- group_by(variable) |>
- dplyr::mutate(reference_datetime_max = min(c(reference_datetime_max, Sys.Date() - lubridate::days(1))))
-
-config <- yaml::read_yaml("../challenge_configuration.yaml")
-sites <- readr::read_csv(paste0("../", config$site_table), show_col_types = FALSE)
-
-df_P1D <- arrow::open_dataset("../cache/summaries/duration=P1D") |>
- left_join(reference_datetimes, by = "variable") |>
- filter(reference_datetime == reference_datetime_max) |>
- left_join(sites, by = "site_id") |>
- filter(site_id %in% sites$site_id) |>
- mutate(reference_datetime = lubridate::as_datetime(reference_datetime),
- datetime = lubridate::as_datetime(datetime)) |>
- filter(lubridate::as_date(datetime) > lubridate::as_date(reference_datetime)) |>
- collect()
-
-```
-
-```{r}
-cutoff <- Sys.Date() - lubridate::days(30)
-df_P1D_scores <- arrow::open_dataset("../cache/scores/duration=P1D") |>
- left_join(sites, by = "site_id") |>
- mutate(reference_datetime = lubridate::as_datetime(reference_datetime),
- datetime = lubridate::as_datetime(datetime)) |>
- filter(reference_datetime > cutoff) |>
- collect()
-
-
-cutoff <- Sys.Date() - lubridate::days(365)
-
-
-ref <- Sys.Date() - lubridate::days(30)
-
-ref_P1D <- min(c(Sys.Date() - lubridate::days(30),
- lubridate::as_date(df_P1D$reference_datetime)))
-
-
-#n_data <- 10
-#who <- combined |> filter(!is.na(observation)) |> summarise(has_data = max(reference_datetime)) |> collect()
-#ref <- as.character ( as.Date(who$has_data[[1]]) - n_data )
-ex_P1D <- df_P1D_scores |>
- mutate(min_reference_datetime = min(reference_datetime)) |>
- filter(reference_datetime == min_reference_datetime)
-
-```
-
-```{r}
-#Best models
-
-best_P1D_scores <- df_P1D_scores |>
- summarise(score = mean(crps, na.rm = TRUE), .by = c("model_id","variable")) |>
- arrange(variable, score) |>
- group_by(variable) |>
- slice(1:5)
-
-```
-Forecasts submitted on `r max(lubridate::as_date(df_P1D$reference_datetime))`
-
-
-### Aquatics: Chlorophyll-a
-
-Forecast summaries are available [here](https://radiantearth.github.io/stac-browser/#/external/raw.githubusercontent.com/eco4cast/usgsrc4cast-ci/main/catalog/summaries/Aquatics/Daily_Chlorophyll_a/collection.json)
-
-```{r}
-best_models <- best_P1D_scores |> filter(variable == "chla") |> pull(model_id)
-
-df_P1D |>
- filter(variable == c("chla"),
- model_id %in% best_models,
- # TODO: update these
- # site_id %in% aquatics_focal_sites
- ) |>
- mutate(observation = as.numeric(NA)) |>
- forecast_plots()
-
-```
-
-:::
-
-## Forecast analysis
-
-Below are forecasts submitted 30 days ago and include the observations used to evaluate them. Mouse over to see the team id, scroll to zoom. Only the top five performing models are shown. Information on how to access the scores can be found in our [catalog](https://radiantearth.github.io/stac-browser/#/external/raw.githubusercontent.com/eco4cast/usgsrc4cast-ci/main/catalog/scores/collection.json)
-
-::: panel-tabset
-
-### Aquatics: chrophyll-a
-
-```{r}
-best_models <- best_P1D_scores |> filter(variable == "chla") |> pull(model_id)
-
-
-ex_P1D |>
- filter(variable == c("chla"),
- model_id %in% best_models,
- #TODO: Update
- # site_id %in% aquatics_focal_sites
- ) |>
- forecast_plots()
-
-```
-
-:::
-
-## Aggregated scores
-
-Average skill scores of each model across all sites.\
-
-Scores are shown by reference date and forecast horizon (in days).\
-
-Scores are averaged across all submissions of the model with a given horizon or a given `reference_datetime` using submissions made since `r cutoff`.\
-
-Learn about the continous ranked probablity score [here](https://projects.ecoforecast.org/neon4cast-docs/Evaluation.html)
-
-::: panel-tabset
-
-### Aquatics: chrophyll-a
-
-```{r}
-leaderboard_plots(df_P1D_scores, "chla")
-```
-
-:::
diff --git a/dashboard/sites.json b/dashboard/sites.json
deleted file mode 100644
index 5b83645e43..0000000000
--- a/dashboard/sites.json
+++ /dev/null
@@ -1,984 +0,0 @@
-{
- "type": "FeatureCollection",
- "name": "neon",
- "crs": {
- "type": "name",
- "properties": {
- "name": "urn:ogc:def:crs:OGC:1.3:CRS84"
- }
- },
- "features": [
- {
- "type": "Feature",
- "properties": {
- "site_id": "Abby Road",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-122.3303, 45.7624]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Arikaree River",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-102.4471, 39.7582]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Lake Barco",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-82.0084, 29.676]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Utqiaġvik",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-156.6194, 71.2824]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Bartlett Experimental Forest",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-71.2874, 44.0639]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Upper Big Creek",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-119.2575, 37.0597]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Blandy Experimental Farm",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-78.0418, 39.0337]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Blacktail Deer Creek",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-110.5871, 44.9501]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Blue River",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-96.6242, 34.4442]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Black Warrior River",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-87.7982, 32.5415]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Caribou-Poker Creeks Research Watershed",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-147.5026, 65.154]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Caribou Creek",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-147.504, 65.1532]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Lyndon B. Johnson National Grassland",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-97.57, 33.4012]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Como Creek",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-105.5442, 40.035]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Central Plains Experimental Range",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-104.7456, 40.8155]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Crampton Lake",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-89.4737, 46.2097]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Rio Cupeyes",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-66.9868, 18.1135]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Dakota Coteau Field Site",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-99.1066, 47.1617]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Delta Junction",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-145.7514, 63.8811]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Dead Lake",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-87.8039, 32.5417]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Disney Wilderness Preserve",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-81.4362, 28.1251]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Flint River",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-84.4374, 31.1854]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Great Smoky Mountains National Park",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-83.5019, 35.689]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Guanica Forest",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-66.8687, 17.9696]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Rio Guilarte",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-66.7987, 18.1741]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Harvard Forest & Quabbin Watershed",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-72.1727, 42.5369]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Healy",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-149.2133, 63.8758]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Lower Hop Brook",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-72.3295, 42.4719]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "The Jones Center At Ichauway",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-84.4686, 31.1948]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Jornada Experimental Range",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-106.8425, 32.5907]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Kings Creek",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-96.6038, 39.1051]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Konza Prairie Agroecosystem",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-96.6129, 39.1104]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Konza Prairie Biological Station",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-96.5631, 39.1008]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Lajas Experimental Station",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-67.0769, 18.0213]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "LeConte Creek",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-83.5038, 35.6904]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Lenoir Landing",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-88.1612, 31.8539]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Lewis Run",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-77.9832, 39.0956]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Little Rock Lake",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-89.7048, 45.9983]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Martha Creek",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-121.9338, 45.7908]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Mayfield Creek",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-87.4077, 32.9604]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "McDiffett Creek",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-96.443, 38.9459]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "McRae Creek",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-122.1655, 44.2596]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Mountain Lake Biological Station",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-80.5248, 37.3783]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Moab",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-109.3883, 38.2483]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Niwot Ridge",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-105.5824, 40.0543]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Northern Great Plains Research Laboratory",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-100.9154, 46.7697]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Marvin Klemme Range Research Station",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-99.0588, 35.4106]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Oksrukuyik Creek",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-149.143, 68.6698]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Onaqui",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-112.4524, 40.1776]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Oak Ridge",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-84.2826, 35.9641]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Ordway-Swisher Biological Station",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-81.9934, 29.6893]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Posey Creek",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-78.1473, 38.8943]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Pringle Creek",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-97.7823, 33.3785]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Prairie Lake",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-99.1139, 47.1591]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Prairie Pothole",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-99.2531, 47.1298]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Pu'u Maka'ala Natural Area Reserve",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-155.3173, 19.5531]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Red Butte Creek",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-111.7979, 40.7839]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Rocky Mountains",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-105.546, 40.2759]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Smithsonian Conservation Biology Institute",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-78.1395, 38.8929]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Smithsonian Environmental Research Center",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-76.56, 38.8901]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "San Joaquin Experimental Range",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-119.7323, 37.1088]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Soaproot Saddle",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-119.2622, 37.0334]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Santa Rita Experimental Range",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-110.8355, 31.9107]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Steigerwaldt-Chequamegon",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-89.5864, 45.5089]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "North Sterling",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-103.0293, 40.4619]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Lake Suggs",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-82.0177, 29.6878]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Sycamore Creek",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-111.5081, 33.751]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Talladega National Forest",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-87.3933, 32.9505]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Lower Teakettle",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-119.006, 37.0058]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Teakettle Creek - Watershed 2",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-119.0274, 36.9559]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Lower Tombigbee River",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-88.1589, 31.8534]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Toolik Lake",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-149.6106, 68.6307]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Toolik Field Station",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-149.3705, 68.6611]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Treehaven",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-89.5857, 45.4937]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "University of Kansas Field Station",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-95.1921, 39.0404]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "University of Notre Dame Environmental Research Center",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-89.5373, 46.2339]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Walker Branch",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-84.2793, 35.9574]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "West St Louis Creek",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-105.9154, 39.8914]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Chase Lake National Wildlife Refuge",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-99.2413, 47.1282]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Wind River Experimental Forest",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-121.9519, 45.8205]
- }
- },
- {
- "type": "Feature",
- "properties": {
- "site_id": "Yellowstone National Park",
- "Partner": "NEON",
- "n": 5
- },
- "geometry": {
- "type": "Point",
- "coordinates": [-110.5391, 44.9535]
- }
- }
- ]
-}
diff --git a/dashboard/styles.css b/dashboard/styles.css
deleted file mode 100644
index 2ddf50c7b4..0000000000
--- a/dashboard/styles.css
+++ /dev/null
@@ -1 +0,0 @@
-/* css styles */
diff --git a/dashboard/targets.qmd b/dashboard/targets.qmd
deleted file mode 100644
index 8f7303c1db..0000000000
--- a/dashboard/targets.qmd
+++ /dev/null
@@ -1,138 +0,0 @@
----
-title: "What to forecast"
-editor:
- markdown:
- wrap: sentence
----
-
-```{r message=FALSE, echo = FALSE}
-library(tidyverse)
-aquatics_focal_sites <- c("BARC", "CRAM")
-```
-
-```{r message=FALSE, echo = FALSE}
-# TODO: need to update this
-googlesheets4::gs4_deauth()
-target_metadata <- googlesheets4::read_sheet("https://docs.google.com/spreadsheets/d/10YTX9ae_C1rFdLgEDkUcCRCpUkVYv06leY01BtD1BgM/edit?usp=sharing")
-```
-
-```{r echo = FALSE}
-target_metadata <- target_metadata |>
- rename(variable = `"official" targets name`) |>
- select(variable, duration, class, Description, horizon, Latency)
-```
-
-## tl;dr: Forecast the targets!
-
-The "targets" are time-series of United States Geological Survey ([USGS](https://www.usgs.gov/)) data for use in model development and forecast evaluation.
-
-The targets are updated as new USGS data are made available.
-
-This challenge focuses on forecasting river chlorophyll-a at select USGS monitoring locations. The links to targets files are included below.
-
-## Where to start {#sec-starting-sites}
-
-
-
-
-
-
-As you develop your forecasting skills and want to expand to more sites, the targets are available at all 10 USGS sites.
-You may also consider submitting forecasts to sites that match your interests.
-For example, a class being taught in the winter may be more interested in forecasting southern sites while a summer class may focus on more northern sites.
-
-More information about USGS sites can be found in the [site metadata](https://radiantearth.github.io/stac-browser/#/external/raw.githubusercontent.com/eco4cast/usgs4cast-ci/main/catalog/sites/collection.json) and on USGS's [website](https://dashboard.waterdata.usgs.gov/app/nwd/en/)
-
-## Explore the targets and themes {#sec-targets}
-
-Information on the targets files for the "themes" is below.
-In the tables,
-
-- "duration" is the time-step of the variable where `P1D` is a daily mean.
-
-- The "forecast horizon" is the number of days-ahead that we want you to forecast.
-
-- The "latency" is the time between data collection and data availability in the targets file
-
-
-### Aquatics
-
-![](https://projects.ecoforecast.org/neon4cast-catalog/img/neon_buoy.jpg)
-
-Freshwater surface water temperature, dissolved oxygen, and chlorophyll-a all influence drinking water quality, are critical for life in aquatic environments, and can represent the health of the ecosystem.
-
-The aquatics theme challenges you to forecast daily mean water quality variables at up-to 10 river USGS sites.
-
-```{r echo = FALSE}
-url <- "https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/targets/project_id=usgsrc4cast/duration=P1D/river-chl-targets.csv.gz"
-read_csv(url, show_col_types = FALSE) |>
- distinct(variable, duration) |>
- # TODO: need to fix target_metadata
- # left_join(target_metadata, by = c("variable","duration")) |>
- filter(variable %in% c("chla")) |>
- # select(-class) |>
- knitr::kable()
-```
-
-The daily mean target file is located at the following URL.
-
-```{r}
-url <- "https://sdsc.osn.xsede.org/bio230014-bucket01/challenges/targets/project_id=usgsrc4cast/duration=P1D/river-chl-targets.csv.gz"
-```
-
-You can directly load it into R using the following
-
-```{r}
-aquatics_targets <- read_csv(url, show_col_types = FALSE)
-```
-
-The file contains the following columns
-
-```{r echo = FALSE}
-aquatics_targets |>
- na.omit() |>
- head() |>
- knitr::kable()
-```
-
-and the time series for the focal sites
-
-```{r}
-aquatics_targets |>
- # TODO: need to update focal sites
- # filter(site_id %in% aquatics_focal_sites) |>
- ggplot(aes(x = datetime, y = observation)) +
- geom_point() +
- facet_grid(variable~site_id, scales = "free_y") +
- theme_bw()
-```
-
-
-## Explore the sites
-
-```{r include=FALSE, echo=FALSE, message=FALSE, warning=FALSE}
-library(leaflet)
-sites <- suppressMessages(sf::st_read("sites.json"))
-```
-
-```{r fig.height=5, fig.width=5, echo=FALSE, include=TRUE, message = FALSE}
-leaflet() %>%
- setView(lat = 43.47839356422085, lng = -98.74777398430538, zoom= 2) %>%
- addTiles(group="OSM") %>%
- addProviderTiles(providers$Esri.WorldImagery, group="Imagery") %>%
- addProviderTiles(providers$Esri.WorldTopoMap, group="Topo Map") %>%
- addLayersControl(baseGroups=c('Imagery','OSM', 'Topo Map')) |>
- addMarkers(data = sites, popup=~as.character(site_id), group = ~as.character(Partner), clusterOptions = markerClusterOptions())
-```
-
-
The following table lists all the sites in the EFI-USGS Ecological Forecasting Challenge.
-The columns with "theme" names incidate whether that site is included in that theme's target file.
-
-```{r echo = FALSE}
-site_list <- read_csv("../USGS_site_metadata.csv", show_col_types = FALSE) |>
- select(site_id, site_no, station_nm, site_url)
-```
-
-```{r echo = FALSE}
-site_list |> knitr::kable()
-```
diff --git a/drivers/download_stage1_pseudo.R b/drivers/download_stage1_pseudo.R
deleted file mode 100644
index e940df9961..0000000000
--- a/drivers/download_stage1_pseudo.R
+++ /dev/null
@@ -1,58 +0,0 @@
-## setup
-library(gdalcubes)
-library(gefs4cast)
-
-gdalcubes::gdalcubes_options(parallel=2*parallel::detectCores())
-#gdalcubes::gdalcubes_options(parallel=TRUE)
-
-config <- yaml::read_yaml("challenge_configuration.yaml")
-driver_bucket <- stringr::word(config$driver_bucket, 1, sep = "/")
-driver_path <- stringr::word(config$driver_bucket, 2, -1, sep = "/")
-
-sites <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/",
- "raw/prod/USGS_site_metadata.csv"),
- col_select = c("site_id", "latitude", "longitude"))
-
-Sys.setenv("GEFS_VERSION"="v12")
-dates <- seq(as.Date("2020-09-24"), Sys.Date()-1, by=1)
-dates_pseudo <- seq(as.Date("2020-09-24"), Sys.Date(), by=1)
-
-message("GEFS v12 stage1-stats")
-bench::bench_time({ # thelio
- s3 <- gefs4cast::gefs_s3_dir(product = "stage1-stats",
- path = driver_path,
- endpoint = config$endpoint,
- bucket = driver_bucket)
- have_dates <- gsub("reference_datetime=", "", s3$ls())
- missing_dates <- dates[!(as.character(dates) %in% have_dates)]
- gefs4cast::gefs_to_parquet(dates = missing_dates,
- ensemble = c("geavg", "gespr"),
- path = s3,
- sites = sites)
-})
-
-message("GEFS v12 pseudo")
-bench::bench_time({ #32xlarge
- s3 <- gefs4cast::gefs_s3_dir(product = "pseudo",
- path = driver_path,
- endpoint = config$endpoint,
- bucket = driver_bucket)
- have_dates <- gsub("reference_datetime=", "", s3$ls())
- missing_dates <- dates_pseudo[!(as.character(dates_pseudo) %in% have_dates)]
- gefs4cast:::gefs_pseudo_measures(dates = missing_dates,
- path = s3,
- sites = sites)
-})
-
-message("GEFS v12 stage1")
-bench::bench_time({ # cirrus ~ 6days for full set
- s3 <- gefs4cast::gefs_s3_dir(product = "stage1",
- path = driver_path,
- endpoint = config$endpoint,
- bucket = driver_bucket)
- have_dates <- gsub("reference_datetime=", "", s3$ls())
- missing_dates <- dates[!(as.character(dates) %in% have_dates)]
- gefs4cast::gefs_to_parquet(dates = missing_dates,
- path = s3,
- sites = sites)
-})
diff --git a/drivers/generate_stage2.R b/drivers/generate_stage2.R
deleted file mode 100644
index 1cb8e4da39..0000000000
--- a/drivers/generate_stage2.R
+++ /dev/null
@@ -1,81 +0,0 @@
-## setup
-library(gdalcubes)
-library(gefs4cast)
-# need to source to_hourly.R instead of from neon4cast because there are neon-specific code in neon4cast
-source("R/eco4cast-helpers/to_hourly.R")
-
-Sys.setenv("GEFS_VERSION"="v12")
-
-site_list <- readr::read_csv("USGS_site_metadata.csv",
- show_col_types = FALSE)
-
-config <- yaml::read_yaml("challenge_configuration.yaml")
-driver_bucket <- stringr::word(config$driver_bucket, 1, sep = "/")
-driver_path <- stringr::word(config$driver_bucket, 2, -1, sep = "/")
-
-# s3_stage2 <- arrow::s3_bucket("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage2",
-# endpoint_override = "sdsc.osn.xsede.org",
-# access_key= Sys.getenv("OSN_KEY"),
-# secret_key= Sys.getenv("OSN_SECRET"))
-s3_stage2 <- gefs4cast::gefs_s3_dir(product = "stage2",
- path = driver_path,
- endpoint = config$endpoint,
- bucket = driver_bucket)
-
-# if there aren't any data (i.e., this is the first time we're creating this dataset),
-# then skip the distinct(reference_datetime) filter
-df <- arrow::open_dataset(s3_stage2)
-if(length(df$files) > 0){
- df <- arrow::open_dataset(s3_stage2) |>
- dplyr::distinct(reference_datetime) |>
- dplyr::collect()
-}
-
-curr_date <- Sys.Date()
-last_week <- dplyr::tibble(reference_datetime = as.character(seq(curr_date - lubridate::days(7),
- curr_date - lubridate::days(1),
- by = "1 day")))
-
-if(length(df$files) > 0){
- missing_dates <- dplyr::anti_join(last_week, df,
- by = "reference_datetime") |>
- dplyr::pull(reference_datetime)
-}else{
- missing_dates <- dplyr::pull(last_week, reference_datetime)
-}
-
-
-if(length(missing_dates) > 0){
- for(i in 1:length(missing_dates)){
-
- print(missing_dates[i])
-
- # bucket <- paste0("bio230014-bucket01/neon4cast-drivers/noaa/gefs-v12/stage1/reference_datetime=",
- # missing_dates[i])
- bucket <- glue::glue("{config$driver_bucket}/gefs-v12/stage1/reference_datetime={missing_dates[i]}")
-
- s3_stage1 <- arrow::s3_bucket(bucket = bucket,
- endpoint_override = config$endpoint,
- anonymous = TRUE)
-
- site_df <- arrow::open_dataset(s3_stage1) |>
- dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) |>
- dplyr::filter(site_id %in% site_list$site_id) |>
- dplyr::collect() |>
- dplyr::mutate(reference_datetime = missing_dates[i])
-
- hourly_df <- to_hourly(site_df,
- site_list = dplyr::select(site_list, site_id, latitude, longitude),
- use_solar_geom = TRUE,
- pseudo = FALSE) |>
- dplyr::mutate(ensemble = as.numeric(stringr::str_sub(ensemble, start = 4, end = 5)),
- reference_datetime = lubridate::as_date(reference_datetime)) |>
- dplyr::rename(parameter = ensemble)
-
- arrow::write_dataset(dataset = hourly_df,
- path = s3_stage2,
- partitioning = c("reference_datetime", "site_id"))
- }
-}
-
-
diff --git a/drivers/generate_stage3.R b/drivers/generate_stage3.R
deleted file mode 100644
index 5987853513..0000000000
--- a/drivers/generate_stage3.R
+++ /dev/null
@@ -1,54 +0,0 @@
-## setup
-library(minioclient)
-library(gdalcubes)
-library(gefs4cast)
-source("R/eco4cast-helpers/to_hourly.R")
-
-config <- yaml::read_yaml("challenge_configuration.yaml")
-driver_bucket <- stringr::word(config$driver_bucket, 1, sep = "/")
-driver_path <- stringr::word(config$driver_bucket, 2, -1, sep = "/")
-
-Sys.setenv("GEFS_VERSION"="v12")
-
-#install_mc()
-mc_alias_set("osn", "sdsc.osn.xsede.org", "", "")
-
-mc_mirror(glue::glue("osn/{driver_bucket}/{driver_path}/gefs-v12/pseudo"), "pseudo")
-
-df <- arrow::open_dataset("pseudo") |>
- dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF"))
-
-
-site_list <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/",
- "raw/prod/USGS_site_metadata.csv"),
- show_col_types = FALSE)
-
-
-s3_stage3 <- gefs4cast::gefs_s3_dir(product = "stage3",
- path = driver_path,
- endpoint = config$endpoint,
- bucket = driver_bucket)
-
-future::plan("future::multisession", workers = 10)
-
-furrr::future_walk(dplyr::pull(site_list, site_id), function(curr_site_id){
-
- df <- arrow::open_dataset("pseudo") |>
- dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) |>
- dplyr::filter(site_id == curr_site_id) |>
- dplyr::collect()
-
- s3_stage3 <- gefs4cast::gefs_s3_dir(product = "stage3",
- path = driver_path,
- endpoint = config$endpoint,
- bucket = driver_bucket)
-
- print(curr_site_id)
- df |>
- to_hourly(site_list = dplyr::select(site_list, site_id, latitude, longitude),
- use_solar_geom = TRUE,
- pseudo = TRUE) |>
- dplyr::mutate(ensemble = as.numeric(stringr::str_sub(ensemble, start = 4, end = 5))) |>
- dplyr::rename(parameter = ensemble) |>
- arrow::write_dataset(path = s3_stage3, partitioning = "site_id")
-})
diff --git a/drivers/update_stage3.R b/drivers/update_stage3.R
deleted file mode 100644
index aed4446e8b..0000000000
--- a/drivers/update_stage3.R
+++ /dev/null
@@ -1,87 +0,0 @@
-library(gdalcubes)
-library(gefs4cast)
-source("R/eco4cast-helpers/to_hourly.R")
-
-site_list <- readr::read_csv(paste0("https://github.com/eco4cast/usgsrc4cast-ci/",
- "raw/prod/USGS_site_metadata.csv"),
- show_col_types = FALSE)
-
-Sys.setenv("GEFS_VERSION"="v12")
-
-config <- yaml::read_yaml("challenge_configuration.yaml")
-driver_bucket <- stringr::word(config$driver_bucket, 1, sep = "/")
-driver_path <- stringr::word(config$driver_bucket, 2, -1, sep = "/")
-
-future::plan("future::multisession", workers = 10)
-
-furrr::future_walk(dplyr::pull(site_list, site_id), function(curr_site_id){
-
- print(curr_site_id)
-
- s3_stage3 <- gefs4cast::gefs_s3_dir(product = "stage3",
- path = driver_path,
- endpoint = config$endpoint,
- bucket = driver_bucket)
-
- # case for if this is the first time creating stage3 drivers
- stage3_dataset <- arrow::open_dataset(s3_stage3)
- if(length(stage3_dataset$files) > 0){
- stage3_df <- stage3_dataset |>
- dplyr::filter(site_id == curr_site_id) |>
- dplyr::collect()
-
- max_date <- stage3_df |>
- dplyr::summarise(max = as.character(lubridate::as_date(max(datetime)))) |>
- dplyr::pull(max)
- }else{
- max_date <- NA
- }
-
- s3_pseudo <- gefs4cast::gefs_s3_dir(product = "pseudo",
- path = driver_path,
- endpoint = config$endpoint,
- bucket = driver_bucket)
-
- if(length(stage3_dataset$files) > 0){
- cut_off <- as.character(lubridate::as_date(max_date) - lubridate::days(3))
- }
-
- if(length(stage3_dataset$files) > 0){
- pseudo_df <- arrow::open_dataset(s3_pseudo) |>
- dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) |>
- dplyr::filter(site_id == curr_site_id,
- reference_datetime >= cut_off) |>
- dplyr::collect()
- }else{
- pseudo_df <- arrow::open_dataset(s3_pseudo) |>
- dplyr::filter(variable %in% c("PRES","TMP","RH","UGRD","VGRD","APCP","DSWRF","DLWRF")) |>
- dplyr::filter(site_id == curr_site_id) |>
- dplyr::collect()
- }
-
-
- if(nrow(pseudo_df) > 0){
-
- df2 <- pseudo_df |>
- to_hourly(site_list = dplyr::select(site_list, site_id, latitude, longitude),
- use_solar_geom = TRUE,
- pseudo = TRUE) |>
- dplyr::mutate(ensemble = as.numeric(stringr::str_sub(ensemble, start = 4, end = 5))) |>
- dplyr::rename(parameter = ensemble)
-
- if(length(stage3_dataset$files) > 0){
- stage3_df_update <- stage3_df |>
- dplyr::filter(datetime < min(df2$datetime))
-
- df2 |>
- dplyr::bind_rows(stage3_df_update) |>
- dplyr::arrange(variable, datetime, parameter) |>
- arrow::write_dataset(path = s3_stage3, partitioning = "site_id")
- }else{
- df2 |>
- dplyr::arrange(variable, datetime, parameter) |>
- arrow::write_dataset(path = s3_stage3, partitioning = "site_id")
- }
-
- }
-})
diff --git a/scoring/build_score_inventory.R b/scoring/build_score_inventory.R
deleted file mode 100644
index 399354fe4e..0000000000
--- a/scoring/build_score_inventory.R
+++ /dev/null
@@ -1,33 +0,0 @@
-library(tidyverse)
-config <- yaml::read_yaml("challenge_configuration.yaml")
-
-s3 <- arrow::s3_bucket(paste0(config$scores_bucket, "/parquet"),
- endpoint_override = config$endpoint,
- anonymous = TRUE)
-
-bucket <- config$scores_bucket
-inventory_df <- arrow::open_dataset(s3) |>
- mutate(reference_date = lubridate::as_date(reference_datetime),
- date = lubridate::as_date(datetime),
- pub_date = lubridate::as_date(pub_datetime)) |>
- filter(project_id == config$project_id) |>
- distinct(duration, model_id, site_id, reference_date, variable, date, project_id, pub_date) |>
- collect() |>
- mutate(path = glue::glue("{bucket}/parquet/project_id={project_id}/duration={duration}/variable={variable}"),
- path_full = glue::glue("{bucket}/parquet/project_id={project_id}/duration={duration}/variable={variable}/model_id={model_id}/date={date}/part-0.parquet"),
- endpoint =config$endpoint)
-
-sites <- readr::read_csv(config$site_table,
- show_col_types = FALSE) |>
- select(site_id, latitude, longitude)
-
-inventory_df <- dplyr::left_join(inventory_df, sites,
- by = "site_id")
-
-s3_inventory <- arrow::s3_bucket(config$inventory_bucket,
- endpoint_override = config$endpoint,
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
-arrow::write_dataset(inventory_df,
- path = s3_inventory$path(glue::glue("catalog/scores/project_id={config$project_id}")))
diff --git a/scoring/scoring.R b/scoring/scoring.R
deleted file mode 100644
index 5a02d0ce29..0000000000
--- a/scoring/scoring.R
+++ /dev/null
@@ -1,189 +0,0 @@
-library(score4cast)
-library(arrow)
-
-past_days <- 365
-n_cores <- 8
-
-setwd(here::here())
-
-Sys.setenv(AWS_ACCESS_KEY_ID=Sys.getenv("OSN_KEY"),
- AWS_SECRET_ACCESS_KEY=Sys.getenv("OSN_SECRET"))
-
-ignore_sigpipe()
-
-config <- yaml::read_yaml("challenge_configuration.yaml")
-
-endpoint <- config$endpoint
-
-s3 <- arrow::s3_bucket(dirname(config$scores_bucket),
- endpoint_override = endpoint,
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
-s3$CreateDir("inventory")
-s3$CreateDir("prov")
-s3$CreateDir("scores")
-
-Sys.setenv("AWS_EC2_METADATA_DISABLED"="TRUE")
-Sys.unsetenv("AWS_DEFAULT_REGION")
-
-s3_inv <- arrow::s3_bucket(paste0(config$inventory_bucket,"/catalog/forecasts/project_id=",
- config$project_id),
- endpoint_override = endpoint)
-
-variable_duration <- arrow::open_dataset(s3_inv) |>
- dplyr::distinct(variable, duration, project_id) |>
- dplyr::collect()
-
-future::plan("future::multisession", workers = n_cores)
-
-#future::plan("future::sequential", workers = n_cores)
-
-furrr::future_walk(1:nrow(variable_duration), function(k, variable_duration, config, endpoint){
-
- Sys.setenv(AWS_ACCESS_KEY_ID=Sys.getenv("OSN_KEY"),
- AWS_SECRET_ACCESS_KEY=Sys.getenv("OSN_SECRET"))
-
- variable <- variable_duration$variable[k]
- duration <- variable_duration$duration[k]
- project_id <- variable_duration$project_id[k]
-
- print(variable_duration[k,])
-
- s3_targets <- arrow::s3_bucket(glue::glue(config$targets_bucket,"/project_id={project_id}"),
- endpoint_override = endpoint)
- s3_scores <- arrow::s3_bucket(config$scores_bucket,
- endpoint_override = endpoint)
- s3_prov <- arrow::s3_bucket(config$prov_bucket,
- endpoint_override = endpoint)
- s3_inv <- arrow::s3_bucket(paste0(config$inventory_bucket,"/catalog/forecasts"),
- endpoint_override = endpoint)
-
- local_prov <- paste0(project_id,"-",duration,"-",variable, "-scoring_provenance.csv")
-
- if (!(local_prov %in% s3_prov$ls())) {
- prov_df <- dplyr::tibble(date = Sys.Date(),
- new_id = "start",
- model_id = "start",
- reference_date = "start",
- pub_date = "start")
- }else{
- path <- s3_prov$path(local_prov)
- prov_df <- arrow::read_csv_arrow(path)
- }
-
- s3_scores_path <- s3_scores$path(glue::glue("parquet/project_id={project_id}/duration={duration}/variable={variable}"))
-
- s3_targets <- arrow::s3_bucket(glue::glue(config$targets_bucket),
- endpoint_override = endpoint)
-
- target <- arrow::open_csv_dataset(s3_targets,
- schema = arrow::schema(
- project_id = arrow::string(),
- site_id = arrow::string(),
- datetime = arrow::timestamp(unit = "ns"), # timezone = "UTC"),
- duration = arrow::string(),
- #depth_m = arrow::float(), #project_specific
- variable = arrow::string(),
- observation = arrow::float()),
- skip = 1) |>
- dplyr::filter(variable == variable_duration$variable[k],
- duration == variable_duration$duration[k],
- project_id == variable_duration$project_id[k]) |>
- dplyr::collect()
-
- curr_variable <- variable
- curr_duration <- duration
- curr_project_id <- project_id
-
- groupings <- arrow::open_dataset(s3_inv,
- schema = arrow::schema(
- duration = arrow::string(),
- model_id = arrow::string(),
- site_id = arrow::string(),
- reference_date = arrow::date32(),
- variable = arrow::string(),
- date = arrow::date32(),
- project_id = arrow::string(),
- pub_date = arrow::date32(),
- path = arrow::string(),
- path_full = arrow::string(),
- path_summaries = arrow::string(),
- endpoint = arrow::string(),
- latitude = arrow::float(),
- longitude = arrow::float(),
- )) |>
- dplyr::filter(variable == curr_variable,
- duration == curr_duration,
- project_id == curr_project_id) |>
- dplyr::select(-site_id) |>
- dplyr::collect() |>
- dplyr::distinct() |>
- dplyr::filter(date > Sys.Date() - lubridate::days(past_days),
- date <= lubridate::as_date(max(target$datetime))) |>
- dplyr::group_by(model_id, date, duration, path, endpoint) |>
- dplyr::arrange(reference_date, pub_date) |>
- dplyr::summarise(reference_date = paste(unique(reference_date), collapse=","),
- pub_date = paste(unique(pub_date), collapse=","),
- .groups = "drop")
-
- if(nrow(groupings) > 0){
-
- new_prov <- purrr::map_dfr(1:nrow(groupings), function(j, groupings, prov_df, s3_scores_path, curr_variable){
-
- group <- groupings[j,]
- ref <- group$date
-
- tg <- target |>
- #dplyr::mutate(depth_m = ifelse(!is.na(depth_m), round(depth_m, 2), depth_m)) |> #project_specific
- dplyr::filter(lubridate::as_date(datetime) >= ref,
- lubridate::as_date(datetime) < ref+lubridate::days(1))
-
- id <- rlang::hash(list(group[, c("model_id","reference_date","date","pub_date")], tg))
-
- if (!(score4cast:::prov_has(id, prov_df, "new_id"))){
-
- print(group)
-
-
- reference_dates <- unlist(stringr::str_split(group$reference_date, ","))
-
- ref_upper <- (lubridate::as_date(ref)+lubridate::days(1))
- fc <- arrow::open_dataset(paste0("s3://anonymous@",group$path,"/model_id=",group$model_id,"?endpoint_override=",group$endpoint)) |>
- dplyr::filter(reference_date %in% reference_dates,
- lubridate::as_date(datetime) >= ref,
- lubridate::as_date(datetime) < ref_upper) |>
- dplyr::collect()
-
- fc |>
- #dplyr::mutate(depth_m = ifelse(!is.na(depth_m), round(depth_m, 2), depth_m)) |> #project_specific
- dplyr::mutate(variable = curr_variable,
- project_id = curr_project_id) |>
- #If for some reason, a forecast has multiple values for a parameter from a specific forecast, then average
- dplyr::summarise(prediction = mean(prediction), .by = dplyr::any_of(c("site_id", "datetime", "reference_datetime", "family",
- "parameter", "pub_datetime", "reference_date", "variable", "project_id"))) |>
- #score4cast::crps_logs_score(tg, extra_groups = c("depth_m","project_id")) |> #project_specific
- score4cast::crps_logs_score(tg, extra_groups = c("project_id")) |> #project_specific
- dplyr::mutate(date = group$date,
- model_id = group$model_id) |>
- dplyr::select(-variable,-project_id) |>
- arrow::write_dataset(s3_scores_path,
- partitioning = c("model_id", "date"))
-
- curr_prov <- dplyr::tibble(date = Sys.Date(),
- new_id = id,
- model_id = group$model_id,
- reference_date = group$reference_date,
- pub_date = group$pub_date)
- }else{
- curr_prov <- NULL
- }
- },
- groupings, prov_df, s3_scores_path,curr_variable)
-
- prov_df <- dplyr::bind_rows(prov_df, new_prov)
- arrow::write_csv_arrow(prov_df, s3_prov$path(local_prov))
- }
-},
-variable_duration, config, endpoint
-)
diff --git a/submission_processing/process_submissions.R b/submission_processing/process_submissions.R
deleted file mode 100644
index 61551d4e6c..0000000000
--- a/submission_processing/process_submissions.R
+++ /dev/null
@@ -1,259 +0,0 @@
-
-library(read4cast)
-library(score4cast)
-library(readr)
-library(dplyr)
-library(arrow)
-library(glue)
-library(here)
-library(minioclient)
-library(tools)
-library(fs)
-library(stringr)
-library(lubridate)
-source("R/eco4cast-helpers/forecast_output_validator.R")
-
-install_mc()
-
-config <- yaml::read_yaml("challenge_configuration.yaml")
-
-sites <- readr::read_csv(config$site_table,
- show_col_types = FALSE) |>
- select(site_id, latitude, longitude)
-
-minioclient::mc_alias_set("s3_store",
- config$endpoint,
- Sys.getenv("OSN_KEY"),
- Sys.getenv("OSN_SECRET"))
-
-minioclient::mc_alias_set("submit",
- config$submissions_endpoint,
- Sys.getenv("AWS_ACCESS_KEY_SUBMISSIONS"),
- Sys.getenv("AWS_SECRET_ACCESS_KEY_SUBMISSIONS"))
-
-message(paste0("Starting Processing Submissions ", Sys.time()))
-
-local_dir <- file.path(here::here(), "submissions")
-unlink(local_dir, recursive = TRUE)
-fs::dir_create(local_dir)
-
-
-## see if there are any files to download and process
-submit_files = minioclient::mc_ls(target = fs::path("submit", config$submissions_bucket, config$project_id),
- recursive = TRUE,
- details = TRUE)
-
-if(nrow(submit_files) > 0){
- message("Downloading forecasts ...")
-
- minioclient::mc_mirror(from = fs::path("submit", config$submissions_bucket, config$project_id),
- to = local_dir)
-
- submissions <- fs::dir_ls(local_dir,
- recurse = TRUE,
- type = "file")
- submissions_filenames <- basename(submissions)
- print(submissions)
-
- if(length(submissions) > 0){
-
- Sys.unsetenv("AWS_DEFAULT_REGION")
- Sys.unsetenv("AWS_S3_ENDPOINT")
- Sys.setenv(AWS_EC2_METADATA_DISABLED="TRUE")
-
- s3 <- arrow::s3_bucket(config$forecasts_bucket,
- endpoint_override = config$endpoint,
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
- s3_scores <- arrow::s3_bucket(file.path(config$scores_bucket,"parquet"),
- endpoint_override = config$endpoint,
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
-
- s3_inventory <- arrow::s3_bucket(dirname(config$inventory_bucket),
- endpoint_override = config$endpoint,
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
- s3_inventory$CreateDir(paste0("inventory/catalog/forecasts/project_id=", config$project_id))
-
- s3_inventory <- arrow::s3_bucket(paste0(config$inventory_bucket,
- "/catalog/forecasts/project_id=",
- config$project_id),
- endpoint_override = config$endpoint,
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
- inventory_df <- arrow::open_dataset(s3_inventory) |> dplyr::collect()
-
- time_stamp <- format(Sys.time(), format = "%Y%m%d%H%M%S")
-
- print(inventory_df)
-
- for(i in 1:length(submissions)){
-
- curr_submission <- basename(submissions[i])
- theme <- stringr::str_split(curr_submission, "-")[[1]][1]
- file_name_model_id <- stringr::str_split(tools::file_path_sans_ext(tools::file_path_sans_ext(curr_submission)), "-")[[1]][5]
- file_name_reference_datetime <- lubridate::as_datetime(paste0(stringr::str_split(curr_submission, "-")[[1]][2:4], collapse = "-"))
- submission_dir <- dirname(submissions[i])
- print(curr_submission)
-
- if((tools::file_ext(curr_submission) %in% c("gz", "csv", "nc"))){
-
- valid <- forecast_output_validator(file.path(local_dir, curr_submission))
-
- if(valid){
-
- # still OK to use read4cast as there aren't challenge-specific things
- # in the package, other than list of all potential target variables,
- # which could be updated if we forecast new variables (but for usgsrc4cast we're forecasting chla)
- fc <- read4cast::read_forecast(submissions[i])
-
- pub_datetime <- strftime(Sys.time(), format = "%Y-%m-%d %H:%M:%S", tz = "UTC")
-
- if(!"duration" %in% names(fc)){
- # if(theme == "terrestrial_30min"){
- # fc <- fc |> dplyr::mutate(duration = "PT30M")
- # }else if(theme %in% c("ticks","beetles")){
- # fc <- fc |> dplyr::mutate(duration = "P1W")
- # }else if(theme %in% c("aquatics","phenology","terrestrial_daily")){
- # fc <- fc |> dplyr::mutate(duration = "P1D")
- # }else{
- # if(stringr::str_detect(fc$datetime[1], ":")){
- # fc <- fc |> dplyr::mutate(duration = "P1H")
- # }else{
- fc <- fc |> dplyr::mutate(duration = "P1D") # currently only have "P1D" duration for usgsrc4cast
- # }
- }
-
-
- if(!("model_id" %in% colnames(fc))){
- fc <- fc |> mutate(model_id = file_name_model_id)
- }else if(fc$model_id[1] == "null"){
- fc <- fc |> mutate(model_id = file_name_model_id)
- }
-
-
- if(!("reference_datetime" %in% colnames(fc))){
- fc <- fc |> mutate(reference_datetime = file_name_reference_datetime)
- }
-
- fc <- fc |>
- dplyr::mutate(pub_datetime = lubridate::as_datetime(pub_datetime),
- datetime = lubridate::as_datetime(datetime),
- reference_datetime = lubridate::as_datetime(reference_datetime),
- reference_date = lubridate::as_date(reference_datetime),
- parameter = as.character(parameter),
- project_id = config$project_id) |>
- dplyr::filter(datetime >= reference_datetime)
-
- print(head(fc))
- s3$CreateDir(paste0("parquet/"))
- fc |> arrow::write_dataset(s3$path(paste0("parquet")), format = 'parquet',
- partitioning = c("project_id",
- "duration",
- "variable",
- "model_id",
- "reference_date"))
-
- s3$CreateDir(paste0("summaries"))
- fc |>
- dplyr::summarise(prediction = mean(prediction),
- .by = dplyr::any_of(c("site_id", "datetime",
- "reference_datetime", "family",
- "depth_m", "duration", "model_id",
- "parameter", "pub_datetime",
- "reference_date", "variable", "project_id"))) |>
- score4cast::summarize_forecast(extra_groups = c("duration", "project_id", "depth_m")) |>
- dplyr::mutate(reference_date = lubridate::as_date(reference_datetime)) |>
- arrow::write_dataset(s3$path("summaries"), format = 'parquet',
- partitioning = c("project_id",
- "duration",
- "variable",
- "model_id",
- "reference_date"))
-
- bucket <- config$forecasts_bucket
- curr_inventory <- fc |>
- mutate(reference_date = lubridate::as_date(reference_datetime),
- date = lubridate::as_date(datetime),
- pub_date = lubridate::as_date(pub_datetime)) |>
- distinct(duration, model_id, site_id, reference_date, variable, date, project_id, pub_date) |>
- mutate(path = glue::glue("{bucket}/parquet/project_id={project_id}/duration={duration}/variable={variable}"),
- path_full = glue::glue("{bucket}/parquet/project_id={project_id}/duration={duration}/variable={variable}/model_id={model_id}/reference_date={reference_date}/part-0.parquet"),
- path_summaries = glue::glue("{bucket}/summaries/project_id={project_id}/duration={duration}/variable={variable}/model_id={model_id}/reference_date={reference_date}/part-0.parquet"),
- endpoint =config$endpoint)
-
-
- curr_inventory <- dplyr::left_join(curr_inventory, sites, by = "site_id")
-
- inventory_df <- dplyr::bind_rows(inventory_df, curr_inventory)
-
- arrow::write_dataset(inventory_df, path = s3_inventory)
-
- submission_timestamp <- paste0(submission_dir,"/T", time_stamp, "_", basename(submissions[i]))
- fs::file_copy(submissions[i], submission_timestamp)
- raw_bucket_object <- paste0("s3_store/",
- config$forecasts_bucket,
- "/raw/project_id=", config$project_id, "/",
- basename(submission_timestamp))
-
- minioclient::mc_cp(submission_timestamp, paste0(dirname(raw_bucket_object),"/", basename(submission_timestamp)))
-
- if(length(minioclient::mc_ls(raw_bucket_object)) > 0){
- minioclient::mc_rm(file.path("submit",
- config$submissions_bucket,
- config$project_id,
- curr_submission))
- }
-
- rm(fc)
- gc()
-
- } else {
-
- submission_timestamp <- paste0(submission_dir,"/T", time_stamp, "_", basename(submissions[i]))
- fs::file_copy(submissions[i], submission_timestamp)
- raw_bucket_object <- paste0("s3_store/",
- config$forecasts_bucket,
- "/raw/project_id=", config$project_id, "/",
- basename(submission_timestamp))
-
- minioclient::mc_cp(submission_timestamp, paste0(dirname(raw_bucket_object),"/", basename(submission_timestamp)))
-
- if(length(minioclient::mc_ls(raw_bucket_object)) > 0){
- minioclient::mc_rm(file.path("submit",
- config$submissions_bucket,
- config$project_id,
- curr_submission))
- }
-
- }
- }
- }
-
- message("writing inventory")
-
- arrow::write_dataset(inventory_df, path = s3_inventory)
-
- s3_inventory <- arrow::s3_bucket(paste0(config$inventory_bucket),
- endpoint_override = config$endpoint,
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
- inventory_df |> dplyr::distinct(model_id, project_id) |>
- arrow::write_csv_arrow(s3_inventory$path("model_id/model_id-project_id-inventory.csv"))
-
- }
-
- unlink(local_dir, recursive = TRUE)
-
- message(paste0("Completed Processing Submissions ", Sys.time()))
-}else{
- message("No submitted files to process")
-}
-
-
diff --git a/targets/Dockerfile b/targets/Dockerfile
deleted file mode 100644
index f34c3fafd8..0000000000
--- a/targets/Dockerfile
+++ /dev/null
@@ -1,21 +0,0 @@
-FROM eco4cast/rocker-neon4cast:latest
-
-# Import GitHub Secret
-ARG GITHUB_PAT
-ENV GITHUB_PAT=$GITHUB_PAT
-
-# Declares build arguments
-# ARG NB_USER
-# ARG NB_UID
-
-# COPY --chown=${NB_USER} . ${HOME}
-
-#USER ${NB_USER}
-RUN install2.r --error \
- clustermq \
- dataRetrieval \
- tarchetypes \
- targets \
- && rm -rf /tmp/downloaded_packages /tmp/*.rds /tmp/Rtmp*
-
-COPY cron.sh /etc/services.d/cron/run
diff --git a/targets/_targets.R b/targets/_targets.R
deleted file mode 100644
index c8fdce01aa..0000000000
--- a/targets/_targets.R
+++ /dev/null
@@ -1,160 +0,0 @@
-library(targets)
-
-options(tidyverse.quiet = TRUE,
- clustermq.scheduler = "multicore")
-
-# set package needs
-tar_option_set(packages = c("dataRetrieval",
- "tidyverse"))
-
-source("src/download_nwis_data.R")
-source("src/s3_utils.R")
-
-# End this file with a list of target objects.
-list(
-
- tar_target(
- config_file,
- "../challenge_configuration.yaml",
- format = "file"
- ),
-
- tar_target(
- config,
- yaml::read_yaml(config_file)
- ),
-
- tar_target(
- site_list_file,
- "in/FY23_ecological_forecast_challenge_USGS_sites.csv",
- format = "file"
- ),
-
- tar_target(
- site_list_id,
- read_csv(site_list_file) %>%
- filter(include_in_challenge == "yes") %>%
- pull(NWIS_site_no)
- ),
-
- tar_target(
- metadata,
- {
- out_file <- "out/USGS_site_metadata.csv"
- whatNWISsites(sites = site_list_id) %>%
- tibble() %>%
- mutate(site_id = paste(agency_cd, site_no, sep = "-"),
- project_id = "usgsrc4cast",
- site_url = paste0("https://waterdata.usgs.gov/monitoring-location/", site_no)) %>%
- relocate(site_id, project_id) %>%
- relocate(site_url, .before = colocated) %>%
- rename(latitude = dec_lat_va,
- longitude = dec_long_va) %>%
- write_csv(file = out_file)
- return(out_file)
- }
- ),
-
- tar_target(
- start_date,
- # Sys.Date() - 2
- as.Date("2000-01-01")
- # {
- # max_date_per_site
- # },
- # pattern = map(historic_data)
- ),
-
- tar_target(
- end_date,
- Sys.Date()
- ),
-
- tar_target(
- char_names_yml,
- "in/characteristic_names.yml",
- format = "file"
- ),
-
- tar_target(
- char_names,
- yaml::read_yaml(char_names_yml)
- ),
-
- tar_target(
- pcodes_yml,
- "in/pcodes.yml",
- format = "file"
- ),
-
- tar_target(
- pcodes,
- yaml::read_yaml(pcodes_yml)
- ),
-
- tar_target(
- historic_data_rds,
- download_historic_data(sites = site_list_id,
- start_date = start_date,
- end_date = end_date,
- pcodes = pcodes,
- service = "dv", # dv is daily values
- statCd = "00003", # 00003 is mean
- out_file = "out/historic_data.rds"),
- format = "file"
- ),
-
- tar_target(
- sites_without_dv,
- {
- sites_with_dv = read_rds(historic_data_rds)
- site_list_id[!site_list_id %in% sites_with_dv$site_no]
- }
- ),
-
- tar_target(
- uv_historic_data_rds,
- download_historic_uv_data(sites = sites_without_dv,
- start_date = start_date,
- end_date = end_date,
- pcodes = pcodes,
- service = "uv",
- out_file = "out/historic_uv_data.rds"),
- format = "file"
- ),
-
- tar_target(
- all_historic_data_csv,
- {
- dv <- read_rds(historic_data_rds)
- uv <- read_rds(uv_historic_data_rds)
- out_file <- "out/USGS_chl_data.csv"
- out <- bind_rows(dv, uv) %>%
- rename(datetime = dateTime,
- site_id = site_no,
- observation = chl_ug_L) %>%
- mutate(variable = "chla",
- site_id = paste0("USGS-", site_id),
- project_id = "usgsrc4cast",
- duration = "P1D") %>%
- select(project_id, site_id, datetime,
- duration, variable, observation)
- write_csv(out, file = out_file)
- return(out_file)
- },
- format = "file"
- ),
-
- tar_target(
- push_to_targets_s3,
- push_to_s3(
- config = config,
- local_file_name = all_historic_data_csv,
- s3_file_name = config$targets_file_name)
- )
-
-)
-
-
-
-
diff --git a/targets/aquatics_targets.R b/targets/aquatics_targets.R
deleted file mode 100644
index b507c5a9e1..0000000000
--- a/targets/aquatics_targets.R
+++ /dev/null
@@ -1,797 +0,0 @@
-message(paste0("Running Creating Aquatics Targets at ", Sys.time()))
-
-Sys.unsetenv("AWS_DEFAULT_REGION")
-Sys.unsetenv("AWS_S3_ENDPOINT")
-Sys.setenv("AWS_EC2_METADATA_DISABLED"="TRUE")
-
-Sys.setenv(TZ = 'UTC')
-## 02_generate_targets_aquatics
-## Process the raw data into the target variable product
-library(neonstore)
-library(tidyverse)
-library(lubridate)
-library(contentid)
-library(sparklyr)
-library(sparkavro)
-source('R/avro_functions.R')
-source('R/data_processing.R')
-# spark_install(version = '3.0')
-
-`%!in%` <- Negate(`%in%`) # not in function
-
-avro_file_directory <- "/home/rstudio/data/aquatic_avro"
-parquet_file_directory <- "/home/rstudio/data/aquatic_parquet"
-EDI_file_directory <- "/home/rstudio/data/aquatic_EDI"
-
-readRenviron("~/.Renviron") # compatible with littler
-Sys.setenv("NEONSTORE_HOME" = "/home/rstudio/data/neonstore")
-Sys.getenv("NEONSTORE_DB")
-
-#temporary aquatic repo during test of new workflow
-site_data <- readr::read_csv("https://raw.githubusercontent.com/eco4cast/neon4cast-targets/main/NEON_Field_Site_Metadata_20220412.csv")
-aq_sites <- site_data |> filter(aquatics == 1) |> pull(field_site_id)
-
-message(paste0("Running Creating Aquatics Targets at ", Sys.time()))
-
-sites <- readr::read_csv("NEON_Field_Site_Metadata_20220412.csv") |>
- dplyr::filter(aquatics == 1)
-
-nonwadable_rivers <- sites$field_site_id[(which(sites$field_site_subtype == "Non-wadeable River"))]
-lake_sites <- sites$field_site_id[(which(sites$field_site_subtype == "Lake"))]
-stream_sites <- sites$field_site_id[(which(sites$field_site_subtype == "Wadeable Stream"))]
-profiling_sites <- c('CRAM', 'LIRO', 'BARC', 'TOOK')
-
-df <- neonstore:::neon_data(product = "DP1.20288.001",
- # start_date = "2023-01-01",
- # end_date = "2023-10-01",
- type="basic")
-
-urls <- df |>
- dplyr::filter(grepl("waq_instantaneous", name)) |>
- dplyr::pull(url)
-
-wq_portal <- duckdbfs::open_dataset(urls, format="csv", filename = TRUE) |>
- dplyr::mutate(siteID = stringr::str_sub(filename, 77,80)) |>
- dplyr::select(siteID, startDateTime, sensorDepth,
- dissolvedOxygen,,dissolvedOxygenFinalQF,
- chlorophyll,chlorophyllFinalQF,
- chlaRelativeFluorescence, chlaRelFluoroFinalQF) %>%
- dplyr::mutate(sensorDepth = as.numeric(sensorDepth),
- dissolvedOxygen = as.numeric(dissolvedOxygen),
- chla = as.numeric(chlorophyll),
- chla_RFU = as.numeric(chlaRelativeFluorescence),
- startDateTime = as_datetime(startDateTime),
- time = as_date(startDateTime)) %>%
- # sites that are not profiling do not have accurate depths - set to 0.5
- dplyr::mutate(sensorDepth = ifelse(siteID %in% profiling_sites, sensorDepth,
- 0.5)) |>
- dplyr::filter(sensorDepth > 0 & sensorDepth < 1) |>
- dplyr::mutate(dissolvedOxygen = ifelse(dissolvedOxygenFinalQF == 1, NA, dissolvedOxygen),
- chla = ifelse(chlorophyllFinalQF == 1, NA, chla)) |>
- dplyr::rename(site_id = siteID) |>
- dplyr::group_by(site_id, time) %>%
- dplyr::summarize(oxygen = mean(dissolvedOxygen, na.rm = TRUE),
- chla = mean(chla, na.rm = TRUE),
- chla_RFU = mean(chla_RFU, na.rm = TRUE), .groups = "drop") %>%
- dplyr::select(time, site_id,
- oxygen, chla, chla_RFU) %>%
- pivot_longer(cols = -c("time", "site_id"), names_to = "variable", values_to = "observation") %>%
- dplyr::filter(!((variable == "chla" & site_id %in% stream_sites) |
- (variable == "chla_RFU" & site_id %in% stream_sites))) |>
- collect()
-
-#====================================================#
-##### low latency WQ data =======
-message("# download the 24/48hr pre_release data from the Google Cloud")
-
-# where should these files be saved?
-
-fs::dir_create(file.path(avro_file_directory,"DP1.20288.001")) # ignores existing directories unlike dir.create()
-
-# need to figure out which month's data are required
-# what is in the NEON store db?
-cur_wq_month <- wq_portal %>%
- group_by(site_id) %>%
- summarise(cur_wq_date = as.Date(max(time)),
- new_date = ceiling_date(max(time), unit = 'month'))
-
-
-# Download any new files from the Google Cloud
-download.neon.avro(months = cur_wq_month,
- data_product = '20288', # WQ data product
- path = file.path(avro_file_directory,"DP1.20288.001"))
-
-# Delete superseded files
-# Files that have been superseded by the NEON store files can be deleted from the relavent repository
-# Look in each repository to see if there are files that exceed the current maximum date of the NEON
-# store data
-delete.neon.parquet(months = cur_wq_month,
- path = file.path(parquet_file_directory, "wq"),
- data_product = '20288')
-
-delete.neon.avro(months = cur_wq_month,
- path = file.path(avro_file_directory,"DP1.20288.001"),
- data_product = '20288')
-
-# The variables (term names that should be kept)
-wq_vars <- c('siteName',
- 'startDate',
- 'sensorDepth',
- 'dissolvedOxygen',
- 'dissolvedOxygenExpUncert',
- 'dissolvedOxygenFinalQF',
- 'chlorophyll',
- 'chlorophyllExpUncert',
- 'chlorophyllFinalQF',
- 'chlaRelativeFluorescence',
- 'chlaRelFluoroFinalQF')
-columns_keep <- c('siteName', 'termName', 'startDate', 'Value', 'verticalIndex')
-
-# Generate a list of files to be read
-wq_avro_files <- list.files(path = file.path(avro_file_directory, 'DP1.20288.001'),
- pattern = '*20288',
- recursive = T, full.names = T)
-
-wq_parquet_files <- list.files(path = file.path(parquet_file_directory, "wq"))
-
-new_files <- map_lgl(wq_avro_files, function(x){
- new_file <- TRUE
- if(basename(x) %in% tools::file_path_sans_ext(wq_parquet_files)){
- new_file <- FALSE
- }
- return(new_file)
-})
-
-wq_avro_files <- wq_avro_files[which(new_files)]
-
-if(length(wq_avro_files) > 0){
- sc <- sparklyr::spark_connect(master = "local")
- # Read in each of the files and then bind by rows
- purrr::walk(.x = wq_avro_files, ~ read.avro.wq(sc= sc,
- path = .x,
- columns_keep = columns_keep,
- dir = file.path(parquet_file_directory, "wq")))
- spark_disconnect(sc)
-}
-
-
-wq_pre_release <- arrow::open_dataset(file.path(parquet_file_directory, "wq")) |>
- collect()
-
-# Combine the avro files with the portal data
-wq_full <- dplyr::bind_rows(wq_portal, wq_pre_release) %>%
- dplyr::arrange(site_id, time)
-
-wq_full <- wq_full |>
- group_by(site_id, time, variable) |>
- summarise(observation = mean(observation, na.rm = TRUE), .groups = "drop")
-
-#==============================#
-
-message("##### WQ QC protocol =======")
-# additional QC steps implemented (FO, 2022-07-13)
-##### check 1 Gross range tests on DO and chlorophyll
-# DO ranges for each sensor and each season
-DO_max <- 15 # gross max
-DO_min <- 2 # gross min
-
-# chlorophyll ranges
-chla_max <- 200
-chla_min <- 0
-
-# GR flag will be true if either the DO concentration or the chlorophyll are
-# outside the ranges specified about
-
-wq_cleaned <- wq_full |>
- tidyr::pivot_wider(names_from = variable,
- values_from = observation,
- id_cols = c(time, site_id)) |>
- dplyr::mutate(chla = ifelse(site_id == "BARC" & (lubridate::as_date(time) >= lubridate::as_date("2021-09-21") &
- lubridate::as_date(time) <= lubridate::as_date("2023-03-16")),
- chla_RFU,
- chla),
- chla = ifelse(site_id == "BLWA" & (lubridate::as_date(time) >= lubridate::as_date("2021-09-15") &
- lubridate::as_date(time) <= lubridate::as_date("2023-03-16")),
- chla_RFU,
- chla),
- chla = ifelse(site_id == "CRAM" & (lubridate::as_date(time) >= lubridate::as_date("2022-04-01") &
- lubridate::as_date(time) <= lubridate::as_date("2023-03-16")),
- chla_RFU,
- chla),
- chla = ifelse(site_id == "FLNT" & (lubridate::as_date(time) >= lubridate::as_date("2021-11-09") &
- lubridate::as_date(time) <= lubridate::as_date("2023-03-16")),
- chla_RFU,
- chla),
- chla = ifelse(site_id == "LIRO" & (lubridate::as_date(time) >= lubridate::as_date("2022-04-01") &
- lubridate::as_date(time) <= lubridate::as_date("2023-03-16")),
- chla_RFU,
- chla),
- chla = ifelse(site_id == "PRLA" & (lubridate::as_date(time) >= lubridate::as_date("2022-04-01") &
- lubridate::as_date(time) <= lubridate::as_date("2023-03-16")),
- chla_RFU,
- chla),
- chla = ifelse(site_id == "PRPO" & (lubridate::as_date(time) >= lubridate::as_date("2022-04-01") &
- lubridate::as_date(time) <= lubridate::as_date("2023-03-16")),
- chla_RFU,
- chla),
- chla = ifelse(site_id == "SUGG" & (lubridate::as_date(time) >= lubridate::as_date("2021-09-21") &
- lubridate::as_date(time) <= lubridate::as_date("2023-03-16")),
- chla_RFU,
- chla),
- chla = ifelse(site_id == "TOMB" & (lubridate::as_date(time) >= lubridate::as_date("2021-09-21") &
- lubridate::as_date(time) <= lubridate::as_date("2023-03-16")),
- chla_RFU,
- chla),
- chla = ifelse(site_id == "TOOK" & (lubridate::as_date(time) >= lubridate::as_date("2022-04-01") &
- lubridate::as_date(time) <= lubridate::as_date("2023-03-16")),
- chla_RFU,
- chla)) |>
- tidyr::pivot_longer(cols = -c("time", "site_id"), names_to = 'variable', values_to = 'observation') |>
- dplyr::filter(variable != 'chla_RFU') %>%
- dplyr::mutate(observation = ifelse(is.na(observation),
- observation, ifelse(observation >= DO_min & observation <= DO_max & variable == 'oxygen',
- observation, ifelse(observation >= chla_min & observation <= chla_max & variable == 'chla', observation, NA)))) %>%
- # manual cleaning based on visual inspection
- dplyr::mutate(observation = ifelse(site_id == "MAYF" &
- between(time, ymd("2019-01-20"), ymd("2019-02-05")) &
- variable == "oxygen", NA, observation),
- observation = ifelse(site_id == "WLOU" &
- !between(observation, 7.5, 11) &
- variable == "oxygen", NA, observation),
- observation = ifelse(site_id == "BARC" &
- observation < 4 &
- variable == "oxygen", NA, observation),
- observation = ifelse(site_id == "BLDE" &
- between(time, ymd("2020-07-01"), ymd("2020-12-31")) &
- variable == "oxygen", NA, observation),
- observation = ifelse(site_id == "BIGC" &
- between(time, ymd("2021-10-25"), ymd("2021-10-27")) &
- variable == "oxygen", NA, observation),
- observation = ifelse(site_id == "REDB" &
- time == ymd("2022-04-28") &
- variable == "oxygen", NA, observation))
-#===============================================#
-message("#### Generate hourly temperature profiles for lake #############")
-message("##### NEON portal data #####")
-
-df <- neonstore:::neon_data(product = "DP1.20264.001",
- # start_date = "2023-06-01",
- # end_date = "2023-08-01",
- type="basic",
- site = lake_sites)
-
-urls <- df |>
- dplyr::filter(grepl("TSD_30_min", name)) |>
- dplyr::pull(url)
-
-
-hourly_temp_profile_portal <- duckdbfs::open_dataset(urls, format="csv", filename = TRUE) |>
- dplyr::mutate(site_id = stringr::str_sub(filename, 77,80),
- verticalPosition = stringr::str_sub(filename, 153,155)) |>
- dplyr::select(startDateTime, site_id, tsdWaterTempMean, thermistorDepth, tsdWaterTempFinalQF, verticalPosition) |>
- dplyr::mutate(tsdWaterTempMean = as.numeric(tsdWaterTempMean),
- thermistorDepth = as.numeric(thermistorDepth),
- tsdWaterTempFinalQF = as.numeric(tsdWaterTempFinalQF),
- verticalPosition = as.numeric(verticalPosition)) |>
- dplyr::mutate(tsdWaterTempMean = ifelse(tsdWaterTempFinalQF == 1, NA, tsdWaterTempMean)) %>%
- dplyr::rename(depth = thermistorDepth) |>
- dplyr::mutate(date = as_date(startDateTime),
- hour = str_pad(hour(startDateTime), width = 2, side = "left", pad = "0"),
- depth = round(depth, 1)) %>% # round to the nearest 0.1 m
- dplyr::summarize(temperature = mean(tsdWaterTempMean, na.rm = TRUE),.by = c("site_id", "depth", "date", "hour")) %>%
- dplyr::select(date, hour, site_id, temperature, depth) |>
- rename(observation = temperature) |>
- mutate(variable = "temperature",
- time = as_datetime(paste0(date, " ",hour, ":00:00"))) |>
- select(-date, - hour) |>
- collect() |>
- QC.temp(range = c(-5, 40), spike = 5, by.depth = T) %>%
- mutate(data_source = 'NEON_portal')
-
-message("##### Sonde EDI data #####")
-# Only 6 lake sites available on EDI
-edi_url_lake <- c("https://pasta.lternet.edu/package/data/eml/edi/1071/1/7f8aef451231d5388c98eef889332a4b",
- "https://pasta.lternet.edu/package/data/eml/edi/1071/1/2c8893684d94b9a52394060a76cab798",
- "https://pasta.lternet.edu/package/data/eml/edi/1071/1/770e2ab9d957991a787a2f990d5a2fad",
- "https://pasta.lternet.edu/package/data/eml/edi/1071/1/2e52d63ba4dc2040d1e5e2d11114aa93",
- "https://pasta.lternet.edu/package/data/eml/edi/1071/1/60df35a34bb948c0ca5e5556d129aa98",
- "https://pasta.lternet.edu/package/data/eml/edi/1071/1/004857d60d6fe7587b112d714e0380d0")
-lake_edi_profile <- c("NEON.D03.BARC.DP0.20005.001.01378.csv",
- "NEON.D05.CRAM.DP0.20005.001.01378.csv",
- "NEON.D05.LIRO.DP0.20005.001.01378.csv",
- "NEON.D09.PRLA.DP0.20005.001.01378.csv",
- "NEON.D09.PRPO.DP0.20005.001.01378.csv",
- "NEON.D03.SUGG.DP0.20005.001.01378.csv")
-
-fs::dir_create(EDI_file_directory) # ignores existing directories unlike dir.create()
-# Download the data
-
-for(i in 1:length(edi_url_lake)){
- if (!file.exists(file.path(EDI_file_directory, lake_edi_profile[i]))) {
- if (!dir.exists(dirname(file.path(EDI_file_directory,
- lake_edi_profile[i])))) {
- dir.create(dirname(file.path(EDI_file_directory,
- lake_edi_profile[i])))
- }
- download.file(edi_url_lake[i], destfile = file.path(EDI_file_directory, lake_edi_profile[i]))
- }
-}
-
-
-# List all the files in the EDI directory
-edi_data <- list.files(file.path(EDI_file_directory), full.names = T)
-# Get the lake sites subset
-edi_lake_files <- c(edi_data[grepl(x = edi_data, pattern= lake_sites[1])],
- edi_data[grepl(x = edi_data, pattern= lake_sites[2])],
- edi_data[grepl(x = edi_data, pattern= lake_sites[3])],
- edi_data[grepl(x = edi_data, pattern= lake_sites[4])],
- edi_data[grepl(x = edi_data, pattern= lake_sites[5])],
- edi_data[grepl(x = edi_data, pattern= lake_sites[6])])
-
-# Calculate the hourly average profile
-hourly_temp_profile_EDI <- purrr::map_dfr(.x = edi_lake_files, ~ read.csv(file = .x)) %>%
- rename('site_id' = siteID,
- 'depth' = sensorDepth,
- 'observation' = waterTemp) %>%
- mutate(startDate = lubridate::ymd_hm(startDate),
- time = lubridate::ymd_h(format(startDate, '%Y-%m-%d %H')),
- depth = round(depth, digits = 1)) %>%
- group_by(site_id, time, depth) %>%
- summarise(observation = mean(observation),.groups = "drop") %>%
- mutate(variable = "temperature") %>%
- # include first QC of data
- QC.temp(range = c(-5, 40), spike = 5, by.depth = T) %>%
- mutate(data_source = 'MS_raw')
-
-message("##### avros data #####")
-message("# Download any new files from the Google Cloud")
-
-# need to figure out which data are required
-# what is in the NEON store db?
-cur_tsd_month <- hourly_temp_profile_portal %>%
- group_by(site_id) %>%
- summarise(cur_wq_date = as.Date(max(time)),
- new_date = as.Date(ceiling_date(max(time), unit = 'month')))
-
-# Download any new files from the Google Cloud
-download.neon.avro(months = cur_tsd_month,
- data_product = '20264', # TSD data product
- path = file.path(avro_file_directory,"DP1.20264.001"))
-# Start by deleting superseded files
-# Files that have been supersed by the NEON store files can be deleted from the relevent repository
-# Look in each repository to see if there are files that match the current maximum month of the NEON
-# store data
-
-delete.neon.parquet(months = cur_tsd_month,
- path = file.path(parquet_file_directory, "tsd"),
- data_product = '20264')
-
-delete.neon.avro(months = cur_tsd_month,
- path = file.path(avro_file_directory, "DP1.20264.001"),
- data_product = '20264')
-
-# The variables (term names that should be kept)
-tsd_vars <- c('siteName',
- 'startDate',
- 'tsdWaterTempMean',
- 'thermistorDepth',
- 'tsdWaterTempExpUncert',
- 'tsdWaterTempFinalQF')
-
-columns_keep <- c('siteName', 'termName', 'startDate', 'Value', 'verticalIndex')
-thermistor_depths <- readr::read_csv('thermistorDepths.csv', col_types = 'ccd')
-
-# Generate a list of files to be read
-tsd_avro_files <- paste0(avro_file_directory, '/',
- list.files(path = avro_file_directory,
- pattern = '*20264',
- recursive = T))
-
-lake_avro_files <- c(tsd_avro_files[grepl(x = tsd_avro_files, pattern= lake_sites[1])],
- tsd_avro_files[grepl(x = tsd_avro_files, pattern= lake_sites[2])],
- tsd_avro_files[grepl(x = tsd_avro_files, pattern= lake_sites[3])],
- tsd_avro_files[grepl(x = tsd_avro_files, pattern= lake_sites[4])],
- tsd_avro_files[grepl(x = tsd_avro_files, pattern= lake_sites[5])],
- tsd_avro_files[grepl(x = tsd_avro_files, pattern= lake_sites[6])],
- tsd_avro_files[grepl(x = tsd_avro_files, pattern= lake_sites[7])])
-
-tsd_parquet_files <- list.files(path = file.path(parquet_file_directory, "tsd"))
-
-new_files <- map_lgl(lake_avro_files, function(x){
- new_file <- TRUE
- if(basename(x) %in% tools::file_path_sans_ext(tsd_parquet_files)){
- new_file <- FALSE
- }
- return(new_file)
-})
-
-lake_avro_files <- lake_avro_files[which(new_files)]
-
-if(length(lake_avro_files) > 0){
- sc <- sparklyr::spark_connect(master = "local")
- message("# Read in each of the files and then bind by rows")
- hourly_temp_profile_avro <- purrr::walk(.x = lake_avro_files,
- ~ read.avro.tsd.profile(sc= sc,
- path = .x,
- thermistor_depths = thermistor_depths,
- columns_keep = columns_keep,
- dir = file.path(parquet_file_directory, "tsd"),
- delete_files = FALSE))
- spark_disconnect(sc)
-}
-
-# Read in the pre-release
-hourly_temp_profile_prerelease <- arrow::open_dataset(file.path(parquet_file_directory, "tsd")) |>
- collect()
-
-# Combine the three data sources
-hourly_temp_profile_lakes <- bind_rows(hourly_temp_profile_portal, hourly_temp_profile_EDI, hourly_temp_profile_prerelease) %>%
- arrange(time, site_id, depth) %>%
- group_by(time, site_id, depth) %>%
- summarise(observation = mean(observation, na.rm = T), .groups = "drop") |>
- mutate(variable = "temperature") |>
- select(time, site_id, depth, variable, observation)
-#======================================================#
-
-message("#### Generate surface (< 1 m) temperature #############")
-message("###### Lake temperatures #####")
-# Daily surface lake temperatures generated from the hourly profiles created above
-
-daily_temp_surface_lakes <- hourly_temp_profile_lakes %>%
- dplyr::filter(depth <= 1) %>%
- mutate(time = lubridate::as_date(time)) %>%
- group_by(site_id, time) %>%
- summarise(observation = mean(observation, na.rm = T),.groups = "drop") %>%
- mutate(variable = 'temperature')
-
-message("##### Stream temperatures #####")
-
-
-df <- neonstore:::neon_data(product = "DP1.20053.001",
- # start_date = "2023-06-01",
- # end_date = "2023-08-01",
- type="basic",
- site = stream_sites)
-
-urls <- df |>
- dplyr::filter(grepl("TSW_30min", name)) |>
- dplyr::pull(url)
-
-temp_streams_portal <-
- duckdbfs::open_dataset(urls, format="csv", filename = TRUE) |>
- dplyr::mutate(site_id = stringr::str_sub(filename, 77,80),
- verticalPosition = stringr::str_sub(filename, 153,155),
- horizontalPosition = stringr::str_sub(filename, 149,151)) |>
- dplyr::filter(horizontalPosition == "101" |
- horizontalPosition == "111" | # take upstream to match WQ data
- (horizontalPosition == "112" & site_id == "BLUE"), # no data at BLUE upstream
- finalQF == 0) %>%
- dplyr::select(startDateTime, site_id, surfWaterTempMean, finalQF) %>%
- dplyr::mutate(time = as_date(startDateTime),
- surfWaterTempMean = as.numeric(surfWaterTempMean)) %>%
- # dplyr::group_by(time, site_id) %>%
- dplyr::summarize(temperature = mean(surfWaterTempMean, na.rm = TRUE), .by = c('time', 'site_id')) %>%
- dplyr::select(time, site_id, temperature) %>%
- rename(observation = temperature) |>
- mutate(variable = "temperature") |>
- collect()
-
-temp_streams_portal_QC <- temp_streams_portal %>%
- QC.temp(range = c(-5, 40), spike = 7, by.depth = F)
-#===========================================#
-message("##### Stream temperatures2 #####")
-#### avros
-
-# need to figure out which month's data are required
-# what is in the NEON store db?
-cur_prt_month <- temp_streams_portal_QC %>%
- group_by(site_id) %>%
- summarise(cur_wq_date = as.Date(max(time)),
- new_date = as.Date(ceiling_date(max(time), unit = 'month')))
-
-
-# what is the next month from this plus the current month? These might be the same
-# new_month_prt <- unique(format(c((as.Date(max(temp_streams_portal_QC$time)) %m+% months(1)), (Sys.Date() - days(2))), "%Y-%m"))
-
-# Download any new files from the Google Cloud
-download.neon.avro(months = cur_prt_month,
- data_product = '20053', # PRT data product
- path = file.path(avro_file_directory,"DP1.20053.001"))
-
-# Start by deleting superseded files
-# Files that have been supersed by the NEON store files can be deleted from the relevent repository
-# Look in each repository to see if there are files that match the current maximum month of the NEON
-# store data
-
-delete.neon.parquet(months = cur_prt_month,
- path = file.path(parquet_file_directory, "prt"),
- data_product = '20053')
-
-delete.neon.avro(months = cur_prt_month,
- path = file.path(avro_file_directory, "DP1.20053.001"),
- data_product = '20053')
-
-# The variables (term names that should be kept)
-prt_vars <- c('siteName',
- 'startDate',
- 'surfWaterTempMean',
- 'surfWaterTempExpUncert',
- 'finalQF')
-
-columns_keep <- c('siteName', 'termName', 'startDate', 'Value', 'verticalIndex')
-
-
-# Generate a list of files to be read
-prt_avro_files <- paste0(avro_file_directory, '/',
- list.files(path = avro_file_directory,
- pattern = '*20053',
- recursive = T))
-
-prt_parquet_files <- list.files(path = file.path(parquet_file_directory, "prt"))
-
-new_files <- map_lgl(prt_avro_files, function(x){
- new_file <- TRUE
- if(basename(x) %in% tools::file_path_sans_ext(prt_parquet_files)){
- new_file <- FALSE
- }
- return(new_file)
-})
-
-prt_avro_files <- prt_avro_files[which(new_files)]
-
-## check for bad NEON files and remove if present
-problem_files <- c('/home/rstudio/data/aquatic_avro/site=BIGC/BIGC_L0_to_L1_Surface_Water_Temperature_DP1.20053.001__2021-12-27.avro',
- '/home/rstudio/data/aquatic_avro/site=BIGC/BIGC_L0_to_L1_Surface_Water_Temperature_DP1.20053.001__2023-02-01.avro',
- '/home/rstudio/data/aquatic_avro/site=BIGC/BIGC_L0_to_L1_Surface_Water_Temperature_DP1.20053.001__2023-01-28.avro')
-
-problem_files <- map_lgl(prt_avro_files, function(x){
- problem_file <- FALSE
- if(stringr::str_detect(x, "BIGC_L0_to_L1_Surface_Water_Temperature_DP1.20053.001")){
- problem_file <- TRUE
- }
- return(problem_file)
-})
-
-if(any(problem_files == TRUE)){
- prt_avro_files <- prt_avro_files[!problem_files]
- message('Problem files removed')
-}
-
-
-if(length(prt_avro_files > 0)){
- sc <- sparklyr::spark_connect(master = "local")
- # Read in each of the files and then bind by rows
- purrr::walk(.x = prt_avro_files, ~ read.avro.prt(sc= sc,
- path = .x,
- columns_keep = columns_keep,
- dir = file.path(parquet_file_directory, "prt")))
- spark_disconnect(sc)
-}
-
-
-temp_streams_prerelease <- arrow::open_dataset(file.path(parquet_file_directory, "prt")) |>
- collect()
-
-#===============================================#
-
-message("##### River temperature ######")
-# For non-wadeable rivers need portal, EDI and avro data
-
-df <- neonstore:::neon_data(product = "DP1.20264.001",
- # start_date = "2023-06-01",
- # end_date = "2023-08-01",
- type="basic",
- site = nonwadable_rivers)
-
-urls <- df |>
- dplyr::filter(grepl("TSD_30_min", name)) |>
- dplyr::pull(url)
-
-
-temp_rivers_portal <- duckdbfs::open_dataset(urls, format="csv", filename = TRUE) |>
- dplyr::mutate(site_id = stringr::str_sub(filename, 77,80)) |>
- dplyr::mutate(depth = as.numeric(thermistorDepth),
- tsdWaterTempMean = as.numeric(tsdWaterTempMean),
- tsdWaterTempFinalQF = as.numeric(tsdWaterTempFinalQF)) %>%
- dplyr::select(startDateTime, site_id, tsdWaterTempMean, depth, tsdWaterTempFinalQF) %>%
- dplyr::filter(tsdWaterTempFinalQF == 0) %>%
- dplyr::mutate(time = as_date(startDateTime)) %>%
- dplyr::summarize(temperature = mean(tsdWaterTempMean, na.rm = TRUE), .by = c("time", "site_id")) %>%
- dplyr::select(time, site_id, temperature) %>%
- rename(observation = temperature) |>
- mutate(variable = "temperature") |>
- collect()
-
-temp_rivers_portal_QC <- temp_rivers_portal %>%
- QC.temp(range = c(-5, 40), spike = 7, by.depth = F)
-
-# EDI data
-edi_url_river <- c("https://pasta.lternet.edu/package/data/eml/edi/1185/1/fb9cf9ba62ee8e8cf94cb020175e9165",
- "https://pasta.lternet.edu/package/data/eml/edi/1185/1/fac068cff680ae28473c3e13dc75aa9f",
- "https://pasta.lternet.edu/package/data/eml/edi/1185/1/5567ad7252b598ee40f5653e7b732ff4" )
-
-river_edi_profile <- c("NEON.D03.FLNT.DP0.20005.001.01378.csv",
- "NEON.D03.BLWA.DP0.20005.001.01378.csv",
- "NEON.D03.TOMB.DP0.20005.001.01378.csv")
-
-for(i in 1:length(edi_url_river)){
- if (!file.exists(file.path(EDI_file_directory,river_edi_profile[i]))) {
- if (!dir.exists(dirname(file.path(EDI_file_directory,
- river_edi_profile[i])))) {
- dir.create(dirname(file.path(EDI_file_directory,
- river_edi_profile[i])))
- }
- download.file(edi_url_river[i], destfile = file.path(EDI_file_directory, river_edi_profile[i]))
- }
-}
-
-edi_data <- list.files(file.path(EDI_file_directory), full.names = T)
-
-edi_rivers <- c(edi_data[grepl(x = edi_data, pattern= nonwadable_rivers[1])],
- edi_data[grepl(x = edi_data, pattern= nonwadable_rivers[2])],
- edi_data[grepl(x = edi_data, pattern= nonwadable_rivers[3])])
-
-# The hourly data set is for the whole water column.
-temp_rivers_EDI <- purrr::map_dfr(.x = edi_rivers, ~ read.csv(file = .x)) %>%
- rename('site_id' = siteID,
- 'observation' = waterTemp) %>%
- mutate(startDate = lubridate::ymd_hm(startDate),
- time = as.Date(startDate)) %>%
- group_by(site_id, time) %>%
- summarise(observation = mean(observation),.groups = "drop") %>%
- # include first QC of data
- QC.temp(range = c(-5, 40), spike = 5, by.depth = F) |>
- mutate(variable = "temperature")
-
-
-message('download non-wadable rivers avros')
-cur_tsd_month <- temp_rivers_portal_QC %>%
- group_by(site_id) %>%
- summarise(cur_wq_date = as.Date(max(time)),
- new_date = as.Date(ceiling_date(max(time), unit = 'month')))
-
-# Download any new files from the Google Cloud
-download.neon.avro(months = cur_tsd_month,
- data_product = '20264', # TSD data product
- path = file.path(avro_file_directory, "DP1.20264.001"))
-
-message("Generate a list of nonwadable_rivers avro files to be read")
-tsd_avro_files <- paste0(avro_file_directory, '/',
- list.files(path = avro_file_directory,
- pattern = '*20264',
- recursive = T))
-
-river_avro_files <- c(tsd_avro_files[grepl(x = tsd_avro_files, pattern= nonwadable_rivers[1])],
- tsd_avro_files[grepl(x = tsd_avro_files, pattern= nonwadable_rivers[2])],
- tsd_avro_files[grepl(x = tsd_avro_files, pattern= nonwadable_rivers[3])])
-
-tsd_parquet_files <- list.files(path = file.path(parquet_file_directory, "river_tsd"))
-
-new_files <- map_lgl(river_avro_files, function(x){
- new_file <- TRUE
- if(basename(x) %in% tools::file_path_sans_ext(tsd_parquet_files)){
- new_file <- FALSE
- }
- return(new_file)
-})
-
-river_avro_files <- river_avro_files[which(new_files)]
-
-
-if(length(river_avro_files) > 0){
- sc <- sparklyr::spark_connect(master = "local")
- # Read in each of the files and then bind by rows
- purrr::walk(.x = river_avro_files, ~ read.avro.tsd(sc= sc,
- path = .x,
- thermistor_depths = thermistor_depths,
- dir = file.path(parquet_file_directory, "river_tsd"),
- delete_files = FALSE))
- spark_disconnect(sc)
-}
-
-
-temp_rivers_prerelease <- arrow::open_dataset(file.path(parquet_file_directory, "river_tsd")) |>
- collect()
-
-#===========================================#
-
-message("#### surface temperatures ####")
-
-# Combine the avro files with the portal data
-temp_full <- dplyr::bind_rows(# Lakes surface temperature
- daily_temp_surface_lakes,
-
- # Stream temperature data
- temp_streams_portal_QC,
- temp_streams_prerelease,
-
- # River temperature data
- temp_rivers_portal_QC,
- temp_rivers_EDI,
- temp_rivers_prerelease) %>%
- dplyr::arrange(site_id, time) %>%
- group_by(site_id, time) %>%
- summarise(observation = mean(observation, na.rm = T),.groups = "drop") %>%
- mutate(variable = 'temperature')
-
-
-#### Temp QC protocol=================
-
-# additional QC steps implemented (FO, 2022-07-13)
-##### check 1 Gross range tests on temperature
-# temperature ranges
-T_max <- 32 # gross max
-T_min <- -2 # gross min
-
-# GR flag will be true if the temperature is outside the range specified
-temp_cleaned <- temp_full %>%
- dplyr::mutate(observation =ifelse(observation >= T_min & observation <= T_max ,
- observation, NA)) %>%
- # manual cleaning based on observation
- dplyr:: mutate(observation = ifelse(site_id == "PRLA" & time %
- dplyr::arrange(site_id, time, variable) %>%
- dplyr::mutate(observation = ifelse(is.nan(observation), NA, observation))
-
-message("#### Writing forecasts to file ####")
-
-targets_long <- targets_long |>
- rename(datetime = time)
-
-hourly_temp_profile_lakes <- hourly_temp_profile_lakes |>
- rename(datetime = time)
-
-readRenviron("~/.Renviron") # compatible with littler
-
-s3 <- arrow::s3_bucket("neon4cast-targets/aquatics",
- endpoint_override = "data.ecoforecast.org",
- access_key = Sys.getenv("AWS_ACCESS_KEY"),
- secret_key = Sys.getenv("AWS_SECRET_ACCESS_KEY"))
-
-arrow::write_csv_arrow(targets_long, sink = s3$path("aquatics-targets.csv.gz"))
-
-targets_long2 <- targets_long |>
- mutate(datetime = lubridate::as_datetime(datetime),
- duration = "P1D",
- project_id = "neon4cast") |>
- select(project_id, site_id, datetime, duration, variable, observation)
-
-s3 <- arrow::s3_bucket("bio230014-bucket01/challenges/targets/project_id=neon4cast/duration=P1D",
- endpoint_override = "sdsc.osn.xsede.org",
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
-arrow::write_csv_arrow(targets_long2, sink = s3$path("aquatics-targets.csv.gz"))
-
-
-s3 <- arrow::s3_bucket("neon4cast-targets/aquatics",
- endpoint_override = "data.ecoforecast.org",
- access_key = Sys.getenv("AWS_ACCESS_KEY"),
- secret_key = Sys.getenv("AWS_SECRET_ACCESS_KEY"))
-
-arrow::write_csv_arrow(hourly_temp_profile_lakes, sink = s3$path("aquatics-expanded-observations.csv.gz"))
-
-hourly_temp_profile_lakes2 <- hourly_temp_profile_lakes |>
- mutate(datetime = lubridate::as_datetime(datetime),
- duration = "PT1H",
- project_id = "neon4cast") |>
- select(project_id, site_id, datetime, duration, variable, observation)
-
-#s3 <- arrow::s3_bucket("bio230014-bucket01/challenges/targets/project_id=neon4cast/duration=PT1H",
-# endpoint_override = "sdsc.osn.xsede.org",
-# access_key = Sys.getenv("OSN_KEY"),
-# secret_key = Sys.getenv("OSN_SECRET"))
-
-s3 <- arrow::s3_bucket("bio230014-bucket01/challenges/",
- endpoint_override = "sdsc.osn.xsede.org",
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
-arrow::write_csv_arrow(hourly_temp_profile_lakes2, sink = s3$path("supporting_data/project_id=neon4cast/aquatics-expanded-observations.csv.gz"))
-
-#arrow::write_csv_arrow(hourly_temp_profile_lakes2, sink = s3$path("aquatics-expanded-observations.csv.gz"))
-
-message(paste0("Completed Aquatics Target at ", Sys.time()))
diff --git a/targets/cron_automation.R b/targets/cron_automation.R
deleted file mode 100644
index cd1807b412..0000000000
--- a/targets/cron_automation.R
+++ /dev/null
@@ -1,61 +0,0 @@
-#remotes::install_github("rqthomas/cronR")
-#remotes::install_deps()
-library(cronR)
-
-home_dir <- path.expand("~")
-log_dir <- path.expand("~/log/cron")
-
-targets_repo <- "neon4cast-ci/targets"
-
-## Phenocam Download and Target Generation
-
-#cmd <- cronR::cron_rscript(rscript = file.path(home_dir, targets_repo, "phenology_targets.R"),
-# rscript_log = file.path(log_dir, "phenology-targets.log"),
-# log_append = FALSE,
-# workdir = file.path(home_dir, targets_repo),
-# trailing_arg = "curl -fsS -m 10 --retry 5 -o /dev/null https://hc-ping.com/f5d48d96-bb41-4c21-b028-930fa2b01c5a")
-#cronR::cron_add(command = cmd, frequency = '0 */2 * * *', id = 'phenocam_download')
-#cronR::cron_add(command = cmd, frequency = 'daily', at = '2PM', id = 'phenocam-targets')
-
-## Aquatics Targets
-
-cmd <- cronR::cron_rscript(rscript = file.path(home_dir, targets_repo,"aquatics_targets.R"),
- rscript_log = file.path(log_dir, "aquatics-target.log"),
- log_append = FALSE,
- workdir = file.path(home_dir, targets_repo),
- cmd = "/usr/local/bin/r", # use litter, more robust on CLI
- trailing_arg = "curl -fsS -m 10 --retry 5 -o /dev/null https://hc-ping.com/1267b13e-8980-4ddf-8aaa-21aa7e15081c")
-cronR::cron_add(command = cmd, frequency = 'daily', at = "7AM", id = 'aquatics-targets')
-
-## Beetles
-
-#cmd <- cronR::cron_rscript(rscript = file.path(home_dir, targets_repo,"beetles_targets.R"),
-# rscript_log = file.path(log_dir, "beetles-targets.log"),
-# log_append = FALSE,
-# workdir = file.path(home_dir, targets_repo),
-# trailing_arg = "curl -fsS -m 10 --retry 5 -o /dev/null https://hc-ping.com/ed35da4e-01d3-4750-ae5a-ad2f5dfa6e99")
-#cronR::cron_add(command = cmd, frequency = "0 10 * * SUN", id = 'beetles-targets')
-
-## Terrestrial targets
-
-cmd <- cronR::cron_rscript(rscript = file.path(home_dir, targets_repo,"terrestrial_targets.R"),
- rscript_log = file.path(log_dir, "terrestrial-targets.log"),
- log_append = FALSE,
- cmd = "/usr/local/bin/r", # use litter, more robust on CLI
- workdir = file.path(home_dir, targets_repo),
- trailing_arg = "curl -fsS -m 10 --retry 5 -o /dev/null https://hc-ping.com/c1fb635f-95f8-4ba2-a348-98924548106c")
-cronR::cron_add(command = cmd, frequency = 'daily', at = "9AM", id = 'terrestrial-targets')
-
-## Ticks
-
-#cmd <- cronR::cron_rscript(rscript = file.path(home_dir, targets_repo,"ticks_targets.R"),
-# rscript_log = file.path(log_dir, "ticks-targets.log"),
-# log_append = FALSE,
-# workdir = file.path(home_dir, targets_repo),
-# trailing_arg = "curl -fsS -m 10 --retry 5 -o /dev/null https://hc-ping.com/09c7ab10-eb4e-40ef-a029-7a4addc3295b")
-#cronR::cron_add(command = cmd, frequency = "0 11 * * SUN", id = 'ticks-targets')
-
-
-cronR::cron_ls()
-
-
diff --git a/targets/fetch_sites.R b/targets/fetch_sites.R
deleted file mode 100644
index 2dd22cdfa2..0000000000
--- a/targets/fetch_sites.R
+++ /dev/null
@@ -1,7 +0,0 @@
-
-# wrapper file for running forecast pipeline; called from gitlab CI
-
-library(targets)
-tar_make()
-tar_meta(fields = error, complete_only = TRUE)
-
diff --git a/targets/in/.empty b/targets/in/.empty
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/targets/in/FY23_ecological_forecast_challenge_USGS_sites.csv b/targets/in/FY23_ecological_forecast_challenge_USGS_sites.csv
deleted file mode 100644
index d82e45fe66..0000000000
--- a/targets/in/FY23_ecological_forecast_challenge_USGS_sites.csv
+++ /dev/null
@@ -1,14 +0,0 @@
-IWAAs_basin,NWIS_site_no,site_name,has_fchla,reason_for_inclusion,other_notes,include_in_challenge
-DRB,01463500,"Delaware River at Trenton, NJ",yes,High profile WQ site for basin stakeholders,,yes
-DRB,01427510,"Delaware River at Calicoon, PA",yes,Upstream pristine site of interest due to reservoir releases,,yes
-ILRB,05553700,"Illinois River at Starved Rock, IL",yes,several algal blooms in recent years ,,yes
-ILRB,05586300,"Illinois River at Florence, IL",yes,NWQN / high profile site,,yes
-ILRB,05543010,"Illinois River at Seneca, IL",yes,Upstream of Starved Rock,,yes
-ILRB,05558300 ,"Illinois RIver at Henry, IL",yes,Downstream of Starved Rock & upstream of Peoria (water intake),,yes
-ILRB,05549500,"Fox River at McHenry, IL",yes,"High chl, IEPA routinely samples for algal toxins at this site. Stakeholder interest.",Sensor has seasonal deployment b/c site freezes over,yes
-UCOL,09014050,Grand Lake Outlet blw. Chipmunk Lane,yes,High profile WQ site for basin stakeholders related the Three Lakes Clarity and the C-BT Project diverting water to the Front Range of Colorado,Only has RFUs for the last 3 years so not including in the forecast challenge ,no
-WRB,,Detroit Lake ,,,,no
-WRB,14211720,Willamette @ Morrison St Br,yes,Important based conversation from Elena Nilsen ,has tidal influence and upstream lagoon seems to be the algal hotspot ,yes
-WRB,14211010,"CLACKAMAS RIVER NEAR OREGON CITY, OR",yes,extend transport ideas with this site ,,yes
-WRB,14181500,"NORTH SANTIAM RIVER AT NIAGARA, OR",yes,,,yes
-NA ,03378500,"WABASH RIVER AT NEW HARMONY, IN",yes,Important to the Nature Conservancy ,contact is Carrie Parmenter at carrie.parmenter@tnc.org,no
diff --git a/targets/in/characteristic_names.yml b/targets/in/characteristic_names.yml
deleted file mode 100644
index 72eb8bb6d1..0000000000
--- a/targets/in/characteristic_names.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-# Characteristic names to use in WQP query. See all characteristics at
-# https://www.waterqualitydata.us/Codes/Characteristicname?mimeType=xml
-
-- "Chlorophyll a"
-- "Chlorophyll b"
-- "Chlorophyll c"
-- "Chlorophyll a (probe)"
-- "Chlorophyll a - Periphyton (attached)"
-- "Chlorophyll a, corrected for pheophytin"
-- "Chlorophyll a - Phytoplankton (suspended)"
-- "Chlorophyll a, uncorrected for pheophytin"
-- "Chlorophyll a (probe relative fluorescence)"
-- "Phaeophytin - Phytoplankton (suspended)"
-- "Phaeophytin - Periphyton (attached)"
-- "Chlorophyll"
-- "Chlorophyll A"
-- "Biomass/chlorophyll ratio"
-- "Chlorophyll/Pheophytin ratio"
-- "Chlorophyll a, free of pheophytin"
-- "Chlorophyll a, collected/analyzed (YES/NO) (choice list)"
-- "Chlorophyll a (probe) concentration, Cyanobacteria (bluegreen)"
-- "Pheophytin"
-- "Pheophytin a"
-- "Pheophytin ratio"
-- "Phytoplankton"
-- "Phytoplankton Density"
-- "Biomass, phytoplankton"
-- "Phytoplankton biovolume"
-- "Phytoplankton productivity"
-- "Phytoplankton Relative Density"
-- "Phytoplankton, settling volume"
-- "Population diversity, phytoplankton, # of species"
diff --git a/targets/in/pcodes.yml b/targets/in/pcodes.yml
deleted file mode 100644
index 6626d92e8a..0000000000
--- a/targets/in/pcodes.yml
+++ /dev/null
@@ -1,73 +0,0 @@
-
-# algal-ish
-# - "32209"
-# - "32210"
-# - "32211"
-# - "32217"
-# - "32218"
-# - "32230"
-# - "32231"
-# - "32232"
-# - "32234"
-# - "32241"
-# - "32269"
-# - "32283"
-# - "32284"
-# - "32285"
-# - "32286"
-# - "32287"
-# - "32288"
-# - "32317"
-# - "32318"
-# - "32320"
-# - "32329"
-# - "32334"
-# - "32735"
-# - "32736"
-# - "32737"
-# - "32738"
-# - "49953"
-# - "60050"
-# - "62360"
-# - "62361"
-# - "65228"
-# - "65229"
-# - "65230"
-# - "65231"
-# - "70951"
-# - "70952"
-# - "70953"
-# - "70954"
-# - "70955"
-# - "70956"
-# - "70957"
-# - "70958"
-# - "92209"
-# - "92217"
-# - "95201"
-# - "95203"
-# - "95204"
-# - "32335"
-# - "32336"
-# - "32337"
-# - "32338"
-# - "32339"
-# - "01325"
-# - "31883"
-# - "31884"
-# - "31885"
-# - "31886"
-# - "31887"
-# - "31888"
-# - "32315"
-# - "32316"
-# - "32319"
-# - "32321"
-# - "32323"
-# - "95202"
-
-
-# UV pcodes
-- "62361"
-- "32316"
-- "32318"
diff --git a/targets/out/.empty b/targets/out/.empty
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/targets/src/download_nwis_data.R b/targets/src/download_nwis_data.R
deleted file mode 100644
index 848e0d6c52..0000000000
--- a/targets/src/download_nwis_data.R
+++ /dev/null
@@ -1,142 +0,0 @@
-
-
-do_fetch_by_site <- function(file_out, site, pcodes, service, start_date, end_date) {
- message(file_out)
- fetch_by_pcode_service_by_site(
- site, pcodes, service,
- start_date, end_date) %>%
- write_csv(file = file_out)
- return(file_out)
-}
-
-do_alt_fetch_by_site <- function(file_out, site, alt_site, pcodes, service, start_date, end_date) {
- message(file_out)
- returned_data <- fetch_by_pcode_service_by_site(
- alt_site, pcodes, service,
- start_date, end_date) %>%
- {
- # Only attempt to add the alternative site no if there was data
- if(nrow(.) > 0)
- # The site number that was downloaded is actually our alternative site,
- # so add a column to retain that info, but substitute the site we are
- # actually using for metabolism modeling into `site_no`
- mutate(., alt_site_no = site_no) %>%
- mutate(site_no = site)
- else .
- } %>%
- write_csv(file = file_out)
- return(file_out)
-}
-
-fetch_by_pcode_service_by_site <- function(site, pcodes, service, start_date, end_date) {
-
- raw_data <- fetch_nwis_fault_tolerantly_bysite(site, pcodes, service, start_date, end_date)
-
- # Remove attributes, which typically have a timestamp associated
- # with them this can cause strange rebuilds of downstream data,
- # even if the data itself is the same.
- attr(raw_data, "comment") <- NULL
- attr(raw_data, "queryTime") <- NULL
- attr(raw_data, "headerInfo") <- NULL
-
- return(raw_data)
-}
-
-fetch_nwis_fault_tolerantly_bysite <- function(site, pcodes, service, start_date, end_date, max_tries = 10) {
- data_returned <- tryCatch(
- retry(readNWISdata(siteNumber = site,
- parameterCd = pcodes,
- startDate = start_date,
- endDate = end_date,
- service = service),
- until = function(val, cnd) "data.frame" %in% class(val),
- max_tries = max_tries),
- error = function(e) return()
- )
-
- # Noticed that some UV calls return a data.frame with a tz_cd column
- # and nothing else. These should be considered empty.
- # For example:
- # readNWISdata(siteNumber = "05579630", parameterCd = "00060", startDate = "2020-12-01",
- # endDate = "2020-12-31", service = "uv")
- if(nrow(data_returned) == 0 & "tz_cd" %in% names(data_returned)) {
- return(data.frame())
- } else {
- return(data_returned)
- }
-}
-
-
-
-download_historic_data <- function(
- sites,
- start_date,
- end_date,
- pcodes,
- service,
- statCd = NULL,
- min_chl = 0, # minimum chlorohpyll-a to keep
- out_file
-){
-
- daily_data <- dataRetrieval::readNWISdata(siteNumbers = sites,
- parameterCd = pcodes,
- startDate = start_date,
- endDate = end_date,
- service = service,
- statCd = statCd) %>%
- pivot_longer(cols = matches(paste0(statCd, "$")),
- names_to = "parameter_name",
- values_to = "chl_ug_L") %>%
- select(site_no, dateTime, parameter_name, chl_ug_L) %>%
- # TODO: add in filter based on prvisional data or not?
- filter(!is.na(chl_ug_L), chl_ug_L >= min_chl) %>%
- mutate(dateTime = as.Date(dateTime)) %>%
- group_by(site_no, dateTime) %>%
- summarise(chl_ug_L = mean(chl_ug_L), .groups = "drop")
-
- write_rds(x = daily_data, file = out_file)
- return(out_file)
-}
-
-
-download_historic_uv_data <- function(
- sites,
- start_date,
- end_date,
- pcodes,
- service,
- min_chl = 0, # minimum chlorohpyll-a to keep
- out_file
-){
-
- daily_data <- dataRetrieval::readNWISdata(siteNumbers = sites,
- parameterCd = pcodes,
- startDate = start_date,
- endDate = end_date,
- service = service) %>%
- pivot_longer(cols = matches("00000$"),
- names_to = "parameter_name",
- values_to = "chl_ug_L") %>%
- select(site_no, dateTime, parameter_name, chl_ug_L) %>%
- # TODO: add in filter based on prvisional data or not?
- filter(!is.na(chl_ug_L), chl_ug_L >= min_chl) %>%
- mutate(dateTime = as.Date(dateTime)) %>%
- group_by(site_no, dateTime) %>%
- summarise(chl_ug_L = mean(chl_ug_L), .groups = "drop")
-
- write_rds(x = daily_data, file = out_file)
- return(out_file)
-}
-
-
-aggregate_to_daily <- function(
- subdaily_data
-){
- daily_data <- subdaily_data %>%
- group_by(siteNumbers, date) %>%
- summarise(.groups = "drop")
-
- return(daily_data)
-}
-
diff --git a/targets/src/s3_utils.R b/targets/src/s3_utils.R
deleted file mode 100644
index e2a741b02c..0000000000
--- a/targets/src/s3_utils.R
+++ /dev/null
@@ -1,30 +0,0 @@
-
-#' helper function for pushing file to s3
-#'
-#' @param config configuration file of the challenge
-#' @param local_file_name targets file name
-#' @param s3_file_name file to write to s3 #'
-push_to_s3 <- function(
- config,
- local_file_name,
- s3_file_name
-){
-
- targets <- read_csv(local_file_name)
- # duration hard coded for now
- bucket_path <- glue::glue("{config$targets_bucket}/project_id={config$project_id}/duration=P1D")
-
- s3 <- arrow::s3_bucket(bucket_path,
- endpoint_override = config$endpoint,
- access_key = Sys.getenv("OSN_KEY"),
- secret_key = Sys.getenv("OSN_SECRET"))
-
- sink <- s3$path(s3_file_name)
-
- # write to s3
- arrow::write_csv_arrow(x = targets,
- sink = sink)
-
- return(sink)
-}
-
diff --git a/usgsrc4cast-ci.Rproj b/usgsrc4cast-ci.Rproj
deleted file mode 100644
index b9255bc959..0000000000
--- a/usgsrc4cast-ci.Rproj
+++ /dev/null
@@ -1,20 +0,0 @@
-Version: 1.0
-
-RestoreWorkspace: Default
-SaveWorkspace: Default
-AlwaysSaveHistory: Default
-
-EnableCodeIndexing: Yes
-UseSpacesForTab: Yes
-NumSpacesForTab: 2
-Encoding: UTF-8
-
-RnwWeave: Sweave
-LaTeX: pdfLaTeX
-
-AutoAppendNewline: Yes
-StripTrailingWhitespace: Yes
-
-BuildType: Package
-PackageUseDevtools: Yes
-PackageInstallArgs: --no-multiarch --with-keep.source