Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

main to prod #57

Merged
merged 3 commits into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
304 changes: 177 additions & 127 deletions catalog/forecasts/forecast_models.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ forecast_data_df <- arrow::open_dataset(forecast_s3) |>
theme_models <- forecast_data_df |>
distinct(model_id)

forecast_sites <- forecast_data_df |>
distinct(site_id)

forecast_date_range <- forecast_data_df |> dplyr::summarise(min(date),max(date))
forecast_min_date <- forecast_date_range$`min(date)`
forecast_max_date <- forecast_date_range$`max(date)`
Expand All @@ -84,31 +87,32 @@ stac4cast::build_forecast_scores(table_schema = forecast_theme_df,
link_items = stac4cast::generate_group_values(group_values = names(config$variable_groups)),
thumbnail_link = catalog_config$forecasts_thumbnail,
thumbnail_title = catalog_config$forecasts_thumbnail_title,
group_sites = forecast_sites$site_id,
model_child = TRUE)

## create separate JSON for model landing page
if (!dir.exists(paste0(catalog_config$forecast_path,"models"))){
dir.create(paste0(catalog_config$forecast_path,"models"))
}

stac4cast::build_group_variables(table_schema = forecast_theme_df,
table_description = forecast_description_create,
start_date = forecast_min_date,
end_date = forecast_max_date,
id_value = "models",
description_string = build_description,
about_string = catalog_config$about_string,
about_title = catalog_config$about_title,
dashboard_string = catalog_config$dashboard_url,
dashboard_title = catalog_config$dashboard_title,
theme_title = "Models",
destination_path = paste0(catalog_config$forecast_path,"models"),
aws_download_path = catalog_config$aws_download_path_forecasts,
group_var_items = stac4cast::generate_model_items(model_list = theme_models$model_id),
thumbnail_link = 'pending',
thumbnail_title = 'pending',
group_var_vector = NULL,
group_sites = NULL)
# if (!dir.exists(paste0(catalog_config$forecast_path,"models"))){
# dir.create(paste0(catalog_config$forecast_path,"models"))
# }

# stac4cast::build_group_variables(table_schema = forecast_theme_df,
# table_description = forecast_description_create,
# start_date = forecast_min_date,
# end_date = forecast_max_date,
# id_value = "models",
# description_string = build_description,
# about_string = catalog_config$about_string,
# about_title = catalog_config$about_title,
# dashboard_string = catalog_config$dashboard_url,
# dashboard_title = catalog_config$dashboard_title,
# theme_title = "Models",
# destination_path = paste0(catalog_config$forecast_path,"models"),
# aws_download_path = catalog_config$aws_download_path_forecasts,
# group_var_items = stac4cast::generate_model_items(model_list = theme_models$model_id),
# thumbnail_link = 'pending',
# thumbnail_title = 'pending',
# group_var_vector = NULL,
# group_sites = NULL)

## CREATE MODELS
variable_gsheet <- gsheet2tbl(config$target_metadata_gsheet)
Expand All @@ -119,8 +123,18 @@ variable_gsheet <- gsheet2tbl(config$target_metadata_gsheet)
# registered_model_id <- googlesheets4::read_sheet(config$model_metadata_gsheet)

# read in model metadata and filter for the relevant project
registered_model_id <- gsheet2tbl(config$model_metadata_gsheet) |>
filter(`What forecasting challenge are you registering for?` == config$project_id)
# registered_model_id <- gsheet2tbl(config$model_metadata_gsheet) |>
# filter(`What forecasting challenge are you registering for?` == config$project_id)

gsheet_read <- gsheet2tbl(config$model_metadata_gsheet)
gsheet_read$row_non_na <- rowSums(!is.na(gsheet_read))

registered_model_id <- gsheet_read |>
filter(`What forecasting challenge are you registering for?` == config$project_id) |>
rename(project_id = `What forecasting challenge are you registering for?`) |>
arrange(row_non_na) |>
distinct(model_id, project_id, .keep_all = TRUE)#|>
#filter(row_non_na > 20) ## estimate based on current number of rows assuming everything (minus model and project) are empty

forecast_sites <- c()

Expand Down Expand Up @@ -158,6 +172,12 @@ for (m in theme_models$model_id){

idx = which(registered_model_id$model_id == m)

if (is.null(registered_model_id$`Web link to model code`[idx])){
model_code_link <- 'https://projects.ecoforecast.org/neon4cast-ci/'
} else{
model_code_link <- registered_model_id$`Web link to model code`[idx]
}

stac4cast::build_model(model_id = m,
team_name = registered_model_id$`Long name of the model (can include spaces)`[idx],
model_description = registered_model_id[idx,"Describe your modeling approach in your own words."][[1]],
Expand All @@ -169,14 +189,13 @@ for (m in theme_models$model_id){
site_table = catalog_config$site_metadata_url,
model_documentation = registered_model_id,
destination_path = paste0(catalog_config$forecast_path,"models/model_items"),
aws_download_path = config$forecasts_bucket, # CHANGE THIS BUCKET NAME
aws_download_path = catalog_config$aws_download_path_forecasts,
collection_name = 'forecasts',
thumbnail_image_name = NULL,
table_schema = forecast_theme_df,
table_description = forecast_description_create,
full_var_df = model_vars,
#code_web_link = registered_model_id$`Web link to model code`[idx],
code_web_link = 'pending')
code_web_link = model_code_link)
}


Expand Down Expand Up @@ -204,106 +223,137 @@ for (i in 1:length(config$variable_groups)){ ## organize variable groups
dir.create(paste0(catalog_config$forecast_path,names(config$variable_groups[i])))
}

for(j in 1:length(config$variable_groups[[i]]$variable)){ # FOR EACH VARIABLE WITHIN A MODEL GROUP
# match variable with full name in gsheet
var_gsheet_arrange <- variable_gsheet |>
arrange(duration)

## restructure variable names
var_values <- config$variable_groups[[i]]$variable
var_name <- config$variable_groups[[i]]$variable[j]
print(var_name)
var_values <- names(config$variable_groups[[i]]$group_vars)

# check data and skip if no data found
var_data_check <- forecast_data_df |>
filter(variable == var_name)

if (nrow(var_data_check) == 0){
print('No data available for variable')
next
}

duration_name <- config$variable_groups[[i]]$duration[j]

# match variable with full name in gsheet
#var_name_full <- variable_gsheet[which(variable_gsheet$`"official" targets name` == var_values),1][[1]]
var_name_full <- variable_gsheet[which(variable_gsheet$`"official" targets name` %in% var_values),1][[1]]



## create new vector to store duration names
duration_values <- config$variable_groups[[i]]$duration
duration_values[which(duration_values == 'P1D')] <- 'Daily'
duration_values[which(duration_values == 'PT1H')] <- 'Hourly'
duration_values[which(duration_values == 'PT30M')] <- '30min'
duration_values[which(duration_values == 'P1W')] <- 'Weekly'

#var_name_combined_list <- paste0(var_values, '_',duration_values)
var_name_combined_list <- paste0(duration_values,'_',var_name_full)

## CREATE VARIABLE GROUP JSONS
group_description <- paste0('This page includes variables for the ',names(config$variable_groups[i]),' group.')

## find group sites
find_group_sites <- forecast_data_df |>
filter(variable %in% var_values) |>
distinct(site_id)

stac4cast::build_group_variables(table_schema = forecast_theme_df,
#theme_id = names(config$variable_groups[i]),
table_description = forecast_description_create,
start_date = forecast_min_date,
end_date = forecast_max_date,
id_value = names(config$variable_groups[i]),
description_string = group_description,
about_string = catalog_config$about_string,
about_title = catalog_config$about_title,
dashboard_string = catalog_config$dashboard_url,
dashboard_title = catalog_config$dashboard_title,
theme_title = names(config$variable_groups[i]),
destination_path = paste0(catalog_config$forecast_path,names(config$variable_groups[i])),
aws_download_path = catalog_config$aws_download_path_forecasts,
group_var_items = stac4cast::generate_group_variable_items(variables = var_name_combined_list),
thumbnail_link = config$variable_groups[[i]]$thumbnail_link,
thumbnail_title = config$variable_groups[[i]]$thumbnail_title,
group_var_vector = unique(var_values),
group_sites = find_group_sites$site_id)

if (!dir.exists(paste0(catalog_config$forecast_path,names(config$variable_groups)[i],'/',var_name_combined_list[j]))){
dir.create(paste0(catalog_config$forecast_path,names(config$variable_groups)[i],'/',var_name_combined_list[j]))
}

var_data <- forecast_data_df |>
filter(variable == var_name,
duration == duration_name)

var_date_range <- var_data |> dplyr::summarise(min(date),max(date))
var_min_date <- var_date_range$`min(date)`
var_max_date <- var_date_range$`max(date)`

var_models <- var_data |> distinct(model_id)

find_var_sites <- forecast_data_df |>
filter(variable == var_name) |>
distinct(site_id)

var_description <- paste0('This page includes all models for the ',var_name_combined_list[j],' variable.')

stac4cast::build_group_variables(table_schema = forecast_theme_df,
table_description = forecast_description_create,
start_date = var_min_date,
end_date = var_max_date,
id_value = var_name_combined_list[j],
description_string = var_description,
about_string = catalog_config$about_string,
about_title = catalog_config$about_title,
dashboard_string = catalog_config$dashboard_url,
dashboard_title = catalog_config$dashboard_title,
theme_title = var_name_combined_list[j],
destination_path = file.path(catalog_config$forecast_path,names(config$variable_groups)[i],var_name_combined_list[j]),
aws_download_path = var_data$path[1],
group_var_items = stac4cast::generate_variable_model_items(model_list = var_models$model_id),
thumbnail_link = 'pending',
thumbnail_title = 'pending',
group_var_vector = NULL,
group_sites = find_var_sites$site_id)
var_name_full <- var_gsheet_arrange[which(var_gsheet_arrange$`"official" targets name` %in% var_values),1][[1]]

}
}
## CREATE VARIABLE GROUP JSONS
group_description <- paste0('This page includes variables for the ',names(config$variable_groups[i]),' group.')

## find group sites
find_group_sites <- forecast_data_df |>
filter(variable %in% var_values) |>
distinct(site_id)

## create empty vector to track publication information
citation_build <- c()
doi_build <- c()

## create empty vector to track variable information
variable_name_build <- c()

for(j in 1:length(config$variable_groups[[i]]$group_vars)){ # FOR EACH VARIABLE WITHIN A MODEL GROUP


var_name <- names(config$variable_groups[[i]]$group_vars[j])
print(var_name)

for (k in 1:length(config$variable_groups[[i]]$group_vars[[j]]$duration)){
duration_value <- config$variable_groups[[i]]$group_vars[[j]]$duration[k]
print(duration_value)

## save original duration name for reference
duration_name <- config$variable_groups[[i]]$group_vars[[j]]$duration[k]

## create formal variable name
duration_value[which(duration_value == 'P1D')] <- 'Daily'
duration_value[which(duration_value == 'PT1H')] <- 'Hourly'
duration_value[which(duration_value == 'PT30M')] <- '30min'
duration_value[which(duration_value == 'P1W')] <- 'Weekly'

var_formal_name <- paste0(duration_value,'_',var_name_full[j])

# check data and skip if no data found
var_data_check <- forecast_data_df |>
filter(variable == var_name)

if (nrow(var_data_check) == 0){
print('No data available for variable')
next
}

if (!dir.exists(file.path(catalog_config$forecast_path,names(config$variable_groups)[i],var_formal_name))){
dir.create(file.path(catalog_config$forecast_path,names(config$variable_groups)[i],var_formal_name))
}

var_data <- forecast_data_df |>
filter(variable == var_name,
duration == duration_name)

var_date_range <- var_data |> dplyr::summarise(min(date),max(date))
var_min_date <- var_date_range$`min(date)`
var_max_date <- var_date_range$`max(date)`

var_models <- var_data |> distinct(model_id)

find_var_sites <- forecast_data_df |>
filter(variable == var_name) |>
distinct(site_id)

var_description <- paste0('This page includes all models for the ',var_formal_name,' variable.')

var_path <- var_data$path[1]

## build lists for creating publication items
var_citations <- config$variable_groups[[i]]$group_vars[[j]]$var_citation
doi_citations <- config$variable_groups[[i]]$group_vars[[j]]$var_doi

#update group list of publication information
citation_build <- append(citation_build, var_citations)
doi_build <- append(doi_build, doi_citations)

variable_name_build <- append(variable_name_build, var_formal_name)

variable_name_build <- append(variable_name_build, var_formal_name)

stac4cast::build_group_variables(table_schema = forecast_data_df,
#theme_id = var_formal_name[j],
table_description = forecast_description_create,
start_date = var_min_date,
end_date = var_max_date,
id_value = var_formal_name,
description_string = var_description,
about_string = catalog_config$about_string,
about_title = catalog_config$about_title,
dashboard_string = catalog_config$dashboard_url,
dashboard_title = catalog_config$dashboard_title,
theme_title = var_formal_name,
destination_path = file.path(catalog_config$forecast_path,names(config$variable_groups)[i],var_formal_name),
aws_download_path = var_path,
group_var_items = stac4cast::generate_variable_model_items(model_list = var_models$model_id),
thumbnail_link = 'pending',
thumbnail_title = 'pending',
group_var_vector = NULL,
group_sites = find_var_sites$site_id,
citation_values = var_citations,
doi_values = config$variable_groups[[i]]$group_vars[[j]]$var_doi)
} ## end duration loop

} ## end variable loop

## BUILD THE GROUP PAGES WITH UPDATED VAR/PUB INFORMATION
stac4cast::build_group_variables(table_schema = forecast_data_df,
table_description = forecast_description_create,
start_date = forecast_min_date,
end_date = forecast_max_date,
id_value = names(config$variable_groups)[i],
description_string = group_description,
about_string = catalog_config$about_string,
about_title = catalog_config$about_title,
dashboard_string = catalog_config$dashboard_url,
dashboard_title = catalog_config$dashboard_title,
theme_title = names(config$variable_groups[i]),
destination_path = file.path(catalog_config$scores_path,names(config$variable_groups)[i]),
aws_download_path = catalog_config$aws_download_path_scores,
group_var_items = stac4cast::generate_group_variable_items(variables = variable_name_build),
thumbnail_link = config$variable_groups[[i]]$thumbnail_link,
thumbnail_title = config$variable_groups[[i]]$thumbnail_title,
group_var_vector = unique(var_values),
group_sites = find_group_sites$site_id,
citation_values = citation_build,
doi_values = doi_build)
} # end group loop
5 changes: 4 additions & 1 deletion challenge_configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ variable_groups:
duration: ['P1D']
thumbnail_link: 'https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/Back-b.jpg'
thumbnail_title: 'USGS Streamgage'
group_vars:
chla:
duration: "P1D"

# Forecast catalog configuration
catalog_config:
Expand Down Expand Up @@ -87,6 +90,6 @@ catalog_config:
base_image_path: 'https://data.ecoforecast.org/usgsrc4cast-catalog'
citation_doi: "https://doi.org/10.1002/fee.2616"
citation_text: "Thomas, R.Q., C. Boettiger, C.C. Carey, M.C. Dietze, L.R. Johnson, M.A. Kenney, J.S. Mclachlan, J.A. Peters, E.R. Sokol, J.F. Weltzin, A. Willson, W.M. Woelmer, and Challenge Contributors. 2023. The NEON Ecological Forecasting Challenge. Frontiers in Ecology and Environment 21: 112-113."
dashboard_url: "https://projects.ecoforecast.org/usgsrc4cast-docs/"
dashboard_url: "https://projects.ecoforecast.org/usgsrc4cast-ci/"
dashboard_title: "EFI-USGS River Chlorophyll Forecast Challenge Dashboard"
site_metadata_url: 'https://raw.githubusercontent.com/eco4cast/usgsrc4cast-ci/main/USGS_site_metadata.csv'
Loading