Skip to content

Commit

Permalink
Merge pull request #57 from eco4cast/main
Browse files Browse the repository at this point in the history
main to prod
  • Loading branch information
jzwart authored May 14, 2024
2 parents 7840488 + 7697a31 commit fb78bea
Show file tree
Hide file tree
Showing 2 changed files with 181 additions and 128 deletions.
304 changes: 177 additions & 127 deletions catalog/forecasts/forecast_models.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ forecast_data_df <- arrow::open_dataset(forecast_s3) |>
theme_models <- forecast_data_df |>
distinct(model_id)

forecast_sites <- forecast_data_df |>
distinct(site_id)

forecast_date_range <- forecast_data_df |> dplyr::summarise(min(date),max(date))
forecast_min_date <- forecast_date_range$`min(date)`
forecast_max_date <- forecast_date_range$`max(date)`
Expand All @@ -84,31 +87,32 @@ stac4cast::build_forecast_scores(table_schema = forecast_theme_df,
link_items = stac4cast::generate_group_values(group_values = names(config$variable_groups)),
thumbnail_link = catalog_config$forecasts_thumbnail,
thumbnail_title = catalog_config$forecasts_thumbnail_title,
group_sites = forecast_sites$site_id,
model_child = TRUE)

## create separate JSON for model landing page
if (!dir.exists(paste0(catalog_config$forecast_path,"models"))){
dir.create(paste0(catalog_config$forecast_path,"models"))
}

stac4cast::build_group_variables(table_schema = forecast_theme_df,
table_description = forecast_description_create,
start_date = forecast_min_date,
end_date = forecast_max_date,
id_value = "models",
description_string = build_description,
about_string = catalog_config$about_string,
about_title = catalog_config$about_title,
dashboard_string = catalog_config$dashboard_url,
dashboard_title = catalog_config$dashboard_title,
theme_title = "Models",
destination_path = paste0(catalog_config$forecast_path,"models"),
aws_download_path = catalog_config$aws_download_path_forecasts,
group_var_items = stac4cast::generate_model_items(model_list = theme_models$model_id),
thumbnail_link = 'pending',
thumbnail_title = 'pending',
group_var_vector = NULL,
group_sites = NULL)
# if (!dir.exists(paste0(catalog_config$forecast_path,"models"))){
# dir.create(paste0(catalog_config$forecast_path,"models"))
# }

# stac4cast::build_group_variables(table_schema = forecast_theme_df,
# table_description = forecast_description_create,
# start_date = forecast_min_date,
# end_date = forecast_max_date,
# id_value = "models",
# description_string = build_description,
# about_string = catalog_config$about_string,
# about_title = catalog_config$about_title,
# dashboard_string = catalog_config$dashboard_url,
# dashboard_title = catalog_config$dashboard_title,
# theme_title = "Models",
# destination_path = paste0(catalog_config$forecast_path,"models"),
# aws_download_path = catalog_config$aws_download_path_forecasts,
# group_var_items = stac4cast::generate_model_items(model_list = theme_models$model_id),
# thumbnail_link = 'pending',
# thumbnail_title = 'pending',
# group_var_vector = NULL,
# group_sites = NULL)

## CREATE MODELS
variable_gsheet <- gsheet2tbl(config$target_metadata_gsheet)
Expand All @@ -119,8 +123,18 @@ variable_gsheet <- gsheet2tbl(config$target_metadata_gsheet)
# registered_model_id <- googlesheets4::read_sheet(config$model_metadata_gsheet)

# read in model metadata and filter for the relevant project
registered_model_id <- gsheet2tbl(config$model_metadata_gsheet) |>
filter(`What forecasting challenge are you registering for?` == config$project_id)
# registered_model_id <- gsheet2tbl(config$model_metadata_gsheet) |>
# filter(`What forecasting challenge are you registering for?` == config$project_id)

gsheet_read <- gsheet2tbl(config$model_metadata_gsheet)
gsheet_read$row_non_na <- rowSums(!is.na(gsheet_read))

registered_model_id <- gsheet_read |>
filter(`What forecasting challenge are you registering for?` == config$project_id) |>
rename(project_id = `What forecasting challenge are you registering for?`) |>
arrange(row_non_na) |>
distinct(model_id, project_id, .keep_all = TRUE)#|>
#filter(row_non_na > 20) ## estimate based on current number of rows assuming everything (minus model and project) are empty

forecast_sites <- c()

Expand Down Expand Up @@ -158,6 +172,12 @@ for (m in theme_models$model_id){

idx = which(registered_model_id$model_id == m)

if (is.null(registered_model_id$`Web link to model code`[idx])){
model_code_link <- 'https://projects.ecoforecast.org/neon4cast-ci/'
} else{
model_code_link <- registered_model_id$`Web link to model code`[idx]
}

stac4cast::build_model(model_id = m,
team_name = registered_model_id$`Long name of the model (can include spaces)`[idx],
model_description = registered_model_id[idx,"Describe your modeling approach in your own words."][[1]],
Expand All @@ -169,14 +189,13 @@ for (m in theme_models$model_id){
site_table = catalog_config$site_metadata_url,
model_documentation = registered_model_id,
destination_path = paste0(catalog_config$forecast_path,"models/model_items"),
aws_download_path = config$forecasts_bucket, # CHANGE THIS BUCKET NAME
aws_download_path = catalog_config$aws_download_path_forecasts,
collection_name = 'forecasts',
thumbnail_image_name = NULL,
table_schema = forecast_theme_df,
table_description = forecast_description_create,
full_var_df = model_vars,
#code_web_link = registered_model_id$`Web link to model code`[idx],
code_web_link = 'pending')
code_web_link = model_code_link)
}


Expand Down Expand Up @@ -204,106 +223,137 @@ for (i in 1:length(config$variable_groups)){ ## organize variable groups
dir.create(paste0(catalog_config$forecast_path,names(config$variable_groups[i])))
}

for(j in 1:length(config$variable_groups[[i]]$variable)){ # FOR EACH VARIABLE WITHIN A MODEL GROUP
# match variable with full name in gsheet
var_gsheet_arrange <- variable_gsheet |>
arrange(duration)

## restructure variable names
var_values <- config$variable_groups[[i]]$variable
var_name <- config$variable_groups[[i]]$variable[j]
print(var_name)
var_values <- names(config$variable_groups[[i]]$group_vars)

# check data and skip if no data found
var_data_check <- forecast_data_df |>
filter(variable == var_name)

if (nrow(var_data_check) == 0){
print('No data available for variable')
next
}

duration_name <- config$variable_groups[[i]]$duration[j]

# match variable with full name in gsheet
#var_name_full <- variable_gsheet[which(variable_gsheet$`"official" targets name` == var_values),1][[1]]
var_name_full <- variable_gsheet[which(variable_gsheet$`"official" targets name` %in% var_values),1][[1]]



## create new vector to store duration names
duration_values <- config$variable_groups[[i]]$duration
duration_values[which(duration_values == 'P1D')] <- 'Daily'
duration_values[which(duration_values == 'PT1H')] <- 'Hourly'
duration_values[which(duration_values == 'PT30M')] <- '30min'
duration_values[which(duration_values == 'P1W')] <- 'Weekly'

#var_name_combined_list <- paste0(var_values, '_',duration_values)
var_name_combined_list <- paste0(duration_values,'_',var_name_full)

## CREATE VARIABLE GROUP JSONS
group_description <- paste0('This page includes variables for the ',names(config$variable_groups[i]),' group.')

## find group sites
find_group_sites <- forecast_data_df |>
filter(variable %in% var_values) |>
distinct(site_id)

stac4cast::build_group_variables(table_schema = forecast_theme_df,
#theme_id = names(config$variable_groups[i]),
table_description = forecast_description_create,
start_date = forecast_min_date,
end_date = forecast_max_date,
id_value = names(config$variable_groups[i]),
description_string = group_description,
about_string = catalog_config$about_string,
about_title = catalog_config$about_title,
dashboard_string = catalog_config$dashboard_url,
dashboard_title = catalog_config$dashboard_title,
theme_title = names(config$variable_groups[i]),
destination_path = paste0(catalog_config$forecast_path,names(config$variable_groups[i])),
aws_download_path = catalog_config$aws_download_path_forecasts,
group_var_items = stac4cast::generate_group_variable_items(variables = var_name_combined_list),
thumbnail_link = config$variable_groups[[i]]$thumbnail_link,
thumbnail_title = config$variable_groups[[i]]$thumbnail_title,
group_var_vector = unique(var_values),
group_sites = find_group_sites$site_id)

if (!dir.exists(paste0(catalog_config$forecast_path,names(config$variable_groups)[i],'/',var_name_combined_list[j]))){
dir.create(paste0(catalog_config$forecast_path,names(config$variable_groups)[i],'/',var_name_combined_list[j]))
}

var_data <- forecast_data_df |>
filter(variable == var_name,
duration == duration_name)

var_date_range <- var_data |> dplyr::summarise(min(date),max(date))
var_min_date <- var_date_range$`min(date)`
var_max_date <- var_date_range$`max(date)`

var_models <- var_data |> distinct(model_id)

find_var_sites <- forecast_data_df |>
filter(variable == var_name) |>
distinct(site_id)

var_description <- paste0('This page includes all models for the ',var_name_combined_list[j],' variable.')

stac4cast::build_group_variables(table_schema = forecast_theme_df,
table_description = forecast_description_create,
start_date = var_min_date,
end_date = var_max_date,
id_value = var_name_combined_list[j],
description_string = var_description,
about_string = catalog_config$about_string,
about_title = catalog_config$about_title,
dashboard_string = catalog_config$dashboard_url,
dashboard_title = catalog_config$dashboard_title,
theme_title = var_name_combined_list[j],
destination_path = file.path(catalog_config$forecast_path,names(config$variable_groups)[i],var_name_combined_list[j]),
aws_download_path = var_data$path[1],
group_var_items = stac4cast::generate_variable_model_items(model_list = var_models$model_id),
thumbnail_link = 'pending',
thumbnail_title = 'pending',
group_var_vector = NULL,
group_sites = find_var_sites$site_id)
var_name_full <- var_gsheet_arrange[which(var_gsheet_arrange$`"official" targets name` %in% var_values),1][[1]]

}
}
## CREATE VARIABLE GROUP JSONS
group_description <- paste0('This page includes variables for the ',names(config$variable_groups[i]),' group.')

## find group sites
find_group_sites <- forecast_data_df |>
filter(variable %in% var_values) |>
distinct(site_id)

## create empty vector to track publication information
citation_build <- c()
doi_build <- c()

## create empty vector to track variable information
variable_name_build <- c()

for(j in 1:length(config$variable_groups[[i]]$group_vars)){ # FOR EACH VARIABLE WITHIN A MODEL GROUP


var_name <- names(config$variable_groups[[i]]$group_vars[j])
print(var_name)

for (k in 1:length(config$variable_groups[[i]]$group_vars[[j]]$duration)){
duration_value <- config$variable_groups[[i]]$group_vars[[j]]$duration[k]
print(duration_value)

## save original duration name for reference
duration_name <- config$variable_groups[[i]]$group_vars[[j]]$duration[k]

## create formal variable name
duration_value[which(duration_value == 'P1D')] <- 'Daily'
duration_value[which(duration_value == 'PT1H')] <- 'Hourly'
duration_value[which(duration_value == 'PT30M')] <- '30min'
duration_value[which(duration_value == 'P1W')] <- 'Weekly'

var_formal_name <- paste0(duration_value,'_',var_name_full[j])

# check data and skip if no data found
var_data_check <- forecast_data_df |>
filter(variable == var_name)

if (nrow(var_data_check) == 0){
print('No data available for variable')
next
}

if (!dir.exists(file.path(catalog_config$forecast_path,names(config$variable_groups)[i],var_formal_name))){
dir.create(file.path(catalog_config$forecast_path,names(config$variable_groups)[i],var_formal_name))
}

var_data <- forecast_data_df |>
filter(variable == var_name,
duration == duration_name)

var_date_range <- var_data |> dplyr::summarise(min(date),max(date))
var_min_date <- var_date_range$`min(date)`
var_max_date <- var_date_range$`max(date)`

var_models <- var_data |> distinct(model_id)

find_var_sites <- forecast_data_df |>
filter(variable == var_name) |>
distinct(site_id)

var_description <- paste0('This page includes all models for the ',var_formal_name,' variable.')

var_path <- var_data$path[1]

## build lists for creating publication items
var_citations <- config$variable_groups[[i]]$group_vars[[j]]$var_citation
doi_citations <- config$variable_groups[[i]]$group_vars[[j]]$var_doi

#update group list of publication information
citation_build <- append(citation_build, var_citations)
doi_build <- append(doi_build, doi_citations)

variable_name_build <- append(variable_name_build, var_formal_name)

variable_name_build <- append(variable_name_build, var_formal_name)

stac4cast::build_group_variables(table_schema = forecast_data_df,
#theme_id = var_formal_name[j],
table_description = forecast_description_create,
start_date = var_min_date,
end_date = var_max_date,
id_value = var_formal_name,
description_string = var_description,
about_string = catalog_config$about_string,
about_title = catalog_config$about_title,
dashboard_string = catalog_config$dashboard_url,
dashboard_title = catalog_config$dashboard_title,
theme_title = var_formal_name,
destination_path = file.path(catalog_config$forecast_path,names(config$variable_groups)[i],var_formal_name),
aws_download_path = var_path,
group_var_items = stac4cast::generate_variable_model_items(model_list = var_models$model_id),
thumbnail_link = 'pending',
thumbnail_title = 'pending',
group_var_vector = NULL,
group_sites = find_var_sites$site_id,
citation_values = var_citations,
doi_values = config$variable_groups[[i]]$group_vars[[j]]$var_doi)
} ## end duration loop

} ## end variable loop

## BUILD THE GROUP PAGES WITH UPDATED VAR/PUB INFORMATION
stac4cast::build_group_variables(table_schema = forecast_data_df,
table_description = forecast_description_create,
start_date = forecast_min_date,
end_date = forecast_max_date,
id_value = names(config$variable_groups)[i],
description_string = group_description,
about_string = catalog_config$about_string,
about_title = catalog_config$about_title,
dashboard_string = catalog_config$dashboard_url,
dashboard_title = catalog_config$dashboard_title,
theme_title = names(config$variable_groups[i]),
destination_path = file.path(catalog_config$scores_path,names(config$variable_groups)[i]),
aws_download_path = catalog_config$aws_download_path_scores,
group_var_items = stac4cast::generate_group_variable_items(variables = variable_name_build),
thumbnail_link = config$variable_groups[[i]]$thumbnail_link,
thumbnail_title = config$variable_groups[[i]]$thumbnail_title,
group_var_vector = unique(var_values),
group_sites = find_group_sites$site_id,
citation_values = citation_build,
doi_values = doi_build)
} # end group loop
5 changes: 4 additions & 1 deletion challenge_configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ variable_groups:
duration: ['P1D']
thumbnail_link: 'https://d9-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/s3fs-public/thumbnails/image/Back-b.jpg'
thumbnail_title: 'USGS Streamgage'
group_vars:
chla:
duration: "P1D"

# Forecast catalog configuration
catalog_config:
Expand Down Expand Up @@ -87,6 +90,6 @@ catalog_config:
base_image_path: 'https://data.ecoforecast.org/usgsrc4cast-catalog'
citation_doi: "https://doi.org/10.1002/fee.2616"
citation_text: "Thomas, R.Q., C. Boettiger, C.C. Carey, M.C. Dietze, L.R. Johnson, M.A. Kenney, J.S. Mclachlan, J.A. Peters, E.R. Sokol, J.F. Weltzin, A. Willson, W.M. Woelmer, and Challenge Contributors. 2023. The NEON Ecological Forecasting Challenge. Frontiers in Ecology and Environment 21: 112-113."
dashboard_url: "https://projects.ecoforecast.org/usgsrc4cast-docs/"
dashboard_url: "https://projects.ecoforecast.org/usgsrc4cast-ci/"
dashboard_title: "EFI-USGS River Chlorophyll Forecast Challenge Dashboard"
site_metadata_url: 'https://raw.githubusercontent.com/eco4cast/usgsrc4cast-ci/main/USGS_site_metadata.csv'

0 comments on commit fb78bea

Please sign in to comment.