diff --git a/vignettes/example_output/FLARE1_FOR_H_2018_07_18_2018_07_25_F_16_20200817T112017.nc b/vignettes/example_output/FLARE1_FOR_H_2018_07_18_2018_07_25_F_16_20200817T112017.nc new file mode 100644 index 0000000..0d5cd75 Binary files /dev/null and b/vignettes/example_output/FLARE1_FOR_H_2018_07_18_2018_07_25_F_16_20200817T112017.nc differ diff --git a/vignettes/flare-metadata-example.Rmd b/vignettes/flare-metadata-example.Rmd index 79b682f..4ab41be 100644 --- a/vignettes/flare-metadata-example.Rmd +++ b/vignettes/flare-metadata-example.Rmd @@ -1,556 +1,314 @@ --- -title: "Example metadata for a FLARE forecast" +title: "Generating metadata from EFI NETCDF output" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Example metadata for a FLARE forecast} + %\VignetteIndexEntry{Generating metadata from EFI NETCDF output} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- - ```{r setup, message=FALSE} library(EML) -library(uuid) library(ncdf4) library(emld) library(lubridate) library(tibble) library(dplyr) library(tidyr) +library(EFIstandards) emld::eml_version("eml-2.2.0") ``` -First, set the forecast identifiers - -ForecastProject_id represents the launch of an automated, iterative forecast. It is created -each time a human modifies the forecast code. It can be a DOI because this is the level that -we envision citations occuring. - -Forecast_id represents each forecast cycle within a ForecastProject_id +## Metadata from output forecast generated by FLARE +This example uses a netcdf output file produced by the Forecasting Lakes and Reservoir +Ecosystems project (https://github.com/CareyLabVT/FLARE). The netcdf output file +already follows the EFI recommended format for forecast output. This vingnettes +shows how to create the EFI metadata from the output file. -For example, if you have a forecast code base on GitHub and launch a forecast from that code that -runs daily for 365 days, then there will be one ForecastProject_id and 365 Forecast_ids. A paper -analyzing the forecasts would cite the ForecastProject_id. +We recommend reading the `logistic-metadata-example` for guidence on generating +output that meets the EFI Standard. ```{r} - -ForecastProject_id <- 30405043 #Some ID that applies to a bunch of forecasts -Forecast_id <- uuid::UUIDgenerate() #The ID that applies to the specific forecast in the cycel +file_name <- "example_output/FLARE1_FOR_H_2018_07_18_2018_07_25_F_16_20200817T112017.nc" ``` -# Scenario 1: Oxygen System turned on - -## Generating the forecast +First, there is set of information that FLARE has in memory that is used to generate +the eml. + +```{r} +nstates <- 560 #Number of modeled states x number of depths +npars <- 10 #Number of parameters that fit in FLARE +n_met_members <- 21 #Number NOAA weather forecast ensemble members +n_ds_members <- 5 #Number of weather downscaling uncertainty members +lake_name <- "Falling Creek Reservoir, Vinton, VA" +lake_latitude <- 37.307 #Degrees North +lake_longitude <- 79.837 #Degrees West +``` -First, get the output as we are currently saving it +Second, there is a set of information requires for generating the metadata that +is FLARE already has in memory when creating the metadata. It needs to be +loaded here. ```{r} -netcdf_file <- system.file("extdata", "IW2_1_SSSon_H_2019_05_27_2019_06_03_F_16_2132020_21_32.nc", - package="EFIstandards", mustWork = TRUE) -nc <- nc_open(netcdf_file) - -#Dimnesions -depth <- ncvar_get(nc, "z") -time <- ncvar_get(nc,'time') -local_tzone <- ncatt_get(nc, 0)$time_zone_of_simulation -time <- as.POSIXct(time, origin = '1970-01-01 00:00.00 UTC', tz = local_tzone) -time <- strftime(time , "%Y-%m-%dT%H:%M:%S%z") - -#States -temperature <- ncvar_get(nc, "temp") -oxygen <- ncvar_get(nc, "OXY_oxy") -zone1temp <- ncvar_get(nc, "zone1temp") -zone2temp <- ncvar_get(nc, "zone2temp") -sw_factor <- ncvar_get(nc, "sw_factor") -inflow_factor <- ncvar_get(nc, "inflow_factor") - -#Forecast timings -forecasted <- ncvar_get(nc, "forecasted") -data_assimilation <- if_else(forecasted == 0, 1, 0) -#This doesn't work but we need to update our writing script to just have the date -forecast_issue_time <- ncatt_get(nc, varid=0, attname = "history") - -forecast_issue_time <- "2020-02-13T21:32:52-05:00" - -nc_close(nc) +abstract_text <- "This is where a longer description of the forest can be added" +forecast_title <- "FLARE" +intellectualRights <- "insert license" +model_description <- list( + name = "General Lake Model", + type = "process-based", + repository = "https://github.com/AquaticEcoDynamics/GLM/releases/tag/v3.1.0" +) +me <- list(individualName = list(givenName = "Quinn", + surName = "Thomas"), + electronicMailAddress = "rqthomas@vt.edu", + id = "https://orcid.org/0000-0003-1282-7825") ``` -## Saving to a standardized output format (Option 1) +## Building the metadata -Second, build a data frame for all variables with a depth dimension +Open and extract information for building the metadata file ```{r} - -n_depths <- length(depth) -state_names <- c("temperature", "oxygen") -states <- list(temperature, oxygen) -n_states <- length(state_names) -df_combined <- list() - -for(k in 1:n_states){ -for(i in 1:n_depths){ - df <- as_tibble(states[[k]][, ,i]) - names(df) <- as.character(seq(1, ncol(states[[k]][, ,i]))) - df <- cbind(time, df, data_assimilation) - df <- df %>% - pivot_longer(cols = -c(time,data_assimilation), - names_to = "ensemble", - values_to = state_names[k]) %>% - mutate(ensemble = as.integer(ensemble)) %>% - mutate(depth = depth[i]) - if(i == 1){ - running_df <- df - }else{ - running_df <- rbind(running_df, df) - } -} - df_combined[[k]] <- running_df +nc <- nc_open(file_name) +t <- ncvar_get(nc,'time') #Saved as seconds since 1970-01-01 00:00.00 UTC +time <- as.POSIXct(t, + origin = '1970-01-01 00:00.00 UTC', + tz = "UTC") + +#data assimilation flag +data_assimilation <- ncvar_get(nc, "data_assimilation") + +#These are in the netcdf global attributed +forecast_issue_time <-ncatt_get(nc, varid = 0)$forecast_issue_time +forecast_iteration_id <- ncatt_get(nc, varid = 0)$forecast_iteration_id +forecast_project_id <- ncatt_get(nc, varid = 0)$forecast_project_id + +#Number of ensemble members +nmembers <- length(ncvar_get(nc, "ensemble")) + +#Vectors for building the attributes table +var_names <- names(nc$var) +attributeName <- rep(NA, length(var_names)) +attributeDefinition <- rep(NA, length(var_names)) +unit <- rep(NA, length(var_names)) +formatString <- rep(NA, length(var_names)) +numberType <- rep(NA, length(var_names)) +definition <- rep(NA, length(var_names)) +col_classes <- rep("numeric", length(var_names)) +for(i in 1:length(var_names)){ + curr_var <- nc$var[[i]] + attributeName[i] <- curr_var$name #Name of variable in model + attributeDefinition[i] <- curr_var$longname #longname in netcdf + unit[i] <- "dimensionless" #use dimesionless in the eml and have the correct + #units in the netcdf + formatString[i] <- NA + numberType[i] <- typeof(ncvar_get(nc, var_names[i])) + if(numberType[i] == "double"){numberType[i] = "real"} } -df_combined <- right_join(df_combined[[1]], df_combined[[2]], - by = c("time", "ensemble", "depth", "data_assimilation")) %>% - mutate(forecast_issue_time = forecast_issue_time, - ForecastProject_id = ForecastProject_id, - Forecast_id = Forecast_id, - scenario = "oxygen_on") %>% - select(time, depth, scenario, ensemble, temperature, - oxygen, forecast_issue_time, - data_assimilation, Forecast_id, ForecastProject_id) - -df_combined_scenario_1 <- df_combined - +nc_close(nc) ``` -Second, build a data frame for all variables without a depth dimension (parameters). +Create the eml attributes table ```{r} -df <- as_tibble(zone1temp[ , ]) -names(df) <- as.character(seq(1, ncol(zone1temp))) -df <- cbind(time, data_assimilation, df) -df_combined_p1 <- df %>% - pivot_longer(cols = -c("time", "data_assimilation"), - names_to = "ensemble", - values_to = "zone1temp") %>% - mutate(ensemble = as.integer(ensemble), - scenario = "oxygen_on") - -df <- as_tibble(zone2temp[ , ]) -names(df) <- as.character(seq(1, ncol(zone2temp))) -df <- cbind(time, data_assimilation, df) -df_combined_p2 <- df %>% - pivot_longer(cols = -c("time", "data_assimilation"), - names_to = "ensemble", - values_to = "zone2temp") %>% - mutate(ensemble = as.integer(ensemble), - scenario = "oxygen_on") - -df <- as_tibble(sw_factor[ , ]) -names(df) <- as.character(seq(1, ncol(sw_factor))) -df <- cbind(time, data_assimilation, df) -df_combined_p3 <- df %>% - pivot_longer(cols = -c("time", "data_assimilation"), - names_to = "ensemble", - values_to = "sw_factor") %>% - mutate(ensemble = as.integer(ensemble), - scenario = "oxygen_on") - -df <- as_tibble(inflow_factor[ , ]) -names(df) <- as.character(seq(1, ncol(inflow_factor))) -df <- cbind(time, data_assimilation, df) -df_combined_p4 <- df %>% - pivot_longer(cols = -c("time", "data_assimilation"), - names_to = "ensemble", - values_to = "inflow_factor") %>% - mutate(ensemble = as.integer(ensemble), - scenario = "oxygen_on") - -df_parameters_1 <- right_join(df_combined_p1, - df_combined_p2, - by = c("time", - "data_assimilation", - "ensemble", - "scenario")) %>% - right_join(df_combined_p3, - by = c("time", - "data_assimilation", - "ensemble", - "scenario")) %>% - right_join(df_combined_p4, - by = c("time", - "data_assimilation", - "ensemble", - "scenario")) %>% - mutate(forecast_issue_time = forecast_issue_time, - Forecast_id = Forecast_id, - ForecastProject_id = ForecastProject_id) %>% - select(time, scenario, ensemble, forecast_issue_time, - data_assimilation, Forecast_id, ForecastProject_id, - zone1temp, zone2temp, sw_factor, inflow_factor) -``` +attributes <- tibble( + attributeName = attributeName, + attributeDefinition = attributeDefinition, + unit = unit, + formatString = formatString, + numberType = numberType, + definition = definition +) -# Scenario 2: Oxygen System turned off +attrList <- set_attributes(attributes, + col_classes = col_classes) -## Generating the forecast +``` -First, get the output as we are currently saving it +Set metadata about the file itself (name, file type, size, MD5, etc) ```{r} -netcdf_file2 <- system.file("extdata", "IW2_1_SSSoff_H_2019_05_27_2019_06_03_F_16_2132020_21_19.nc", package="EFIstandards") -nc <- nc_open(netcdf_file2) - -#Dimnesions -depth <- ncvar_get(nc, "z") -time <- ncvar_get(nc,'time') -local_tzone <- ncatt_get(nc, 0)$time_zone_of_simulation -time <- as.POSIXct(time, origin = '1970-01-01 00:00.00 UTC', tz = local_tzone) -time <- strftime(time , "%Y-%m-%dT%H:%M:%S%z") - -#States -temperature <- ncvar_get(nc, "temp") -oxygen <- ncvar_get(nc, "OXY_oxy") -zone1temp <- ncvar_get(nc, "zone1temp") -zone2temp <- ncvar_get(nc, "zone2temp") -sw_factor <- ncvar_get(nc, "sw_factor") -inflow_factor <- ncvar_get(nc, "inflow_factor") - -#Forecast timings -forecasted <- ncvar_get(nc, "forecasted") -data_assimilation <- if_else(forecasted == 0, 1, 0) -#This doesn't work but we need to update our writing script to just have the date -forecast_issue_time <- ncatt_get(nc, varid=0, attname = "history") -forecast_issue_time <- "2020-02-13T21:32:52-05:00" - -nc_close(nc) +physical <- set_physical(file_name) ``` -## Saving to a standardized output format (Option 1) +Describing the output file ```{r} -n_depths <- length(depth) -state_names <- c("temperature", "oxygen") -states <- list(temperature, oxygen) -n_states <- length(state_names) -df_combined <- list() - -for(k in 1:n_states){ -for(i in 1:n_depths){ - df <- as_tibble(states[[k]][, ,i]) - names(df) <- as.character(seq(1, ncol(states[[k]][, ,i]))) - df <- cbind(time, df, data_assimilation) - df <- df %>% - pivot_longer(cols = -c(time,data_assimilation), - names_to = "ensemble", - values_to = state_names[k]) %>% - mutate(ensemble = as.integer(ensemble)) %>% - mutate(depth = depth[i]) - if(i == 1){ - running_df <- df - }else{ - running_df <- rbind(running_df, df) - } -} - df_combined[[k]] <- running_df -} - -df_combined <- right_join(df_combined[[1]], df_combined[[2]], - by = c("time", "ensemble", "depth", "data_assimilation")) %>% - mutate(forecast_issue_time = forecast_issue_time, - Forecast_id = Forecast_id, - ForecastProject_id = ForecastProject_id, - scenario = "oxygen_off") %>% - select(time, depth, scenario, ensemble, temperature, - oxygen, forecast_issue_time, - data_assimilation, Forecast_id, ForecastProject_id) - -df_combined_scenario_2 <- df_combined - +dataTable <- eml$otherEntity( + entityName = "forecast", + entityDescription = "Forecast of water physics, chemistry, and biology", + physical = physical, + attributeList = attrList) ``` -Second, build a data frame for all variables without a depth dimension (parameters). +Let the eml know that the correct units can be found in the netcdf ```{r} -df <- as_tibble(zone1temp[ , ]) -names(df) <- as.character(seq(1, ncol(zone1temp))) -df <- cbind(time, data_assimilation, df) -df_combined_p1 <- df %>% - pivot_longer(cols = -c("time", "data_assimilation"), - names_to = "ensemble", - values_to = "zone1temp") %>% - mutate(ensemble = as.integer(ensemble), - scenario = "oxygen_off") - -df <- as_tibble(zone2temp[ , ]) -names(df) <- as.character(seq(1, ncol(zone2temp))) -df <- cbind(time, data_assimilation, df) -df_combined_p2 <- df %>% - pivot_longer(cols = -c("time", "data_assimilation"), - names_to = "ensemble", - values_to = "zone2temp") %>% - mutate(ensemble = as.integer(ensemble), - scenario = "oxygen_off") - -df <- as_tibble(sw_factor[ , ]) -names(df) <- as.character(seq(1, ncol(sw_factor))) -df <- cbind(time, data_assimilation, df) -df_combined_p3 <- df %>% - pivot_longer(cols = -c("time", "data_assimilation"), - names_to = "ensemble", - values_to = "sw_factor") %>% - mutate(ensemble = as.integer(ensemble), - scenario = "oxygen_off") - -df <- as_tibble(inflow_factor[ , ]) -names(df) <- as.character(seq(1, ncol(inflow_factor))) -df <- cbind(time, data_assimilation, df) -df_combined_p4 <- df %>% - pivot_longer(cols = -c("time", "data_assimilation"), - names_to = "ensemble", - values_to = "inflow_factor") %>% - mutate(ensemble = as.integer(ensemble), - scenario = "oxygen_off") - -df_parameters_2 <- right_join(df_combined_p1, - df_combined_p2, - by = c("time", - "data_assimilation", - "ensemble", - "scenario")) %>% - right_join(df_combined_p3, - by = c("time", - "data_assimilation", - "ensemble", - "scenario")) %>% - right_join(df_combined_p4, - by = c("time", - "data_assimilation", - "ensemble", - "scenario")) %>% - mutate(forecast_issue_time = forecast_issue_time, - Forecast_id = Forecast_id, - ForecastProject_id = ForecastProject_id) %>% - select(time, scenario, ensemble, forecast_issue_time, - data_assimilation, Forecast_id, ForecastProject_id, - zone1temp, zone2temp, sw_factor, inflow_factor) +set_unitList(data.frame(id = 'netcdf', + unitType="dimensionless", + "parentSI"="dimensionless", + "multiplierToSI" = 1, + "description"="units are defined in the netcdf file")) ``` -## Saving to a standardized output format (Option 1) - -Create the 1D and 0D tables +Set the coverage for the output file ```{r} - -df_combined_0D <- rbind(df_parameters_1, df_parameters_2) -df_combined_1D <- rbind(df_combined_scenario_1, df_combined_scenario_2) +coverage <- + set_coverage(begin = as_date(first(time)), + end = as_date(last(time)), + geographicDescription = lake_name, + west = lake_longitude, east = lake_longitude, + north = lake_latitude, south = lake_latitude) ``` -Write tables to CSV. +Set the keywords for the output file ```{r} -write.csv(df_combined_1D, "flare-forecast-ensemble-multi-variable-1D.csv", row.names = FALSE) -write.csv(df_combined_0D, "flare-forecast-ensemble-parameters-0D.csv", row.names = FALSE) +keywordSet <- list( + list( + keywordThesaurus = "EFI controlled vocabulary", + keyword = list("forecast", + "water quality", + "timeseries") + )) ``` -## Standardized Metadata - -Let's document the metadata of the data table itself. It may well be that we decide an Ecological Forecast has to have specific columns like the ones described above, which would thus correspond to a partially pre-defined attributes table (e.g. the units would probably still be allowed to vary, but format would be the same.) - -Note one weakness of this format is that it assumes all data in a column have the same units. This common assumption might be violoated by transformations to "long" form data, where you have columns like "variable", "value", and "units". (The long form may be useful, but it exposes much less information in the metadata layer -- e.g. we no longer know what's actually being measured without looking at the data file itself). +Set the abstract for the output file ```{r} -#attributes <- tibble::tribble( -# ~attributeName, ~attributeDefinition, ~unit, ~formatString, ~numberType, -# "time", "time", "year", "YYYY-MM-DD%THH:MM_SS%Z", "numberType", -# "depth", "depth in reservior", "meter", NA, "real", -# "ensemble", "index of ensemble member", "dimensionless", NA, "integer", -# "scenario", "forecast scenario", "dimensionless", NA, NA, NA, "scenario name", -# -# ##### EDIT STARTING HERE -# "temperature", "water temperature", "celsius", NA, "real", NA, -# "oxygen", "oxygen concentration", "numberPerMeterSquared", NA, "real", NA, -# #### EDIT ENDING HERE -# -# "forecast_issue_time", "time that forecast was created", "dimensionless", "YYYY-MM-DD", "numberType", -# "data_assimilation", "Flag whether time step included data assimilation", "dimensionless", NA, "integer", -# "Forecast_id", "ID for specific forecast cycle", "dimensionless", NA, "character", -# "ForecastProject_id", "ID for forecasting project", "dimensionless", NA, "character", -# ) - -attributes <- tibble::tribble( - ~attributeName, ~attributeDefinition, ~unit, ~formatString, ~numberType, ~definition, - "time", "time", "year", "YYYY-MM-DD", "numberType", NA, - "depth", "depth in reservior", "meter", NA, "real", NA, - "ensemble", "index of ensemble member", "dimensionless", NA, "integer", NA, - "scenario", "forecast scenario", NA, NA, NA, "scenario name", - "temperature", "water temperature", "celsius", NA, "real", NA, - "oxygen", "oxygen concentration", "numberPerMeterSquared", NA, "real", NA, - "forecast_issue_time", "time that forecast was created", NA, "YYYY-MM-DD", NA, NA, - "data_assimilation", "Flag whether time step included data assimilation", "dimensionless", NA, "integer", NA, - "Forecast_id", "ID for specific forecast cycle", NA, NA, NA, "forecast id", - "ForecastProject_id", "ID for forecasting project", NA, NA, NA, "project id" -) - - -attrList <- set_attributes(attributes, - col_classes = c("Date", "numeric", "character", "numeric", - #EDIT STARTING HERE - "numeric","numeric", - #EDIT ENDING HERE - "Date", - "numeric", "character", "character")) -physical <- set_physical("flare-forecast-ensemble-multi-variable-1D.csv") - -dataTable <- eml$dataTable( - entityName = "flare-forecast-ensemble-multi-variable-1D.csv", - entityDescription = "Falling Creek Reservior Forecast", - physical = physical, - attributeList = attrList) +abstract <- abstract_text ``` - -There's a lot more optional terminology that could be exploited here -- for instance, the specification lets us define different missing value codes (and explanations) for each column, and allows us to indicate `precision`, `minimum` and `maximum`. - -Note that `physical` type can document almost any formats as well, including NetCDF etc. A NetCDF file would still document the variables measured in much the same way regardless of the underlying representation. Note that - -Now that we've documented the actual data.frame itself, we can add additional metadata to the record describing our forecast, which is essential for citing, discovering, and interpreting the result. We start with some authorship information. +Describing the data set ```{r} - -me <- list(individualName = list(givenName = "Quinn", - surName = "Thomas"), - electronicMailAddress = "rqthomas@vt.edu", - id = "https://orcid.org/0000-0003-1282-7825") - - +dataset <- eml$dataset( + title = forecast_title, + creator = me, + contact = list(references=me$id), + pubDate = as_date(as_datetime(forecast_issue_time)), + intellectualRights = intellectualRights, + abstract = abstract, + dataTable = dataTable, + keywordSet = keywordSet, + coverage = coverage +) ``` -Set Taxonomic, Temporal, and Geographic Coverage. +### EFI specific attributes ```{r} -coverage <- - set_coverage(begin = as_datetime(time[1]), - end = as_datetime(tail((time)[1])), - geographicDescription = "Falling Creek Reservior", - west = -79.9, east = -79.9, - north = 37.27, south = 37.27) -``` +initial_conditions <- list( + # Possible values: no, contains, data_driven, propagates, assimilates + uncertainty = "assimilates", + # Number of parameters / dimensionality + complexity = nstates, + propagation = list( + type = "ensemble", # ensemble vs analytic + size = nmembers # required if ensemble + ), + assimilation = list( + type = "EnKF", + reference = "https://www.biorxiv.org/content/10.1101/2020.01.22.915538v2.abstract", + complexity = nstates + ) +) -Set key words. We will need to develop a EFI controlled vocabulary +parameters <- list( + uncertainty = "assimilates", + complexity = npars, + propagation = list( + type = "ensemble", # ensemble vs analytic + size = nmembers # required if ensemble + ), + assimilation = list( + type = "EnKF", + reference = "https://www.biorxiv.org/content/10.1101/2020.01.22.915538v2.abstract", + complexity = npars + ) +) -```{r} -keywordSet <- list( - list( - keywordThesaurus = "EFI controlled vocabulary", - keyword = list("forecast", - "ecosystem", - "timeseries") - )) -``` +process_error <- list( + uncertainty = "propagates", + propagation = list( + type = "ensemble", # ensemble vs analytic + size = nmembers # required if ensemble + ), + complexity = nstates, + covariance = TRUE, + localization = "Distance extinction of covariance" +) +random_effects <- list( + uncertainty = "no" +) -Our dataset needs an abstract describing what this is all about. Also, a methods section is not required but it's probably a good idea. Here we import a methods section that was written in Markdown. - -**Forecast timestep**: 1 day - -**Forecast time horizon** 16 days - -**Data assimilation** - - * Data Assimilation used: Yes - * If, DA used - type of method: EnKF - * If, DA used - Number of parameters calibrated: 3 - * If, DA used - Sources of training data (DOI, GitHub): https://github.com/CareyLabVT/SCCData/tree/carina-data - -**Model Description** - - * Type of model (Empirical, process-based, machine learning): Process-based - * Model name: General Lake Model-AED V3 - * Location of repository with model code: - * GLM: https://github.com/AquaticEcoDynamics/GLM - * AED: https://github.com/AquaticEcoDynamics/libaed2 - * Model citation: Hipsey et al. 2019 GMD - * Total number of model process parameters: Hard to count - -**Model Covariates** - - * Type (i.e., meteorology): meteorology, Stream Inflow - * Source (i.e., NOAA GEFS): NOAA GEFS 16-day, https://github.com/CareyLabVT/SCCData/tree/diana-data - -**Uncertainty (No, Derived from data, Propagates, Assimilates)** - - * Initial conditions: Assimilates - * Parameter: Assimilates - * Parameter Random Effects: No - * Process (within model): Propagates, Assimilates - * Multi-model: No - * Driver: Derived from data - * Scenario: Yes - oxygen system on and off - * Method for propagating uncertainty (Analytic, ensemble numeric): ensemble numeric - * If Analytic, specific method - * If ensemble numeric, number of ensembles: 210 - +drivers <- list( + uncertainty = "propagates", + complexity = n_met_members * n_ds_members, + propagation = list( + type = "ensemble", # ensemble vs analytic + size = n_met_members * n_ds_members) +) +``` +Get the forecast horizon using the data assimilation flag ```{r} -abstract_text <- system.file("extdata", "abstract.md", package="EFIstandards") -methods_text <- system.file("extdata", "methods.md", package="EFIstandards") - -abstract <- list(markdown = paste(readLines(abstract_text), collapse = "\n")) -methods <- list(id="forecast", - methodStep = list(description = - list(markdown = paste(readLines(methods_text), - collapse = "\n")))) ## to be dropped +forecast_horizon <- length(which(data_assimilation == 0)) ``` +Build the EFI additional metadata ```{r} -dataset = eml$dataset( - title = "FLARE forecast", - creator = me, - contact = list(references="https://orcid.org/0000-0003-1282-7825"), - pubDate = forecast_issue_time, - intellectualRights = "http://www.lternet.edu/data/netpolicy.html.", - abstract = "An illustration of how we might use EML metadata to describe an ecological forecast", - dataTable = dataTable, - keywordSet = keywordSet, - coverage = coverage, - methods = methods - ) +additionalMetadata <- eml$additionalMetadata( + metadata = list( + forecast = list( + ## Basic elements + timestep = "1 day", + forecast_horizon = forecast_horizon, + forecast_issue_time = forecast_issue_time, + forecast_iteration_id = forecast_iteration_id, + forecast_project_id = forecast_project_id, + model_description = model_description, + ## UNCERTAINTY CLASSES + initial_conditions = initial_conditions, + parameters = parameters, + random_effects = random_effects, + process_error = process_error, + drivers = drivers + ) # forecast + ) # metadata +) # eml$additionalMetadata ``` -All we need now is to add a unique identifier for the project and we are good to go! This could be a DOI or merely an identifier we create, e.g. a UUID. +Build the EML ```{r} my_eml <- eml$eml(dataset = dataset, - packageId = ForecastProject_id, - system = "uuid" - ) + additionalMetadata = additionalMetadata, + packageId = forecast_project_id, + system = "uuid" ## system used to generate packageId +) ``` -Once we have finished building our EML metadata, we can confirm it is valid. This will catch any missing elements. (Recall that what is 'required' depends on what you include -- for example, you don't have to document a `dataTable` at all, but if you do, you have to document the "physical" file format it is in (e.g. `csv`) and the attributes and units it uses!) - -```{r} -eml_validate(my_eml) -``` +Check if the EML is a valid EML and whether is meets the EFI Forecasting +Standards -We are now ready to write out a valid EML document: ```{r} -write_eml(my_eml, "forecast-flare-eml.xml") +EFIstandards::forecast_validator(my_eml) ``` -At this point, we could easily upload this metadata along with the data itself to DataONE via the API (or `dataone` R package.) - -We can also generate a JSON-LD version of EML: +Write EML file ```{r} -emld::as_json(as_emld("forecast-flare-eml.xml"), file = "forecast-flare-eml.json") +write_eml(my_eml, paste0(file_name,"-eml.xml")) ``` - - ```{r include=FALSE} ## Cleanup -lapply(list.files(pattern = "[.]csv"), unlink) -lapply(list.files(pattern = "[.]json"), unlink) lapply(list.files(pattern = "[.]xml"), unlink) ``` \ No newline at end of file