forked from Grepath/XGBoostNeon4Casts
-
Notifications
You must be signed in to change notification settings - Fork 1
/
forecast_model.R
157 lines (123 loc) · 5.73 KB
/
forecast_model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
library(tidyverse)
library(neon4cast)
library(lubridate)
library(rMR)
library(glue)
source("ignore_sigpipe.R")
forecast_date <- Sys.Date()
noaa_date <- Sys.Date() - days(3) #Need to use yesterday's NOAA forecast because today's is not available yet
# Step 0: Define a unique name which will identify your model in the leaderboard and connect it to team members info, etc
model_id <- "neon4cast_example"
# Step 1: Download latest target data and site description data
target <- readr::read_csv(paste0("https://data.ecoforecast.org/neon4cast-targets/",
"aquatics/aquatics-targets.csv.gz"), guess_max = 1e6)
site_data <- readr::read_csv(paste0("https://raw.githubusercontent.com/eco4cast/neon4cast-targets/",
"main/NEON_Field_Site_Metadata_20220412.csv")) |>
dplyr::filter(aquatics == 1)
# Step 2: Get meterological predictions as drivers
df_past <- neon4cast::noaa_stage3()
## Helper function: for each site, average over predicted 0h horizon ensembles to get 'historic values'
noaa_mean_historical <- function(df_past, site, var) {
df_past |>
dplyr::filter(site_id == site,
variable == var) |>
dplyr::rename(ensemble = parameter) |>
dplyr::select(datetime, prediction, ensemble) |>
dplyr::mutate(date = as_date(datetime)) |>
dplyr::group_by(date) |>
dplyr::summarize(air_temperature = mean(prediction, na.rm = TRUE),
.groups = "drop") |>
dplyr::rename(datetime = date) |>
dplyr::mutate(air_temperature = air_temperature - 273.15) |>
dplyr::collect()
}
## Helper fn: get daily average temperature from each ensemble in future
noaa_mean_forecast <- function(site, var, reference_date) {
endpoint = "data.ecoforecast.org"
bucket <- glue::glue("neon4cast-drivers/noaa/gefs-v12/stage1/0/{reference_date}")
s3 <- arrow::s3_bucket(bucket, endpoint_override = endpoint, anonymous = TRUE)
# stage1 air temp is Celsius
arrow::open_dataset(s3) |>
dplyr::filter(site_id == site,
datetime >= lubridate::as_datetime(forecast_date),
variable == var) |>
dplyr::select(datetime, prediction, parameter) |>
dplyr::mutate(datetime = as_date(datetime)) |>
dplyr::group_by(datetime, parameter) |>
dplyr::summarize(air_temperature = mean(prediction), .groups = "drop") |>
dplyr::select(datetime, air_temperature, parameter) |>
dplyr::rename(ensemble = parameter) |>
dplyr::collect()
}
# Step 2.5: We'll skip any site that doesn't have both temperature and oxygen
sites <- target |> na.omit() |> distinct(site_id, variable) |>
filter(variable %in% c("oxygen", "temperature")) |>
count(site_id) |> filter(n==2) |> pull(site_id)
#Step 3.0: Define the forecasts model for a site
forecast_site <- function(site) {
message(paste0("Running site: ", site))
# Get site information for elevation
site_info <- site_data |> dplyr::filter(field_site_id == site)
# historical temperatures
noaa_past_mean <- noaa_mean_historical(df_past, site, "air_temperature")
# Merge in past NOAA data into the targets file, matching by date.
site_target <- target |>
dplyr::select(datetime, site_id, variable, observation) |>
dplyr::filter(variable %in% c("temperature", "oxygen"),
site_id == site) |>
tidyr::pivot_wider(names_from = "variable", values_from = "observation") |>
dplyr::left_join(noaa_past_mean, by = c("datetime"))
rm(noaa_past_mean) # save RAM
# Fit linear model based o # n past data: water temperature = m * air temperature + b
fit <- lm(temperature ~ air_temperature, data = site_target)
# Get 30-day predicted temperature ensemble at the site
noaa_future <- noaa_mean_forecast(site, "TMP", noaa_date)
# use the linear model (predict.lm) to forecast water temperature for each ensemble member
temperature <-
noaa_future |>
mutate(site_id = site,
prediction = predict(fit, tibble(air_temperature)),
variable = "temperature")
# use forecasted water temperature to predict oxygen by assuming that oxygen is saturated.
forecasted_oxygen <-
rMR::Eq.Ox.conc(temperature$prediction,
elevation.m = site_info$field_mean_elevation_m,
bar.press = NULL,
bar.units = NULL,
out.DO.meas = "mg/L",
salinity = 0,
salinity.units = "pp.thou")
# stick bits together
oxygen <-
noaa_future |>
mutate(site_id = site,
prediction = forecasted_oxygen,
variable = "oxygen")
forecast <- dplyr::bind_rows(temperature, oxygen)
# Format results to EFI standard
forecast <- forecast |>
mutate(reference_datetime = forecast_date,
family = "ensemble",
model_id = model_id) |>
rename(parameter = ensemble) |>
select(model_id, datetime, reference_datetime,
site_id, family, parameter, variable, prediction)
}
### AND HERE WE GO! We're ready to start forecasting ###
## Test with a single site first!
forecast <- forecast_site( sites[1] )
#Visualize the ensemble predictions -- what do you think?
forecast |>
ggplot(aes(x = datetime, y = prediction, group = parameter)) +
geom_line(alpha=0.3) +
facet_wrap(~variable, scales = "free")
# Run all sites -- may be slow!
forecast <- map_dfr(sites, forecast_site)
#Forecast output file name in standards requires for Challenge.
# csv.gz means that it will be compressed
file_date <- Sys.Date() #forecast$reference_datetime[1]
forecast_file <- paste0("aquatics","-",file_date,"-",model_id,".csv.gz")
#Write csv to disk
write_csv(forecast, forecast_file)
# Step 4: Submit forecast!
neon4cast::submit(forecast_file = forecast_file, metadata = NULL, ask = FALSE)