-
Notifications
You must be signed in to change notification settings - Fork 11
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Compute percentiles for daily values and their colors #45
Changes from all commits
7206599
bd04778
16cec6f
6c3fa3e
b1a783a
919cabc
30ba280
9c0b0c1
a9d6e60
f9fc643
f4cb8ec
c1ebb51
af242e2
8d09f41
be8a91a
afaa891
5e82f3b
def11ed
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,2 @@ | ||
hash: 1526c1af515d9c34c65bfda095715f4d | ||
hash: b9097707855bed84ababea6e279bf73c | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,2 @@ | ||
hash: a2002214eb302dee23e475fa6379df85 | ||
hash: d25d6b36a48a4c9ffbb95997df49802e | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,2 @@ | ||
hash: 4481da677798c03c96badb7d7303ac09 | ||
hash: 64a273b80a416f15a80da2ec7d4f05eb | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,2 @@ | ||
hash: 53ac4d4371319321a9840cb6801ca326 | ||
hash: 643764a5a20f82946253c41b1e503a5d | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,33 +1,42 @@ | ||
#' @title Download the discharge from NWIS for each dv gage | ||
#' | ||
#' | ||
#' @param ind_file character file name where the output should be saved | ||
#' @param sites_ind indicator file for the vector of site numbers | ||
#' @param dates object from viz_config.yml that specifies dates as string | ||
fetch_dv_data <- function(ind_file, sites_ind, dates){ | ||
|
||
#' @param request_limit number indicating how many sites to include per dataRetrieval request (from viz_config.yml) | ||
fetch_dv_data <- function(ind_file, sites_ind, dates, request_limit){ | ||
|
||
sites <- readRDS(scipiper::sc_retrieve(sites_ind, remake_file = '1_fetch.yml')) | ||
|
||
dv_sites_data <- lapply(sites, FUN = function(x){ | ||
d <- dataRetrieval::readNWISdata( | ||
service="dv", | ||
site = x, | ||
|
||
req_bks <- seq(1, length(sites), by=request_limit) | ||
dv_data <- data.frame() | ||
for(i in req_bks) { | ||
last_site <- min(i+request_limit-1, length(sites)) | ||
get_sites <- sites[i:last_site] | ||
data_i <- | ||
dataRetrieval::readNWISdata( | ||
service = "dv", | ||
statCd = "00003", # need this to avoid NAs | ||
site = get_sites, | ||
parameterCd = "00060", | ||
startDate = dates$start, | ||
endDate = dates$end) %>% | ||
endDate = dates$end) %>% | ||
dataRetrieval::renameNWISColumns() | ||
if(nrow(d) > 0 && any(names(d) == "Flow")) { | ||
d[, c("dateTime", "Flow")] # keep only dateTime and Flow columns | ||
|
||
if(nrow(data_i) > 0 && any(names(data_i) == "Flow")) { | ||
data_i <- data_i[, c("site_no", "dateTime", "Flow")] # keep only dateTime and Flow columns | ||
} else { | ||
NULL # no data returned situation | ||
data_i <- NULL # no data returned situation | ||
} | ||
|
||
}) | ||
|
||
names(dv_sites_data) <- sites | ||
|
||
|
||
dv_data <- rbind(dv_data, data_i) | ||
message(paste("Completed", last_site, "of", length(sites))) | ||
} | ||
|
||
dv_data_unique <- dplyr::distinct(dv_data) # need this to avoid some duplicates | ||
|
||
# Write the data file and the indicator file | ||
data_file <- scipiper::as_data_file(ind_file) | ||
saveRDS(dv_sites_data, data_file) | ||
saveRDS(dv_data_unique, data_file) | ||
scipiper::gd_put(ind_file, data_file) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,25 +1,26 @@ | ||
#' @title Fetch appropriate daily value sites from NWIS | ||
#' | ||
#' | ||
#' @param ind_file character file name where the output should be saved | ||
#' @param dates object from viz_config.yml that specifies dates as string | ||
fetch_dv_sites <- function(ind_file, dates){ | ||
|
||
hucs <- zeroPad(1:21, 2) # all hucs | ||
|
||
sites <- c() | ||
for(huc in hucs){ | ||
sites <- | ||
sites <- | ||
dataRetrieval::whatNWISdata( | ||
huc = huc, | ||
service = "dv", | ||
huc = huc, | ||
service = "dv", | ||
startDate = dates$start, | ||
endDate = dates$end, | ||
parameterCd = "00060", | ||
statCd = "00003") %>% | ||
dplyr::pull(site_no) %>% | ||
parameterCd = "00060", | ||
statCd = "00003") %>% | ||
dplyr::pull(site_no) %>% | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider swapping the above two lines. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. True, but There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, right. |
||
unique() %>% | ||
c(sites) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it possible that some sites show up in multiple HUCs? I don't know how that would happen, but since you're still seeing duplicates after many other fixes... |
||
} | ||
|
||
# Write the data file and the indicator file | ||
data_file <- scipiper::as_data_file(ind_file) | ||
saveRDS(sites, data_file) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
hash: 2d6859e8e3aa3e660fe89227c7ee7e36 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
hash: 0dffd490e10569053d4900287b3783dd | ||
|
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
hash: ad0e59822f9a265a44ebf6c143494c06 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,2 @@ | ||
hash: 0bcfda07c078bd4ad4d3c9b995e50712 | ||
hash: 0a7f4eb0123a895c664e964592d19f8c | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
choose_timesteps <- function(ind_file, dates) { | ||
timesteps <- seq(as.POSIXct(dates$start, tz = "UTC"), as.POSIXct(dates$end, tz = "UTC"), by = 'hours') | ||
timesteps <- seq(as.POSIXct(dates$start, tz = "UTC"), as.POSIXct(dates$end, tz = "UTC"), by = 'days') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks good. Note for the future: if we ever decide to try out 12-hour timesteps instead of daily ones (#23), this is where I think we'd want to adjust that. |
||
data_file <- as_data_file(ind_file) | ||
saveRDS(timesteps, data_file) | ||
gd_put(ind_file, data_file) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
#' @title Compute the color for each daily value percentile | ||
#' | ||
#' @param ind_file character file name where the output should be saved | ||
#' @param dv_stats_ind indicator file for the data.frame of dv_data | ||
#' @param color_palette list of colors to use for the color ramp (from viz_config.yml) | ||
process_dv_stat_colors <- function(ind_file, dv_stats_ind, color_palette){ | ||
|
||
dv_stats <- readRDS(scipiper::sc_retrieve(dv_stats_ind, remake_file = '2_process.yml')) | ||
col_fun <- colorRamp(color_palette) | ||
|
||
# just removing NA percentiles for now | ||
dv_stats_with_color <- dv_stats %>% | ||
filter(!is.na(per)) %>% | ||
mutate(color = rgb(col_fun(per), maxColorValue = 255)) # don't know how necessary maxColorValue is | ||
|
||
# Write the data file and the indicator file | ||
saveRDS(dv_stats_with_color, scipiper::as_data_file(ind_file)) | ||
scipiper::gd_put(ind_file) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
#' @title Calculate the stat category for each gage's discharge value | ||
#' | ||
#' @param ind_file character file name where the output should be saved | ||
#' @param dv_data_ind indicator file for the data.frame of dv_data | ||
#' @param site_stats_clean_ind indicator file for the data.frame of dv stats for each site | ||
#' @param dates object from viz_config.yml that specifies dates as string | ||
#' @param percentiles character vector of the types of stats to include, i.e. `c("10", "75")` | ||
#' will return the 10th and 75th percentiles (from viz_config.yml) | ||
process_dv_stats <- function(ind_file, dv_data_ind, site_stats_clean_ind, dates, percentiles){ | ||
|
||
dv_data <- readRDS(scipiper::sc_retrieve(dv_data_ind, remake_file = '1_fetch.yml')) | ||
site_stats <- readRDS(scipiper::sc_retrieve(site_stats_clean_ind, remake_file = '2_process.yml')) | ||
|
||
# breakdown date into month & day pairs | ||
dv_data_md <- dv_data %>% | ||
dplyr::mutate(month_nu = as.numeric(format(dateTime, "%m")), | ||
day_nu = as.numeric(format(dateTime, "%d"))) | ||
|
||
# merge stats with the dv data | ||
# merge still results in extra rows - 24 extra to be exact | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think I was seeing those with the test data you shared - they're duplicates in dv_data_md, right? They might be resolved by the suggestion above to call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. even after adding that step, I am still getting 24 extra observations There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmmm. Can you figure out which ones they are? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Working on this FYI! |
||
dv_with_stats <- left_join(dv_data_md, site_stats, by = c("site_no", "month_nu", "day_nu")) | ||
|
||
stat_colnames <- sprintf("p%s_va", percentiles) | ||
stat_perc <- as.numeric(percentiles)/100 | ||
|
||
interpolate_percentile <- function(df){ | ||
# This function takes the current daily value and interpolates it's percentile based | ||
# on the percentiles for the matching site and day of the year | ||
df <- select(df, "dv_val", one_of(stat_colnames)) | ||
out <- rep(NA, nrow(df)) | ||
|
||
for (i in 1:length(out)){ | ||
dv_val <- df$dv_val[i] | ||
|
||
df_i <- slice(df, i) %>% | ||
select(-dv_val) %>% | ||
tidyr::gather(stat_name, stat_value) %>% | ||
mutate(stat_value = as.numeric(stat_value), | ||
stat_type = as.numeric(gsub("p|_va", "", stat_name))/100) | ||
|
||
y <- df_i$stat_type | ||
x <- df_i$stat_value | ||
nas <- is.na(x) | ||
x <- x[!nas] | ||
y <- y[!nas] | ||
if (length(unique(x)) < 2){ | ||
out[i] <- NA | ||
} else if (dv_val < x[1L]){ # the first and last *have* to be numbers per filtering criteria | ||
out[i] <- head(stat_perc, 1) | ||
} else if (dv_val > tail(x, 1L)){ | ||
out[i] <- tail(stat_perc, 1) | ||
} else { | ||
out[i] <- approx(x, y, xout = dv_val)$y | ||
} | ||
} | ||
return(out) | ||
|
||
} | ||
|
||
dv_stats <- dv_with_stats %>% | ||
mutate(dv_val = Flow) %>% | ||
filter_(sprintf("!is.na(%s)", stat_colnames[1]), | ||
sprintf("!is.na(%s)", tail(stat_colnames,1)), | ||
sprintf("!is.na(%s)", "dv_val")) %>% | ||
mutate(per = interpolate_percentile(.)) %>% | ||
select(site_no, dateTime, dv_val, per, p50_va) | ||
|
||
# Write the data file and the indicator file | ||
data_file <- scipiper::as_data_file(ind_file) | ||
saveRDS(dv_stats, data_file) | ||
scipiper::gd_put(ind_file, data_file) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
#' @title Clean up the site statistics data to eliminate duplicates | ||
#' | ||
#' @param ind_file character file name where the output should be saved | ||
#' @param site_stats_ind indicator file for the data frame of site statistics | ||
process_site_stats <- function(ind_file, site_stats_ind){ | ||
|
||
stat_data <- readRDS(scipiper::sc_retrieve(site_stats_ind, remake_file = '1_fetch.yml')) | ||
|
||
# For duplicated site stats, pick the result with the more recent end_yr | ||
# E.g. Site number 12010000 has two sets of stats for some of it's data | ||
# filter by January 1 and you will see one set from 1930 - 2003 and one | ||
# from 1930 - 2018. Filter so that only the 2018 one is used. | ||
stat_data_unique <- stat_data %>% | ||
dplyr::mutate(nyears = end_yr - begin_yr) %>% | ||
tidyr::unite(mashed, site_no, month_nu, day_nu) %>% | ||
dplyr::distinct() %>% # some of the stats are literally exact copies | ||
dplyr::group_by(mashed) %>% | ||
dplyr::mutate(same_window = any(duplicated(nyears))) %>% | ||
dplyr::filter(ifelse(!same_window, | ||
# pick the stat that has more years of data | ||
nyears == max(nyears), | ||
# if there are > 1 with the same number of years, | ||
# pick the more recent stat | ||
end_yr == max(end_yr))) %>% | ||
dplyr::ungroup() %>% | ||
tidyr::separate(mashed, c("site_no", "month_nu", "day_nu"), sep = "_") %>% | ||
dplyr::mutate(month_nu = as.numeric(month_nu), day_nu = as.numeric(day_nu)) | ||
|
||
# Write the data file and the indicator file | ||
data_file <- scipiper::as_data_file(ind_file) | ||
saveRDS(stat_data_unique, data_file) | ||
scipiper::gd_put(ind_file, data_file) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Other duplicates may be discoverable here if you look for rows that are distinct just among the columns agency_cd, site_no, and dateTime (e.g., check the results of
dup_dv <- dv_data %>% group_by(site_no, month_nu, day_nu) %>% summarize(n=n()) %>% filter(n > 1) %>% left_join(dv_data, by=c('site_no','month_nu','day_nu'))
)