Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compute percentiles for daily values and their colors #45

Merged
merged 18 commits into from
Nov 7, 2018
Merged
7 changes: 2 additions & 5 deletions 1_fetch/src/fetch_dv_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@ fetch_dv_data <- function(ind_file, sites_ind, dates, request_limit){
req_bks <- seq(1, length(sites), by=request_limit)
dv_data <- data.frame()
for(i in req_bks) {
last_site <- i+request_limit-1
if(i == tail(req_bks, 1) && last_site > length(sites)) {
last_site <- length(sites)
}
last_site <- min(i+request_limit-1, length(sites))
get_sites <- sites[i:last_site]
data_i <-
dataRetrieval::readNWISdata(
Expand All @@ -33,7 +30,7 @@ fetch_dv_data <- function(ind_file, sites_ind, dates, request_limit){
}

dv_data <- rbind(dv_data, data_i)
print(paste("Completed", last_site, "of", length(sites)))
message(paste("Completed", last_site, "of", length(sites)))
}

dv_data_unique <- dplyr::distinct(dv_data) # need this to avoid some duplicates
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Other duplicates may be discoverable here if you look for rows that are distinct just among the columns agency_cd, site_no, and dateTime (e.g., check the results of dup_dv <- dv_data %>% group_by(site_no, month_nu, day_nu) %>% summarize(n=n()) %>% filter(n > 1) %>% left_join(dv_data, by=c('site_no','month_nu','day_nu')))

Expand Down
2 changes: 1 addition & 1 deletion 1_fetch/src/fetch_dv_sites.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ fetch_dv_sites <- function(ind_file, dates){
endDate = dates$end,
parameterCd = "00060",
statCd = "00003") %>%
dplyr::distinct() %>%
dplyr::pull(site_no) %>%
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider swapping the above two lines. distinct on multiple columns could still leave duplicates in site_no...which I think is what you want to avoid, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True, but distinct can't work on a vector (which is the output of dplyr::pull) so I will need to use unique()

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, right. unique, then.

unique() %>%
c(sites)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible that some sites show up in multiple HUCs? I don't know how that would happen, but since you're still seeing duplicates after many other fixes...

}

Expand Down
5 changes: 1 addition & 4 deletions 1_fetch/src/fetch_site_stats.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,7 @@ fetch_site_stats <- function(ind_file, sites_ind, request_limit, percentiles){
req_bks <- seq(1, length(sites), by=request_limit)
stat_data <- data.frame()
for(i in req_bks) {
last_site <- i+request_limit-1
if(i == tail(req_bks, 1) && last_site > length(sites)) {
last_site <- length(sites)
}
last_site <- min(i+request_limit-1, length(sites))
get_sites <- sites[i:last_site]
current_sites <- suppressMessages(
dataRetrieval::readNWISstat(
Expand Down
6 changes: 3 additions & 3 deletions 2_process.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ targets:
# -- config --
proj_str:
command: viz_config[[I('projection')]]
color_palette:
command: viz_config[[I('color_palette')]]
sites_color_palette:
command: viz_config[[I('sites_color_palette')]]

2_process/out/timesteps.rds.ind:
command: choose_timesteps(target_name, dates = dates)
Expand Down Expand Up @@ -64,7 +64,7 @@ targets:
command: process_dv_stat_colors(
ind_file = target_name,
dv_stats_ind = '2_process/out/dv_stats.rds.ind',
color_palette = color_palette)
color_palette = sites_color_palette)
2_process/out/dv_stat_colors.rds:
command: gd_get('2_process/out/dv_stat_colors.rds.ind')

Expand Down
17 changes: 8 additions & 9 deletions 2_process/src/process_dv_stat_colors.R
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
#' @title Compute the color for each daily value percentile
#'
#'
#' @param ind_file character file name where the output should be saved
#' @param dv_stats_ind indicator file for the data.frame of dv_data
#' @param color_palette list of colors to use for the color ramp (from viz_config.yml)
process_dv_stat_colors <- function(ind_file, dv_stats_ind, color_palette){

dv_stats <- readRDS(scipiper::sc_retrieve(dv_stats_ind, remake_file = '2_process.yml'))
col_fun <- colorRamp(color_palette)

# just removing NA percentiles for now
dv_stats_with_color <- dv_stats %>%
filter(!is.na(per)) %>%
dv_stats_with_color <- dv_stats %>%
filter(!is.na(per)) %>%
mutate(color = rgb(col_fun(per), maxColorValue = 255)) # don't know how necessary maxColorValue is

# Write the data file and the indicator file
data_file <- scipiper::as_data_file(ind_file)
saveRDS(dv_stats_with_color, data_file)
scipiper::gd_put(ind_file, data_file)
saveRDS(dv_stats_with_color, scipiper::as_data_file(ind_file))
scipiper::gd_put(ind_file)
}
48 changes: 25 additions & 23 deletions 2_process/src/process_dv_stats.R
Original file line number Diff line number Diff line change
@@ -1,41 +1,43 @@
#' @title Calculate the stat category for each gage's discharge value
#'
#'
#' @param ind_file character file name where the output should be saved
#' @param dv_data_ind indicator file for the data.frame of dv_data
#' @param site_stats_clean_ind indicator file for the data.frame of dv stats for each site
#' @param dates object from viz_config.yml that specifies dates as string
#' @param percentiles character vector of the types of stats to include, i.e. `c("10", "75")`
#' @param percentiles character vector of the types of stats to include, i.e. `c("10", "75")`
#' will return the 10th and 75th percentiles (from viz_config.yml)
process_dv_stats <- function(ind_file, dv_data_ind, site_stats_clean_ind, dates, percentiles){

dv_data <- readRDS(scipiper::sc_retrieve(dv_data_ind, remake_file = '1_fetch.yml'))
site_stats <- readRDS(scipiper::sc_retrieve(site_stats_clean_ind, remake_file = '2_process.yml'))

# breakdown date into month & day pairs
dv_data_md <- dv_data %>%
dv_data_md <- dv_data %>%
dplyr::mutate(month_nu = as.numeric(format(dateTime, "%m")),
day_nu = as.numeric(format(dateTime, "%d")))

# merge stats with the dv data
# merge still results in extra rows - 24 extra to be exact
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I was seeing those with the test data you shared - they're duplicates in dv_data_md, right? They might be resolved by the suggestion above to call pull before distinct in fetch_dv_sites, or you could remove the duplicates in lines just above this one or at the end of fetch_dv_data

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

even after adding that step, I am still getting 24 extra observations

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm. Can you figure out which ones they are?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Working on this FYI!

dv_with_stats <- left_join(dv_data_md, site_stats, by = c("site_no", "month_nu", "day_nu"))

stat_colnames <- sprintf("p%s_va", percentiles)
stat_perc <- as.numeric(percentiles)/100

int_per <- function(df){

interpolate_percentile <- function(df){
# This function takes the current daily value and interpolates it's percentile based
# on the percentiles for the matching site and day of the year
df <- select(df, "dv_val", one_of(stat_colnames))
out <- rep(NA, nrow(df))

for (i in 1:length(out)){
dv_val <- df$dv_val[i]
df_i <- slice(df, i) %>%
select(-dv_val) %>%
tidyr::gather(stat_name, stat_value) %>%

df_i <- slice(df, i) %>%
select(-dv_val) %>%
tidyr::gather(stat_name, stat_value) %>%
mutate(stat_value = as.numeric(stat_value),
stat_type = as.numeric(gsub("p|_va", "", stat_name))/100)

y <- df_i$stat_type
x <- df_i$stat_value
nas <- is.na(x)
Expand All @@ -52,17 +54,17 @@ process_dv_stats <- function(ind_file, dv_data_ind, site_stats_clean_ind, dates,
}
}
return(out)

}
dv_stats <- dv_with_stats %>%
mutate(dv_val = Flow) %>%
filter_(sprintf("!is.na(%s)", stat_colnames[1]),
sprintf("!is.na(%s)", tail(stat_colnames,1)),

dv_stats <- dv_with_stats %>%
mutate(dv_val = Flow) %>%
filter_(sprintf("!is.na(%s)", stat_colnames[1]),
sprintf("!is.na(%s)", tail(stat_colnames,1)),
sprintf("!is.na(%s)", "dv_val")) %>%
mutate(per = int_per(.)) %>%
mutate(per = interpolate_percentile(.)) %>%
select(site_no, dateTime, dv_val, per, p50_va)

# Write the data file and the indicator file
data_file <- scipiper::as_data_file(ind_file)
saveRDS(dv_stats, data_file)
Expand Down
2 changes: 1 addition & 1 deletion viz_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ percentiles: ["05","10","20","25","50","75","80","90","95"]

# styling
background_col: "gray90"
color_palette: ['#ca0020','#f4a582','#efefef','#efefef','#92c5de','#034064']
sites_color_palette: ['#ca0020','#f4a582','#efefef','#efefef','#92c5de','#034064']
gage_line_col: "#3c829c"
gage_norm_col: "#4BA3C3"
legend_text:
Expand Down