DOI-USGS · lindsayplatt · Nov 7, 2018 · Nov 5, 2018 · Nov 6, 2018 · Nov 6, 2018
diff --git a/1_fetch.yml b/1_fetch.yml
@@ -5,6 +5,7 @@ packages:
   - dplyr
   - scipiper
   - sf
+  - tidyr
   - yaml
 
 file_extensions:
@@ -77,7 +78,8 @@ targets:
     command: fetch_dv_data(
       ind_file = target_name,
       sites_ind = '1_fetch/out/sites.rds.ind',
-      dates = dates)
+      dates = dates,
+      request_limit = request_limit)
   1_fetch/out/dv_data.rds:
     command: gd_get('1_fetch/out/dv_data.rds.ind')
 

diff --git a/1_fetch/out/dv_data.rds.ind b/1_fetch/out/dv_data.rds.ind
@@ -1,2 +1,2 @@
-hash: 1526c1af515d9c34c65bfda095715f4d
+hash: b9097707855bed84ababea6e279bf73c
 
diff --git a/1_fetch/out/site_locations.rds.ind b/1_fetch/out/site_locations.rds.ind
@@ -1,2 +1,2 @@
-hash: a2002214eb302dee23e475fa6379df85
+hash: d25d6b36a48a4c9ffbb95997df49802e
 
diff --git a/1_fetch/out/site_stats.rds.ind b/1_fetch/out/site_stats.rds.ind
@@ -1,2 +1,2 @@
-hash: 4481da677798c03c96badb7d7303ac09
+hash: 64a273b80a416f15a80da2ec7d4f05eb
 
diff --git a/1_fetch/out/sites.rds.ind b/1_fetch/out/sites.rds.ind
@@ -1,2 +1,2 @@
-hash: 53ac4d4371319321a9840cb6801ca326
+hash: 643764a5a20f82946253c41b1e503a5d
 
diff --git a/1_fetch/src/fetch_dv_data.R b/1_fetch/src/fetch_dv_data.R
@@ -1,33 +1,42 @@
 #' @title Download the discharge from NWIS for each dv gage
-#' 
+#'
 #' @param ind_file character file name where the output should be saved
 #' @param sites_ind indicator file for the vector of site numbers
 #' @param dates object from viz_config.yml that specifies dates as string
-fetch_dv_data <- function(ind_file, sites_ind, dates){
-
+#' @param request_limit number indicating how many sites to include per dataRetrieval request (from viz_config.yml)
+fetch_dv_data <- function(ind_file, sites_ind, dates, request_limit){
+
   sites <- readRDS(scipiper::sc_retrieve(sites_ind, remake_file = '1_fetch.yml'))
-
-  dv_sites_data <- lapply(sites, FUN = function(x){
-    d <- dataRetrieval::readNWISdata(
-        service="dv",
-        site = x,
+
+  req_bks <- seq(1, length(sites), by=request_limit)
+  dv_data <- data.frame()
+  for(i in req_bks) {
+    last_site <- min(i+request_limit-1, length(sites))
+    get_sites <- sites[i:last_site]
+    data_i <-
+      dataRetrieval::readNWISdata(
+        service = "dv",
+        statCd = "00003", # need this to avoid NAs
+        site = get_sites,
         parameterCd = "00060",
         startDate = dates$start,
-        endDate = dates$end) %>% 
+        endDate = dates$end) %>%
       dataRetrieval::renameNWISColumns()
-  
-    if(nrow(d) > 0 && any(names(d) == "Flow")) {
-      d[, c("dateTime", "Flow")] # keep only dateTime and Flow columns
+
+    if(nrow(data_i) > 0 && any(names(data_i) == "Flow")) {
+      data_i <- data_i[, c("site_no", "dateTime", "Flow")] # keep only dateTime and Flow columns
     } else {
-      NULL # no data returned situation
+      data_i <- NULL # no data returned situation
     }
-
-  })
-
-  names(dv_sites_data) <- sites
-
+
+    dv_data <- rbind(dv_data, data_i)
+    message(paste("Completed", last_site, "of", length(sites)))
+  }
+
+  dv_data_unique <- dplyr::distinct(dv_data) # need this to avoid some duplicates
+
   # Write the data file and the indicator file
   data_file <- scipiper::as_data_file(ind_file)
-  saveRDS(dv_sites_data, data_file)
+  saveRDS(dv_data_unique, data_file)
   scipiper::gd_put(ind_file, data_file)
 }
diff --git a/1_fetch/src/fetch_dv_sites.R b/1_fetch/src/fetch_dv_sites.R
@@ -1,25 +1,26 @@
 #' @title Fetch appropriate daily value sites from NWIS
-#' 
+#'
 #' @param ind_file character file name where the output should be saved
 #' @param dates object from viz_config.yml that specifies dates as string
 fetch_dv_sites <- function(ind_file, dates){
 
   hucs <- zeroPad(1:21, 2) # all hucs
-  
+
   sites <- c()
   for(huc in hucs){
-    sites <- 
+    sites <-
       dataRetrieval::whatNWISdata(
-        huc = huc, 
-        service = "dv", 
+        huc = huc,
+        service = "dv",
         startDate = dates$start,
         endDate = dates$end,
-        parameterCd = "00060", 
-        statCd = "00003") %>% 
-      dplyr::pull(site_no) %>% 
+        parameterCd = "00060",
+        statCd = "00003") %>%
+      dplyr::pull(site_no) %>%
+      unique() %>%
       c(sites)
   }
-  
+
   # Write the data file and the indicator file
   data_file <- scipiper::as_data_file(ind_file)
   saveRDS(sites, data_file)

diff --git a/1_fetch/src/fetch_site_stats.R b/1_fetch/src/fetch_site_stats.R
@@ -1,30 +1,31 @@
 #' @title Get the discharge quantiles for each dv gage
-#' 
+#'
 #' @param ind_file character file name where the output should be saved
 #' @param sites_ind indicator file for the vector of site numbers
 #' @param request_limit number indicating how many sites to include per dataRetrieval request (from viz_config.yml)
-#' @param percentiles character vector of the types of stats to include, i.e. `c("10", "75")` 
+#' @param percentiles character vector of the types of stats to include, i.e. `c("10", "75")`
 #' will return the 10th and 75th percentiles (from viz_config.yml)
 fetch_site_stats <- function(ind_file, sites_ind, request_limit, percentiles){
-   
+
   sites <- readRDS(scipiper::sc_retrieve(sites_ind, remake_file = '1_fetch.yml'))
-  
+
   req_bks <- seq(1, length(sites), by=request_limit)
   stat_data <- data.frame()
   for(i in req_bks) {
-    last_site <- i+request_limit-1
+    last_site <- min(i+request_limit-1, length(sites))
     get_sites <- sites[i:last_site]
     current_sites <- suppressMessages(
       dataRetrieval::readNWISstat(
         siteNumbers = get_sites,
-        parameterCd = "00060", 
+        parameterCd = "00060",
         statReportType="daily",
         statType=paste0("P", percentiles)
-      ))
+      )) %>%
+      dplyr::select(-agency_cd, -parameter_cd, -ts_id, -loc_web_ds)
     stat_data <- rbind(stat_data, current_sites)
     print(paste("Completed", last_site, "of", length(sites)))
   }
-  
+
   # Write the data file and the indicator file
   data_file <- scipiper::as_data_file(ind_file)
   saveRDS(stat_data, data_file)

diff --git a/2_process.yml b/2_process.yml
@@ -15,6 +15,9 @@ file_extensions:
 
 sources:
   - 2_process/src/choose_timesteps.R
+  - 2_process/src/process_site_stats.R
+  - 2_process/src/process_dv_stats.R
+  - 2_process/src/process_dv_stat_colors.R
   - 2_process/src/project_shift_points.R
   - 2_process/src/project_shift_states.R
 
@@ -23,29 +26,60 @@ targets:
   2_process:
     depends:
       - 2_process/out/timesteps.rds.ind
+      - 2_process/out/site_stats_clean.rds.ind
+      - 2_process/out/dv_stats.rds.ind
+      - 2_process/out/dv_stat_colors.rds.ind
       - states_shifted
       - site_locations_shifted
 
   # -- config --
   proj_str:
     command: viz_config[[I('projection')]]
+  sites_color_palette:
+    command: viz_config[[I('sites_color_palette')]]
 
   2_process/out/timesteps.rds.ind:
     command: choose_timesteps(target_name, dates = dates)
   2_process/out/timesteps.rds:
     command: gd_get('2_process/out/timesteps.rds.ind')
 
+  # -- process site data --
+  2_process/out/site_stats_clean.rds.ind:
+    command: process_site_stats(
+      ind_file = target_name,
+      site_stats_ind = '1_fetch/out/site_stats.rds.ind')
+  2_process/out/site_stats_clean.rds:
+    command: gd_get('2_process/out/site_stats_clean.rds.ind')
+
+  2_process/out/dv_stats.rds.ind:
+    command: process_dv_stats(
+      ind_file = target_name,
+      dv_data_ind = '1_fetch/out/dv_data.rds.ind',
+      site_stats_clean_ind = '2_process/out/site_stats_clean.rds.ind',
+      dates = dates,
+      percentiles = percentiles)
+  2_process/out/dv_stats.rds:
+    command: gd_get('2_process/out/dv_stats.rds.ind')
+
+  2_process/out/dv_stat_colors.rds.ind:
+    command: process_dv_stat_colors(
+      ind_file = target_name,
+      dv_stats_ind = '2_process/out/dv_stats.rds.ind',
+      color_palette = sites_color_palette)
+  2_process/out/dv_stat_colors.rds:
+    command: gd_get('2_process/out/dv_stat_colors.rds.ind')
+
   # -- scaling, shifting, rotating config (applies to both states and points) --
   shift_cfg:
     command: viz_config[[I('shift')]]
 
-  # -- states --
+  # -- process states --
   states_projected:
     command: project_states('1_fetch/out/pre_state_boundaries_census.zip.ind', projection)
   states_shifted:
     command: shift_states(states_projected, shift_cfg)
 
-  #-- sites --
+  # -- process sites --
   site_locations_projected:
     command: project_points('1_fetch/out/site_locations.rds.ind', projection)
   site_locations_shifted:

diff --git a/2_process/out/dv_stat_colors.rds.ind b/2_process/out/dv_stat_colors.rds.ind
@@ -0,0 +1,2 @@
+hash: 2d6859e8e3aa3e660fe89227c7ee7e36
+
diff --git a/2_process/out/dv_stats.rds.ind b/2_process/out/dv_stats.rds.ind
@@ -0,0 +1,2 @@
+hash: 0dffd490e10569053d4900287b3783dd
+
diff --git a/2_process/out/site_locations_sp.rds.ind b/2_process/out/site_locations_sp.rds.ind
diff --git a/2_process/out/site_stats_clean.rds.ind b/2_process/out/site_stats_clean.rds.ind
@@ -0,0 +1,2 @@
+hash: ad0e59822f9a265a44ebf6c143494c06
+
diff --git a/2_process/out/timesteps.rds.ind b/2_process/out/timesteps.rds.ind
@@ -1,2 +1,2 @@
-hash: 0bcfda07c078bd4ad4d3c9b995e50712
+hash: 0a7f4eb0123a895c664e964592d19f8c
 
diff --git a/2_process/src/choose_timesteps.R b/2_process/src/choose_timesteps.R
@@ -1,5 +1,5 @@
 choose_timesteps <- function(ind_file, dates) {
-  timesteps <- seq(as.POSIXct(dates$start, tz = "UTC"), as.POSIXct(dates$end, tz = "UTC"), by = 'hours')
+  timesteps <- seq(as.POSIXct(dates$start, tz = "UTC"), as.POSIXct(dates$end, tz = "UTC"), by = 'days')
   data_file <- as_data_file(ind_file)
   saveRDS(timesteps, data_file)
   gd_put(ind_file, data_file)

diff --git a/2_process/src/process_dv_stat_colors.R b/2_process/src/process_dv_stat_colors.R
@@ -0,0 +1,19 @@
+#' @title Compute the color for each daily value percentile
+#'
+#' @param ind_file character file name where the output should be saved
+#' @param dv_stats_ind indicator file for the data.frame of dv_data
+#' @param color_palette list of colors to use for the color ramp (from viz_config.yml)
+process_dv_stat_colors <- function(ind_file, dv_stats_ind, color_palette){
+
+  dv_stats <- readRDS(scipiper::sc_retrieve(dv_stats_ind, remake_file = '2_process.yml'))
+  col_fun <- colorRamp(color_palette)
+
+  # just removing NA percentiles for now
+  dv_stats_with_color <- dv_stats %>%
+    filter(!is.na(per)) %>%
+    mutate(color = rgb(col_fun(per), maxColorValue = 255)) # don't know how necessary maxColorValue is
+
+  # Write the data file and the indicator file
+  saveRDS(dv_stats_with_color, scipiper::as_data_file(ind_file))
+  scipiper::gd_put(ind_file)
+}
diff --git a/2_process/src/process_dv_stats.R b/2_process/src/process_dv_stats.R
@@ -0,0 +1,72 @@
+#' @title Calculate the stat category for each gage's discharge value
+#'
+#' @param ind_file character file name where the output should be saved
+#' @param dv_data_ind indicator file for the data.frame of dv_data
+#' @param site_stats_clean_ind indicator file for the data.frame of dv stats for each site
+#' @param dates object from viz_config.yml that specifies dates as string
+#' @param percentiles character vector of the types of stats to include, i.e. `c("10", "75")`
+#' will return the 10th and 75th percentiles (from viz_config.yml)
+process_dv_stats <- function(ind_file, dv_data_ind, site_stats_clean_ind, dates, percentiles){
+
+  dv_data <- readRDS(scipiper::sc_retrieve(dv_data_ind, remake_file = '1_fetch.yml'))
+  site_stats <- readRDS(scipiper::sc_retrieve(site_stats_clean_ind, remake_file = '2_process.yml'))
+
+  # breakdown date into month & day pairs
+  dv_data_md <- dv_data %>%
+    dplyr::mutate(month_nu = as.numeric(format(dateTime, "%m")),
+                  day_nu = as.numeric(format(dateTime, "%d")))
+
+  # merge stats with the dv data
+  # merge still results in extra rows - 24 extra to be exact
+  dv_with_stats <- left_join(dv_data_md, site_stats, by = c("site_no", "month_nu", "day_nu"))
+
+  stat_colnames <- sprintf("p%s_va", percentiles)
+  stat_perc <- as.numeric(percentiles)/100
+
+  interpolate_percentile <- function(df){
+    # This function takes the current daily value and interpolates it's percentile based
+    # on the percentiles for the matching site and day of the year
+    df <- select(df, "dv_val", one_of(stat_colnames))
+    out <- rep(NA, nrow(df))
+
+    for (i in 1:length(out)){
+      dv_val <- df$dv_val[i]
+
+      df_i <- slice(df, i) %>%
+        select(-dv_val) %>%
+        tidyr::gather(stat_name, stat_value) %>%
+        mutate(stat_value = as.numeric(stat_value),
+               stat_type = as.numeric(gsub("p|_va", "", stat_name))/100)
+
+      y <- df_i$stat_type
+      x <- df_i$stat_value
+      nas <- is.na(x)
+      x <- x[!nas]
+      y <- y[!nas]
+      if (length(unique(x)) < 2){
+        out[i] <- NA
+      } else if (dv_val < x[1L]){ # the first and last *have* to be numbers per filtering criteria
+        out[i] <- head(stat_perc, 1)
+      } else if (dv_val > tail(x, 1L)){
+        out[i] <- tail(stat_perc, 1)
+      } else {
+        out[i] <- approx(x, y, xout = dv_val)$y
+      }
+    }
+    return(out)
+
+  }
+
+  dv_stats <- dv_with_stats %>%
+    mutate(dv_val = Flow) %>%
+    filter_(sprintf("!is.na(%s)", stat_colnames[1]),
+            sprintf("!is.na(%s)", tail(stat_colnames,1)),
+            sprintf("!is.na(%s)", "dv_val")) %>%
+    mutate(per = interpolate_percentile(.)) %>%
+    select(site_no, dateTime, dv_val, per, p50_va)
+
+  # Write the data file and the indicator file
+  data_file <- scipiper::as_data_file(ind_file)
+  saveRDS(dv_stats, data_file)
+  scipiper::gd_put(ind_file, data_file)
+}
diff --git a/2_process/src/process_site_stats.R b/2_process/src/process_site_stats.R
@@ -0,0 +1,33 @@
+#' @title Clean up the site statistics data to eliminate duplicates
+#' 
+#' @param ind_file character file name where the output should be saved
+#' @param site_stats_ind indicator file for the data frame of site statistics
+process_site_stats <- function(ind_file, site_stats_ind){
+
+  stat_data <- readRDS(scipiper::sc_retrieve(site_stats_ind, remake_file = '1_fetch.yml'))
+
+  # For duplicated site stats, pick the result with the more recent end_yr
+  #   E.g. Site number 12010000 has two sets of stats for some of it's data 
+  #   filter by January 1 and you will see one set from 1930 - 2003 and one 
+  #   from 1930 - 2018. Filter so that only the 2018 one is used.
+  stat_data_unique <- stat_data %>%
+    dplyr::mutate(nyears = end_yr - begin_yr) %>% 
+    tidyr::unite(mashed, site_no, month_nu, day_nu) %>% 
+    dplyr::distinct() %>% # some of the stats are literally exact copies
+    dplyr::group_by(mashed) %>% 
+    dplyr::mutate(same_window = any(duplicated(nyears))) %>% 
+    dplyr::filter(ifelse(!same_window, 
+                         # pick the stat that has more years of data
+                         nyears == max(nyears), 
+                         # if there are > 1 with the same number of years,
+                         # pick the more recent stat
+                         end_yr == max(end_yr))) %>% 
+    dplyr::ungroup() %>% 
+    tidyr::separate(mashed, c("site_no", "month_nu", "day_nu"), sep = "_") %>% 
+    dplyr::mutate(month_nu = as.numeric(month_nu), day_nu = as.numeric(day_nu))
+
+  # Write the data file and the indicator file
+  data_file <- scipiper::as_data_file(ind_file)
+  saveRDS(stat_data_unique, data_file)
+  scipiper::gd_put(ind_file, data_file)
+}
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1,2 @@
		hash: 1526c1af515d9c34c65bfda095715f4d
		hash: b9097707855bed84ababea6e279bf73c
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1,2 @@
		hash: a2002214eb302dee23e475fa6379df85
		hash: d25d6b36a48a4c9ffbb95997df49802e
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1,2 @@
		hash: 4481da677798c03c96badb7d7303ac09
		hash: 64a273b80a416f15a80da2ec7d4f05eb
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1,2 @@
		hash: 53ac4d4371319321a9840cb6801ca326
		hash: 643764a5a20f82946253c41b1e503a5d
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1,2 @@
		hash: 0bcfda07c078bd4ad4d3c9b995e50712
		hash: 0a7f4eb0123a895c664e964592d19f8c