DOI-USGS · lindsayplatt · Nov 7, 2018 · Nov 5, 2018 · Nov 6, 2018 · Nov 6, 2018
diff --git a/1_fetch.yml b/1_fetch.yml
@@ -5,6 +5,7 @@ packages:
   - dplyr
   - scipiper
   - sf
+  - tidyr
   - yaml
 
 file_extensions:
@@ -76,7 +77,8 @@ targets:
     command: fetch_dv_data(
       ind_file = target_name,
       sites_ind = '1_fetch/out/sites.rds.ind',
-      dates = dates)
+      dates = dates,
+      request_limit = request_limit)
   1_fetch/out/dv_data.rds:
     command: gd_get('1_fetch/out/dv_data.rds.ind')
 

diff --git a/1_fetch/out/dv_data.rds.ind b/1_fetch/out/dv_data.rds.ind
@@ -1,2 +1,2 @@
-hash: 1526c1af515d9c34c65bfda095715f4d
+hash: 9d304e4bada2c2c115e4c664d129823b
 
diff --git a/1_fetch/out/site_locations.rds.ind b/1_fetch/out/site_locations.rds.ind
@@ -1,2 +1,2 @@
-hash: a2002214eb302dee23e475fa6379df85
+hash: bdf819be202ded812280a27a0c45c9de
 
diff --git a/1_fetch/out/site_stats.rds.ind b/1_fetch/out/site_stats.rds.ind
@@ -1,2 +1,2 @@
-hash: 4481da677798c03c96badb7d7303ac09
+hash: f9de080a31361f1bd22a2a6b59f746cb
 
diff --git a/1_fetch/out/sites.rds.ind b/1_fetch/out/sites.rds.ind
@@ -1,2 +1,2 @@
-hash: 53ac4d4371319321a9840cb6801ca326
+hash: 2e5992fb50c4fd9cd43fe4b6ac5335a6
 
diff --git a/1_fetch/src/fetch_dv_data.R b/1_fetch/src/fetch_dv_data.R
@@ -1,33 +1,45 @@
 #' @title Download the discharge from NWIS for each dv gage
-#' 
+#'
 #' @param ind_file character file name where the output should be saved
 #' @param sites_ind indicator file for the vector of site numbers
 #' @param dates object from viz_config.yml that specifies dates as string
-fetch_dv_data <- function(ind_file, sites_ind, dates){
-
+#' @param request_limit number indicating how many sites to include per dataRetrieval request (from viz_config.yml)
+fetch_dv_data <- function(ind_file, sites_ind, dates, request_limit){
+
   sites <- readRDS(scipiper::sc_retrieve(sites_ind, remake_file = '1_fetch.yml'))
-
-  dv_sites_data <- lapply(sites, FUN = function(x){
-    d <- dataRetrieval::readNWISdata(
-        service="dv",
-        site = x,
+
+  req_bks <- seq(1, length(sites), by=request_limit)
+  dv_data <- data.frame()
+  for(i in req_bks) {
+    last_site <- i+request_limit-1
+    if(i == tail(req_bks, 1) && last_site > length(sites)) {
+      last_site <- length(sites)
+    }
+    get_sites <- sites[i:last_site]
+    data_i <-
+      dataRetrieval::readNWISdata(
+        service = "dv",
+        statCd = "00003", # need this to avoid NAs
+        site = get_sites,
         parameterCd = "00060",
         startDate = dates$start,
-        endDate = dates$end) %>% 
+        endDate = dates$end) %>%
       dataRetrieval::renameNWISColumns()
-  
-    if(nrow(d) > 0 && any(names(d) == "Flow")) {
-      d[, c("dateTime", "Flow")] # keep only dateTime and Flow columns
+
+    if(nrow(data_i) > 0 && any(names(data_i) == "Flow")) {
+      data_i <- data_i[, c("site_no", "dateTime", "Flow")] # keep only dateTime and Flow columns
     } else {
-      NULL # no data returned situation
+      data_i <- NULL # no data returned situation
     }
-
-  })
-
-  names(dv_sites_data) <- sites
-
+
+    dv_data <- rbind(dv_data, data_i)
+    print(paste("Completed", last_site, "of", length(sites)))
+  }
+
+  dv_data_unique <- dplyr::distinct(dv_data) # need this to avoid some duplicates
+
   # Write the data file and the indicator file
   data_file <- scipiper::as_data_file(ind_file)
-  saveRDS(dv_sites_data, data_file)
+  saveRDS(dv_data_unique, data_file)
   scipiper::gd_put(ind_file, data_file)
 }
diff --git a/1_fetch/src/fetch_dv_sites.R b/1_fetch/src/fetch_dv_sites.R
@@ -1,25 +1,26 @@
 #' @title Fetch appropriate daily value sites from NWIS
-#' 
+#'
 #' @param ind_file character file name where the output should be saved
 #' @param dates object from viz_config.yml that specifies dates as string
 fetch_dv_sites <- function(ind_file, dates){
 
   hucs <- zeroPad(1:21, 2) # all hucs
-  
+
   sites <- c()
   for(huc in hucs){
-    sites <- 
+    sites <-
       dataRetrieval::whatNWISdata(
-        huc = huc, 
-        service = "dv", 
+        huc = huc,
+        service = "dv",
         startDate = dates$start,
         endDate = dates$end,
-        parameterCd = "00060", 
-        statCd = "00003") %>% 
-      dplyr::pull(site_no) %>% 
+        parameterCd = "00060",
+        statCd = "00003") %>%
+      dplyr::distinct() %>%
+      dplyr::pull(site_no) %>%
       c(sites)
   }
-  
+
   # Write the data file and the indicator file
   data_file <- scipiper::as_data_file(ind_file)
   saveRDS(sites, data_file)

diff --git a/1_fetch/src/fetch_site_stats.R b/1_fetch/src/fetch_site_stats.R
@@ -1,30 +1,34 @@
 #' @title Get the discharge quantiles for each dv gage
-#' 
+#'
 #' @param ind_file character file name where the output should be saved
 #' @param sites_ind indicator file for the vector of site numbers
 #' @param request_limit number indicating how many sites to include per dataRetrieval request (from viz_config.yml)
-#' @param percentiles character vector of the types of stats to include, i.e. `c("10", "75")` 
+#' @param percentiles character vector of the types of stats to include, i.e. `c("10", "75")`
 #' will return the 10th and 75th percentiles (from viz_config.yml)
 fetch_site_stats <- function(ind_file, sites_ind, request_limit, percentiles){
-   
+
   sites <- readRDS(scipiper::sc_retrieve(sites_ind, remake_file = '1_fetch.yml'))
-  
+
   req_bks <- seq(1, length(sites), by=request_limit)
   stat_data <- data.frame()
   for(i in req_bks) {
     last_site <- i+request_limit-1
+    if(i == tail(req_bks, 1) && last_site > length(sites)) {
+      last_site <- length(sites)
+    }
     get_sites <- sites[i:last_site]
     current_sites <- suppressMessages(
       dataRetrieval::readNWISstat(
         siteNumbers = get_sites,
-        parameterCd = "00060", 
+        parameterCd = "00060",
         statReportType="daily",
         statType=paste0("P", percentiles)
-      ))
+      )) %>%
+      dplyr::select(-agency_cd, -parameter_cd, -ts_id, -loc_web_ds)
     stat_data <- rbind(stat_data, current_sites)
     print(paste("Completed", last_site, "of", length(sites)))
   }
-  
+
   # Write the data file and the indicator file
   data_file <- scipiper::as_data_file(ind_file)
   saveRDS(stat_data, data_file)

diff --git a/2_process.yml b/2_process.yml
@@ -16,6 +16,9 @@ file_extensions:
 sources:
   - 2_process/src/choose_timesteps.R
   - 2_process/src/process_site_locations_to_sp.R
+  - 2_process/src/process_site_stats.R
+  - 2_process/src/process_dv_stats.R
+  - 2_process/src/process_dv_stat_colors.R
   - 2_process/src/project_shift_states.R
 
 targets:
@@ -24,17 +27,48 @@ targets:
     depends:
       - 2_process/out/timesteps.rds.ind
       - 2_process/out/site_locations_sp.rds.ind
+      - 2_process/out/site_stats_clean.rds.ind
+      - 2_process/out/dv_stats.rds.ind
+      - 2_process/out/dv_stat_colors.rds.ind
 
   # -- config --
   proj_str:
     command: viz_config[[I('projection')]]
+  color_palette:
+    command: viz_config[[I('color_palette')]]
 
   2_process/out/timesteps.rds.ind:
     command: choose_timesteps(target_name, dates = dates)
   2_process/out/timesteps.rds:
     command: gd_get('2_process/out/timesteps.rds.ind')
 
-##### states
+  # -- process site data --
+  2_process/out/site_stats_clean.rds.ind:
+    command: process_site_stats(
+      ind_file = target_name,
+      site_stats_ind = '1_fetch/out/site_stats.rds.ind')
+  2_process/out/site_stats_clean.rds:
+    command: gd_get('2_process/out/site_stats_clean.rds.ind')
+
+  2_process/out/dv_stats.rds.ind:
+    command: process_dv_stats(
+      ind_file = target_name,
+      dv_data_ind = '1_fetch/out/dv_data.rds.ind',
+      site_stats_clean_ind = '2_process/out/site_stats_clean.rds.ind',
+      dates = dates,
+      percentiles = percentiles)
+  2_process/out/dv_stats.rds:
+    command: gd_get('2_process/out/dv_stats.rds.ind')
+
+  2_process/out/dv_stat_colors.rds.ind:
+    command: process_dv_stat_colors(
+      ind_file = target_name,
+      dv_stats_ind = '2_process/out/dv_stats.rds.ind',
+      color_palette = color_palette)
+  2_process/out/dv_stat_colors.rds:
+    command: gd_get('2_process/out/dv_stat_colors.rds.ind')
+
+  # -- process state spatial data --
   2_state_boundaries_census_sp:
     command: read_shp_zip('1_fetch/in/pre_state_boundaries_census.zip', skip = I(list(STATEFP=c('69','66','60'))))
 
@@ -45,7 +79,7 @@ targets:
     command: mutate_sp_coords(`2_state_boundaries_albers_sp`, STATEFP = I(list('02', c('72', '78'), c('15'))),
       scale = I(c(0.47, 3, 1.5)), shift_x = I(c(90, -120, 520)), shift_y = I(c(-465, 90, -110)), rotate = I(c(-50, 20, -35)))
 
-######## sites
+  # -- process site spatial data --
   2_site_locations_sp:
     command: process_site_locations_to_sp(site_locations_ind = '1_fetch/out/site_locations.rds.ind', projection = projection)
   2_site_locations_mutated_sp:

diff --git a/2_process/out/dv_stat_colors.rds.ind b/2_process/out/dv_stat_colors.rds.ind
@@ -0,0 +1,2 @@
+hash: 2fffd2ca1a8032c5709770869fab3bbe
+
diff --git a/2_process/out/dv_stats.rds.ind b/2_process/out/dv_stats.rds.ind
@@ -0,0 +1,2 @@
+hash: 4167874102333723dd8b0191c52a612d
+
diff --git a/2_process/out/site_locations_sp.rds.ind b/2_process/out/site_locations_sp.rds.ind
@@ -1,2 +1,2 @@
-hash: 432df1d7c12cc9ba0a42c544be5b3c33
+hash: 0d690807b3803cbb87ba1818fec9ce33
 
diff --git a/2_process/out/site_stats_clean.rds.ind b/2_process/out/site_stats_clean.rds.ind
@@ -0,0 +1,2 @@
+hash: 5854dcede0a39113e95d3fecce8982de
+
diff --git a/2_process/out/timesteps.rds.ind b/2_process/out/timesteps.rds.ind
@@ -1,2 +1,2 @@
-hash: 0bcfda07c078bd4ad4d3c9b995e50712
+hash: 0a7f4eb0123a895c664e964592d19f8c
 
diff --git a/2_process/src/choose_timesteps.R b/2_process/src/choose_timesteps.R
@@ -1,5 +1,5 @@
 choose_timesteps <- function(ind_file, dates) {
-  timesteps <- seq(as.POSIXct(dates$start, tz = "UTC"), as.POSIXct(dates$end, tz = "UTC"), by = 'hours')
+  timesteps <- seq(as.POSIXct(dates$start, tz = "UTC"), as.POSIXct(dates$end, tz = "UTC"), by = 'days')
   data_file <- as_data_file(ind_file)
   saveRDS(timesteps, data_file)
   gd_put(ind_file, data_file)

diff --git a/2_process/src/process_dv_stat_colors.R b/2_process/src/process_dv_stat_colors.R
@@ -0,0 +1,20 @@
+#' @title Compute the color for each daily value percentile
+#' 
+#' @param ind_file character file name where the output should be saved
+#' @param dv_stats_ind indicator file for the data.frame of dv_data
+#' @param color_palette list of colors to use for the color ramp (from viz_config.yml)
+process_dv_stat_colors <- function(ind_file, dv_stats_ind, color_palette){
+
+  dv_stats <- readRDS(scipiper::sc_retrieve(dv_stats_ind, remake_file = '2_process.yml'))
+  col_fun <- colorRamp(color_palette)
+
+  # just removing NA percentiles for now
+  dv_stats_with_color <- dv_stats %>% 
+    filter(!is.na(per)) %>% 
+    mutate(color = rgb(col_fun(per), maxColorValue = 255)) # don't know how necessary maxColorValue is
+
+  # Write the data file and the indicator file
+  data_file <- scipiper::as_data_file(ind_file)
+  saveRDS(dv_stats_with_color, data_file)
+  scipiper::gd_put(ind_file, data_file)
+}
diff --git a/2_process/src/process_dv_stats.R b/2_process/src/process_dv_stats.R
@@ -0,0 +1,70 @@
+#' @title Calculate the stat category for each gage's discharge value
+#' 
+#' @param ind_file character file name where the output should be saved
+#' @param dv_data_ind indicator file for the data.frame of dv_data
+#' @param site_stats_clean_ind indicator file for the data.frame of dv stats for each site
+#' @param dates object from viz_config.yml that specifies dates as string
+#' @param percentiles character vector of the types of stats to include, i.e. `c("10", "75")` 
+#' will return the 10th and 75th percentiles (from viz_config.yml)
+process_dv_stats <- function(ind_file, dv_data_ind, site_stats_clean_ind, dates, percentiles){
+
+  dv_data <- readRDS(scipiper::sc_retrieve(dv_data_ind, remake_file = '1_fetch.yml'))
+  site_stats <- readRDS(scipiper::sc_retrieve(site_stats_clean_ind, remake_file = '2_process.yml'))
+
+  # breakdown date into month & day pairs
+  dv_data_md <- dv_data %>% 
+    dplyr::mutate(month_nu = as.numeric(format(dateTime, "%m")),
+                  day_nu = as.numeric(format(dateTime, "%d")))
+
+  # merge stats with the dv data
+  # merge still results in extra rows - 24 extra to be exact
+  dv_with_stats <- left_join(dv_data_md, site_stats, by = c("site_no", "month_nu", "day_nu"))
+
+  stat_colnames <- sprintf("p%s_va", percentiles)
+  stat_perc <- as.numeric(percentiles)/100
+
+  int_per <- function(df){
+    df <- select(df, "dv_val", one_of(stat_colnames))
+    out <- rep(NA, nrow(df))
+
+    for (i in 1:length(out)){
+      dv_val <- df$dv_val[i]
+
+      df_i <- slice(df, i) %>% 
+        select(-dv_val) %>% 
+        tidyr::gather(stat_name, stat_value) %>% 
+        mutate(stat_value = as.numeric(stat_value),
+               stat_type = as.numeric(gsub("p|_va", "", stat_name))/100)
+
+      y <- df_i$stat_type
+      x <- df_i$stat_value
+      nas <- is.na(x)
+      x <- x[!nas]
+      y <- y[!nas]
+      if (length(unique(x)) < 2){
+        out[i] <- NA
+      } else if (dv_val < x[1L]){ # the first and last *have* to be numbers per filtering criteria
+        out[i] <- head(stat_perc, 1)
+      } else if (dv_val > tail(x, 1L)){
+        out[i] <- tail(stat_perc, 1)
+      } else {
+        out[i] <- approx(x, y, xout = dv_val)$y
+      }
+    }
+    return(out)
+
+  }
+
+  dv_stats <- dv_with_stats %>% 
+    mutate(dv_val = Flow) %>% 
+    filter_(sprintf("!is.na(%s)", stat_colnames[1]), 
+            sprintf("!is.na(%s)", tail(stat_colnames,1)), 
+            sprintf("!is.na(%s)", "dv_val")) %>%
+    mutate(per = int_per(.)) %>% 
+    select(site_no, dateTime, dv_val, per, p50_va)
+
+  # Write the data file and the indicator file
+  data_file <- scipiper::as_data_file(ind_file)
+  saveRDS(dv_stats, data_file)
+  scipiper::gd_put(ind_file, data_file)
+}
diff --git a/2_process/src/process_site_stats.R b/2_process/src/process_site_stats.R
@@ -0,0 +1,33 @@
+#' @title Clean up the site statistics data to eliminate duplicates
+#' 
+#' @param ind_file character file name where the output should be saved
+#' @param site_stats_ind indicator file for the data frame of site statistics
+process_site_stats <- function(ind_file, site_stats_ind){
+
+  stat_data <- readRDS(scipiper::sc_retrieve(site_stats_ind, remake_file = '1_fetch.yml'))
+
+  # For duplicated site stats, pick the result with the more recent end_yr
+  #   E.g. Site number 12010000 has two sets of stats for some of it's data 
+  #   filter by January 1 and you will see one set from 1930 - 2003 and one 
+  #   from 1930 - 2018. Filter so that only the 2018 one is used.
+  stat_data_unique <- stat_data %>%
+    dplyr::mutate(nyears = end_yr - begin_yr) %>% 
+    tidyr::unite(mashed, site_no, month_nu, day_nu) %>% 
+    dplyr::distinct() %>% # some of the stats are literally exact copies
+    dplyr::group_by(mashed) %>% 
+    dplyr::mutate(same_window = any(duplicated(nyears))) %>% 
+    dplyr::filter(ifelse(!same_window, 
+                         # pick the stat that has more years of data
+                         nyears == max(nyears), 
+                         # if there are > 1 with the same number of years,
+                         # pick the more recent stat
+                         end_yr == max(end_yr))) %>% 
+    dplyr::ungroup() %>% 
+    tidyr::separate(mashed, c("site_no", "month_nu", "day_nu"), sep = "_") %>% 
+    dplyr::mutate(month_nu = as.numeric(month_nu), day_nu = as.numeric(day_nu))
+
+  # Write the data file and the indicator file
+  data_file <- scipiper::as_data_file(ind_file)
+  saveRDS(stat_data_unique, data_file)
+  scipiper::gd_put(ind_file, data_file)
+}
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1,2 @@
		hash: 1526c1af515d9c34c65bfda095715f4d
		hash: 9d304e4bada2c2c115e4c664d129823b
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1,2 @@
		hash: a2002214eb302dee23e475fa6379df85
		hash: bdf819be202ded812280a27a0c45c9de
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1,2 @@
		hash: 4481da677798c03c96badb7d7303ac09
		hash: f9de080a31361f1bd22a2a6b59f746cb
Copy link Contributor aappling-usgs Nov 7, 2018 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. I think the build/status file for this .ind file may not have made it in here. If updates aren't present in your local repository, you may need to run `scipiper:::YAMLify_build_status('1_fetch/out/site_stats.rds.ind')`
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1,2 @@
		hash: 53ac4d4371319321a9840cb6801ca326
		hash: 2e5992fb50c4fd9cd43fe4b6ac5335a6
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1,2 @@
		hash: 432df1d7c12cc9ba0a42c544be5b3c33
		hash: 0d690807b3803cbb87ba1818fec9ce33
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		hash: 5854dcede0a39113e95d3fecce8982de
Copy link Contributor aappling-usgs Nov 7, 2018 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. I'm not spotting a build/status file in this PR that contains `name: 2_process/out/site_stats_clean.rds.ind`. This is scipiper's fault, but it'd still be useful if you compensated (until we can fix it in scipiper). If the build/status file doesn't exist, try running `scipiper:::YAMLify_build_status('2_process/out/site_stats_clean.rds.ind')`
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1,2 @@
		hash: 0bcfda07c078bd4ad4d3c9b995e50712
		hash: 0a7f4eb0123a895c664e964592d19f8c