diff --git a/.Rbuildignore b/.Rbuildignore
index a655369..ae0d771 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -1,3 +1,5 @@
+^renv$
+^renv\.lock$
^.*\.Rproj$
^\.Rproj\.user$
^LICENSE\.md$
diff --git a/DESCRIPTION b/DESCRIPTION
index 79e2d70..c0d5fbd 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,13 +1,13 @@
Type: Package
Package: tc.sensors
Title: Retrieve Loop Detector Data from the MnDOT JSON Feed
-Version: 0.2.3
-Date: 2023-07-21
+Version: 0.2.4
+Date: 2024-05-01
Authors@R: c(
person("Metropolitan Council", role = "cph"),
person("Liz", "Roten", , "liz.roten@metc.state.mn.us", role = c("cre", "aut"),
comment = c(ORCID = "0000-0002-5346-3549")),
- person("Nicole", "Sullivan", , "nicole.sullivan@metc.state.mn.us", role = "aut"),
+ person("Nicole", "Sullivan", , "nicole.sullivan@metc.state.mn.us", role = "ctb"),
person("Ashley", "Asmus", , "ashley.asmus@metc.state.mn.us", role = "ctb",
comment = c(ORCID = "0000-0001-5505-1372")),
person("Yaxuan", "Zhang", , "yaxuan.zhang@metc.state.mn.us", role = "ctb",
@@ -15,26 +15,27 @@ Authors@R: c(
)
Description: Process data collected from Minnesota Department of
Transportation (MnDOT) loop detectors installed on the Minnesota
- Freeway system in 30-second interval measurements of occupancy and
- volume, data which are pushed daily to a public JSON feed. Occupancy
- and volume data can be used to calculate speed and delay.
+ Freeway system. Data are published to a public JSON feed in 30-second
+ interval measurements of occupancy and volume. Occupancy and volume
+ data can be used to calculate speed and delay.
License: MIT + file LICENSE
BugReports: https://github.com/Metropolitan-Council/tc.sensors/issues
Imports:
- cli (>= 3.6.1),
- curl (>= 5.0.1),
- data.table (>= 1.14.8),
- dplyr (>= 1.1.2),
+ cli (>= 3.6.2),
+ curl (>= 5.2.1),
+ data.table (>= 1.15.4),
+ dplyr (>= 1.1.4),
geosphere (>= 1.5-18),
- jsonlite (>= 1.8.7),
+ jsonlite (>= 1.8.8),
magrittr (>= 2.0.3),
- purrr (>= 1.0.1),
- sf (>= 1.0-14),
+ purrr (>= 1.0.2),
+ sf (>= 1.0-16),
tibble (>= 3.2.1),
- tidyr (>= 1.3.0),
+ tidyr (>= 1.3.1),
tis (>= 1.39),
+ units (>= 0.8-5),
utils,
- xml2 (>= 1.3.5)
+ xml2 (>= 1.3.6)
Suggests:
cowplot (>= 1.0.0),
furrr (>= 0.1.0),
@@ -52,4 +53,4 @@ VignetteBuilder:
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE, roclets = c("rd", "namespace", "collate"))
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
diff --git a/LICENSE.md b/LICENSE.md
index 681e5e0..508b74d 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -1,6 +1,6 @@
# MIT License
-Copyright (c) 2022 Metropolitan Council
+Copyright (c) 2024 Metropolitan Council
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
diff --git a/NAMESPACE b/NAMESPACE
index 9afbbfb..b0f6b7d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -47,6 +47,7 @@ importFrom(purrr,map2)
importFrom(sf,st_as_sf)
importFrom(sf,st_cast)
importFrom(sf,st_length)
+importFrom(sf,st_make_valid)
importFrom(sf,st_set_crs)
importFrom(stats,median)
importFrom(tibble,as_tibble)
@@ -55,6 +56,8 @@ importFrom(tidyr,separate)
importFrom(tidyr,unite)
importFrom(tis,day)
importFrom(tis,isHoliday)
+importFrom(units,as_units)
+importFrom(units,set_units)
importFrom(utils,download.file)
importFrom(utils,read.csv)
importFrom(xml2,read_xml)
diff --git a/R/add_day_type.R b/R/add_day_type.R
index 4a79ebd..8e977cb 100644
--- a/R/add_day_type.R
+++ b/R/add_day_type.R
@@ -14,17 +14,21 @@
#' @importFrom dplyr case_when
#'
add_day_type <- function(sensor_data) {
- sensor_data[, `:=`(c("day_of_week", "day_type", "holiday", "day_category"), {
- day_of_week <- weekdays(date)
- day_type <- dplyr::case_when(day_of_week %in% c(
- "Saturday",
- "Sunday"
- ) ~ "Weekend", TRUE ~ "Weekday")
- holiday <- tis::isHoliday(date)
- day_category <- dplyr::case_when(
- holiday == TRUE ~ "Holiday",
- TRUE ~ day_type
- )
- .(day_of_week, day_type, holiday, day_category)
- })][, holiday := NULL]
+ sensor_data[
+ , `:=`(c("day_of_week", "day_type", "holiday", "day_category"), {
+ day_of_week <- weekdays(date)
+ day_type <- dplyr::case_when(day_of_week %in% c(
+ "Saturday",
+ "Sunday"
+ ) ~ "Weekend", TRUE ~ "Weekday")
+ holiday <- tis::isHoliday(date)
+ day_category <- dplyr::case_when(
+ holiday == TRUE ~ "Holiday",
+ TRUE ~ day_type
+ )
+ .(day_of_week, day_type, holiday, day_category)
+ })
+ ][
+ , holiday := NULL
+ ]
}
diff --git a/R/add_distance.R b/R/add_distance.R
index 3ac2163..4d4f511 100644
--- a/R/add_distance.R
+++ b/R/add_distance.R
@@ -131,7 +131,8 @@ add_distance <- function(config,
))
]
}
- config_final_joined <- config_final[as.data.table(config),
+ config_final_joined <- config_final[
+ as.data.table(config),
on = .(
detector_name, detector_label, detector_category,
detector_lane, detector_field, detector_abandoned, r_node_name,
diff --git a/R/aggregate_sensor.R b/R/aggregate_sensor.R
index 38d7e1a..fff452a 100644
--- a/R/aggregate_sensor.R
+++ b/R/aggregate_sensor.R
@@ -1,214 +1,273 @@
-#' Aggregate raw sensor data to a chosen level
-#'
-#' @param sensor_data data frame for single sensor returned from `pull_sensor()`
-#' @param interval_length numeric, the interval length in hours.
-#' `NA` indicates no aggregation (30 second data)
-#' `0.25` indicates 15 minutes.
-#' Default is `1`.
-#' @param config data.table, a configuration file for the given sensor
-#' @param replace_impossible logical, whether to replace impossible values with `NA`.
-#' Default is `TRUE` and *highly* recommended.
-#' @param interpolate_missing logical, whether to interpolate missing volume and occupancy
-#' values at the raw data level. Only applies if `replace_impossible` is `TRUE`. Note
-#' that this option increases the function runtime.
-#' @param occupancy_pct_threshold numeric, the lowest possible occupancy percentage
-#' to use when calculating speed. Default is `0.0020` or 0.2%. Increasing the threshold
-#' results in more stable speed values, while lowering it may increase speed variability.
-#' A higher occupancy threshold is recommended for shorter interval lengths
-#'
-#'
-#' @return a data.table with values for volume, occupancy, and speed
-#' - `date` IDate, the given date
-#' - `interval_bin` numeric, the observations interval bin
-#' - `{measure}.pct_null` numeric, the percentage of observations with null
-#' values for the given measure
-#' - `{measure}.sum` numeric, the measure's total over the given interval
-#' - `{measure}.mean` numeric, the measure's mean over the given interval
-#' - `speed` numeric, the mean traffic speed over the given interval
-#'
-#' @details
-#' ## Calculating speed
-#'
-#' There are 60 scans per second, which means there are 60*60 = 1,800 scans per
-#' 30-second interval. The occupancy value in the 30-second interval data
-#' represents the number of scans that were occupied of the 1,800 scans in that
-#' interval.
-#'
-#' With 60 scans per second, 60 seconds per minute there are 3,600 scans per minute.
-#' With 3,600 scans per minute, 60 minutes per hour there are 216,000 scans per hour.
-#' To find the number of scans in 15 minutes, we can multiply 0.25 * 216000 = 54,000 scans.
-#'
-#' ## Impossible values
-#'
-#' Any observation with a volume that exceeds 20 vehicles or an occupancy that exceeds 1,800 scans
-#' will be replaced with `NA`. It is impossible for more than twenty vehicles to pass over a sensor
-#' in only 30 seconds, and the maximum number of scans in 30 seconds is 1,800 (60 scans/second * 30 seconds).
-#'
-#' ### Interpolating missing values
-#'
-#' `interpolate_missing` indicates whether to interpolate missing volume and occupancy values
-#' at the raw data level. The interpolated value for a given observation is the mean of
-#' the two observations on either side of the observation. This method preserves the variable's
-#' overall distribution.
-#'
-#' @export
-#'
-#' @import data.table
-#' @importFrom cli cli_abort
-#'
-#' @examples
-#' \dontrun{
-#'
-#' library(tc.sensors)
-#' library(dplyr)
-#' config <- pull_configuration()
-#'
-#' config_sample <- dplyr::filter(config, config$detector_abandoned == "f") %>%
-#' dplyr::sample_n(1)
-#' yesterday <- as.Date(Sys.Date() - 365)
-#'
-#' sensor_results <- pull_sensor(
-#' sensor = config_sample$detector_name[[1]],
-#' pull_date = yesterday
-#' )
-#'
-#' aggregate_sensor(sensor_results,
-#' interval_length = 1,
-#' config = config_sample
-#' )
-#' }
-aggregate_sensor <- function(sensor_data, config, interval_length,
- replace_impossible = TRUE,
- interpolate_missing = FALSE,
- occupancy_pct_threshold = 0.0020) {
- # input checks ---------------------------------------------------------------
- if (is.na(interval_length)) {
- cli::cli_abort("No aggregation to do!")
- }
-
- if (interval_length > 24) {
- cli::cli_abort("Interval cannot exceed 24 hours.")
-
- if (length(unique(sensor_data$date)) <= 1) {
- cli::cli_abort("For intervals greater than 24 hours, you must have data for more than one date")
- }
- }
-
- if (nrow(sensor_data) != 2880 * length(unique(sensor_data$date))) {
- cli::cli_abort("For multiple dates, you must have at least 2,880 rows for each date you want covered.")
- }
-
-
- if (length(unique(sensor_data$sensor)) > 1) {
- cli::cli_abort("More than one sensor is in this dataset.")
- }
-
- # format data ----------------------------------------------------------------
- sensor_data <- data.table::as.data.table(sensor_data)
- config <- data.table::as.data.table(config)[detector_name == sensor_data$sensor[[1]]]
-
- # number of scans in the given interval length
- interval_scans <- interval_length * 216000
- field_length <- as.numeric(config[, "detector_field"][[1]])
-
- if (replace_impossible == TRUE) {
- sensor_data <- tc.sensors::replace_impossible(
- sensor_data = sensor_data,
- interval_length = NA
- )
-
- if (interpolate_missing == TRUE) {
- sensor_data <- sensor_data[
- , `:=`(volume.rollmean = data.table::shift(
- data.table::frollapply(volume, 3, mean,
- align = "center",
- na.rm = TRUE, hasNA = TRUE
- )
- )),
- by = .(sensor)
- ][
- , volume := ifelse(is.na(volume), volume.rollmean, volume)
- ][
- , `:=`(occupancy.rollmean = data.table::shift(
- data.table::frollapply(occupancy, 3, mean,
- align = "center", na.rm = TRUE,
- hasNA = TRUE
- )
- )),
- by = .(sensor)
- ][
- , occupancy := ifelse(is.na(occupancy), occupancy.rollmean, occupancy)
- ][, .(volume, occupancy, date, sensor, hour, min)]
- }
- }
-
-
- interval_length_min <- interval_length * 60
- n_rows_expected <- interval_length_min * 2 # two scans/observations per minute
-
- # there are 60 scans/second
- # 60*30 = 1,800 scans/ 30 sec (the interval we are given)
- # 60*60 = 3,600 scans/minute
- # 3600*60 = 216,000 scans per hour
- # 216,000 = number of scans in one hour
-
- if (interval_length < 1) { # if the interval length is less than an hour
- # browser()
-
- bins <- seq(0, 60, interval_length * 60)
-
- sensor_data[, interval_min_bin := findInterval(sensor_data$min, bins)][
- , start_min := min(min),
- by = .(date, hour, interval_min_bin)
- ]
-
- sensor_data_agg <- sensor_data[, as.list(unlist(lapply(.SD, function(x) {
- list(
- #sum = round(mean(x, na.rm = TRUE) * n_rows_expected),
- sum = round(sum(x, na.rm = TRUE)),
- mean = mean(x, na.rm = TRUE),
- pct.null = round(100 * sum(is.na(x)) / length(x),1)
- )
- }))),
- by = .(date, hour, start_min, interval_min_bin, sensor),
- .SDcols = c("volume", "occupancy")
- ][, start_datetime := as.character(as.POSIXct(paste(date, hour, start_min), format = "%Y-%m-%d %H %M"))][
- , occupancy.pct := (occupancy.sum / interval_scans)
- ][
- , speed := ifelse(volume.sum != 0 & occupancy.pct >= occupancy_pct_threshold,
- (volume.sum * (60 / interval_length_min) * field_length)
- / (5280 * occupancy.pct), NA
- )
- ]
- } else { # if the interval length is greater than or equal to 1 hour
- bins <- seq(0, 24, interval_length)
-
- sensor_data[, date := data.table::as.IDate(date)][
- , year := data.table::year(date)
- ][
- , interval_bin := findInterval(sensor_data$hour, bins)
- ]
-
- data.table::setorder(sensor_data, date)
-
- sensor_data_agg <- sensor_data[, as.list(unlist(lapply(.SD, function(x) {
- list(
- #sum = round(mean(x, na.rm = TRUE) * n_rows_expected),
- sum = round(sum(x, na.rm = TRUE)),
- mean = mean(x, na.rm = TRUE),
- pct.null = round(100 * sum(is.na(x)) / length(x),1)
- )
- }))),
- by = .(date, interval_bin, sensor),
- .SDcols = c("volume", "occupancy")
- ][, occupancy.sum := ifelse(occupancy.sum >= interval_scans, NA, occupancy.sum)][
- , occupancy.pct := (occupancy.sum / interval_scans)
- ][
- , speed := ifelse(volume.sum != 0 & occupancy.pct >= occupancy_pct_threshold,
- ((volume.sum * field_length) /
- (5280 * occupancy.pct)) / interval_length, NA
- )
- ]
- }
- return(sensor_data_agg)
-}
+#' Aggregate raw sensor data to a chosen level
+#'
+#' @param sensor_data data frame for single sensor returned from `pull_sensor()`
+#' @param interval_length numeric, the interval length in hours.
+#' `NA` indicates no aggregation (30 second data)
+#' `0.25` indicates 15 minutes.
+#' Default is `1`.
+#' @param config data.table, a configuration file for the given sensor
+#' @param replace_impossible logical, whether to replace impossible values with `NA`.
+#' Default is `TRUE` and *highly* recommended.
+#' @param interpolate_missing logical, whether to interpolate missing volume and occupancy
+#' values at the raw data level. Only applies if `replace_impossible` is `TRUE`. Note
+#' that this option increases the function runtime.
+#' @param occupancy_pct_threshold numeric, the lowest possible occupancy percentage
+#' to use when calculating speed. Default is `0.0020` or 0.02%. Increasing the threshold
+#' results in more stable speed values, while lowering it may increase speed variability.
+#' A higher occupancy threshold is recommended for shorter interval lengths
+#'
+#'
+#' @return a data.table with values for volume, occupancy, and speed
+#' - `date` IDate, the given date
+#' - `interval_bin` numeric, the observations interval bin
+#' - `{measure}.pct_null` numeric, the percentage of observations with null
+#' values for the given measure
+#' - `{measure}.sum` numeric, the measure's total over the given interval
+#' - `{measure}.mean` numeric, the measure's mean over the given interval
+#' - `speed` numeric, the mean traffic speed over the given interval
+#'
+#' @details
+#'
+#' ## Calculating speed
+#'
+#' There are 60 scans per second, which means there are 60 * 60 = 1,800 scans per
+#' 30-second interval. The occupancy value in the 30-second interval data
+#' represents the number of scans that were occupied of the 1,800 scans in that
+#' interval.
+#'
+#' With 60 scans per second, 60 seconds per minute there are 3,600 scans per minute.
+#' With 3,600 scans per minute, 60 minutes per hour there are 216,000 scans per hour.
+#' To find the number of scans in 15 minutes, we can multiply 0.25 * 216000 = 54,000 scans.
+#'
+#' Speed, in miles per hour, is calculate by multiplying the number of
+#' vehicles per hour by field length in miles, divided by the
+#' occupancy for the given interval.
+#'
+#' ## Impossible values
+#'
+#' Any observation with a volume that exceeds 20 vehicles or an occupancy that exceeds 1,800 scans
+#' will be replaced with `NA`. It is impossible for more than twenty vehicles to pass over a sensor
+#' in only 30 seconds, and the maximum number of scans in 30 seconds is 1,800
+#' (60 scans/second * 30 seconds).
+#'
+#' ### Interpolating missing values
+#'
+#' `interpolate_missing` indicates whether to interpolate missing volume and occupancy values
+#' at the raw data level. The interpolated value for a given observation is the mean of
+#' the two observations on either side of the observation. This method preserves the variable's
+#' overall distribution.
+#'
+#' @export
+#'
+#' @import data.table
+#' @importFrom cli cli_abort
+#' @importFrom units set_units as_units
+#'
+#' @examples
+#' \dontrun{
+#'
+#' library(tc.sensors)
+#' library(dplyr)
+#' config <- pull_configuration()
+#'
+#' config_sample <- dplyr::filter(config, config$detector_abandoned == "f") %>%
+#' dplyr::sample_n(1)
+#' yesterday <- as.Date(Sys.Date() - 365)
+#'
+#' sensor_results <- pull_sensor(
+#' sensor = config_sample$detector_name[[1]],
+#' pull_date = yesterday
+#' )
+#'
+#' aggregate_sensor(sensor_results,
+#' interval_length = 1,
+#' config = config_sample
+#' )
+#' }
+aggregate_sensor <- function(sensor_data,
+ config,
+ interval_length,
+ replace_impossible = TRUE,
+ interpolate_missing = FALSE,
+ occupancy_pct_threshold = 0.0020) {
+ # input checks ---------------------------------------------------------------
+ if (is.na(interval_length)) {
+ cli::cli_abort("No aggregation to do!")
+ }
+
+ if (interval_length > 24) {
+ cli::cli_abort("Interval cannot exceed 24 hours.")
+
+ if (length(unique(sensor_data$date)) <= 1) {
+ cli::cli_abort("For intervals greater than 24 hours, you must have data for more than one date")
+ }
+ }
+
+ if (nrow(sensor_data) != (2880 * length(unique(sensor_data$date)))) {
+ cli::cli_abort("For multiple dates, you must have at least 2,880 rows for each date you want covered.")
+ }
+
+ if (length(unique(sensor_data$sensor)) > 1) {
+ cli::cli_abort("More than one sensor is in this dataset.")
+ }
+
+ # format data ----------------------------------------------------------------
+ sensor_data <- data.table::as.data.table(sensor_data)
+ config <- data.table::as.data.table(config)[detector_name == sensor_data$sensor[[1]]]
+
+ # number of scans in the given interval length
+ # 3600 * 60 = 216,000 scans per hour
+ interval_scans <- interval_length * 216000
+ field_length <- as.numeric(config[, "detector_field"][[1]])
+ # convert field length from feet to miles
+ field_length_miles <- field_length %>%
+ units::as_units("feet") %>%
+ units::set_units("miles") %>%
+ as.numeric()
+
+ if (replace_impossible == TRUE) {
+ sensor_data <- tc.sensors::replace_impossible(
+ sensor_data = sensor_data,
+ interval_length = NA
+ )
+
+ if (interpolate_missing == TRUE) {
+ sensor_data <- sensor_data[
+ , `:=`(volume.rollmean = data.table::shift(
+ # calculate rolling average volume
+ data.table::frollapply(volume, 3, mean,
+ align = "center",
+ na.rm = TRUE,
+ hasNA = TRUE
+ )
+ )),
+ by = .(sensor)
+ ][
+ , volume := ifelse(is.na(volume), volume.rollmean, volume)
+ ][
+ # calculate rolling average occupancy
+ , `:=`(occupancy.rollmean = data.table::shift(
+ data.table::frollapply(occupancy, 3, mean,
+ align = "center",
+ na.rm = TRUE,
+ hasNA = TRUE
+ )
+ )),
+ by = .(sensor)
+ ][
+ , occupancy := ifelse(is.na(occupancy), occupancy.rollmean, occupancy)
+ ][
+ , .(volume, occupancy, date, sensor, hour, min)
+ ]
+ }
+ }
+
+ # find interval length in minutes
+ interval_length_min <- interval_length * 60
+ n_rows_expected <- interval_length_min * 2 # two scans/observations per minute
+
+ # there are 60 scans/second in the raw data
+ # 60 * 30 = 1,800 scans/ 30 sec (the interval we are given)
+ # 60 * 60 = 3,600 scans/minute
+ # 3600 * 60 = 216,000 scans per hour
+ # 216,000 = number of scans in one hour
+
+ if (interval_length < 1) {
+ # if the interval length is less than an hour
+
+ # create minute bins
+ bins <- seq(0, 60, interval_length * 60)
+
+ # add minute bins to sensor_data
+ sensor_data[
+ , interval_min_bin := findInterval(sensor_data$min, bins)
+ ][
+ , start_min := min(min),
+ by = .(date, hour, interval_min_bin)
+ ]
+
+ sensor_data_agg <- sensor_data[
+ , as.list(unlist(lapply(.SD, function(x) {
+ list(
+ # create sum, mean, and pct.null columns for volume and occupancy
+ # resulting in "volume.sum", "volume.mean", etc.
+ # grouped by date, hour, start_min, interval, and sensor
+ sum = round(sum(x, na.rm = TRUE)),
+ mean = mean(x, na.rm = TRUE),
+ pct.null = round(100 * sum(is.na(x)) / length(x), 1)
+ )
+ }))),
+ by = .(date, hour, start_min, interval_min_bin, sensor),
+ .SDcols = c("volume", "occupancy")
+ ][
+ , start_datetime := as.character(as.POSIXct(
+ paste(date, hour, start_min),
+ format = "%Y-%m-%d %H %M"
+ ))
+ ][
+ , occupancy.pct := (occupancy.sum / interval_scans)
+ ][
+ , speed := ifelse(
+ # if volume is not 0 and occupancy.pct is gte occupancy threshold
+ volume.sum != 0 & occupancy.pct >= occupancy_pct_threshold,
+ # speed is volume * number of interval periods in a single hour * field length in miles
+ # divided by occupancy
+ (volume.sum * (60 / interval_length_min) * field_length_miles) / occupancy.pct,
+ # otherwise, NA
+ NA
+ )
+ ]
+ } else {
+ # if the interval length is greater than or equal to 1 hour
+ # create hour bins
+ bins <- seq(0, 24, interval_length)
+
+ # add hour bins to sensor_data
+ sensor_data[
+ , date := data.table::as.IDate(date)
+ ][
+ , year := data.table::year(date)
+ ][
+ , interval_bin := findInterval(sensor_data$hour, bins)
+ ]
+
+ # order by date
+ data.table::setorder(sensor_data, date)
+
+ sensor_data_agg <- sensor_data[
+ , as.list(unlist(lapply(.SD, function(x) {
+ list(
+ # create sum, mean, and pct.null columns for volume and occupancy
+ # resulting in "volume.sum", "volume.mean", etc.
+ # grouped by date, interval, and sensor
+ sum = round(sum(x, na.rm = TRUE)),
+ mean = mean(x, na.rm = TRUE),
+ pct.null = round(100 * sum(is.na(x)) / length(x), 1)
+ )
+ }))),
+ by = .(date, interval_bin, sensor),
+ .SDcols = c("volume", "occupancy")
+ ][
+ # if total occupancy is gte the number of interval scans, make NA
+ , occupancy.sum := ifelse(occupancy.sum >= interval_scans, NA, occupancy.sum)
+ ][
+ # calculate occupancy percent of all interval scans
+ , occupancy.pct := (occupancy.sum / interval_scans)
+ ][
+ , speed := ifelse(
+ # if volume is not 0 and occupancy percent meets threshold value,
+ volume.sum != 0 & occupancy.pct >= occupancy_pct_threshold,
+ # calculate speed
+ # volume * field length in miles
+ # divided by occupancy
+ # all over the interval length in hours
+ ((volume.sum * field_length_miles) /
+ (occupancy.pct)) / interval_length,
+ # otherwise, speed is NA
+ NA
+ )
+ ]
+ }
+
+ return(sensor_data_agg)
+}
diff --git a/R/generate_spatial_lines.R b/R/generate_spatial_lines.R
index f8287bc..0b43374 100644
--- a/R/generate_spatial_lines.R
+++ b/R/generate_spatial_lines.R
@@ -41,7 +41,8 @@
#' }
#'
#' @import data.table
-#' @importFrom sf st_as_sf st_cast st_set_crs st_length
+#' @importFrom sf st_as_sf st_cast st_set_crs st_length st_make_valid
+#' @importFrom units set_units
#' @importFrom dplyr mutate group_by summarize
generate_spatial_lines <- function(config) {
# browser()
@@ -59,13 +60,14 @@ generate_spatial_lines <- function(config) {
# keyby = .(corridor_id)]
config_coords <- as.data.table(config)[r_node_n_type == "Station", ][
- , corridor_category := ifelse(corridor_route == "I-35" & r_node_lat > 45, "I35 north of cities",
- ifelse(corridor_route == "I-35" & r_node_lat <= 45, "I35 south of cities",
- ifelse(corridor_route == "T.H.5" & r_node_lon < -93.3, "5 west of cities",
- ifelse(corridor_route == "T.H.5" & r_node_lon > -93.3, "5 east of cities", "Other")
+ , corridor_category :=
+ ifelse(corridor_route == "I-35" & r_node_lat > 45, "I35 north of cities",
+ ifelse(corridor_route == "I-35" & r_node_lat <= 45, "I35 south of cities",
+ ifelse(corridor_route == "T.H.5" & r_node_lon < -93.3, "5 west of cities",
+ ifelse(corridor_route == "T.H.5" & r_node_lon > -93.3, "5 east of cities", "Other")
+ )
)
)
- )
][
, corridor_id := paste(corridor_route, corridor_dir, corridor_category, sep = "_")
][
@@ -75,9 +77,12 @@ generate_spatial_lines <- function(config) {
lines_sf <- sf::st_as_sf(config_coords, coords = c("r_node_lon", "r_node_lat")) %>%
dplyr::group_by(corridor_id) %>%
dplyr::summarise(do_union = FALSE, .groups = "keep") %>%
- sf::st_cast("LINESTRING") %>%
+ sf::st_make_valid() %>%
+ suppressMessages(sf::st_cast("LINESTRING", warn = FALSE)) %>%
sf::st_set_crs(4326) %>%
- dplyr::mutate(length_miles = as.numeric(sf::st_length(geometry)) * 0.00062137)
+ dplyr::mutate(length_miles = sf::st_length(geometry) %>%
+ units::set_units("mile") %>%
+ as.numeric())
return(lines_sf)
}
diff --git a/R/pull_configuration.R b/R/pull_configuration.R
index 5e703e3..2b15d7f 100644
--- a/R/pull_configuration.R
+++ b/R/pull_configuration.R
@@ -4,9 +4,9 @@
#' Useful for mapping (contains lat/lons) and calculating performance measures (contains detector_field).
#'
#' @param return_opt character, indicate how to return the data.
-#' \code{"within_dir"} will return the data within the directory as a csv entitled
+#' `"within_dir"` will return the data within the directory as a csv entitled
#' "Configuration of Metro Detectors YYYY-MM-DD".
-#' \code{"in-memory"} will return the data in R, but requires assignment.
+#' `"in-memory"` will return the data in R, but requires assignment.
#' @param .quiet logical, whether to hide messages. Default is `TRUE`
#'
#' @return dataframe containing 20 variables, including detector_field and lat/lons,
diff --git a/R/pull_sensor.R b/R/pull_sensor.R
index 347f36e..6105a2b 100644
--- a/R/pull_sensor.R
+++ b/R/pull_sensor.R
@@ -2,12 +2,12 @@
#'
#' @description Create a tidy data frame, containing volume and occupancy,
#' for a single date and sensor.
-#' Use \code{\link{pull_sensor_ids}} to obtain metro sensor IDs.
+#' Use [pull_sensor_ids()] to obtain metro sensor IDs.
#'
#' @param pull_date character, the date of data to pull.
-#' Needs to by in "YYYY-MM-DD" format.
+#' Needs to be in `"YYYY-MM-DD"` format.
#' @param sensor character, the sensor ID.
-#' See documentation for \code{\link{pull_sensor_ids}} to obtain metro sensor IDs.
+#' See [pull_sensor_ids()] to obtain metro sensor IDs.
#' @param fill_gaps logical, whether to fill gaps in the time series with `NA`
#' values. Default is `TRUE`
#' @inheritParams pull_configuration
@@ -21,7 +21,7 @@
#' usually results in a file that is around ~30-31KB.
#'
#' Approximate time to pull one sensor's and one extension's
-#' (v or c for volume or occupancy, respectively) data across
+#' ("v" or "c" for volume or occupancy, respectively) data across
#' a year on a Mac is 1.33 minutes.
#'
#' Also note that if you assign `pull_sensor()`'s output, the result is returned in-memory,
@@ -69,8 +69,6 @@ pull_sensor <- function(sensor, pull_date,
fill_gaps = TRUE,
.quiet = TRUE) {
# browser()
- # exts <- c("v", "c")
- # loops_ls <- map(exts, extension_pull)
volume <- extension_pull("v", "volume", pull_date = pull_date, sensor = sensor, quiet = .quiet)
occupancy <- extension_pull("c", "occupancy", pull_date = pull_date, sensor = sensor, quiet = .quiet)
@@ -97,12 +95,11 @@ pull_sensor <- function(sensor, pull_date,
)
)
} else if (fill_gaps == FALSE) {
- # Return essentially empty data.table if both volume and occupancy are missing for entire day
+ # Return empty data.table if both volume and occupancy are missing for entire day
loop_date_sensor[, `:=`(hour = NA, min = NA)]
}
} else {
# Add hour and minutes if either volume or occupancy (or both) are available
-
loop_date_sensor[, `:=`(
hour = rep(0:23, each = 120),
min = rep(seq(0, 59.5, by = 0.5), 24)
@@ -116,9 +113,9 @@ pull_sensor <- function(sensor, pull_date,
#' Pull extension
#'
-#' @param ext string, either \code{"v"} for volume or \code{"c"} for occupancy
-#' @param ext_name string, either \code{"volume"} or \code{"occupancy"}
-#' @param quiet logical, whether to hide messages. Default is TRUE
+#' @param ext string, either `"v"` for volume or `"c"` for occupancy
+#' @param ext_name string, either `"volume"` or `"occupancy"`
+#' @param quiet logical, whether to hide messages. Default is `TRUE`
#' @inheritParams pull_sensor
#' @keywords internal
#'
@@ -127,6 +124,7 @@ pull_sensor <- function(sensor, pull_date,
#' @export
extension_pull <- function(ext, ext_name, sensor, pull_date, quiet = TRUE) {
# browser()
+
pull_year <- format.Date(as.Date(pull_date, format = "%Y-%m-%d"), "%Y")
pull_month <- format.Date(as.Date(pull_date, format = "%Y-%m-%d"), "%m")
pull_day <- format.Date(as.Date(pull_date, format = "%Y-%m-%d"), "%d")
diff --git a/R/scrub.R b/R/scrub.R
index f1ed939..d94112b 100644
--- a/R/scrub.R
+++ b/R/scrub.R
@@ -23,16 +23,14 @@ scrub_sensor <- function(sensor_data, interval_length = NA) {
#'
#' @details
#' ## Criteria
-#' - Hourly
-#' - total hourly occupancy exceeds 216,000 scans
-#' - total hourly volume exceeds 2,300 cars
#' - 30-sec
-#' - total 30-second volume exceeds 20 cars
-#' - total 30-second occupancy exceed 1,800 scans
-#' - Percent nulls > 10.
-#'
-#' @author@R c(person("Ashley", "Asmus"),
-#' person("Liz", "Roten"))
+#' - Total 30-second volume exceeds 20 cars
+#' - Total 30-second occupancy exceed 1,800 scans
+#' - Hourly
+#' - Total hourly occupancy exceeds 216,000 scans (60 scans per second \* 60 secs per min \*
+#' 60 mins per hour = 216,000 scans per hour)
+#' - Total hourly volume exceeds 2,300 cars
+#' - Percent nulls >= 10%
#'
replace_impossible <- function(sensor_data,
interval_length = NA) {
@@ -40,20 +38,45 @@ replace_impossible <- function(sensor_data,
cli::cli_abort("More than one sensor is in this dataset.")
}
+ # if interval length is NA (no aggregation)
if (is.na(interval_length)) {
+ # check that each date has 2,880 rows
if (nrow(sensor_data) != 2880 * length(unique(sensor_data$date))) {
cli::cli_abort("For multiple dates, you must have at least 2,880 rows for each date you want covered.")
}
- sensor_data[, volume := ifelse(volume >= 20, NA, volume)][, occupancy := ifelse(occupancy >= 1800, NA, occupancy)]
+ sensor_data[
+ # if volume is gte 20 vehicles, make NA
+ , volume := ifelse(volume >= 20, NA, volume)
+ ][
+ # if occupancy is gte 1800 scans, make NA
+ , occupancy := ifelse(occupancy >= 1800, NA, occupancy)
+ ]
} else {
if (interval_length > 24) {
cli::cli_abort("Interval cannot exceed 24 hours.")
}
- sensor_data[, volume.sum := ifelse(volume.sum >= (interval_length * 2300), NA, volume.sum)][, occupancy.sum := ifelse(occupancy.sum >= (interval_length * 216000), NA, occupancy.sum)][, volume.sum := ifelse(volume.pct.null >= 10, NA, volume.sum)][, occupancy.sum := ifelse(occupancy.pct.null >= 10, NA, occupancy.sum)][, speed := ifelse(is.na(volume.sum), NA, speed)][, speed := ifelse(is.na(occupancy.sum), NA, speed)]
+ sensor_data[
+ # if the total volume is gte 2,300 vehicles per hour, make NA
+ , volume.sum := ifelse(volume.sum >= (interval_length * 2300), NA, volume.sum)
+ ][
+ # if occupancy is gte 216,000 scans per hour, make NA
+ , occupancy.sum := ifelse(occupancy.sum >= (interval_length * 216000), NA, occupancy.sum)
+ ][
+ # if the percent of all volume.sums is gte 10, make NA
+ , volume.sum := ifelse(volume.pct.null >= 10, NA, volume.sum)
+ ][
+ # if the percent of all occupancy.sums is gte 10, make NA
+ , occupancy.sum := ifelse(occupancy.pct.null >= 10, NA, occupancy.sum)
+ ][
+ # if volume is NA, make speed NA
+ , speed := ifelse(is.na(volume.sum), NA, speed)
+ ][
+ # if occupancy is NA, make speed NA
+ , speed := ifelse(is.na(occupancy.sum), NA, speed)
+ ]
}
return(sensor_data)
- # 60 scans per second * 60 secs per min * 60 mins per hour = 216,000 scans per hour
}
diff --git a/R/utils-pipe.R b/R/utils-pipe.R
index fd0b1d1..b0dc29e 100644
--- a/R/utils-pipe.R
+++ b/R/utils-pipe.R
@@ -1,6 +1,6 @@
#' Pipe operator
#'
-#' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details.
+#' See `magrittr::[\%>\%][magrittr::pipe]` for details.
#'
#' @name %>%
#' @rdname pipe
diff --git a/man/aggregate_sensor.Rd b/man/aggregate_sensor.Rd
index a5ecc1f..88c7cab 100644
--- a/man/aggregate_sensor.Rd
+++ b/man/aggregate_sensor.Rd
@@ -31,7 +31,7 @@ values at the raw data level. Only applies if \code{replace_impossible} is \code
that this option increases the function runtime.}
\item{occupancy_pct_threshold}{numeric, the lowest possible occupancy percentage
-to use when calculating speed. Default is \code{0.0020} or 0.2\%. Increasing the threshold
+to use when calculating speed. Default is \code{0.0020} or 0.02\%. Increasing the threshold
results in more stable speed values, while lowering it may increase speed variability.
A higher occupancy threshold is recommended for shorter interval lengths}
}
@@ -53,7 +53,7 @@ Aggregate raw sensor data to a chosen level
\details{
\subsection{Calculating speed}{
-\if{html}{\out{
}}\preformatted{There are 60 scans per second, which means there are 60*60 = 1,800 scans per
+\if{html}{\out{
}}\preformatted{There are 60 scans per second, which means there are 60 * 60 = 1,800 scans per
30-second interval. The occupancy value in the 30-second interval data
represents the number of scans that were occupied of the 1,800 scans in that
interval.
@@ -61,6 +61,10 @@ interval.
With 60 scans per second, 60 seconds per minute there are 3,600 scans per minute.
With 3,600 scans per minute, 60 minutes per hour there are 216,000 scans per hour.
To find the number of scans in 15 minutes, we can multiply 0.25 * 216000 = 54,000 scans.
+
+Speed, in miles per hour, is calculate by multiplying the number of
+vehicles per hour by field length in miles, divided by the
+occupancy for the given interval.
}\if{html}{\out{
}}
}
@@ -68,15 +72,18 @@ To find the number of scans in 15 minutes, we can multiply 0.25 * 216000 = 54,00
\if{html}{\out{
}}\preformatted{Any observation with a volume that exceeds 20 vehicles or an occupancy that exceeds 1,800 scans
will be replaced with `NA`. It is impossible for more than twenty vehicles to pass over a sensor
-in only 30 seconds, and the maximum number of scans in 30 seconds is 1,800 (60 scans/second * 30 seconds).
-
-### Interpolating missing values
+in only 30 seconds, and the maximum number of scans in 30 seconds is 1,800
+(60 scans/second * 30 seconds).
+}\if{html}{\out{
}}
+\subsection{Interpolating missing values}{
- `interpolate_missing` indicates whether to interpolate missing volume and occupancy values
- at the raw data level. The interpolated value for a given observation is the mean of
- the two observations on either side of the observation. This method preserves the variable's
- overall distribution.
+\if{html}{\out{
}}\preformatted{`interpolate_missing` indicates whether to interpolate missing volume and occupancy values
+at the raw data level. The interpolated value for a given observation is the mean of
+the two observations on either side of the observation. This method preserves the variable's
+overall distribution.
}\if{html}{\out{
}}
+}
+
}
}
\examples{
diff --git a/man/extension_pull.Rd b/man/extension_pull.Rd
index a52b8bb..7590351 100644
--- a/man/extension_pull.Rd
+++ b/man/extension_pull.Rd
@@ -12,12 +12,12 @@ extension_pull(ext, ext_name, sensor, pull_date, quiet = TRUE)
\item{ext_name}{string, either \code{"volume"} or \code{"occupancy"}}
\item{sensor}{character, the sensor ID.
-See documentation for \code{\link{pull_sensor_ids}} to obtain metro sensor IDs.}
+See \code{\link[=pull_sensor_ids]{pull_sensor_ids()}} to obtain metro sensor IDs.}
\item{pull_date}{character, the date of data to pull.
-Needs to by in "YYYY-MM-DD" format.}
+Needs to be in \code{"YYYY-MM-DD"} format.}
-\item{quiet}{logical, whether to hide messages. Default is TRUE}
+\item{quiet}{logical, whether to hide messages. Default is \code{TRUE}}
}
\value{
a tibble
diff --git a/man/pipe.Rd b/man/pipe.Rd
index a648c29..250ab19 100644
--- a/man/pipe.Rd
+++ b/man/pipe.Rd
@@ -15,6 +15,6 @@ lhs \%>\% rhs
The result of calling \code{rhs(lhs)}.
}
\description{
-See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details.
+See \verb{magrittr::[\\\%>\\\%][magrittr::pipe]} for details.
}
\keyword{internal}
diff --git a/man/pull_configuration.Rd b/man/pull_configuration.Rd
index 707ec0d..fa4d360 100644
--- a/man/pull_configuration.Rd
+++ b/man/pull_configuration.Rd
@@ -74,7 +74,7 @@ pull_configuration("within_dir") # No assignment necessary
}
\seealso{
Other loop sensor functions:
-\code{\link{pull_sensor_ids}()},
-\code{\link{pull_sensor}()}
+\code{\link{pull_sensor}()},
+\code{\link{pull_sensor_ids}()}
}
\concept{loop sensor functions}
diff --git a/man/pull_sensor.Rd b/man/pull_sensor.Rd
index 1114a04..116bca0 100644
--- a/man/pull_sensor.Rd
+++ b/man/pull_sensor.Rd
@@ -8,10 +8,10 @@ pull_sensor(sensor, pull_date, fill_gaps = TRUE, .quiet = TRUE)
}
\arguments{
\item{sensor}{character, the sensor ID.
-See documentation for \code{\link{pull_sensor_ids}} to obtain metro sensor IDs.}
+See \code{\link[=pull_sensor_ids]{pull_sensor_ids()}} to obtain metro sensor IDs.}
\item{pull_date}{character, the date of data to pull.
-Needs to by in "YYYY-MM-DD" format.}
+Needs to be in \code{"YYYY-MM-DD"} format.}
\item{fill_gaps}{logical, whether to fill gaps in the time series with \code{NA}
values. Default is \code{TRUE}}
@@ -24,7 +24,7 @@ data frame containing variables volume, occupancy, sensor, date, time.
\description{
Create a tidy data frame, containing volume and occupancy,
for a single date and sensor.
-Use \code{\link{pull_sensor_ids}} to obtain metro sensor IDs.
+Use \code{\link[=pull_sensor_ids]{pull_sensor_ids()}} to obtain metro sensor IDs.
}
\details{
\subsection{Output}{
@@ -33,7 +33,7 @@ Use \code{\link{pull_sensor_ids}} to obtain metro sensor IDs.
usually results in a file that is around ~30-31KB.
Approximate time to pull one sensor's and one extension's
- (v or c for volume or occupancy, respectively) data across
+ ("v" or "c" for volume or occupancy, respectively) data across
a year on a Mac is 1.33 minutes.
Also note that if you assign `pull_sensor()`'s output, the result is returned in-memory,
diff --git a/man/replace_impossible.Rd b/man/replace_impossible.Rd
index ea120e4..68b5803 100644
--- a/man/replace_impossible.Rd
+++ b/man/replace_impossible.Rd
@@ -24,17 +24,14 @@ Replace impossible volume and occupancy values with \code{NA} at given interval
\details{
\subsection{Criteria}{
-\if{html}{\out{
}}\preformatted{- Hourly
- - total hourly occupancy exceeds 216,000 scans
- - total hourly volume exceeds 2,300 cars
-- 30-sec
- - total 30-second volume exceeds 20 cars
- - total 30-second occupancy exceed 1,800 scans
-- Percent nulls > 10.
+\if{html}{\out{
}}\preformatted{- 30-sec
+ - Total 30-second volume exceeds 20 cars
+ - Total 30-second occupancy exceed 1,800 scans
+- Hourly
+ - Total hourly occupancy exceeds 216,000 scans (60 scans per second \\* 60 secs per min \\*
+ 60 mins per hour = 216,000 scans per hour)
+ - Total hourly volume exceeds 2,300 cars
+- Percent nulls >= 10\%
}\if{html}{\out{
}}
}
}
-\author{
-@R c(person("Ashley", "Asmus"),
-person("Liz", "Roten"))
-}
diff --git a/man/tc.sensors-package.Rd b/man/tc.sensors-package.Rd
index d488dfa..2324813 100644
--- a/man/tc.sensors-package.Rd
+++ b/man/tc.sensors-package.Rd
@@ -8,7 +8,7 @@
\description{
\if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}}
-Process data collected from Minnesota Department of Transportation (MnDOT) loop detectors installed on the Minnesota Freeway system in 30-second interval measurements of occupancy and volume, data which are pushed daily to a public JSON feed. Occupancy and volume data can be used to calculate speed and delay.
+Process data collected from Minnesota Department of Transportation (MnDOT) loop detectors installed on the Minnesota Freeway system. Data are published to a public JSON feed in 30-second interval measurements of occupancy and volume. Occupancy and volume data can be used to calculate speed and delay.
}
\seealso{
Useful links:
@@ -20,14 +20,10 @@ Useful links:
\author{
\strong{Maintainer}: Liz Roten \email{liz.roten@metc.state.mn.us} (\href{https://orcid.org/0000-0002-5346-3549}{ORCID})
-Authors:
-\itemize{
- \item Nicole Sullivan \email{nicole.sullivan@metc.state.mn.us}
-}
-
Other contributors:
\itemize{
\item Metropolitan Council [copyright holder]
+ \item Nicole Sullivan \email{nicole.sullivan@metc.state.mn.us} [contributor]
\item Ashley Asmus \email{ashley.asmus@metc.state.mn.us} (\href{https://orcid.org/0000-0001-5505-1372}{ORCID}) [contributor]
\item Yaxuan Zhang \email{yaxuan.zhang@metc.state.mn.us} (\href{https://orcid.org/0000-0003-2343-8307}{ORCID}) [contributor]
}
diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
index 58fb9c0..63b6a89 100644
--- a/tests/testthat/setup.R
+++ b/tests/testthat/setup.R
@@ -2,4 +2,12 @@
yesterday <- as.Date(Sys.Date() - 3)
-config <- pull_configuration()
+config_raw <- pull_configuration()
+
+config <- pull_configuration() %>%
+ dplyr::filter(
+ detector_abandoned == "f",
+ detector_category == "",
+ r_node_n_type == "Station"
+ ) %>%
+ data.table::as.data.table()
diff --git a/tests/testthat/test-add_weather.R b/tests/testthat/test-add_weather.R
index a67384b..938ab86 100644
--- a/tests/testthat/test-add_weather.R
+++ b/tests/testthat/test-add_weather.R
@@ -1,45 +1,44 @@
testthat::skip_on_ci()
testthat::test_that("Weather data functions as expected", {
- config_sample <- dplyr::filter(config, config$detector_abandoned == "f") %>%
- dplyr::sample_n(1)
-
- sensor_results <- pull_sensor(
- sensor = config_sample$detector_name[[1]],
- pull_date = yesterday,
- fill_gaps = TRUE
- )
-
-
-
- # test aggregation at 15 minutes----------------------------------------------
-
- # test aggregation at 1 hour--------------------------------------------------
-
- agg_hour <- aggregate_sensor(sensor_results,
- interval_length = 1,
- config = config_sample
+ testthat::try_again(
+ times = 5,
+ code = {
+ config_sample <- config %>%
+ dplyr::sample_n(1)
+
+ sensor_results <- pull_sensor(
+ sensor = config_sample$detector_name[[1]],
+ pull_date = yesterday,
+ fill_gaps = TRUE
+ )
+
+ # test aggregation at 1 hour--------------------------------------------------
+
+ agg_hour <- aggregate_sensor(sensor_results,
+ interval_length = 1,
+ config = config_sample
+ )
+
+ # test argument checks--------------------------------------------------------
+ testthat::expect_error(add_weather(agg_hour,
+ interval_length = 0.25
+ ))
+
+ agg_hour_weather <- add_weather(agg_hour,
+ interval_length = 1
+ )
+
+ testthat::expect_equal(dim(agg_hour_weather)[[1]], 24)
+
+ # test aggregation at 24 hours------------------------------------------------
+ agg_day_weather <- aggregate_sensor(sensor_results,
+ interval_length = 24,
+ config = config_sample
+ ) %>%
+ add_weather(interval_length = 24)
+
+ testthat::expect_equal(dim(agg_day_weather)[[1]], 1)
+ }
)
-
-
- # test argument checks--------------------------------------------------------
- testthat::expect_error(add_weather(agg_hour,
- interval_length = 0.25
- ))
-
-
- agg_hour_weather <- add_weather(agg_hour,
- interval_length = 1
- )
-
- testthat::expect_equal(dim(agg_hour_weather)[[1]], 24)
-
- # test aggregation at 24 hours------------------------------------------------
- agg_day_weather <- aggregate_sensor(sensor_results,
- interval_length = 24,
- config = config_sample
- ) %>%
- add_weather(interval_length = 24)
-
- testthat::expect_equal(dim(agg_day_weather)[[1]], 1)
})
diff --git a/tests/testthat/test-aggregate_sensor.R b/tests/testthat/test-aggregate_sensor.R
index 7e6a48c..5d90dc4 100644
--- a/tests/testthat/test-aggregate_sensor.R
+++ b/tests/testthat/test-aggregate_sensor.R
@@ -4,7 +4,7 @@ testthat::test_that("Aggregation functions as expected", {
# re-try this code with different sensors if it fails
times = 5,
code = {
- config_sample <- dplyr::filter(config, config$detector_abandoned == "f") %>%
+ config_sample <- config %>%
dplyr::sample_n(1)
sensor_results <- pull_sensor(
@@ -13,7 +13,6 @@ testthat::test_that("Aggregation functions as expected", {
fill_gaps = TRUE
)
-
# test aggregation at 15 minutes----------------------------------------------
agg <- aggregate_sensor(sensor_results,
interval_length = 0.25,
@@ -44,10 +43,14 @@ testthat::test_that("Aggregation functions as expected", {
round(mean(agg_hour$volume.mean))
)
- testthat::expect_equal(sum(sensor_results$occupancy, na.rm = TRUE), sum(agg_hour$occupancy.sum, na.rm = TRUE))
+ testthat::expect_equal(
+ sum(sensor_results$occupancy, na.rm = TRUE),
+ sum(agg_hour$occupancy.sum, na.rm = TRUE)
+ )
ifelse(!is.na(agg$speed),
- testthat::expect_true(round(mean(agg$speed, na.rm = TRUE)) - round(mean(agg_hour$speed, na.rm = TRUE)) < 3), NA
+ testthat::expect_true(round(mean(agg$speed, na.rm = TRUE)) -
+ round(mean(agg_hour$speed, na.rm = TRUE)) < 3), NA
)
# test aggregation at 24 hours------------------------------------------------
agg_day <- aggregate_sensor(sensor_results,
diff --git a/tests/testthat/test-generate_spatial_lines.R b/tests/testthat/test-generate_spatial_lines.R
index 4c40f7d..908a22d 100644
--- a/tests/testthat/test-generate_spatial_lines.R
+++ b/tests/testthat/test-generate_spatial_lines.R
@@ -1,6 +1,4 @@
-test_that("spatial lines are generated", {
- config <- as.data.table(pull_configuration())
-
+testthat::test_that("spatial lines are generated", {
lines_sf <- generate_spatial_lines(config = config)
testthat::expect_true("sf" %in% class(lines_sf))
diff --git a/tests/testthat/test-pull_configuration.R b/tests/testthat/test-pull_configuration.R
index 0d7deba..e9d73a3 100644
--- a/tests/testthat/test-pull_configuration.R
+++ b/tests/testthat/test-pull_configuration.R
@@ -1,5 +1,5 @@
testthat::test_that("Pull configuration returns 9,114 x 20 tibble", {
- testthat::expect_equal(class(config)[[1]], "tbl_df")
- testthat::expect_true(dim(config)[[1]] > 9000)
- testthat::expect_equal(dim(config)[[2]], 20)
+ testthat::expect_equal(class(config_raw)[[1]], "tbl_df")
+ testthat::expect_true(dim(config_raw)[[1]] > 9000)
+ testthat::expect_equal(dim(config_raw)[[2]], 20)
})
diff --git a/tests/testthat/test-pull_sensor.R b/tests/testthat/test-pull_sensor.R
index 3c4daa9..4e5545f 100644
--- a/tests/testthat/test-pull_sensor.R
+++ b/tests/testthat/test-pull_sensor.R
@@ -1,15 +1,20 @@
testthat::test_that("Test that data can be pulled from a random sensor id for yesterday's date", {
- config_sample <- dplyr::filter(config, config$detector_abandoned == "f") %>%
- dplyr::sample_n(1)
+ testthat::try_again(
+ times = 5,
+ code = {
+ config_sample <- config %>%
+ dplyr::sample_n(1)
- sensor_results <- pull_sensor(
- sensor = config_sample$detector_name[[1]],
- pull_date = yesterday,
- fill_gaps = TRUE
- )
+ sensor_results <- pull_sensor(
+ sensor = config_sample$detector_name[[1]],
+ pull_date = yesterday,
+ fill_gaps = TRUE
+ )
- testthat::expect_equal(class(sensor_results)[[1]], "data.table")
- testthat::expect_equal(dim(sensor_results)[[1]], 2880)
- testthat::expect_equal(dim(sensor_results)[[2]], 6)
+ testthat::expect_equal(class(sensor_results)[[1]], "data.table")
+ testthat::expect_equal(dim(sensor_results)[[1]], 2880)
+ testthat::expect_equal(dim(sensor_results)[[2]], 6)
+ }
+ )
})
diff --git a/tests/testthat/test-replace_impossible.R b/tests/testthat/test-replace_impossible.R
index cdd2c70..dbd89cf 100644
--- a/tests/testthat/test-replace_impossible.R
+++ b/tests/testthat/test-replace_impossible.R
@@ -1,16 +1,8 @@
testthat::test_that("Impossible values are replaced", {
- config_sample <- dplyr::filter(config, config$detector_abandoned == "f") %>%
- dplyr::sample_n(1)
-
- sensor_results <- pull_sensor(
- sensor = config_sample$detector_name[[1]],
- pull_date = yesterday,
- fill_gaps = TRUE
- )
-
- ifelse(is.na(sensor_results$volume),
- {
- config_sample <- dplyr::filter(config, config$detector_abandoned == "f") %>%
+ testthat::try_again(
+ times = 5,
+ code = {
+ config_sample <- config %>%
dplyr::sample_n(1)
sensor_results <- pull_sensor(
@@ -18,70 +10,83 @@ testthat::test_that("Impossible values are replaced", {
pull_date = yesterday,
fill_gaps = TRUE
)
- },
- NA
- )
- imp_rem <- replace_impossible(
- sensor_data = data.table::as.data.table(sensor_results),
- interval_length = NA
- )
+ ifelse(is.na(sensor_results$volume),
+ {
+ config_sample <- config %>%
+ dplyr::sample_n(1)
- testthat::expect_true(max(imp_rem$volume, na.rm = TRUE) < 20 | is.na(max(imp_rem$volume)))
- testthat::expect_true(max(imp_rem$occupancy, na.rm = TRUE) < 1800 | is.na(max(imp_rem$occupancy)))
+ sensor_results <- pull_sensor(
+ sensor = config_sample$detector_name[[1]],
+ pull_date = yesterday,
+ fill_gaps = TRUE
+ )
+ },
+ NA
+ )
+
+ imp_rem <- replace_impossible(
+ sensor_data = data.table::as.data.table(sensor_results),
+ interval_length = NA
+ )
- # test aggregation at 15 minutes----------------------------------------------
- agg <- aggregate_sensor(sensor_results,
- interval_length = 0.25,
- config = config_sample
- ) %>%
- replace_impossible(interval_length = 0.25)
+ testthat::expect_true(max(imp_rem$volume, na.rm = TRUE) < 20 | is.na(max(imp_rem$volume)))
+ testthat::expect_true(max(imp_rem$occupancy, na.rm = TRUE) < 1800 | is.na(max(imp_rem$occupancy)))
- testthat::expect_true(max(agg$volume.sum, na.rm = TRUE) < 0.25 * 2300)
- testthat::expect_true(max(agg$occupancy.sum, na.rm = TRUE) < 0.25 * 216000)
+ # test aggregation at 15 minutes----------------------------------------------
+ agg <- aggregate_sensor(sensor_results,
+ interval_length = 0.25,
+ config = config_sample
+ ) %>%
+ replace_impossible(interval_length = 0.25)
- testthat::expect_equal(dim(agg)[[1]], 96)
+ testthat::expect_true(max(agg$volume.sum, na.rm = TRUE) < 0.25 * 2300)
+ testthat::expect_true(max(agg$occupancy.sum, na.rm = TRUE) < 0.25 * 216000)
- # test aggregation at 1 hour--------------------------------------------------
- agg_hour <- aggregate_sensor(sensor_results,
- interval_length = 1,
- config = config_sample
- ) %>%
- replace_impossible(interval_length = 1)
+ testthat::expect_equal(dim(agg)[[1]], 96)
- testthat::expect_true(max(agg_hour$volume.sum, na.rm = TRUE) < 2300)
- testthat::expect_true(max(agg_hour$occupancy.sum, na.rm = TRUE) < 216000)
+ # test aggregation at 1 hour--------------------------------------------------
+ agg_hour <- aggregate_sensor(sensor_results,
+ interval_length = 1,
+ config = config_sample
+ ) %>%
+ replace_impossible(interval_length = 1)
- # test aggregation at 24 hours------------------------------------------------
- agg_day <- aggregate_sensor(sensor_results,
- interval_length = 24,
- config = config_sample
- ) %>%
- replace_impossible(interval_length = 24)
+ testthat::expect_true(max(agg_hour$volume.sum, na.rm = TRUE) < 2300)
+ testthat::expect_true(max(agg_hour$occupancy.sum, na.rm = TRUE) < 216000)
- testthat::expect_true(max(agg_day$volume.sum, na.rm = TRUE) < 24 * 2300)
- testthat::expect_true(max(agg_day$occupancy.sum, na.rm = TRUE) < 24 * 216000)
+ # test aggregation at 24 hours------------------------------------------------
+ agg_day <- aggregate_sensor(sensor_results,
+ interval_length = 24,
+ config = config_sample
+ ) %>%
+ replace_impossible(interval_length = 24)
+ testthat::expect_true(max(agg_day$volume.sum, na.rm = TRUE) < 24 * 2300)
+ testthat::expect_true(max(agg_day$occupancy.sum, na.rm = TRUE) < 24 * 216000)
- # test argument checks--------------------------------------------------------
- testthat::expect_error(replace_impossible(sensor_results,
- interval_length = 48
- ))
- testthat::expect_error(
- replace_impossible(
- rbind(
- sensor_results,
- data.table::data.table(
- volume = 10,
- occupancy = 12,
- date = Sys.Date(),
- sensor = config_sample$detector_name,
- hour = 0,
- min = 30
+ # test argument checks--------------------------------------------------------
+ testthat::expect_error(replace_impossible(sensor_results,
+ interval_length = 48
+ ))
+
+ testthat::expect_error(
+ replace_impossible(
+ rbind(
+ sensor_results,
+ data.table::data.table(
+ volume = 10,
+ occupancy = 12,
+ date = Sys.Date(),
+ sensor = config_sample$detector_name,
+ hour = 0,
+ min = 30
+ )
+ ),
+ interval_length = 24
)
- ),
- interval_length = 24
- )
+ )
+ }
)
})
diff --git a/vignettes/calculate_speed_delay.Rmd b/vignettes/calculate_speed_delay.Rmd
index 10354f0..88de21b 100644
--- a/vignettes/calculate_speed_delay.Rmd
+++ b/vignettes/calculate_speed_delay.Rmd
@@ -10,7 +10,6 @@ vignette: >
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
-
```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
@@ -22,15 +21,18 @@ knitr::opts_chunk$set(
)
options(scipen = 9999)
+sensor_year <- 2019
```
-**Warning: the following analysis has not yet been peer reviewed and is subject to change at any time without notice. The analysis pulls data from a single corridor over a two week period, and is intended to illustrate this package's features. This analysis cannot be used to draw conclusions on the corridor's overall performance or the performance of the system as a whole.**
+**Warning: the following analysis has not been peer reviewed and is subject to change at any time without notice. The analysis pulls data from a single corridor over a two week period, and is intended to illustrate this package's features. This analysis cannot be used to draw conclusions on the corridor's overall performance or the performance of the system as a whole.**
+
+This vignette walks through steps to fetch data for a select number of sensors, clean the sensor data, and then calculate speed and delay.
## Pull sensor data
-First, we will pull the data for three random sensors from March 3 to March 16, 2019.
+First, we will pull the data for sensors on TH 610 from March 3 to March 16, `r sensor_year`.
-```{r}
+```{r message=FALSE}
library(tc.sensors)
library(tictoc)
library(dplyr)
@@ -45,20 +47,27 @@ library(scales)
```
-```{r, cache=TRUE}
+```{r pull-sensors, warning=FALSE}
sensor_config <- pull_configuration() %>%
dplyr::filter(
+ detector_abandoned == "f", # detectors that are NOT abandoned
+ # abandoned indicates that the detector is no longer in use
corridor_route == "T.H.610", # single out one corridor
r_node_n_type == "Station", # only station node types
detector_category == "" # only mainline detectors
)
-date_range <- seq(as.Date("2019/03/03"), as.Date("2019/03/16"), by = "days")
+date_range <- seq(
+ as.Date(paste0(sensor_year, "/03/03")),
+ as.Date(paste0(sensor_year, "/03/16")),
+ by = "days"
+)
-plan(multiprocess)
+plan(future::multisession)
-tic() # Start the timer
-sensor_data <- future_map(sensor_config$detector_name,
+tic()
+sensor_data <- future_map(
+ sensor_config$detector_name,
.f = function(det_name) {
future_map_dfr(seq(1:length(date_range)), .f = function(x) {
tc.sensors::pull_sensor(det_name, date_range[[x]])
@@ -66,26 +75,36 @@ sensor_data <- future_map(sensor_config$detector_name,
}
)
toc()
-
-#> 11.92 sec elapsed
```
-Clean data
+## Clean data
+
+The following data cleaning steps and checks are taken in `scrub_sensor()`
+
+- Data is complete for all sensors (2,880 observations per sensor)
+- Volume and occupancy values are marked `NA` if
+ - Vehicle volume >= 20 vehicles, or 2,300 vehicles per hour
+ - Occupancy >= 1,800 scans, or 216,000 scans per hour
+- The percentage of `NA` values is then calculated for occupancy and volume. If the `NA` percentage is greater than a 10%, the values are replaced with `NA`
+- When calculating speed, if volume or occupancy are `NA`, speed is changed to `NA`
-```{r}
-clean_sensor_data <- purrr::map(sensor_data,
+Ongoing discussion on whether you can have valid occupancy with invalid volume, and vice versa.
+
+```{r clean-sensors}
+clean_sensor_data <- purrr::map(
+ sensor_data,
scrub_sensor,
interval_length = NA
-) # replace
+)
```
-## Aggregate sensor data and calculate speed
+## Aggregate and calculate speed
To calculate speed and evaluate our speed calculation, we are going to look at two different aggregation intervals: 10 minutes and 60 minutes (or 1 hour).
To avoid getting unreasonable speed values, we will set an occupancy percentage threshold at 0.002, or 0.2%. This functions somewhat like a sample size cut-off; if the occupancy is too low, its more likely that the speed value returned will be invalid.
-```{r}
+```{r agg-sensors}
agg_ten <- purrr::map(clean_sensor_data,
aggregate_sensor,
config = sensor_config,
@@ -107,32 +126,24 @@ agg_hour <- purrr::map(clean_sensor_data,
The NA values here aren't evenly distributed across all starting hours and day types. There is a greater percentage of null values in the early morning on both weekdays and weekends, which aligns with our knowledge that there is relatively low traffic during these hours.
-```{r}
+```{r sensor-summary}
#> 10min speed summary
summary(agg_ten$speed)
-#> Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
-#> 0.35 54.93 61.98 60.79 68.22 166.91 37879
#> 10 minute speed NA percentage
summary(agg_ten$speed)[7] / nrow(agg_ten)
#> NA's
-#> 0.3296349
-
#> 1 hour speed summary
summary(agg_hour$speed)
-#> Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
-#> 1.716 54.950 61.352 60.436 67.139 92.156 6088
#> 1 hour minute speed NA percentage
summary(agg_hour$speed)[7] / nrow(agg_hour)
-#> NA's
-#> 0.317878
```
We also see this by plotting.
-```{r}
+```{r plot-null-obs}
cowplot::plot_grid(
agg_ten %>%
group_by(day_type, hour) %>%
@@ -194,7 +205,7 @@ ggsave(device = "png", dpi = 72, filename = "plot_speed_nulls.png")
Next, we will plot the relationship between speed, volume, and occupancy for both aggregation levels. The plots below show that the larger interval size seems to stabilize speed values.
-```{r}
+```{r plot-ten-hour-agg}
plot_ten <- ggplot() +
geom_point(
data = agg_ten %>%
@@ -235,10 +246,12 @@ plot_hour <- ggplot() +
alpha = 0.5,
size = 3
) +
- scale_fill_viridis_c(stringr::str_wrap("Relative volume", 5),
+ scale_fill_viridis_c(
+ stringr::str_wrap("Relative volume", 5),
option = "magma", aesthetics = c("fill", "color")
) +
- scale_y_continuous("",
+ scale_y_continuous(
+ "",
labels = scales::percent,
limits = range(agg_ten$occupancy.pct)
) +
@@ -256,13 +269,15 @@ ggsave(device = "png", dpi = 72, filename = "plot_occupancy_speed_volume.png")
## Impute missing speeds
-One option for handling NA values is imputation. We can use the `{mice}` package with a random forest method. A random forest model is relatively easy to understand, handles outliers well, and works with non-linear data well. The random forest also fits with what we know about how speed is dependent on factors like day type and time of day.
+One option for handling `NA` values is imputation. We can use the `{mice}` package with a random forest method. A random forest model is relatively easy to understand, handles outliers well, and works with non-linear data well. The random forest also fits with what we know about how speed is dependent on factors like day type and time of day.
-```{r}
+```{r impute-random-forest}
library(mice)
tictoc::tic()
-rf_imp_day_type <- mice(agg_hour,
+rf_imp_day_type <- mice(
+ agg_hour %>%
+ filter(!is.na(speed)),
method = "rf",
formulas = c(
as.formula("speed ~ volume.sum + occupancy.sum + interval_bin + day_type")
@@ -275,7 +290,7 @@ tictoc::toc()
After we run our random forest, we are going to fetch the complete imputed data and plot the results. By fetching all the imputations, we are able to assess how the model behaves over time.
-```{r}
+```{r compile-imps}
all_imps <- complete(rf_imp_day_type, "all", include = T)
agg_hour_imputed <- map_dfr(seq(1, 6), .f = function(x) {
@@ -287,7 +302,7 @@ agg_hour_imputed <- map_dfr(seq(1, 6), .f = function(x) {
First, we will plot the speed density, taking care to examine weekdays and weekends.
-```{r}
+```{r plot-imputed}
ggplot() +
geom_density(
data = agg_hour_imputed,
@@ -315,7 +330,7 @@ ggplot() +
title = "Imputed and original speed density by day type",
caption = stringr::str_wrap(
paste0(
- "Mainline sensors on T.H.610 corridor, March 3-16, 2019.
+ "Mainline sensors on T.H.610 corridor, March 3-16, ", sensor_year, ".
Random forest speed imputation at one hour interval. "
),
width = 60
@@ -327,14 +342,16 @@ ggplot() +
```{r, include=FALSE}
ggsave(device = "png", dpi = 72, filename = "plot_speed_imputation_density.png")
```
+
Next, we will look at a scatter plot of the imputed speed and hour, again, by weekday and weekend. Here, the color indicates the imputation.
-```{r}
+```{r plot-speed-imputed}
ggplot() +
geom_jitter(
- data = agg_hour_imputed,
+ data = agg_hour_imputed %>%
+ filter(speed != 0),
mapping = aes(
- x = reorder(interval_bin, imp_n),
+ x = interval_bin,
y = speed,
color = imp_n
),
@@ -348,21 +365,23 @@ ggplot() +
y = "Speed",
color = "Imputation",
caption = stringr::str_wrap(paste0(
- "Mainline sensors on T.H.610 corridor, March 3-16, 2019.
+ "Mainline sensors on T.H.610 corridor, March 3-16, ", sensor_year, ".
Random forest speed imputation at one hour interval. "
), width = 60)
) +
theme_minimal()
```
+
![](plot_speed_imputation_hourly.png)
```{r, include=FALSE}
ggsave(device = "png", dpi = 72, filename = "plot_speed_imputation_hourly.png")
```
+
## Distance
In order to calculate delay, we need to calculate the distance between the upstream sensor and the sensor of interest. We can accomplish this using the sensor configuration table and `add_distance()`.
-```{r}
+```{r config-distance}
sensor_config_distance <- tc.sensors::add_distance(sensor_config, interpolate_missing = TRUE)
```
@@ -377,8 +396,7 @@ To calculate the amount of time (in hours) it takes to traverse a given distance
$Time = \frac{Distance}{Speed}$
-
-```{r}
+```{r time-test}
time_test <- complete(rf_imp_day_type) %>%
# sample_n(1000) %>%
left_join(sensor_config_distance %>%
@@ -406,7 +424,7 @@ ref_speed <- time_test %>%
unique()
```
-```{r}
+```{r time-compare}
time_compare <- left_join(time_test, ref_speed, by = c(
"sensor",
"day_type"
@@ -419,9 +437,7 @@ time_compare <- left_join(time_test, ref_speed, by = c(
)
```
-### Plot delay
-
-```{r}
+```{r plot-delay}
ggplot() +
geom_jitter(
data = time_compare,
@@ -432,15 +448,16 @@ ggplot() +
)
) +
scale_color_viridis_c(option = "magma", direction = -1) +
- # facet_wrap(~day_type, nrow = 2) +
labs(
title = "Speed and median speed delay by hour",
x = "Hour",
y = "Delay (minutes)",
color = "Speed (mph)",
caption = stringr::str_wrap(
- "Mainline sensors on T.H.610 corridor, March 3-16, 2019.
- Random forest speed imputation at one hour interval. ",
+ paste0(
+ "Mainline sensors on T.H.610 corridor, March 3-16, ", sensor_year,
+ ". Random forest speed imputation at one hour interval. "
+ ),
width = 60
)
) +
@@ -450,6 +467,7 @@ ggplot() +
```{r, include=FALSE}
ggsave(device = "png", dpi = 72, filename = "plot_delay_hour_speed.png")
```
+
## Relevant reports and references
diff --git a/vignettes/plot_delay_hour_speed.png b/vignettes/plot_delay_hour_speed.png
index d2c45a1..77fdfb9 100644
Binary files a/vignettes/plot_delay_hour_speed.png and b/vignettes/plot_delay_hour_speed.png differ
diff --git a/vignettes/plot_occupancy_speed_volume.png b/vignettes/plot_occupancy_speed_volume.png
index 75c364e..e3d5c1c 100644
Binary files a/vignettes/plot_occupancy_speed_volume.png and b/vignettes/plot_occupancy_speed_volume.png differ
diff --git a/vignettes/plot_speed_imputation_density.png b/vignettes/plot_speed_imputation_density.png
index 014846b..7b9e319 100644
Binary files a/vignettes/plot_speed_imputation_density.png and b/vignettes/plot_speed_imputation_density.png differ
diff --git a/vignettes/plot_speed_imputation_hourly.png b/vignettes/plot_speed_imputation_hourly.png
index de22ae1..d25ba19 100644
Binary files a/vignettes/plot_speed_imputation_hourly.png and b/vignettes/plot_speed_imputation_hourly.png differ
diff --git a/vignettes/plot_speed_nulls.png b/vignettes/plot_speed_nulls.png
index 28fc0fb..90dcd53 100644
Binary files a/vignettes/plot_speed_nulls.png and b/vignettes/plot_speed_nulls.png differ
diff --git a/vignettes/pulling_sensors_in_parallel.Rmd b/vignettes/pulling_sensors_in_parallel.Rmd
index 3fc57e2..b9fe996 100644
--- a/vignettes/pulling_sensors_in_parallel.Rmd
+++ b/vignettes/pulling_sensors_in_parallel.Rmd
@@ -9,6 +9,8 @@ vignette: >
## Pull data for select sensors for all 2019
+Functions in this package are designed to pull data for a single sensor on a single date. To get data spanning multiple sensors and dates efficiently, we can use vector and parallel processing to simultaneously access data.
+
```{r, eval = FALSE}
library(tc.sensors)
library(dplyr)
@@ -20,7 +22,8 @@ library(tictoc)
sensor_config <- pull_configuration()
sensor_ids <- pull_sensor_ids()
-tic() # Start the timer
+# start the timer
+tic()
# set up parallel processing
future::plan(multisession)
diff --git a/vignettes/tc.sensors.Rmd b/vignettes/tc.sensors.Rmd
index b063aa6..d13be54 100644
--- a/vignettes/tc.sensors.Rmd
+++ b/vignettes/tc.sensors.Rmd
@@ -20,7 +20,7 @@ library(tc.sensors)
## Introduction
-`{tc.sensors}` is an R package for processing data collected from Minnesota Department of Transportation (MnDOT) loop detectors installed on the Minnesota Freeway system in 30-second interval measurements of occupancy and volume, data which are pushed daily to a public JSON feed. Occupancy and volume data can be used to calculate speed and delay.
+`{tc.sensors}` is an R package for processing data collected from Minnesota Department of Transportation (MnDOT) loop detectors installed on the Minnesota Freeway system. Data are published to a public JSON feed in 30-second interval measurements of occupancy and volume. Occupancy and volume data can be used to calculate speed and delay.
## Primary terms and definitions
@@ -35,14 +35,25 @@ Volume represents the number of vehicles that pass through a detector in a given
The term 'occupancy' does not here refer to the occupants of a vehicle but rather the occupancy of the sensor, or how long the sensor was 'occupied'. In a 30-second time period, 1800 scans are produced (60 per second), and each scan is binary: either the sensor is occupied or not. Therefore, a sensor occupied for 1 second within the 30-second time period would have a value of 60. Raw occupancy values can be converted to percentages:
-$$\frac{Occupancy}{1800}*100% $$
+$$\frac{Occupancy}{1800}\times 100% $$
The resulting percentage is the percentage of time in that 30 seconds that the sensor was 'occupied'.
-### Interpolating null values
+### Data cleaning and interpolation
-Where nulls exist (vs. a zero measurement), it is assumed the connection was disrupted and no measurement was taken. For 15-minute intervals where other values exist, the nulls are interpolated with the **average** of the other values within the interval. Impossible values are also interpolated with the mean of the interval. Impossible values are raw occupancy values greater than 1800 (given that only 1800 scans are taken in a 30-second period). If an entire interval contains only nulls, it is converted to 'NA' and no values within the interval are interpolated.
+Where nulls exist (vs. a zero measurement), it is assumed the connection was disrupted and no measurement was taken. For aggregated intervals where other values exist, the nulls are interpolated with a rolling **average** of the other values within the interval. Impossible values are also interpolated with the mean of the interval. Impossible values are raw occupancy values greater than 1800 (given that only 1800 scans are taken in a 30-second period). If an entire interval contains only nulls, it is converted to 'NA' and no values within the interval are interpolated.
-Note that a variable is created containing the percentage of nulls/impossible values in that interval; therefore, one can choose to exclude intervals with interpolation rates above a certain threshold of choice (eg if more than, say, 30% of the data is missing).
+Note that a variable is created containing the percentage of nulls/impossible values in that interval; therefore, one can choose to exclude intervals with interpolation rates above a certain threshold of choice (e.g. if more than, say, 30% of the data is missing).
+
+The following data cleaning steps and checks are taken in `scrub_sensor()`
+
+- Data is complete for all sensors (2,880 observations per sensor per day)
+- Make volume and occupancy values `NA` if
+ - vehicle volume >= 20 vehicles, or 2,300 vehicles per hour
+ - occupancy >= 1,800 scans, or 216,000 scans per hour [^ 60 scans per second \* 60 secs per min \* 60 mins per hour = 216,000 scans per hour]
+- The percentage of `NA` values is calculated for occupancy and volume. If the `NA` percentage is greater than a given threshold, all volume and occupancy values for a sensor are replaced with `NA`
+ - If volume or occupancy are `NA`, speed is changed to `NA`
+
+There is ongoing discussion on whether you can have valid occupancy with invalid volume, and vice versa.
## Calculations
@@ -50,29 +61,33 @@ Calculations for speed, flow, headway, density and lost/spare capacity must be c
### Density
-The number of vehicles per mile (`Flow / Speed`). See [full calculation method](http://data.dot.state.mn.us/datatools/Density.html) for additional context.
-
+Density is the number of vehicles per mile calculated from Flow and Speed (`Flow / Speed`). See [full calculation method](http://data.dot.state.mn.us/datatools/Density.html) for additional context.
### Speed
-Speed is calculated
+Speed must be an aggregated measure. We do not recommend going below a 15-minute interval for calculating speed.
+
+Speed, in miles per hour, is calculated multiply the number of vehicles per hour by field length in miles divided by occupancy.
-$$\frac{Vehicles Per Hour*Vehicle Length}{5280*Occupancy Percentage} $$
+$$\frac{\text{Vehicles Per Hour} \times \text{Field Length}}{\text{Occupancy Percentage}} $$
-or
+Speed can also be calculated with Flow and Density.
$$\frac{Flow}{Density} $$
-'Vehicles Per Hour' is calculated by summing all the vehicles over the 15-minute interval, and then multiplying that by four. 'Vehicle Length' is a static field in the sensor configuration dataset. 'Occupancy Percentage' is calculated by summing all the occupancy values over the 15-minute interval, and then dividing by 54,000 (1,800 scans in 30 seconds*30 periods in 15-minute interval).
+'Vehicles Per Hour' is calculated by summing all the vehicles over the given interval, and then multiplying that by the number of intervals in an hour (there are 4 15-minute intervals in an hour, so the multiplier is 4). 'Vehicle Length' is a static field in the sensor configuration dataset. 'Occupancy Percentage' is calculated by summing all the occupancy values over the given interval, and then dividing by the total number of scans in the interval period (54,000 = 1,800 scans in 30 seconds * 30 periods in 15-minute interval).
### Lost or Spare Capacity
-The average flow that a roadway is losing, either due to low traffic or high congestion, throughout the sampling period.
+The average flow that a roadway is losing, either due to low traffic or high congestion, throughout the sampling period. Capacity is calculated using Flow and Density.
+If flow exceeds 1,800 vehicles per hour, the sensor is considered well performing. If flow is under 1,800 vehicles per hour and vehicle density is greater than 43 vehicles per mile, the sensor is at lost capacity; there is more traffic than the road can accommodate at free-flow speed. If the flow is under 1,800 vehicles per hour and vehicle density is less than 43 vehicles per mile, the sensor has spare capacity; there is less traffic than the road is built to handle.
-### Vehicle Miles Traveled (VMT)
+- Flow > 1800: 0, roadway operating at appropriate capacity
+- Density > 43: Lost Capacity: Flow - 1800
+- Density ≤ 43: Spare Capacity: 1800 - Flow
## Additional documentation