Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cue_rate calls db unnecessarily #4

Open
see24 opened this issue Aug 2, 2023 · 2 comments
Open

cue_rate calls db unnecessarily #4

see24 opened this issue Aug 2, 2023 · 2 comments

Comments

@see24
Copy link

see24 commented Aug 2, 2023

cue_rate has 3 calls to covariates_removal but only really needs one. Replacing the later 2 calls with the sp_covars object created from the first call leads to >2.5x faster execution.

Sorry not to make a pull request but it is such a quick fix I figure it is not worth the overhead.

Thanks

cue_rate2 <- function (species = NULL, model = NULL, od = NULL, tssr = NULL, 
          pairwise = FALSE, quantiles = NULL, samples = 1000) 
{
  rem_vcv_list <- NULL
  rm(rem_vcv_list)
  napops:::check_data_exists()
  if (!is.null(species)) {
    napops:::check_valid_species(species = species, mod = "rem")
  }
  if (!is.null(model)) {
    napops:::check_valid_model(model = model, mod = "rem")
  }
  if (pairwise) {
    if (length(od) != length(tssr)) {
      stop("Pairwise set to TRUE but OD and TSSR are not the same length.")
    }
  }
  sp_covars <- covariates_removal(species = species)
  if (!is.null(od)) {
    od_range <- range(sp_covars$OD)
    if (any(od < od_range[1]) || any(od > od_range[2])) {
      warning(paste0("You are providing some OD values that are outside the training values of [", 
                     od_range[1], ",", od_range[2], "] for species ", 
                     species))
    }
  }
  if (!is.null(tssr)) {
    tssr_range <- range(sp_covars$TSSR)
    if (any(tssr < tssr_range[1]) || any(tssr > tssr_range[2])) {
      warning(paste0("You are providing some TSSR values that are outside the training values of [", 
                     tssr_range[1], ",", tssr_range[2], "] for species ", 
                     species))
    }
  }
  if (isFALSE(pairwise)) {
    tssr_values <- rep(tssr, each = length(od))
    sim_data <- data.frame(Intercept = rep(1, times = length(tssr_values)), 
                           TSSR = tssr_values, OD = rep(od, length(tssr)))
  }
  else {
    sim_data <- data.frame(Intercept = rep(1, times = length(tssr)), 
                           TSSR = tssr, OD = od)
  }
  design <- sim_data
  tssr_median <- stats::median(sp_covars$TSSR)
  design$TSSR <- (design$TSSR - tssr_median)/24
  design$TSSR2 <- design$TSSR^2
  od_sp_median <- stats::median(sp_covars$OD)
  design$OD <- (design$OD - od_sp_median)/365
  design$OD2 <- design$OD^2
  design <- design[, c("Intercept", "TSSR", "TSSR2", "OD", 
                       "OD2")]
  coefficients <- coef_removal(species = species, model = model)
  coefficients <- as.numeric(coefficients[, c("Intercept", 
                                              "TSSR", "TSSR2", "OD", "OD2")])
  if (is.null(quantiles)) {
    coefficients[which(is.na(coefficients))] <- 0
    phi <- exp(as.matrix(design) %*% coefficients)
    sim_data$CR_est <- phi
    sim_data <- sim_data[, c("TSSR", "OD", "CR_est")]
    return(sim_data)
  }
  else {
    load(paste0(rappdirs::app_dir(appname = "napops")$data(), 
                "/rem_vcv.rda"))
    vcv <- rem_vcv_list[[model]][[species]]
    bootstrap_df <- bootstrap(vcv = vcv, coefficients = coefficients, 
                              design = design, quantiles = quantiles, samples = samples, 
                              model = "rem")
    return(cbind(sim_data[, c("TSSR", "OD")], bootstrap_df))
  }
}
bird_obs <- data.frame(species = c("AMRO", "AMGO", "BCCH", "SCTA"),
                        od = seq(85, 180, by = 5),
                        tssr = seq(85, 180, by = 5)/90)

cue_rate(species = bird_obs$species[1],
         model = "best",
         od = bird_obs$od,
         tssr = bird_obs$tssr,
         pairwise = TRUE)

bm <- bench::mark(old = cue_rate(species = bird_obs$species[1],
                           model = "best",
                           od = bird_obs$od,
                           tssr = bird_obs$tssr,
                           pairwise = TRUE),
                  new = cue_rate2(species = bird_obs$species[1],
                           model = "best",
                           od = bird_obs$od,
                           tssr = bird_obs$tssr,
                           pairwise = TRUE), 
                  filter_gc = FALSE, min_iterations = 10, 
                  relative = TRUE)
@see24
Copy link
Author

see24 commented Aug 3, 2023

Another potential speed up idea for you. It seems like cue_rate only needs the min, max and median of tssr and od for each species. Instead of loading all the observations for each species from the database on the fly and summarizing each time, why not include in the package a table with the summary values pre-calculated.

library(napops)
library(dplyr)

# preloading a table of species min, max median for each model type
sp_tbl <- list_species()

sum_covs <- function(df){
  df %>% 
    summarise(across(-c(Species, Survey_Method), lst(min, max, median)))
}

all_cov_tbl <- sp_tbl %>% rowwise() %>% 
  mutate(sum_covs(covariates_removal(Species)), 
         sum_covs(covariates_distance(Species)))

This table could be kept in the package, or stored in the db and loaded on attaching the package.

Then cue_rate could look like this:

# Version that uses loaded table to get species covariates
cue_rate3 <- function (species = NULL, model = NULL, od = NULL, tssr = NULL, 
                       pairwise = FALSE, quantiles = NULL, samples = 1000) 
{
  rem_vcv_list <- NULL
  rm(rem_vcv_list)
  napops:::check_data_exists()
  if (!is.null(species)) {
    napops:::check_valid_species(species = species, mod = "rem")
  }
  if (!is.null(model)) {
    napops:::check_valid_model(model = model, mod = "rem")
  }
  if (pairwise) {
    if (length(od) != length(tssr)) {
      stop("Pairwise set to TRUE but OD and TSSR are not the same length.")
    }
  }
  sp_covars <- all_cov_tbl[all_cov_tbl$Species == species, ]
  if (!is.null(od)) {
    if (any(od < sp_covars$OD_min) || any(od >  sp_covars$OD_max)) {
      warning(paste0("You are providing some OD values that are outside the training values of [", 
                     sp_covars$OD_min, ",",  sp_covars$OD_max, "] for species ", 
                     species))
    }
  }
  if (!is.null(tssr)) {
    if (any(tssr <  sp_covars$TSSR_min) || any(tssr > sp_covars$TSSR_max)) {
      warning(paste0("You are providing some TSSR values that are outside the training values of [", 
                     sp_covars$TSSR_min, ",", sp_covars$TSSR_max, "] for species ", 
                     species))
    }
  }
  if (isFALSE(pairwise)) {
    tssr_values <- rep(tssr, each = length(od))
    sim_data <- data.frame(Intercept = rep(1, times = length(tssr_values)), 
                           TSSR = tssr_values, OD = rep(od, length(tssr)))
  }
  else {
    sim_data <- data.frame(Intercept = rep(1, times = length(tssr)), 
                           TSSR = tssr, OD = od)
  }
  design <- sim_data
  tssr_median <- sp_covars$TSSR_median
  design$TSSR <- (design$TSSR - tssr_median)/24
  design$TSSR2 <- design$TSSR^2
  od_sp_median <- sp_covars$OD_median
  design$OD <- (design$OD - od_sp_median)/365
  design$OD2 <- design$OD^2
  design <- design[, c("Intercept", "TSSR", "TSSR2", "OD", 
                       "OD2")]
  coefficients <- coef_removal(species = species, model = model)
  coefficients <- as.numeric(coefficients[, c("Intercept", 
                                              "TSSR", "TSSR2", "OD", "OD2")])
  if (is.null(quantiles)) {
    coefficients[which(is.na(coefficients))] <- 0
    phi <- exp(as.matrix(design) %*% coefficients)
    sim_data$CR_est <- phi
    sim_data <- sim_data[, c("TSSR", "OD", "CR_est")]
    return(sim_data)
  }
  else {
    load(paste0(rappdirs::app_dir(appname = "napops")$data(), 
                "/rem_vcv.rda"))
    vcv <- rem_vcv_list[[model]][[species]]
    bootstrap_df <- bootstrap(vcv = vcv, coefficients = coefficients, 
                              design = design, quantiles = quantiles, samples = samples, 
                              model = "rem")
    return(cbind(sim_data[, c("TSSR", "OD")], bootstrap_df))
  }
}

A benchmark shows that this greatly increases the speed by ~200x the original and 70x my previous suggestion

bm <- bench::mark(old = cue_rate(species = bird_obs$species[1],
                           model = "best",
                           od = bird_obs$od,
                           tssr = bird_obs$tssr,
                           pairwise = TRUE),
                  new = cue_rate2(species = bird_obs$species[1],
                           model = "best",
                           od = bird_obs$od,
                           tssr = bird_obs$tssr,
                           pairwise = TRUE), 
                  new3 = cue_rate3(species = bird_obs$species[1],
                                   model = "best",
                                   od = bird_obs$od,
                                   tssr = bird_obs$tssr,
                                   pairwise = TRUE),
                  filter_gc = FALSE, min_iterations = 10, 
                  relative = TRUE)
bm
expression min median itr/sec mem_alloc gc/sec n_itr n_gc total_time
old 211.17765 208.05791 1.000000 39.94229 NaN 10 0 9.12s
new 70.96604 71.08165 2.936756 18.77249 Inf 10 1 3.11s
new3 1.00000 1.00000 197.796045 1.00000 NaN 109 0 502.67ms

I hope these ideas are helpful! I am currently using napops to calculate offsets for each stop in the BBS data for northern Ontario so speed is of the essence.

@see24
Copy link
Author

see24 commented Oct 11, 2023

I made a few other modifications to cue_rate and did the same for edr, so I figured I would share. The changes use the table all_cov_tbl instead of summarizing from the full data base each time, return a dataframe of NA if species is not in NA-POPS and return a numeric rather than an array. I have not tested it with quantiles but this just made it easier for my workflow. Apologies for not making a PR but I am not sure my changes will actually make sense for all users so I will let you decide.

# Version that uses loaded table to get species covariates
# also returns dataframe with all NA if species not valid
# and changes result so the CR_est is a numeric not an array
# Note: not tested with quantiles
cue_rate3 <- function (species = NULL, model = NULL, od = NULL, tssr = NULL,
                       pairwise = FALSE, quantiles = NULL, samples = 1000)
{
  rem_vcv_list <- NULL
  rm(rem_vcv_list)
  napops:::check_data_exists()
  if (!is.null(species)) {
    val <- check_valid_species(species = species, mod = "rem")
    if(!val){
      return(data.frame(TSSR = NA_real_, OD = NA_real_, CR_est = NA_real_))
    }
  }
  if (!is.null(model)) {
    napops:::check_valid_model(model = model, mod = "rem")
  }
  if (pairwise) {
    if (length(od) != length(tssr)) {
      stop("Pairwise set to TRUE but OD and TSSR are not the same length.")
    }
  }
  sp_covars <- all_cov_tbl[all_cov_tbl$Species == species, ]
  if (!is.null(od)) {
    if (any(od < sp_covars$OD_min) || any(od >  sp_covars$OD_max)) {
      warning(paste0("You are providing some OD values that are outside the training values of [",
                     sp_covars$OD_min, ",",  sp_covars$OD_max, "] for species ",
                     species))
    }
  }
  if (!is.null(tssr)) {
    if (any(tssr <  sp_covars$TSSR_min) || any(tssr > sp_covars$TSSR_max)) {
      warning(paste0("You are providing some TSSR values that are outside the training values of [",
                     sp_covars$TSSR_min, ",", sp_covars$TSSR_max, "] for species ",
                     species))
    }
  }
  if (isFALSE(pairwise)) {
    tssr_values <- rep(tssr, each = length(od))
    sim_data <- data.frame(Intercept = rep(1, times = length(tssr_values)),
                           TSSR = tssr_values, OD = rep(od, length(tssr)))
  }
  else {
    sim_data <- data.frame(Intercept = rep(1, times = length(tssr)),
                           TSSR = tssr, OD = od)
  }
  design <- sim_data
  tssr_median <- sp_covars$TSSR_median
  design$TSSR <- (design$TSSR - tssr_median)/24
  design$TSSR2 <- design$TSSR^2
  od_sp_median <- sp_covars$OD_median
  design$OD <- (design$OD - od_sp_median)/365
  design$OD2 <- design$OD^2
  design <- design[, c("Intercept", "TSSR", "TSSR2", "OD",
                       "OD2")]
  coefficients <- coef_removal(species = species, model = model)
  coefficients <- as.numeric(coefficients[, c("Intercept",
                                              "TSSR", "TSSR2", "OD", "OD2")])
  if (is.null(quantiles)) {
    coefficients[which(is.na(coefficients))] <- 0
    phi <- exp(as.matrix(design) %*% coefficients)
    sim_data <- cbind(sim_data, CR_est = phi)
    sim_data <- sim_data[, c("TSSR", "OD", "CR_est")]
    return(sim_data)
  }
  else {
    load(paste0(rappdirs::app_dir(appname = "napops")$data(),
                "/rem_vcv.rda"))
    vcv <- rem_vcv_list[[model]][[species]]
    bootstrap_df <- bootstrap(vcv = vcv, coefficients = coefficients,
                              design = design, quantiles = quantiles, samples = samples,
                              model = "rem")
    return(cbind(sim_data[, c("TSSR", "OD")], bootstrap_df))
  }
}

# Version using loaded table
# also returns dataframe with all NA if species not valid
# and changes result so is a numeric not an array
# Note: not tested with quantiles
edr2 <- function (species = NULL, model = NULL, road = NULL, forest = NULL,
                  pairwise = FALSE, quantiles = NULL, samples = 1000) {
  dis_vcv_list <- NULL
  rm(dis_vcv_list)
  napops_dir <- NULL
  rm(napops_dir)
  roadside <- NULL
  rm(roadside)
  napops:::check_data_exists()
  if (!is.null(species)) {
    val <- check_valid_species(species = species, mod = "dis")
    if(!val){
      return(data.frame(Road = NA_real_, Forest = NA_real_, EDR_est = NA_real_))
    }
  }
  if (!is.null(model)) {
    napops:::check_valid_model(model = model, mod = "dis")
  }
  if (pairwise) {
    if (length(road) != length(forest)) {
      stop("Pairwise set to TRUE but Road and Forest are not the same length.")
    }
  }
  sp_covars <- all_cov_tbl[all_cov_tbl$Species == species, ]
  if (!is.null(forest)) {
    if (any(forest > 1) || any(forest < 0)) {
      stop("Forest coverage values must be between 0 and 1.")
    }
    if (any(forest < sp_covars$Forest_min) || any(forest > sp_covars$Forest_max)) {
      warning(paste0("You are providing some Forest Coverage values that are outside the training values of [",
                     sp_covars$Forest_min, ",", sp_covars$Forest_max, "] for species ",
                     species))
    }
  }
  if (isFALSE(pairwise)) {
    forest_values <- rep(forest, each = length(road))
    sim_data <- data.frame(Intercept = rep(1, times = length(forest_values)),
                           Forest = forest_values, Road = as.integer(rep(road,
                                                                         length(forest))))
  }
  else {
    sim_data <- data.frame(Intercept = rep(1, times = length(forest)),
                           Forest = forest, Road = as.integer(road))
  }
  sim_data$RoadForest <- sim_data$Road * sim_data$Forest
  design <- sim_data
  coefficients <- coef_distance(species = species, model = model)
  coefficients <- as.numeric(coefficients[, c("Intercept",
                                              "Forest", "Road", "RoadForest")])
  if (is.null(quantiles)) {
    coefficients[which(is.na(coefficients))] <- 0
    tau <- exp(as.matrix(design) %*% coefficients)
    sim_data <- cbind(sim_data, EDR_est = tau)
    sim_data <- sim_data[, c("Road", "Forest", "EDR_est")]
    return(sim_data)
  }
  else {
    load(paste0(rappdirs::app_dir(appname = "napops")$data(),
                "/dis_vcv.rda"))
    vcv <- dis_vcv_list[[model]][[species]]
    bootstrap_df <- bootstrap(vcv = vcv, coefficients = coefficients,
                              design = design, quantiles = quantiles, samples = samples,
                              model = "dis")
    return(cbind(sim_data[, c("Road", "Forest")], bootstrap_df))
  }
  if (is.null(species)) {
    stop("No argument passed for species\n")
  }
  distance <- read.csv(file = paste0(napops_dir$data(), "/distance.csv"))
  species <- toupper(species)
  if (isFALSE(species %in% distance$Species)) {
    stop(paste0("Species ", species, " does not exist in NA-POPS database.\n"))
  }
  dis_sp <- distance[which(distance$Species == species), ]
  model_number <- NULL
  if (model == "best") {
    model_number <- which(dis_sp$aic == min(dis_sp$aic))
    dis_sp <- dis_sp[which(dis_sp$model == model_number),
    ]
  }
  else if (model == "full") {
    model_number <- 5
    dis_sp <- dis_sp[which(dis_sp$model == 5), ]
  }
  else if (is.numeric(model)) {
    if (model < 1 || model > 5) {
      stop(paste0(model, " is an invalid model."))
    }
    else {
      model_number <- model
      dis_sp <- dis_sp[which(dis_sp$model == model), ]
    }
  }
  else {
    stop(paste0(model, " is an invalid model."))
  }
  if (model_number == 1) {
    return(exp(dis_sp$intercept))
  }
  else if (model_number == 2) {
    if (is.null(roadside)) {
      stop("No argument supplied for roadside status.")
    }
    else if (isFALSE(is.logical(roadside))) {
      stop("Invalid argument for roadside status. Must be TRUE or FALSE.")
    }
    return(exp(dis_sp$intercept + as.numeric(roadside) *
                 dis_sp$road))
  }
  else if (model_number == 3) {
    if (is.null(forest)) {
      stop("No argument supplied for forest coverage.")
    }
    else if (forest < 0 || forest > 1) {
      stop("Invalid argument for forest. Must be a proportion between 0 and 1.")
    }
    return(exp(dis_sp$intercept + forest * dis_sp$forest))
  }
  else if (model_number == 4) {
    if (is.null(roadside)) {
      stop("No argument supplied for roadside status.")
    }
    else if (isFALSE(is.logical(roadside))) {
      stop("Invalid argument for roadside status. Must be TRUE or FALSE.")
    }
    if (is.null(forest)) {
      stop("No argument supplied for forest coverage.")
    }
    else if (forest < 0 || forest > 1) {
      stop("Invalid argument for forest. Must be a proportion between 0 and 1.")
    }
    return(exp(dis_sp$intercept + as.numeric(roadside) *
                 dis_sp$road + forest * dis_sp$forest))
  }
  else if (model_number == 5) {
    if (is.null(roadside)) {
      stop("No argument supplied for roadside status.")
    }
    else if (isFALSE(is.logical(roadside))) {
      stop("Invalid argument for roadside status. Must be TRUE or FALSE.")
    }
    if (is.null(forest)) {
      stop("No argument supplied for forest coverage.")
    }
    else if (forest < 0 || forest > 1) {
      stop("Invalid argument for forest. Must be a proportion between 0 and 1.")
    }
    return(exp(dis_sp$intercept + as.numeric(roadside) *
                 dis_sp$road + forest * dis_sp$forest + (as.numeric(roadside) *
                                                           forest * dis_sp$roadforest)))
  }
}

# would be better if invalid species returned a consistent result that indicated that.
check_valid_species <- function (species = NULL, mod = NULL) {
  if(mod == "rem"){
    mod_cov_tbl <- all_cov_tbl[all_cov_tbl$Removal == 1,]
  } else if(mod == "dis"){
    mod_cov_tbl <- all_cov_tbl[all_cov_tbl$Distance == 1,]
  } else {
    stop("mod must be 'rem' or 'dis', not ", mod)
  }

  not_in <- setdiff(species, mod_cov_tbl$Species)
  if (length(not_in) > 0) {
    warning(paste0("The following species do not exist for the selected model type in NA-POPS:\n",
                   not_in), call. = FALSE)
    return(FALSE)
  } else {
    return(TRUE)
  }
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant