Skip to content

Commit

Permalink
fix xgboost thread usage
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffreyhanson committed Mar 7, 2022
1 parent 727be71 commit 3418629
Show file tree
Hide file tree
Showing 53 changed files with 180 additions and 199 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: surveyvoi
Type: Package
Version: 1.0.3.2
Version: 1.0.3.3
Title: Survey Value of Information
Description: Decision support tool for prioritizing sites for ecological
surveys based on their potential to improve plans for conserving
Expand Down
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# surveyvoi 1.0.3.3

- Fix issue with `fit_xgb_occupancy_models()` using more than specified number
of threads for parallel processing.
- Ensure that PSOCK and FORK clusters used for parallel processing are
terminated correctly, even when processing is interrupted.

# surveyvoi 1.0.3.2

- Fix compatibility issues with updates to the xgboost package (version 1.5.0).
Expand Down
1 change: 1 addition & 0 deletions R/approx_near_optimal_survey_scheme.R
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,7 @@ approx_near_optimal_survey_scheme <- function(
"n_approx_replicates",
"n_approx_outcomes_per_replicate",
"rcpp_approx_expected_value_of_decision_given_survey_scheme"))
on.exit(try(stop_cluster(cl), silent = TRUE), add = TRUE)
}
## run calculations
curr_sites_approx_evsdi <- plyr::laply(
Expand Down
33 changes: 17 additions & 16 deletions R/approx_optimal_survey_scheme.R
Original file line number Diff line number Diff line change
Expand Up @@ -339,22 +339,23 @@ approx_optimal_survey_scheme <- function(
# calculate expected value of decision given schemes that survey sites
## initialize cluster
if (n_threads > 1) {
cl <- start_cluster(n_threads,
c("pij", "all_feasible_schemes", "new_info_idx",
"site_data", "feature_data",
"feature_survey_column",
"feature_survey_sensitivity_column",
"feature_survey_specificity_column",
"site_survey_cost_column",
"site_management_cost_column",
"site_management_locked_in",
"site_management_locked_out",
"feature_target_column",
"n_approx_replicates",
"n_approx_outcomes_per_replicate",
"total_budget",
"seed",
"rcpp_approx_expected_value_of_decision_given_survey_scheme"))
cl <- start_cluster(n_threads,
c("pij", "all_feasible_schemes", "new_info_idx",
"site_data", "feature_data",
"feature_survey_column",
"feature_survey_sensitivity_column",
"feature_survey_specificity_column",
"site_survey_cost_column",
"site_management_cost_column",
"site_management_locked_in",
"site_management_locked_out",
"feature_target_column",
"n_approx_replicates",
"n_approx_outcomes_per_replicate",
"total_budget",
"seed",
"rcpp_approx_expected_value_of_decision_given_survey_scheme"))
on.exit(try(stop_cluster(cl), silent = TRUE), add = TRUE)
}
## run calculations
evd_new_info <- plyr::laply(
Expand Down
1 change: 1 addition & 0 deletions R/fit_hglm_occupancy_models.R
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,7 @@ fit_hglm_occupancy_models <- function(
c("model_cmbs", "d", "feature_data", "seed",
"jags_n_samples", "jags_n_burnin", "jags_n_thin", "jags_n_adapt",
"fit_hglm_model"))
on.exit(try(stop_cluster(cl), silent = TRUE), add = TRUE)
}
## main processing
m_raw <- plyr::llply(
Expand Down
114 changes: 89 additions & 25 deletions R/fit_xgb_occupancy_models.R
Original file line number Diff line number Diff line change
Expand Up @@ -218,9 +218,11 @@ NULL
#' @export
fit_xgb_occupancy_models <- function(
site_data, feature_data,
site_detection_columns, site_n_surveys_columns,
site_detection_columns,
site_n_surveys_columns,
site_env_vars_columns,
feature_survey_sensitivity_column, feature_survey_specificity_column,
feature_survey_sensitivity_column,
feature_survey_specificity_column,
xgb_tuning_parameters,
xgb_early_stopping_rounds = rep(20, length(site_detection_columns)),
xgb_n_rounds = rep(100, length(site_detection_columns)),
Expand Down Expand Up @@ -372,13 +374,18 @@ fit_xgb_occupancy_models <- function(
feature_data[[feature_survey_specificity_column]][i],
parameters = xgb_tuning_parameters,
early_stopping_rounds = xgb_early_stopping_rounds[i],
n_rounds = xgb_n_rounds[i], n_folds = n_folds[i],
n_threads = n_threads, verbose = verbose, seed = seed)
n_rounds = xgb_n_rounds[i],
n_folds = n_folds[i],
n_threads = n_threads,
verbose = verbose,
seed = seed)
})

# assess models
perf <- plyr::ldply(seq_len(nrow(feature_data)), function(i) {
out <- plyr::ldply(seq_len(n_folds[i]), function(k) {
perf <- plyr::ldply(
seq_len(nrow(feature_data)), .parallel = FALSE, function(i) {
out <- plyr::ldply(
seq_len(n_folds[i]), .parallel = FALSE, function(k) {
## extract fold training and test data
m_k <- m[[i]]$models[[k]]
nround_k <- m[[i]]$models[[k]]$best_iteration
Expand All @@ -392,9 +399,15 @@ fit_xgb_occupancy_models <- function(
survey_spec <- feature_data[[feature_survey_specificity_column]][[i]]
## make predictions
p_train_k <- c(withr::with_package("xgboost",
stats::predict(m_k, x_train_k, iterationrange = c(1, nround_k + 1))))
stats::predict(
m_k, xgboost::xgb.DMatrix(x_train_k, nthread = 1),
iterationrange = c(1, nround_k + 1),
nthread = 1)))
p_test_k <- c(withr::with_package("xgboost",
stats::predict(m_k, x_test_k, iterationrange = c(1, nround_k + 1))))
stats::predict(
m_k, xgboost::xgb.DMatrix(x_test_k, nthread = 1),
iterationrange = c(1, nround_k + 1),
nthread = 1)))
## validate predictions
assertthat::assert_that(all(p_train_k >= 0), all(p_train_k <= 1),
msg = "xgboost predictions are not between zero and one")
Expand Down Expand Up @@ -434,15 +447,18 @@ fit_xgb_occupancy_models <- function(
# make model predictions
pred <- vapply(seq_len(nrow(feature_data)),
FUN.VALUE = numeric(nrow(site_env_data)), function(i) {
nr <- m[[i]]$parameters$nround
rowMeans(
vapply(m[[i]]$models, FUN.VALUE = numeric(nrow(site_env_data)),
function(x) {
## generate predictions
out <- c(withr::with_package("xgboost", stats::predict(
x, site_env_data, iterationrange = c(1, x$best_iteration + 1))))
x, xgboost::xgb.DMatrix(site_env_data, nthread = 1),
iterationrange = c(1, x$best_iteration + 1),
nthread = 1
)))
## validate predictions
assertthat::assert_that(all(out >= 0), all(out <= 1),
assertthat::assert_that(
all(out >= 0), all(out <= 1),
msg = "xgboost predictions are not between zero and one")
## return result
out
Expand Down Expand Up @@ -486,7 +502,6 @@ tune_model <- function(data, folds, survey_sensitivity, survey_specificity,
x
})
full_parameters <- data.frame(full_parameters, stringsAsFactors = FALSE)

## add objective if missing
if (is.null(full_parameters$objective)) {
full_parameters$objective <- "binary:logistic"
Expand All @@ -503,15 +518,54 @@ tune_model <- function(data, folds, survey_sensitivity, survey_specificity,
})
assertthat::assert_that(all(is.finite(spw)))

# precompute xgboost matrices
# N.B. this is because xgboost has a bug when creating DMatrix objects
# from R matrix objects, wherein it will use all available threads
# on the system, if done within a child R process
dtrain_paths <- vapply(
seq_len(n_folds), FUN.VALUE = character(1), function(k) {
f <- tempfile(fileext = ".dmat")
xgboost::xgb.DMatrix.save(
xgboost::xgb.DMatrix(
data[[k]]$fit$x,
label = data[[k]]$fit$y,
weight = data[[k]]$fit$w,
nthread = 1
),
f
)
f
})
dtest_paths <- vapply(
seq_len(n_folds), FUN.VALUE = character(1), function(k) {
f <- tempfile(fileext = ".dmat")
xgboost::xgb.DMatrix.save(
xgboost::xgb.DMatrix(
data[[k]]$test$x,
label = data[[k]]$test$y,
weight = data[[k]]$test$w,
nthread = 1
),
f
)
f
})
assertthat::assert_that(
all(file.exists(dtrain_paths)),
all(file.exists(dtest_paths)),
msg = "could not save data to temporary directory to faciliate analysis")

# find best tuning parameters using k-fold cross validation
## fit models using all parameters combinations
is_parallel <- (n_threads > 1) && (nrow(full_parameters) > 1)
if (is_parallel) {
cl <- start_cluster(
n_threads,
c("full_parameters", "data", "survey_sensitivity", "survey_specificity",
c("full_parameters", "data", "dtest_paths", "dtest_paths",
"survey_sensitivity", "survey_specificity",
"spw", "n_rounds", "early_stopping_rounds", "seed",
"rcpp_model_performance", "make_feval_tss"))
on.exit(try(stop_cluster(cl), silent = TRUE), add = TRUE)
}
cv <- plyr::ldply(
seq_len(nrow(full_parameters)), .parallel = is_parallel,
Expand All @@ -522,17 +576,20 @@ tune_model <- function(data, folds, survey_sensitivity, survey_specificity,
cv <- lapply(seq_len(n_folds), function(k) {
### prepare data for xgboost (note we use the fit data not the train data)
dtrain <- xgboost::xgb.DMatrix(
data[[k]]$fit$x, label = data[[k]]$fit$y,
weight = data[[k]]$fit$w)
dtrain_paths[k], nthread = 1, silent = TRUE)
dtest <- xgboost::xgb.DMatrix(
data[[k]]$test$x, label = data[[k]]$test$y,
weight = data[[k]]$test$w)
dtest_paths[k], nthread = 1, silent = TRUE)
### prepare evaluation function
curr_feval_tss <- make_feval_tss(survey_sensitivity, survey_specificity)
### prepare arguments for xgboost call
args <- list(data = dtrain, verbose = FALSE, scale_pos_weight = spw[k],
watchlist = list(test = dtest), eval_metric = curr_feval_tss,
maximize = TRUE, nrounds = n_rounds, nthread = 1,
args <- list(data = dtrain,
verbose = FALSE,
scale_pos_weight = spw[k],
watchlist = list(test = dtest),
eval_metric = curr_feval_tss,
maximize = TRUE,
nrounds = n_rounds,
nthread = 1,
early_stopping_rounds = early_stopping_rounds)
args <- append(args, p)
### fit model
Expand All @@ -541,10 +598,11 @@ tune_model <- function(data, folds, survey_sensitivity, survey_specificity,
})
### generate predictions
yhat_test <- c(withr::with_package("xgboost",
stats::predict(
model, data[[k]]$test$x,
iterationrange = c(1, model$best_iteration + 1)
)))
stats::predict(
model, dtest,
iterationrange = c(1, model$best_iteration + 1),
nthread = 1
)))
### validate predictions
assertthat::assert_that(all(yhat_test >= 0), all(yhat_test <= 1),
msg = "xgboost predictions are not between zero and one")
Expand All @@ -565,9 +623,15 @@ tune_model <- function(data, folds, survey_sensitivity, survey_specificity,
if (is_parallel) {
cl <- stop_cluster(cl)
}
## determine best parameters for i'th species

# cleanup
unlink(dtest_paths, force = TRUE)
unlink(dtrain_paths, force = TRUE)

# determine best parameters for i'th species
cv <- tibble::as_tibble(cv)
j <- which.max(cv$eval)

# return best parameters and models
best_params <- as.list(full_parameters[j, , drop = FALSE])
best_params$scale_pos_weight <- list(spw)
Expand Down
1 change: 1 addition & 0 deletions R/optimal_survey_scheme.R
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ optimal_survey_scheme <- function(
"feature_target_column",
"total_budget",
"rcpp_expected_value_of_decision_given_survey_scheme"))
on.exit(try(stop_cluster(cl), silent = TRUE), add = TRUE)
}
## run calculations
evd_new_info <- plyr::laply(
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ Please cite the *surveyvoi R* package when using it in publications. To
cite the developmental version, please use:

> Hanson JO, Chadès I, Hudgins EJ, Bennett J (2021). surveyvoi: Survey
> Value of Information. R package version 1.0.3.2. Available at
> Value of Information. R package version 1.0.3.3. Available at
> <https://github.com/jeffreyhanson/surveyvoi>.
## Usage
Expand Down
2 changes: 1 addition & 1 deletion docs/404.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion docs/articles/index.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions docs/articles/surveyvoi.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions docs/authors.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions docs/index.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion docs/news/index.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 3418629

Please sign in to comment.