fix xgboost thread usage

prioritizr · Mar 7, 2022 · 3418629 · 3418629
1 parent 727be71
commit 3418629
Show file tree

Hide file tree

Showing 53 changed files with 180 additions and 199 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: surveyvoi
 Type: Package
-Version: 1.0.3.2
+Version: 1.0.3.3
 Title: Survey Value of Information
 Description: Decision support tool for prioritizing sites for ecological
     surveys based on their potential to improve plans for conserving

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,10 @@
+# surveyvoi 1.0.3.3
+
+- Fix issue with `fit_xgb_occupancy_models()` using more than specified number
+  of threads for parallel processing.
+- Ensure that PSOCK and FORK clusters used for parallel processing are
+  terminated correctly, even when processing is interrupted.
+
 # surveyvoi 1.0.3.2
 
 - Fix compatibility issues with updates to the xgboost package (version 1.5.0).

diff --git a/R/approx_near_optimal_survey_scheme.R b/R/approx_near_optimal_survey_scheme.R
@@ -412,6 +412,7 @@ approx_near_optimal_survey_scheme <- function(
           "n_approx_replicates",
           "n_approx_outcomes_per_replicate",
           "rcpp_approx_expected_value_of_decision_given_survey_scheme"))
+      on.exit(try(stop_cluster(cl), silent = TRUE), add = TRUE)
     }
     ## run calculations
     curr_sites_approx_evsdi <- plyr::laply(

diff --git a/R/approx_optimal_survey_scheme.R b/R/approx_optimal_survey_scheme.R
@@ -339,22 +339,23 @@ approx_optimal_survey_scheme <- function(
   # calculate expected value of decision given schemes that survey sites
   ## initialize cluster
   if (n_threads > 1) {
-      cl <- start_cluster(n_threads,
-        c("pij", "all_feasible_schemes", "new_info_idx",
-          "site_data", "feature_data",
-          "feature_survey_column",
-          "feature_survey_sensitivity_column",
-          "feature_survey_specificity_column",
-          "site_survey_cost_column",
-          "site_management_cost_column",
-          "site_management_locked_in",
-          "site_management_locked_out",
-          "feature_target_column",
-          "n_approx_replicates",
-          "n_approx_outcomes_per_replicate",
-          "total_budget",
-          "seed",
-          "rcpp_approx_expected_value_of_decision_given_survey_scheme"))
+    cl <- start_cluster(n_threads,
+      c("pij", "all_feasible_schemes", "new_info_idx",
+        "site_data", "feature_data",
+        "feature_survey_column",
+        "feature_survey_sensitivity_column",
+        "feature_survey_specificity_column",
+        "site_survey_cost_column",
+        "site_management_cost_column",
+        "site_management_locked_in",
+        "site_management_locked_out",
+        "feature_target_column",
+        "n_approx_replicates",
+        "n_approx_outcomes_per_replicate",
+        "total_budget",
+        "seed",
+        "rcpp_approx_expected_value_of_decision_given_survey_scheme"))
+    on.exit(try(stop_cluster(cl), silent = TRUE), add = TRUE)
   }
   ## run calculations
   evd_new_info <- plyr::laply(

diff --git a/R/fit_hglm_occupancy_models.R b/R/fit_hglm_occupancy_models.R
@@ -333,6 +333,7 @@ fit_hglm_occupancy_models <- function(
       c("model_cmbs", "d", "feature_data", "seed",
         "jags_n_samples", "jags_n_burnin", "jags_n_thin", "jags_n_adapt",
         "fit_hglm_model"))
+    on.exit(try(stop_cluster(cl), silent = TRUE), add = TRUE)
   }
   ## main processing
   m_raw <- plyr::llply(

diff --git a/R/fit_xgb_occupancy_models.R b/R/fit_xgb_occupancy_models.R
@@ -218,9 +218,11 @@ NULL
 #' @export
 fit_xgb_occupancy_models <- function(
   site_data, feature_data,
-  site_detection_columns, site_n_surveys_columns,
+  site_detection_columns,
+  site_n_surveys_columns,
   site_env_vars_columns,
-  feature_survey_sensitivity_column, feature_survey_specificity_column,
+  feature_survey_sensitivity_column,
+  feature_survey_specificity_column,
   xgb_tuning_parameters,
   xgb_early_stopping_rounds = rep(20, length(site_detection_columns)),
   xgb_n_rounds = rep(100, length(site_detection_columns)),
@@ -372,13 +374,18 @@ fit_xgb_occupancy_models <- function(
         feature_data[[feature_survey_specificity_column]][i],
       parameters = xgb_tuning_parameters,
       early_stopping_rounds = xgb_early_stopping_rounds[i],
-      n_rounds = xgb_n_rounds[i], n_folds = n_folds[i],
-      n_threads = n_threads, verbose = verbose, seed = seed)
+      n_rounds = xgb_n_rounds[i],
+      n_folds = n_folds[i],
+      n_threads = n_threads,
+      verbose = verbose,
+      seed = seed)
   })
 
   # assess models
-  perf <- plyr::ldply(seq_len(nrow(feature_data)), function(i) {
-    out <- plyr::ldply(seq_len(n_folds[i]), function(k) {
+  perf <- plyr::ldply(
+    seq_len(nrow(feature_data)), .parallel = FALSE, function(i) {
+    out <- plyr::ldply(
+      seq_len(n_folds[i]), .parallel = FALSE, function(k) {
       ## extract fold training and test data
       m_k <- m[[i]]$models[[k]]
       nround_k <- m[[i]]$models[[k]]$best_iteration
@@ -392,9 +399,15 @@ fit_xgb_occupancy_models <- function(
       survey_spec <- feature_data[[feature_survey_specificity_column]][[i]]
       ## make predictions
       p_train_k <- c(withr::with_package("xgboost",
-        stats::predict(m_k, x_train_k, iterationrange = c(1, nround_k + 1))))
+        stats::predict(
+          m_k, xgboost::xgb.DMatrix(x_train_k, nthread = 1),
+          iterationrange = c(1, nround_k + 1),
+          nthread = 1)))
       p_test_k <- c(withr::with_package("xgboost",
-        stats::predict(m_k, x_test_k, iterationrange = c(1, nround_k + 1))))
+        stats::predict(
+          m_k, xgboost::xgb.DMatrix(x_test_k, nthread = 1),
+          iterationrange = c(1, nround_k + 1),
+          nthread = 1)))
       ## validate predictions
       assertthat::assert_that(all(p_train_k >= 0), all(p_train_k <= 1),
         msg = "xgboost predictions are not between zero and one")
@@ -434,15 +447,18 @@ fit_xgb_occupancy_models <- function(
   # make model predictions
   pred <- vapply(seq_len(nrow(feature_data)),
                  FUN.VALUE = numeric(nrow(site_env_data)), function(i) {
-    nr <- m[[i]]$parameters$nround
     rowMeans(
       vapply(m[[i]]$models, FUN.VALUE = numeric(nrow(site_env_data)),
              function(x) {
       ## generate predictions
       out <- c(withr::with_package("xgboost", stats::predict(
-        x, site_env_data, iterationrange = c(1, x$best_iteration + 1))))
+        x, xgboost::xgb.DMatrix(site_env_data, nthread = 1),
+        iterationrange = c(1, x$best_iteration + 1),
+        nthread = 1
+      )))
       ## validate predictions
-      assertthat::assert_that(all(out >= 0), all(out <= 1),
+      assertthat::assert_that(
+        all(out >= 0), all(out <= 1),
         msg = "xgboost predictions are not between zero and one")
       ## return result
       out
@@ -486,7 +502,6 @@ tune_model <- function(data, folds, survey_sensitivity, survey_specificity,
     x
   })
   full_parameters <- data.frame(full_parameters, stringsAsFactors = FALSE)
-
   ## add objective if missing
   if (is.null(full_parameters$objective)) {
     full_parameters$objective <- "binary:logistic"
@@ -503,15 +518,54 @@ tune_model <- function(data, folds, survey_sensitivity, survey_specificity,
   })
   assertthat::assert_that(all(is.finite(spw)))
 
+  # precompute xgboost matrices
+  # N.B. this is because xgboost has a bug when creating DMatrix objects
+  # from R matrix objects, wherein it will use all available threads
+  # on the system, if done within a child R process
+  dtrain_paths <- vapply(
+    seq_len(n_folds), FUN.VALUE = character(1), function(k) {
+    f <- tempfile(fileext = ".dmat")
+    xgboost::xgb.DMatrix.save(
+      xgboost::xgb.DMatrix(
+        data[[k]]$fit$x,
+        label = data[[k]]$fit$y,
+        weight = data[[k]]$fit$w,
+        nthread = 1
+      ),
+      f
+    )
+    f
+  })
+  dtest_paths <- vapply(
+    seq_len(n_folds), FUN.VALUE = character(1), function(k) {
+    f <- tempfile(fileext = ".dmat")
+    xgboost::xgb.DMatrix.save(
+      xgboost::xgb.DMatrix(
+        data[[k]]$test$x,
+        label = data[[k]]$test$y,
+        weight = data[[k]]$test$w,
+        nthread = 1
+      ),
+      f
+    )
+    f
+  })
+  assertthat::assert_that(
+    all(file.exists(dtrain_paths)),
+    all(file.exists(dtest_paths)),
+    msg = "could not save data to temporary directory to faciliate analysis")
+
   # find best tuning parameters using k-fold cross validation
   ## fit models using all parameters combinations
   is_parallel <- (n_threads > 1) && (nrow(full_parameters) > 1)
   if (is_parallel) {
     cl <- start_cluster(
       n_threads,
-      c("full_parameters", "data", "survey_sensitivity", "survey_specificity",
+      c("full_parameters", "data", "dtest_paths", "dtest_paths",
+        "survey_sensitivity", "survey_specificity",
         "spw", "n_rounds", "early_stopping_rounds", "seed",
         "rcpp_model_performance", "make_feval_tss"))
+    on.exit(try(stop_cluster(cl), silent = TRUE), add = TRUE)
   }
   cv <- plyr::ldply(
     seq_len(nrow(full_parameters)), .parallel = is_parallel,
@@ -522,17 +576,20 @@ tune_model <- function(data, folds, survey_sensitivity, survey_specificity,
     cv <- lapply(seq_len(n_folds), function(k) {
       ### prepare data for xgboost (note we use the fit data not the train data)
       dtrain <- xgboost::xgb.DMatrix(
-        data[[k]]$fit$x, label = data[[k]]$fit$y,
-        weight = data[[k]]$fit$w)
+        dtrain_paths[k], nthread = 1, silent = TRUE)
       dtest <- xgboost::xgb.DMatrix(
-        data[[k]]$test$x, label = data[[k]]$test$y,
-        weight = data[[k]]$test$w)
+        dtest_paths[k], nthread = 1, silent = TRUE)
       ### prepare evaluation function
       curr_feval_tss <- make_feval_tss(survey_sensitivity, survey_specificity)
       ### prepare arguments for xgboost call
-      args <- list(data = dtrain, verbose = FALSE, scale_pos_weight = spw[k],
-                   watchlist = list(test = dtest), eval_metric = curr_feval_tss,
-                   maximize = TRUE, nrounds = n_rounds, nthread = 1,
+      args <- list(data = dtrain,
+                   verbose = FALSE,
+                   scale_pos_weight = spw[k],
+                   watchlist = list(test = dtest),
+                   eval_metric = curr_feval_tss,
+                   maximize = TRUE,
+                   nrounds = n_rounds,
+                   nthread = 1,
                    early_stopping_rounds = early_stopping_rounds)
       args <- append(args, p)
       ### fit model
@@ -541,10 +598,11 @@ tune_model <- function(data, folds, survey_sensitivity, survey_specificity,
       })
       ### generate predictions
       yhat_test <- c(withr::with_package("xgboost",
-          stats::predict(
-            model, data[[k]]$test$x,
-            iterationrange = c(1, model$best_iteration + 1)
-          )))
+        stats::predict(
+          model, dtest,
+          iterationrange = c(1, model$best_iteration + 1),
+          nthread = 1
+        )))
       ### validate predictions
       assertthat::assert_that(all(yhat_test >= 0), all(yhat_test <= 1),
         msg = "xgboost predictions are not between zero and one")
@@ -565,9 +623,15 @@ tune_model <- function(data, folds, survey_sensitivity, survey_specificity,
   if (is_parallel) {
     cl <- stop_cluster(cl)
   }
-  ## determine best parameters for i'th species
+
+  # cleanup
+  unlink(dtest_paths, force = TRUE)
+  unlink(dtrain_paths, force = TRUE)
+
+  # determine best parameters for i'th species
   cv <- tibble::as_tibble(cv)
   j <- which.max(cv$eval)
+
   # return best parameters and models
   best_params <- as.list(full_parameters[j, , drop = FALSE])
   best_params$scale_pos_weight <- list(spw)

diff --git a/R/optimal_survey_scheme.R b/R/optimal_survey_scheme.R
@@ -332,6 +332,7 @@ optimal_survey_scheme <- function(
         "feature_target_column",
         "total_budget",
         "rcpp_expected_value_of_decision_given_survey_scheme"))
+    on.exit(try(stop_cluster(cl), silent = TRUE), add = TRUE)
   }
   ## run calculations
   evd_new_info <- plyr::laply(

diff --git a/README.md b/README.md
@@ -80,7 +80,7 @@ Please cite the *surveyvoi R* package when using it in publications. To
 cite the developmental version, please use:
 
 > Hanson JO, Chadès I, Hudgins EJ, Bennett J (2021). surveyvoi: Survey
-> Value of Information. R package version 1.0.3.2. Available at
+> Value of Information. R package version 1.0.3.3. Available at
 > <https://github.com/jeffreyhanson/surveyvoi>.
 
 ## Usage

diff --git a/docs/404.html b/docs/404.html
diff --git a/docs/articles/index.html b/docs/articles/index.html
diff --git a/docs/articles/surveyvoi.html b/docs/articles/surveyvoi.html
diff --git a/docs/authors.html b/docs/authors.html
diff --git a/docs/index.html b/docs/index.html
diff --git a/docs/news/index.html b/docs/news/index.html