From f2e8696c85d7230ee9c95542a04efb178ed769c1 Mon Sep 17 00:00:00 2001 From: Benjamin Christoffersen Date: Sat, 1 Jun 2019 16:02:11 -0700 Subject: [PATCH] fix https://github.com/boennecd/parglm/issues/8 and https://github.com/boennecd/parglm/issues/7 --- DESCRIPTION | 2 +- NEWS.md | 4 ++++ R/parglm.R | 16 ++++++++++++++-- man/parglm.Rd | 13 +++++++++++-- src/arma_n_rcpp.h | 7 ------- src/parallel_qr.cpp | 5 ++++- tests/testthat/test_parglm_fams.R | 24 +++++++++++++++++++++++- 7 files changed, 57 insertions(+), 14 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c3bb2e3..944b9ae 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: parglm Type: Package Title: Parallel GLM -Version: 0.1.3 +Version: 0.1.4 Authors@R: c( person("Benjamin", "Christoffersen", email = "boennecd@gmail.com", diff --git a/NEWS.md b/NEWS.md index 86416c6..cbabb60 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# pargm 0.1.4 +* `stop`s when there are more variables than observations. Previously, this + caused a crash. + # pargm 0.1.3 * Fix bug found with Valgrind. diff --git a/R/parglm.R b/R/parglm.R index c253cb2..c7b9dcc 100644 --- a/R/parglm.R +++ b/R/parglm.R @@ -46,6 +46,15 @@ NULL #' element. The \code{qr} element in the object returned by \code{parglm}(\code{.fit}) only has the \eqn{R} #' matrix from the QR decomposition. #' +#' @details +#' The current implementation uses \code{min(as.integer(n / p), nthreads)} +#' threads where \code{n} is the number observations, \code{p} is the +#' number of covariates, and \code{nthreads} is the \code{nthreads} element of +#' the list +#' returned by \code{\link{parglm.control}}. Thus, there is likely little (if +#' any) reduction in computation time if \code{p} is almost equal to \code{n}. +#' The current implementation cannot handle \code{p > n}. +#' #' @examples #' # small example from `help('glm')`. Fitting this model in parallel does #' # not matter as the data set is small @@ -140,11 +149,13 @@ parglm.control <- function( #' @importFrom stats gaussian binomial Gamma inverse.gaussian poisson #' @export parglm.fit <- function( - x, y, weights = rep(1, nobs), start = NULL, etastart = NULL, - mustart = NULL, offset = rep(0, nobs), family = gaussian(), + x, y, weights = rep(1, NROW(x)), start = NULL, etastart = NULL, + mustart = NULL, offset = rep(0, NROW(x)), family = gaussian(), control = list(), intercept = TRUE, ...){ .check_fam(family) stopifnot(nrow(x) == length(y)) + if(NCOL(x) > NROW(x)) + stop("not implemented with more variables than observations") if(!is.null(mustart)) warning(sQuote("mustart"), " will not be used") @@ -193,6 +204,7 @@ parglm.fit <- function( if(control$nthreads > 1L) max(nrow(x) / control$nthreads, control$nthreads) else nrow(x) + block_size <- max(block_size, NCOL(x)) use_start <- !is.null(start) fit <- parallelglm( diff --git a/man/parglm.Rd b/man/parglm.Rd index 4716bd4..05edd3d 100644 --- a/man/parglm.Rd +++ b/man/parglm.Rd @@ -9,8 +9,8 @@ parglm(formula, family = gaussian, data, weights, subset, na.action, start = NULL, offset, control = list(...), contrasts = NULL, model = TRUE, x = FALSE, y = TRUE, ...) -parglm.fit(x, y, weights = rep(1, nobs), start = NULL, - etastart = NULL, mustart = NULL, offset = rep(0, nobs), +parglm.fit(x, y, weights = rep(1, NROW(x)), start = NULL, + etastart = NULL, mustart = NULL, offset = rep(0, NROW(x)), family = gaussian(), control = list(), intercept = TRUE, ...) } \arguments{ @@ -71,6 +71,15 @@ Function like \code{\link{glm}} which can make the computation in parallel. The function supports most families listed in \code{\link{family}}. See "\code{vignette("parglm", "parglm")}" for run time examples. } +\details{ +The current implementation uses \code{min(as.integer(n / p), nthreads)} +threads where \code{n} is the number observations, \code{p} is the +number of covariates, and \code{nthreads} is the \code{nthreads} element of +the list +returned by \code{\link{parglm.control}}. Thus, there is likely little (if +any) reduction in computation time if \code{p} is almost equal to \code{n}. +The current implementation cannot handle \code{p > n}. +} \examples{ # small example from `help('glm')`. Fitting this model in parallel does # not matter as the data set is small diff --git a/src/arma_n_rcpp.h b/src/arma_n_rcpp.h index 33cc04f..ce848c9 100644 --- a/src/arma_n_rcpp.h +++ b/src/arma_n_rcpp.h @@ -11,13 +11,6 @@ #endif #define ARMA_NO_DEBUG -// Note: This also disables the check in inv(A, B) for whether inversion is succesfull it seems http://arma.sourceforge.net/docs.html#inv -// from armadillo config.hpp -//// Uncomment the above line if you want to disable all run-time checks. -//// This will result in faster code, but you first need to make sure that your code runs correctly! -//// We strongly recommend to have the run-time checks enabled during development, -//// as this greatly aids in finding mistakes in your code, and hence speeds up development. -//// We recommend that run-time checks be disabled _only_ for the shipped version of your program. #include diff --git a/src/parallel_qr.cpp b/src/parallel_qr.cpp index e1b5c29..b1dd4c7 100644 --- a/src/parallel_qr.cpp +++ b/src/parallel_qr.cpp @@ -1,5 +1,6 @@ #include "parallel_qr.h" #include "LAPACK_wrappers.h" +#include arma::mat R_F::R_rev_piv() const { arma::uvec piv = pivot; @@ -14,7 +15,9 @@ qr_parallel::worker::worker R_F qr_parallel::worker::operator()(){ qr_work_chunk my_chunk = my_generator->get_chunk(); QR_factorization qr(my_chunk.X); - arma::mat F = qr.qy(my_chunk.Y, true).rows(0, my_chunk.X.n_cols - 1); + const unsigned n_rows = + std::min(my_chunk.X.n_cols - 1, my_chunk.Y.n_rows - 1); + arma::mat F = qr.qy(my_chunk.Y, true).rows(0, n_rows); return R_F { qr.R(), qr.pivot(), std::move(F), my_chunk.dev }; } diff --git a/tests/testthat/test_parglm_fams.R b/tests/testthat/test_parglm_fams.R index e7f8792..f2b1997 100644 --- a/tests/testthat/test_parglm_fams.R +++ b/tests/testthat/test_parglm_fams.R @@ -264,7 +264,7 @@ test_that("'FASTs' fail when design matrix is singular", { f2 <- parglm(y ~ X, control = parglm.control(method = "FAST")))) }) -test_that("'parglm' yields the same as 'glm' also when one observations is not 'good'",{ +test_that("'parglm' yields the same as 'glm' also when one observations is not 'good'", { phat <- seq(.01, .99, by = .01) X <- log(phat / (1 - phat)) - 2 set.seed(47313714) @@ -285,3 +285,25 @@ test_that("'parglm' yields the same as 'glm' also when one observations is not ' control = parglm.control(nthreads = 2)) expect_equal(fit[to_check], pfit[to_check]) }) + +test_that("'stop's when there are more variables than observations", { + set.seed(1) + n <- 20L + dframe <- cbind(data.frame(y = 1:n), replicate(n, rnorm(n))) + + expect_error( + parglm(y ~ ., gaussian(), dframe), + "not implemented with more variables than observations", fixed = TRUE) + + # check that it works with same number of observations as variables + dframe <- dframe[, 1:n] + fpar <- parglm(y ~ ., gaussian(), dframe, nthreads = 2) + fglm <- glm(y ~ ., gaussian(), dframe) + expect_equal(coef(fpar), coef(fglm)) + + # and with almost the same number of variables as observations + dframe <- dframe[, 1:(n - 3L)] + fpar <- parglm(y ~ ., gaussian(), dframe, nthreads = 2) + fglm <- glm(y ~ ., gaussian(), dframe) + expect_equal(coef(fpar), coef(fglm)) +})