From f2e8696c85d7230ee9c95542a04efb178ed769c1 Mon Sep 17 00:00:00 2001
From: Benjamin Christoffersen <boennecd@gmail.com>
Date: Sat, 1 Jun 2019 16:02:11 -0700
Subject: [PATCH] fix https://github.com/boennecd/parglm/issues/8 and
 https://github.com/boennecd/parglm/issues/7

---
 DESCRIPTION                       |  2 +-
 NEWS.md                           |  4 ++++
 R/parglm.R                        | 16 ++++++++++++++--
 man/parglm.Rd                     | 13 +++++++++++--
 src/arma_n_rcpp.h                 |  7 -------
 src/parallel_qr.cpp               |  5 ++++-
 tests/testthat/test_parglm_fams.R | 24 +++++++++++++++++++++++-
 7 files changed, 57 insertions(+), 14 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index c3bb2e3..944b9ae 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: parglm
 Type: Package
 Title: Parallel GLM
-Version: 0.1.3
+Version: 0.1.4
 Authors@R: c(
   person("Benjamin", "Christoffersen", 
          email = "boennecd@gmail.com", 
diff --git a/NEWS.md b/NEWS.md
index 86416c6..cbabb60 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,7 @@
+# pargm 0.1.4
+* `stop`s when there are more variables than observations. Previously, this 
+  caused a crash.
+
 # pargm 0.1.3
 * Fix bug found with Valgrind.
 
diff --git a/R/parglm.R b/R/parglm.R
index c253cb2..c7b9dcc 100644
--- a/R/parglm.R
+++ b/R/parglm.R
@@ -46,6 +46,15 @@ NULL
 #' element. The \code{qr} element in the object returned by \code{parglm}(\code{.fit}) only has the \eqn{R}
 #' matrix from the QR decomposition.
 #'
+#' @details
+#' The current implementation uses \code{min(as.integer(n / p), nthreads)}
+#' threads where \code{n} is the number observations, \code{p} is the
+#' number of covariates, and \code{nthreads} is the \code{nthreads} element of
+#' the list
+#' returned by \code{\link{parglm.control}}. Thus, there is likely little (if
+#' any) reduction in computation time if \code{p} is almost equal to \code{n}.
+#' The current implementation cannot handle \code{p > n}.
+#'
 #' @examples
 #' # small example from `help('glm')`. Fitting this model in parallel does
 #' # not matter as the data set is small
@@ -140,11 +149,13 @@ parglm.control <- function(
 #' @importFrom stats gaussian binomial Gamma inverse.gaussian poisson
 #' @export
 parglm.fit <- function(
-  x, y, weights = rep(1, nobs), start = NULL, etastart = NULL,
-  mustart = NULL, offset = rep(0, nobs), family = gaussian(),
+  x, y, weights = rep(1, NROW(x)), start = NULL, etastart = NULL,
+  mustart = NULL, offset = rep(0, NROW(x)), family = gaussian(),
   control = list(), intercept = TRUE, ...){
   .check_fam(family)
   stopifnot(nrow(x) == length(y))
+  if(NCOL(x) > NROW(x))
+    stop("not implemented with more variables than observations")
 
   if(!is.null(mustart))
     warning(sQuote("mustart"), " will not be used")
@@ -193,6 +204,7 @@ parglm.fit <- function(
       if(control$nthreads > 1L)
         max(nrow(x) / control$nthreads, control$nthreads) else
           nrow(x)
+  block_size <- max(block_size, NCOL(x))
 
   use_start <- !is.null(start)
   fit <- parallelglm(
diff --git a/man/parglm.Rd b/man/parglm.Rd
index 4716bd4..05edd3d 100644
--- a/man/parglm.Rd
+++ b/man/parglm.Rd
@@ -9,8 +9,8 @@ parglm(formula, family = gaussian, data, weights, subset, na.action,
   start = NULL, offset, control = list(...), contrasts = NULL,
   model = TRUE, x = FALSE, y = TRUE, ...)
 
-parglm.fit(x, y, weights = rep(1, nobs), start = NULL,
-  etastart = NULL, mustart = NULL, offset = rep(0, nobs),
+parglm.fit(x, y, weights = rep(1, NROW(x)), start = NULL,
+  etastart = NULL, mustart = NULL, offset = rep(0, NROW(x)),
   family = gaussian(), control = list(), intercept = TRUE, ...)
 }
 \arguments{
@@ -71,6 +71,15 @@ Function like \code{\link{glm}} which can make the computation
 in parallel. The function supports most families listed in \code{\link{family}}.
 See "\code{vignette("parglm", "parglm")}" for run time examples.
 }
+\details{
+The current implementation uses \code{min(as.integer(n / p), nthreads)}
+threads where \code{n} is the number observations, \code{p} is the
+number of covariates, and \code{nthreads} is the \code{nthreads} element of
+the list
+returned by \code{\link{parglm.control}}. Thus, there is likely little (if
+any) reduction in computation time if \code{p} is almost equal to \code{n}.
+The current implementation cannot handle \code{p > n}.
+}
 \examples{
 # small example from `help('glm')`. Fitting this model in parallel does
 # not matter as the data set is small
diff --git a/src/arma_n_rcpp.h b/src/arma_n_rcpp.h
index 33cc04f..ce848c9 100644
--- a/src/arma_n_rcpp.h
+++ b/src/arma_n_rcpp.h
@@ -11,13 +11,6 @@
 #endif
 
 #define ARMA_NO_DEBUG
-// Note: This also disables the check in inv(A, B) for whether inversion is succesfull it seems http://arma.sourceforge.net/docs.html#inv
-// from armadillo config.hpp
-//// Uncomment the above line if you want to disable all run-time checks.
-//// This will result in faster code, but you first need to make sure that your code runs correctly!
-//// We strongly recommend to have the run-time checks enabled during development,
-//// as this greatly aids in finding mistakes in your code, and hence speeds up development.
-//// We recommend that run-time checks be disabled _only_ for the shipped version of your program.
 
 #include <RcppArmadillo.h>
 
diff --git a/src/parallel_qr.cpp b/src/parallel_qr.cpp
index e1b5c29..b1dd4c7 100644
--- a/src/parallel_qr.cpp
+++ b/src/parallel_qr.cpp
@@ -1,5 +1,6 @@
 #include "parallel_qr.h"
 #include "LAPACK_wrappers.h"
+#include <algorithm>
 
 arma::mat R_F::R_rev_piv() const {
   arma::uvec piv = pivot;
@@ -14,7 +15,9 @@ qr_parallel::worker::worker
 R_F qr_parallel::worker::operator()(){
   qr_work_chunk my_chunk = my_generator->get_chunk();
   QR_factorization qr(my_chunk.X);
-  arma::mat F = qr.qy(my_chunk.Y, true).rows(0, my_chunk.X.n_cols - 1);
+  const unsigned n_rows =
+    std::min(my_chunk.X.n_cols - 1, my_chunk.Y.n_rows - 1);
+  arma::mat F = qr.qy(my_chunk.Y, true).rows(0, n_rows);
 
   return R_F { qr.R(), qr.pivot(), std::move(F), my_chunk.dev };
 }
diff --git a/tests/testthat/test_parglm_fams.R b/tests/testthat/test_parglm_fams.R
index e7f8792..f2b1997 100644
--- a/tests/testthat/test_parglm_fams.R
+++ b/tests/testthat/test_parglm_fams.R
@@ -264,7 +264,7 @@ test_that("'FASTs' fail when design matrix is singular", {
     f2 <- parglm(y ~ X, control = parglm.control(method = "FAST"))))
 })
 
-test_that("'parglm' yields the same as 'glm' also when one observations is not 'good'",{
+test_that("'parglm' yields the same as 'glm' also when one observations is not 'good'", {
   phat <- seq(.01, .99, by = .01)
   X <- log(phat / (1 - phat)) - 2
   set.seed(47313714)
@@ -285,3 +285,25 @@ test_that("'parglm' yields the same as 'glm' also when one observations is not '
                  control = parglm.control(nthreads = 2))
   expect_equal(fit[to_check], pfit[to_check])
 })
+
+test_that("'stop's when there are more variables than observations", {
+  set.seed(1)
+  n <- 20L
+  dframe <- cbind(data.frame(y = 1:n), replicate(n, rnorm(n)))
+
+  expect_error(
+    parglm(y ~ ., gaussian(), dframe),
+    "not implemented with more variables than observations", fixed = TRUE)
+
+  # check that it works with same number of observations as variables
+  dframe <- dframe[, 1:n]
+  fpar <- parglm(y ~ ., gaussian(), dframe, nthreads = 2)
+  fglm <-    glm(y ~ ., gaussian(), dframe)
+  expect_equal(coef(fpar), coef(fglm))
+
+  # and with almost the same number of variables as observations
+  dframe <- dframe[, 1:(n - 3L)]
+  fpar <- parglm(y ~ ., gaussian(), dframe, nthreads = 2)
+  fglm <-    glm(y ~ ., gaussian(), dframe)
+  expect_equal(coef(fpar), coef(fglm))
+})