jacobbien · viettr · Nov 10, 2020 · Nov 10, 2020 · Nov 10, 2020 · Nov 10, 2020
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -4,3 +4,4 @@
 ^_pkgdown\.yml$
 ^docs$
 ^pkgdown$
+^\.github$
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 .Rproj.user
 inst/doc
+.Rhistory
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: trac
 Title: Tree-based Aggregation of Compositional Data
-Version: 0.0.1
+Version: 0.0.2
 Authors@R: 
     c(person(given = "Jacob",
              family = "Bien",
@@ -18,7 +18,7 @@ License: GPL-3
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.1.1
+RoxygenNote: 7.2.0
 Remotes: jacobbien/ggb
 Imports: 
     reticulate,
@@ -33,11 +33,15 @@ Imports:
     dplyr,
     tidyr,
     magrittr,
-    rlang
+    rlang,
+    glmnet
 Suggests: 
     rmarkdown,
     knitr,
-    testthat
+    testthat,
+    phyloseq,
+    kableExtra,
+    tidyverse
 VignetteBuilder: knitr
 Depends: 
     R (>= 2.10)
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,16 +1,27 @@
 # Generated by roxygen2: do not edit by hand
 
 export(aggregate_to_level)
+export(check_additional_covariates)
+export(check_method)
 export(cv_sparse_log_contrast)
 export(cv_trac)
+export(get_probability_cv)
+export(get_probability_platt)
 export(phylo_to_A)
 export(plot_cv_trac)
 export(plot_trac_path)
+export(predict_second_stage)
+export(predict_sparse_log_contrast)
 export(predict_trac)
+export(probability_transform)
 export(refit_sparse_log_contrast)
+export(refit_sparse_log_contrast_classif)
 export(refit_trac)
+export(rescale_betas)
+export(second_stage)
 export(sparse_log_contrast)
 export(tax_table_to_phylo)
 export(trac)
 importFrom(magrittr,"%>%")
 importFrom(rlang,.data)
+importFrom(stats,predict)
diff --git a/R/cv_log_contrast.R b/R/cv_log_contrast.R
@@ -4,33 +4,67 @@
 #' It performs \code{nfold}-fold cross validation.
 #'
 #' @param fit output of \code{\link{sparse_log_contrast}} function.
-#' @param Z,y same arguments as passed to \code{\link{sparse_log_contrast}}. C
-#' that is used will be taken from fit object.
+#' @param Z,y,additional_covariates same arguments as passed to
+#'   \code{\link{sparse_log_contrast}}. C will be taken from fit object.
 #' @param folds a partition of \code{1:nrow(Z)}.
 #' @param nfolds number of folds for cross-validation
 #' @param summary_function how to combine the errors calculated on each
 #' observation within a fold (e.g. mean or median)
 #' @export
-cv_sparse_log_contrast <- function(fit, Z, y, folds = NULL, nfolds = 5, summary_function = stats::median) {
+cv_sparse_log_contrast <- function(fit, Z, y, folds = NULL, nfolds = 5,
+                                   summary_function = stats::median,
+                                   additional_covariates = NULL) {
   n <- nrow(Z)
   p <- ncol(Z)
   stopifnot(length(y) == n)
-  if(is.null(folds)) folds <- ggb:::make_folds(n, nfolds)
+  if (!is.null(additional_covariates) & !is.data.frame(additional_covariates)) {
+    additional_covariates <- data.frame(additional_covariates)
+  }
+  if (is.null(folds)) folds <- ggb:::make_folds(n, nfolds)
   else
     nfolds <- length(folds)
   cv <- list()
   fit_folds <- list() # save this to reuse by log-ratio's cv function
   errs <- matrix(NA, ncol(fit$beta), nfolds)
   for (i in seq(nfolds)) {
     cat("fold", i, fill = TRUE)
+    # add for backward compatibility
+    if (is.null(fit$method)) fit$method <- "regr"
+    if (is.null(fit$w_additional_covariates)) {
+      fit$w_additional_covariates <- NULL
+    }
+    if (is.null(fit$rho)) fit$rho <- 0
+    if (is.null(fit$normalized)) fit$normalized <- FALSE
     # train on all but i-th fold (and use settings from fit):
     fit_folds[[i]] <- sparse_log_contrast(Z[-folds[[i]], ],
                                           y[-folds[[i]]],
+                                          additional_covariates[-folds[[i]], ],
                                           fit$C,
-                                          fraclist = fit$fraclist)
+                                          fraclist = fit$fraclist,
+                                          w_additional_covariates =
+                                            fit$w_additional_covariates,
+                                          method = fit$method,
+                                          rho = fit$rho,
+                                          normalized = fit$normalized)
     if (fit$refit) stop("Not yet supported.")
-    errs[, i] <- apply((predict_trac(list(fit_folds[[i]]), Z[folds[[i]], ])[[1]] - y[folds[[i]]])^2,
-                       2, summary_function)
+    if (fit$method == "regr" | is.null(fit$method)) {
+      errs[, i] <- apply((predict_trac(
+        list(fit_folds[[i]]),
+          Z[folds[[i]], ],
+          additional_covariates[folds[[i]], ])[[1]] - y[folds[[i]]])^2,
+        2, summary_function
+      )
+    }
+
+    if (fit$method == "classif" |
+        fit$method == "classif_huber") {
+      # loss: max(0, 1 - y_hat * y)^2
+      er <- sign(predict_trac(list(fit_folds[[i]]),
+                              Z[folds[[i]],],
+                              additional_covariates[folds[[i]],])[[1]]) !=
+        c(y[folds[[i]]])
+      errs[, i] <- colMeans(er)
+    }
   }
   m <- rowMeans(errs)
   se <- apply(errs, 1, stats::sd) / sqrt(nfolds)

diff --git a/R/cv_trac.R b/R/cv_trac.R
@@ -1,53 +1,138 @@
 #' Perform cross validation for tuning parameter selection
 #'
 #' This function is to be called after calling \code{\link{trac}}.  It performs
-#' \code{nfold}-fold cross validation.
+#' \code{nfold}-fold cross validation. For classification the metric is
+#' misclassification error.
 #'
 #' @param fit output of \code{\link{trac}} function.
-#' @param Z,y,A same arguments as passed to \code{\link{trac}}
+#' @param Z,y,A,additional_covariates same arguments as passed to
+#'    \code{\link{trac}}
 #' @param folds a partition of \code{1:nrow(Z)}.
 #' @param nfolds number of folds for cross-validation
 #' @param summary_function how to combine the errors calculated on each
-#' observation within a fold (e.g. mean or median)
+#' observation within a fold (e.g. mean or median) (only for regression task)
+#' @param stratified if `TRUE` use stratified folds based on target variable
+#'   only for classification. Default set to FALSE.
 #' @export
-cv_trac <- function(fit, Z, y, A, folds = NULL, nfolds = 5, summary_function = stats::median) {
+cv_trac <- function(fit, Z, y, A, additional_covariates = NULL, folds = NULL,
+                    nfolds = 5, summary_function = stats::median,
+                    stratified = FALSE) {
   n <- nrow(Z)
   p <- ncol(Z)
+  if (!is.null(additional_covariates) & !is.data.frame(additional_covariates)) {
+    additional_covariates <- data.frame(additional_covariates)
+  }
   stopifnot(length(y) == n)
-  if(is.null(folds)) folds <- ggb:::make_folds(n, nfolds)
-  else
+  if (is.null(folds)) {
+    if (stratified) {
+      folds <- make_folds_stratified(n, nfolds, y)
+    } else {
+      folds <- ggb:::make_folds(n, nfolds)
+    }
+  } else {
     nfolds <- length(folds)
+  }
+
   cv <- list()
   fit_folds <- list() # save this to reuse by log-ratio's cv function
   for (iw in seq_along(fit)) {
     if (length(fit) > 1) cat("CV for weight sequence #", iw, fill = TRUE)
     errs <- matrix(NA, ncol(fit[[iw]]$beta), nfolds)
     for (i in seq(nfolds)) {
       cat("fold", i, fill = TRUE)
+      # add for backward compatibility
+      if (is.null(fit[[iw]]$method)) fit[[iw]]$method <- "regr"
+      if (is.null(fit[[iw]]$w_additional_covariates)) {
+        fit[[iw]]$w_additional_covariates <- NULL
+      }
+      if (is.null(fit[[iw]]$rho)) fit[[iw]]$rho <- 0
+
+
       # train on all but i-th fold (and use settings from fit):
-      fit_folds[[i]] <- trac(Z[-folds[[i]], ],
-                      y[-folds[[i]]],
-                      A, fraclist = fit[[iw]]$fraclist, w = fit[[iw]]$w)
+      fit_folds[[i]] <- trac(Z = Z[-folds[[i]], ],
+                             y = y[-folds[[i]]],
+                             A = A,
+                             additional_covariates =
+                               additional_covariates[-folds[[i]], ],
+                             fraclist = fit[[iw]]$fraclist,
+                             w = fit[[iw]]$w,
+                             w_additional_covariates =
+                               fit[[iw]]$w_additional_covariates,
+                             method = fit[[iw]]$method,
+                             rho = fit[[iw]]$rho,
+                             normalized = fit[[iw]]$normalized)
+
       if (fit[[iw]]$refit) {
         fit_folds[[i]] <- refit_trac(fit_folds[[i]], Z[-folds[[i]], ],
-                              y[-folds[[i]]], A)
+                                     y[-folds[[i]]], A)
+      }
+      if (fit[[iw]]$method == "regr" | is.null(fit[[iw]]$method)) {
+        errs[, i] <- apply(
+          (predict_trac(
+            fit_folds[[i]],
+            Z[folds[[i]], ],
+            additional_covariates[folds[[i]], ])[[1]] - y[folds[[i]]])^2,
+          2, summary_function
+        )
+      }
+
+      if (fit[[iw]]$method == "classif" |
+          fit[[iw]]$method == "classif_huber") {
+        # loss: max(0, 1 - y_hat * y)^2
+        er <- sign(predict_trac(fit_folds[[i]],
+                                Z[folds[[i]],],
+                                additional_covariates[folds[[i]],])[[1]]) !=
+          c(y[folds[[i]]])
+        errs[, i] <- colMeans(er)
       }
-      errs[, i] <- apply((predict_trac(fit_folds[[i]],
-                                         Z[folds[[i]], ])[[1]] - y[folds[[i]]])^2, 2, summary_function)
     }
     m <- rowMeans(errs)
     se <- apply(errs, 1, stats::sd) / sqrt(nfolds)
     ibest <- which.min(m)
-    i1se <- min(which(m < m[ibest] + se[ibest]))
-    cv[[iw]] <- list(errs = errs, m = m, se = se,
-                     lambda_best = fit[[iw]]$fraclist[ibest], ibest = ibest,
-                     lambda_1se = fit[[iw]]$fraclist[i1se], i1se = i1se,
-                     fraclist = fit[[iw]]$fraclist, w = fit[[iw]]$w,
-                     nonzeros = colSums(abs(fit[[iw]]$gamma) > 1e-5),
-                     fit_folds = fit_folds)
+    i1se <- min(which(m <= m[ibest] + se[ibest]))
+    cv[[iw]] <- list(
+      errs = errs, m = m, se = se,
+      lambda_best = fit[[iw]]$fraclist[ibest], ibest = ibest,
+      lambda_1se = fit[[iw]]$fraclist[i1se], i1se = i1se,
+      fraclist = fit[[iw]]$fraclist, w = fit[[iw]]$w,
+      nonzeros = colSums(abs(fit[[iw]]$gamma) > 1e-5),
+      fit_folds = fit_folds
+    )
+  }
+  list(
+    cv = cv,
+    iw_best = which.min(lapply(cv, function(cvv) cvv$m[cvv$ibest])),
+    iw_1se = which.min(lapply(cv, function(cvv) cvv$m[cvv$i1se])),
+    folds = folds
+  )
+}
+
+#' This function creates stratified folds for cross validation for unbalanced
+#' data. The code is adopted from ggb:::make_folds
+#'
+#' @param n number of observations
+#' @param nfolds number of folds
+#' @param y variable with group assignment.
+
+make_folds_stratified <- function(n, nfolds, y) {
+  # Check if number of folds is greater than the max n of observations
+  # per group. If the number is greater at least one fold will not contain
+  # any observations of group of interest.
+  max_n_y <- max(table(y))
+  nfolds <- min(nfolds, max_n_y)
+  # Initiate the list in advance
+  folds <- vector(mode = "list", length = nfolds)
+  for (j in unique(y)) {
+    ixs <- which(y == j)
+    nn <- round(length(ixs) / nfolds)
+    sizes <- rep(nn, nfolds)
+    sizes[nfolds] <- sizes[nfolds] + length(ixs) - nn * nfolds
+    b <- c(0, cumsum(sizes))
+    ii <- sample(length(ixs))
+    ii <- ixs[ii]
+    for (i in seq(nfolds)) {
+      folds[[i]] <- c(folds[[i]], ii[seq(b[i] + 1, b[i + 1])])
+    }
   }
-  list(cv = cv,
-       iw_best = which.min(lapply(cv, function(cvv) cvv$m[cvv$ibest])),
-       iw_1se = which.min(lapply(cv, function(cvv) cvv$m[cvv$i1se])),
-       folds = folds)
+  folds
 }
diff --git a/R/data.R b/R/data.R
@@ -23,3 +23,38 @@
 #' Data](https://www.biorxiv.org/content/10.1101/2020.09.01.277632v1.full) for
 #' the definition of A.} }
 "sCD14"
+
+
+#' malawi vs venezuela, adults only
+#'
+#' This is a subset of the "malawi vs venezuela, adults only" task from the
+#' [Microbiome Learning
+#' Repo](
+#' https://knights-lab.github.io/MLRepo/docs/yatsunenko_malawi_venezuela.html).
+#' The task is to predict whether the individuals live in Malawi or Venezuela
+#' based on the OTU count table.
+#' OTUs that occur in less than 10% of the samples are excluded. Only
+#' Bacteria are included. The
+#' taxonomic table for the OTUs was obtained by the [greengenes reference
+#' database](https://www.nature.com/articles/ismej2011139?report=reader).
+#' The data was originally published in [Yatsunenko, T.,
+#' Rey, F. E., Manary, M. J., Trehan, I., Dominguez-Bello, M. G., Contreras, M.,
+#'  ... & Gordon, J. I. (2012).
+#'  Human gut microbiome viewed across age and geography.
+#'  nature, 486(7402),
+#'  222-227.](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3376388/).
+#'
+#'
+#' @format A phyloseq object:
+#' \describe{
+#'   \item{otu_table}{OTU count table with 54 rows corresponding to subjects
+#'     and 5008 columns corresponding to OTUs}
+#'   \item{tax_table}{Taxonomic table with the taxonomic assignment on different
+#'     levels for the OTUs. The names of the OTUs are saved in the rownames.
+#'     The 5008 rows correspond to the different OTUS and the 7 columns to
+#'     the different levels of the taxonomic tree (Kingdom, Phylum, Class,
+#'     Order, Family, Genus, Species)}
+#'   \item{sam_data}{Meta data about the different observations. Contains
+#'     the labels Vars and additional covariates sex (binary) and age (numeric)}
+#' }
+"malawi"