deprecate f in favor of group (#638)

Signed-off-by: Daena Rys <[email protected]> Co-authored-by: TuomasBorman <[email protected]> Co-authored-by: Tuomas Borman <[email protected]>
microbiome · Oct 1, 2024 · 8cace1b · 8cace1b
1 parent f7bcc66
commit 8cace1b
Show file tree

Hide file tree

Showing 10 changed files with 290 additions and 224 deletions.
diff --git a/NEWS b/NEWS
@@ -155,3 +155,4 @@ computation
 + If missing values, give informative error in *RDA/*CCA functions
 + transformAssay can apply transformation to altExp
 + Added CSS transformation
++ In agglomerateByVariable, splitOn and getDominant, use 'group' to specify grouping variable.
diff --git a/R/agglomerate.R b/R/agglomerate.R
@@ -45,7 +45,8 @@
 #'        \item \code{detection}: The threshold value for determining presence
 #'        or absence. A value in \code{x} must exceed this threshold to be
 #'        considered present.
-#'        \item \code{assay.type}: \code{Character scalar}. Specifies the assay used to
+#'        \item \code{assay.type}: \code{Character scalar}. Specifies the assay
+#'        used to
 #'        calculate prevalence. (Default: \code{"counts"})
 #'        \item \code{prevalence}: Prevalence threshold (in 0 to 1). The
 #'        required prevalence is strictly greater by default. To include the
@@ -56,10 +57,11 @@
 #'        from
 #'        \code{\link[DECIPHER:ConsensusSequence]{DECIPHER::ConsensusSequence}}
 #'        is returned. (Default: \code{FALSE})
-#'        \item \code{archetype} Of each level of \code{f}, which element should
+#'        \item \code{archetype} Of each level of \code{group}, which element
+#'        should
 #'        be regarded as the archetype and metadata in the columns or rows kept,
 #'        while merging? This can be single integer value or an integer vector
-#'        of the same length as \code{levels(f)}. (Default:
+#'        of the same length as \code{levels(group)}. (Default:
 #'        \code{1L}, which means the first element encountered per
 #'        factor level will be kept)
 #'    }
@@ -78,10 +80,15 @@
 #'   row-wise / for features ('rows') or column-wise / for samples ('cols').
 #'   Must be \code{'rows'} or \code{'cols'}.
 #'
-#' @param f A factor for merging. Must be the same length as
-#'   \code{nrow(x)/ncol(x)}. Rows/Cols corresponding to the same level will be
-#'   merged. If \code{length(levels(f)) == nrow(x)/ncol(x)}, \code{x} will be
-#'   returned unchanged.
+#' @param group \code{Character scalar}, \code{character vector} or
+#' \code{factor vector}. A column name from \code{rowData(x)} or
+#' \code{colData(x)} or alternatively a vector specifying how the merging is
+#' performed. If vector, the value must be the same length as
+#' \code{nrow(x)/ncol(x)}. Rows/Cols corresponding to the same level will be
+#' merged. If \code{length(levels(group)) == nrow(x)/ncol(x)}, \code{x} will be
+#' returned unchanged.
+#'  
+#' @param f Deprecated. Use \code{group} instead.
 #'
 #' @param update.tree \code{Logical scalar}. Should
 #'   \code{rowTree()} also be merged? (Default: \code{FALSE})
@@ -90,9 +97,12 @@
 #'
 #' @details
 #' 
-#' Agglomeration sums up the values of assays at the specified taxonomic level. With
-#' certain assays, e.g. those that include binary or negative values, this summing
-#' can produce meaningless values. In those cases, consider performing agglomeration
+#' Agglomeration sums up the values of assays at the specified taxonomic level.
+#' With
+#' certain assays, e.g. those that include binary or negative values, this
+#' summing
+#' can produce meaningless values. In those cases, consider performing
+#' agglomeration
 #' first, and then applying the transformation afterwards.
 #'
 #' \code{agglomerateByVariable} works similarly to
@@ -275,7 +285,7 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"),
 
         # merge taxa
         x <- agglomerateByVariable(
-            x, by = "rows", f = tax_factors, na.rm = TRUE, ...)
+            x, by = "rows", group = tax_factors, na.rm = TRUE, ...)
 
         # "Empty" the values to the right of the rank, using NA_character_.
         if( col < length(taxonomyRanks(x)) ){
@@ -303,10 +313,10 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"),
 #' @aliases agglomerateByVariable
 #' @export
 setMethod("agglomerateByVariable", signature = c(x = "SummarizedExperiment"),
-            function(x, by, f, ...){
+            function(x, by, group = f, f, ...){
                 by <- .check_MARGIN(by)
                 FUN <- switch(by, .merge_rows, .merge_cols)
-                x <- FUN(x, f, ...)
+                x <- FUN(x, group, ...)
                 return(x)
             }
 )
@@ -316,13 +326,14 @@ setMethod("agglomerateByVariable", signature = c(x = "SummarizedExperiment"),
 #' @export
 setMethod("agglomerateByVariable",
             signature = c(x = "TreeSummarizedExperiment"),
-            function(x, by, f, update.tree = mergeTree, mergeTree = FALSE, ...){
+            function(x, by, group = f, f, update.tree = mergeTree,
+                    mergeTree = FALSE, ...){
                 # Check by
                 by <- .check_MARGIN(by)
                 # Get function based on by
                 FUN <- switch(by, .merge_rows_TSE, .merge_cols_TSE)
                 # Agglomerate
-                x <- FUN(x, f, update.tree = update.tree, ...)
+                x <- FUN(x, group, update.tree = update.tree, ...)
                 return(x)
             }
 )
@@ -331,7 +342,8 @@ setMethod("agglomerateByVariable",
 #' @importFrom SingleCellExperiment altExp altExp<- altExps<-
 #' @export
 setMethod("agglomerateByRank", signature = c(x = "SingleCellExperiment"),
-    function(x, ..., altexp = NULL, altexp.rm = strip_altexp, strip_altexp = TRUE){
+    function(x, ..., altexp = NULL, altexp.rm = strip_altexp,
+            strip_altexp = TRUE){
         # input check
         if(!.is_a_bool(altexp.rm)){
             stop("'altexp.rm' mus be TRUE or FALSE.", call. = FALSE)
@@ -352,8 +364,8 @@ setMethod("agglomerateByRank", signature = c(x = "SingleCellExperiment"),
 setMethod(
     "agglomerateByRank", signature = c(x = "TreeSummarizedExperiment"),
     function(
-        x, ..., update.tree = agglomerateTree, agglomerate.tree = agglomerateTree, 
-        agglomerateTree = FALSE){
+        x, ..., update.tree = agglomerateTree,
+        agglomerate.tree = agglomerateTree, agglomerateTree = FALSE){
                 # input check
                 if(!.is_a_bool(update.tree)){
                     stop("'update.tree' must be TRUE or FALSE.",
@@ -388,7 +400,7 @@ setMethod(
 
 # This function removes empty rank columns from rowdata. (Those that include
 # only NA values)
-.remove_NA_cols_from_rowdata <- function(x, empty.ranks.rm = remove_empty_ranks, 
+.remove_NA_cols_from_rowdata <- function(x, empty.ranks.rm = remove_empty_ranks,
     remove_empty_ranks = FALSE, ...){
     # Check empty.ranks.rm
     if( !.is_a_bool(empty.ranks.rm) ){

diff --git a/R/getDominant.R b/R/getDominant.R
@@ -8,18 +8,28 @@
 #' @inheritParams getPrevalence
 #'
 #' @param name \code{Character scalar}. A name for the column of the 
-#' \code{colData} where results will be stored. (Default: \code{"dominant_taxa"})
+#' \code{colData} where results will be stored.
+#' (Default: \code{"dominant_taxa"})
 #'   
 #' @param other.name \code{Character scalar}. A name for features that are not 
-#' included in n the most frequent dominant features in the data. (Default: \code{"Other"})
+#' included in n the most frequent dominant features in the data.
+#' (Default: \code{"Other"})
 #' 
-#' @param n \code{Numeric scalar}. The number of features that are the most frequent 
-#' dominant features. Default is NULL, which defaults that each sample is assigned 
+#' @param group \code{Character scalar}. Defines a group. Must be one of the
+#' columns from \code{rowData(x)}. (Default: \code{NULL})
+#' 
+#' @param rank Deprecated. Use \code{group} instead.
+#' 
+#' @param n \code{Numeric scalar}. The number of features that are the most
+#' frequent 
+#' dominant features. Default is NULL, which defaults that each sample is
+#' assigned 
 #' a dominant taxon. (Default: \code{NULL})
 #' 
-#' @param complete \code{Logical scalar}. A value to manage multiple dominant taxa for a sample.
+#' @param complete \code{Logical scalar}. A value to manage multiple dominant
+#' taxa for a sample.
 #' Default for getDominant is TRUE to include all equally dominant taxa
-#' for each sample. complete = FALSE samples one taxa for the samples that have 
+#' for each sample. complete = FALSE samples one taxa for the samples that have
 #' multiple. 
 #' Default for addDominant is FALSE to add a column with only one 
 #' dominant taxon assigned for each sample into colData. complete = TRUE adds a
@@ -34,14 +44,14 @@
 #' object, and stores the information in the \code{colData}. It is a wrapper for
 #' \code{getDominant}.
 #'
-#' With \code{rank} parameter, it is possible to agglomerate taxa based on
-#' taxonomic ranks. E.g. if 'Genus' rank is used, all abundances of same Genus
-#' are added together, and those families are returned.
-#' See \code{agglomerateByRank()} for additional arguments to deal with
+#' With \code{group} parameter, it is possible to agglomerate rows based on
+#' groups. If the value is one of the columns in \code{taxonomyRanks()},
+#' \code{agglomerateByRank()} is applied. Otherwise,
+#' \code{agglomerateByVariable()} is utilized.
+#' E.g. if 'Genus' rank is used, all abundances of same Genus
+#' are added together, and agglomerated features are returned.
+#' See corresponding functions for additional arguments to deal with
 #' missing values or special characters.
-#' If the \code{rank} is not specifying a taxonomy rank from
-#' \code{taxonomyRanks(x)}, the function
-#' agglomerates rows with \code{agglomerateByVariable()}.
 #'
 #' @return \code{getDominant} returns a named character vector \code{x}
 #' while \code{addDominant} returns
@@ -56,44 +66,44 @@
 #' x <- GlobalPatterns
 #'
 #' # Finds the dominant taxa.
-#' sim.dom <- getDominant(x, rank="Genus")
+#' sim.dom <- getDominant(x, group = "Genus")
 #'
 #' # Add information to colData
-#' x <- addDominant(x, rank = "Genus", name="dominant_genera")
+#' x <- addDominant(x, group = "Genus", name ="dominant_genera")
 #' colData(x)
 NULL
 
 #' @rdname getDominant
 #' @export
 setGeneric("getDominant",signature = c("x"),
-           function(x, assay.type = assay_name, assay_name = "counts", 
-                    rank = NULL, other.name = "Other", n = NULL, 
-                    complete = TRUE, ...)
-               standardGeneric("getDominant"))
+    function(x, assay.type = assay_name, assay_name = "counts",
+            group = rank, rank = NULL, other.name = "Other", n = NULL, 
+            complete = TRUE, ...)
+    standardGeneric("getDominant"))
 
 #' @rdname getDominant
 #' @importFrom IRanges relist
 #' @export
 setMethod("getDominant", signature = c(x = "SummarizedExperiment"),
-    function(x, assay.type = assay_name, assay_name = "counts", 
-             rank = NULL, other.name = "Other", n = NULL, complete = TRUE, ...){
+    function(x, assay.type = assay_name, assay_name = "counts", group = rank,
+            rank = NULL, other.name = "Other", n = NULL, complete = TRUE, ...){
         # Input check
         # Check assay.type
         .check_assay_present(assay.type, x)
-        # rank check
-        if(!is.null(rank)){
-            if(!.is_a_string(rank)){
-                stop("'rank' must be an single character value.",
-                     call. = FALSE)
+        # group check
+        if(!is.null(group)){
+            if(!.is_a_string(group)){
+                stop("'group' must be an single character value.",
+                    call. = FALSE)
             }
         } 
-        # If "rank" is not NULL, species are aggregated according to the
+        # If "group" is not NULL, species are aggregated according to the
         # taxonomic rank that is specified by user.
-        if (!is.null(rank) && rank %in% taxonomyRanks(x)) {
-            x <- agglomerateByRank(x, rank, ...)
+        if (!is.null(group) && group %in% taxonomyRanks(x)) {
+            x <- agglomerateByRank(x, rank = group, ...)
         # or factor that is specified by user
-        } else if (!is.null(rank)) {
-            x <- agglomerateByVariable(x, by = "rows", f = rank, ...)
+        } else if (!is.null(group)) {
+            x <- agglomerateByVariable(x, by = "rows", group = group, ...)
         }
         # Get assay
         mat <- assay(x, assay.type)
@@ -104,17 +114,20 @@ setMethod("getDominant", signature = c(x = "SummarizedExperiment"),
         # Get rownames based on indices
         taxa <- rownames(mat)[unlist(idx)]
 
-        # If multiple dominant taxa were found, names contain taxa in addition to 
-        # sample name. Names are converted so that they include only sample names.
+        # If multiple dominant taxa were found, names contain taxa in addition
+        # to 
+        # sample name. Names are converted so that they include only sample
+        # names.
         names(taxa) <- rep( names(idx), times = lengths(idx) )
 
         # If individual sample contains multiple dominant taxa (they have equal 
-        # counts) and if complete is FALSE, the an arbitrarily chosen dominant 
+        # counts) and if complete is FALSE, the an arbitrarily chosen dominant
         # taxa is returned
         if( length(taxa)>ncol(x) && !complete){
             # Store order
             order <- unique(names(taxa))
-            # there are multiple dominant taxa in one sample (counts are equal), length
+            # there are multiple dominant taxa in one sample (counts are equal),
+            # length
             # of dominant is greater than rows in colData.
             taxa <- split(taxa, rep(names(taxa), lengths(taxa)) )
             # Order the data
@@ -124,13 +137,15 @@ setMethod("getDominant", signature = c(x = "SummarizedExperiment"),
             # one of them is arbitrarily chosen
             taxa <- lapply(taxa, function(item) {
                         return(sample(item, 1)) })
-            taxa <- unname(sapply(taxa, function (x) {
-                        unlist(x)}))
+            taxa <- unname(unlist(lapply(taxa, function (x) {
+                        unlist(x)})))
             names(taxa) <- names
-            warning("Multiple dominant taxa were found for some samples. Use complete = TRUE for details.", call. = FALSE)
+            warning(
+                "Multiple dominant taxa were found for some samples. ",
+                "Use complete = TRUE for details.", call. = FALSE)
         }
 
-        # Name "Other" the features that are not included in n the most abundant 
+        # Name "Other" the features that are not included in n the most abundant
         # in the data
         if(!is.null(n)){
             flat_taxa <- unlist(taxa, recursive = TRUE)
@@ -157,26 +172,26 @@ setMethod("getDominant", signature = c(x = "SummarizedExperiment"),
 #' @rdname getDominant
 #' @export
 setGeneric("addDominant", signature = c("x"),
-           function(x, name = "dominant_taxa", other.name = "Other", n = NULL, ...)
-               standardGeneric("addDominant"))
+    function(x, name = "dominant_taxa", other.name = "Other", n = NULL, ...)
+    standardGeneric("addDominant"))
 
 #' @rdname getDominant
 #' @export
 setMethod("addDominant", signature = c(x = "SummarizedExperiment"),
     function(x, name = "dominant_taxa", other.name = "Other", n = NULL, 
-             complete = FALSE, ...) {
+            complete = FALSE, ...) {
         # name check
         if(!.is_non_empty_string(name)){
             stop("'name' must be a non-empty single character value.",
-                 call. = FALSE)
+                call. = FALSE)
         }
         # other.name check
         if(!.is_non_empty_string(other.name)){
             stop("'other.name' must be a non-empty single character value.",
-                 call. = FALSE)
+                call. = FALSE)
         }
-        dom.taxa <- getDominant(x, other.name = other.name, n = n, 
-                                              complete = complete, ...)
+        dom.taxa <- getDominant(
+            x, other.name = other.name, n = n, complete = complete, ...)
         # Add list into colData if there are multiple dominant taxa
         if(length(unique(names(dom.taxa))) < length(names(dom.taxa))) {
             # Store order
@@ -200,12 +215,13 @@ setMethod("addDominant", signature = c(x = "SummarizedExperiment"),
         inds <- which(x == "NA")
         if (length(inds) > 0){
             x[inds] <- NA
-            warning(paste("Interpreting NA string as missing value NA. 
-        Removing", length(inds), "entries"), call. = FALSE)
+            warning("Interpreting NA string as missing value NA. Removing",
+                length(inds), "entries", call. = FALSE)
         }
         x <- x[!is.na(x)]
     }
-    # Create a frequency table of unique values of the dominant taxa for each sample
+    # Create a frequency table of unique values of the dominant taxa for each
+    # sample
     s <- rev(sort(table(x)))
     # Include only n the most frequent taxa
     if (!is.null(n)){