Skip to content

Commit

Permalink
deprecate f in favor of group (#638)
Browse files Browse the repository at this point in the history
Signed-off-by: Daena Rys <[email protected]>
Co-authored-by: TuomasBorman <[email protected]>
Co-authored-by: Tuomas Borman <[email protected]>
  • Loading branch information
3 people authored Oct 1, 2024
1 parent f7bcc66 commit 8cace1b
Show file tree
Hide file tree
Showing 10 changed files with 290 additions and 224 deletions.
1 change: 1 addition & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -155,3 +155,4 @@ computation
+ If missing values, give informative error in *RDA/*CCA functions
+ transformAssay can apply transformation to altExp
+ Added CSS transformation
+ In agglomerateByVariable, splitOn and getDominant, use 'group' to specify grouping variable.
50 changes: 31 additions & 19 deletions R/agglomerate.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@
#' \item \code{detection}: The threshold value for determining presence
#' or absence. A value in \code{x} must exceed this threshold to be
#' considered present.
#' \item \code{assay.type}: \code{Character scalar}. Specifies the assay used to
#' \item \code{assay.type}: \code{Character scalar}. Specifies the assay
#' used to
#' calculate prevalence. (Default: \code{"counts"})
#' \item \code{prevalence}: Prevalence threshold (in 0 to 1). The
#' required prevalence is strictly greater by default. To include the
Expand All @@ -56,10 +57,11 @@
#' from
#' \code{\link[DECIPHER:ConsensusSequence]{DECIPHER::ConsensusSequence}}
#' is returned. (Default: \code{FALSE})
#' \item \code{archetype} Of each level of \code{f}, which element should
#' \item \code{archetype} Of each level of \code{group}, which element
#' should
#' be regarded as the archetype and metadata in the columns or rows kept,
#' while merging? This can be single integer value or an integer vector
#' of the same length as \code{levels(f)}. (Default:
#' of the same length as \code{levels(group)}. (Default:
#' \code{1L}, which means the first element encountered per
#' factor level will be kept)
#' }
Expand All @@ -78,10 +80,15 @@
#' row-wise / for features ('rows') or column-wise / for samples ('cols').
#' Must be \code{'rows'} or \code{'cols'}.
#'
#' @param f A factor for merging. Must be the same length as
#' \code{nrow(x)/ncol(x)}. Rows/Cols corresponding to the same level will be
#' merged. If \code{length(levels(f)) == nrow(x)/ncol(x)}, \code{x} will be
#' returned unchanged.
#' @param group \code{Character scalar}, \code{character vector} or
#' \code{factor vector}. A column name from \code{rowData(x)} or
#' \code{colData(x)} or alternatively a vector specifying how the merging is
#' performed. If vector, the value must be the same length as
#' \code{nrow(x)/ncol(x)}. Rows/Cols corresponding to the same level will be
#' merged. If \code{length(levels(group)) == nrow(x)/ncol(x)}, \code{x} will be
#' returned unchanged.
#'
#' @param f Deprecated. Use \code{group} instead.
#'
#' @param update.tree \code{Logical scalar}. Should
#' \code{rowTree()} also be merged? (Default: \code{FALSE})
Expand All @@ -90,9 +97,12 @@
#'
#' @details
#'
#' Agglomeration sums up the values of assays at the specified taxonomic level. With
#' certain assays, e.g. those that include binary or negative values, this summing
#' can produce meaningless values. In those cases, consider performing agglomeration
#' Agglomeration sums up the values of assays at the specified taxonomic level.
#' With
#' certain assays, e.g. those that include binary or negative values, this
#' summing
#' can produce meaningless values. In those cases, consider performing
#' agglomeration
#' first, and then applying the transformation afterwards.
#'
#' \code{agglomerateByVariable} works similarly to
Expand Down Expand Up @@ -275,7 +285,7 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"),

# merge taxa
x <- agglomerateByVariable(
x, by = "rows", f = tax_factors, na.rm = TRUE, ...)
x, by = "rows", group = tax_factors, na.rm = TRUE, ...)

# "Empty" the values to the right of the rank, using NA_character_.
if( col < length(taxonomyRanks(x)) ){
Expand Down Expand Up @@ -303,10 +313,10 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"),
#' @aliases agglomerateByVariable
#' @export
setMethod("agglomerateByVariable", signature = c(x = "SummarizedExperiment"),
function(x, by, f, ...){
function(x, by, group = f, f, ...){
by <- .check_MARGIN(by)
FUN <- switch(by, .merge_rows, .merge_cols)
x <- FUN(x, f, ...)
x <- FUN(x, group, ...)
return(x)
}
)
Expand All @@ -316,13 +326,14 @@ setMethod("agglomerateByVariable", signature = c(x = "SummarizedExperiment"),
#' @export
setMethod("agglomerateByVariable",
signature = c(x = "TreeSummarizedExperiment"),
function(x, by, f, update.tree = mergeTree, mergeTree = FALSE, ...){
function(x, by, group = f, f, update.tree = mergeTree,
mergeTree = FALSE, ...){
# Check by
by <- .check_MARGIN(by)
# Get function based on by
FUN <- switch(by, .merge_rows_TSE, .merge_cols_TSE)
# Agglomerate
x <- FUN(x, f, update.tree = update.tree, ...)
x <- FUN(x, group, update.tree = update.tree, ...)
return(x)
}
)
Expand All @@ -331,7 +342,8 @@ setMethod("agglomerateByVariable",
#' @importFrom SingleCellExperiment altExp altExp<- altExps<-
#' @export
setMethod("agglomerateByRank", signature = c(x = "SingleCellExperiment"),
function(x, ..., altexp = NULL, altexp.rm = strip_altexp, strip_altexp = TRUE){
function(x, ..., altexp = NULL, altexp.rm = strip_altexp,
strip_altexp = TRUE){
# input check
if(!.is_a_bool(altexp.rm)){
stop("'altexp.rm' mus be TRUE or FALSE.", call. = FALSE)
Expand All @@ -352,8 +364,8 @@ setMethod("agglomerateByRank", signature = c(x = "SingleCellExperiment"),
setMethod(
"agglomerateByRank", signature = c(x = "TreeSummarizedExperiment"),
function(
x, ..., update.tree = agglomerateTree, agglomerate.tree = agglomerateTree,
agglomerateTree = FALSE){
x, ..., update.tree = agglomerateTree,
agglomerate.tree = agglomerateTree, agglomerateTree = FALSE){
# input check
if(!.is_a_bool(update.tree)){
stop("'update.tree' must be TRUE or FALSE.",
Expand Down Expand Up @@ -388,7 +400,7 @@ setMethod(

# This function removes empty rank columns from rowdata. (Those that include
# only NA values)
.remove_NA_cols_from_rowdata <- function(x, empty.ranks.rm = remove_empty_ranks,
.remove_NA_cols_from_rowdata <- function(x, empty.ranks.rm = remove_empty_ranks,
remove_empty_ranks = FALSE, ...){
# Check empty.ranks.rm
if( !.is_a_bool(empty.ranks.rm) ){
Expand Down
114 changes: 65 additions & 49 deletions R/getDominant.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,28 @@
#' @inheritParams getPrevalence
#'
#' @param name \code{Character scalar}. A name for the column of the
#' \code{colData} where results will be stored. (Default: \code{"dominant_taxa"})
#' \code{colData} where results will be stored.
#' (Default: \code{"dominant_taxa"})
#'
#' @param other.name \code{Character scalar}. A name for features that are not
#' included in n the most frequent dominant features in the data. (Default: \code{"Other"})
#' included in n the most frequent dominant features in the data.
#' (Default: \code{"Other"})
#'
#' @param n \code{Numeric scalar}. The number of features that are the most frequent
#' dominant features. Default is NULL, which defaults that each sample is assigned
#' @param group \code{Character scalar}. Defines a group. Must be one of the
#' columns from \code{rowData(x)}. (Default: \code{NULL})
#'
#' @param rank Deprecated. Use \code{group} instead.
#'
#' @param n \code{Numeric scalar}. The number of features that are the most
#' frequent
#' dominant features. Default is NULL, which defaults that each sample is
#' assigned
#' a dominant taxon. (Default: \code{NULL})
#'
#' @param complete \code{Logical scalar}. A value to manage multiple dominant taxa for a sample.
#' @param complete \code{Logical scalar}. A value to manage multiple dominant
#' taxa for a sample.
#' Default for getDominant is TRUE to include all equally dominant taxa
#' for each sample. complete = FALSE samples one taxa for the samples that have
#' for each sample. complete = FALSE samples one taxa for the samples that have
#' multiple.
#' Default for addDominant is FALSE to add a column with only one
#' dominant taxon assigned for each sample into colData. complete = TRUE adds a
Expand All @@ -34,14 +44,14 @@
#' object, and stores the information in the \code{colData}. It is a wrapper for
#' \code{getDominant}.
#'
#' With \code{rank} parameter, it is possible to agglomerate taxa based on
#' taxonomic ranks. E.g. if 'Genus' rank is used, all abundances of same Genus
#' are added together, and those families are returned.
#' See \code{agglomerateByRank()} for additional arguments to deal with
#' With \code{group} parameter, it is possible to agglomerate rows based on
#' groups. If the value is one of the columns in \code{taxonomyRanks()},
#' \code{agglomerateByRank()} is applied. Otherwise,
#' \code{agglomerateByVariable()} is utilized.
#' E.g. if 'Genus' rank is used, all abundances of same Genus
#' are added together, and agglomerated features are returned.
#' See corresponding functions for additional arguments to deal with
#' missing values or special characters.
#' If the \code{rank} is not specifying a taxonomy rank from
#' \code{taxonomyRanks(x)}, the function
#' agglomerates rows with \code{agglomerateByVariable()}.
#'
#' @return \code{getDominant} returns a named character vector \code{x}
#' while \code{addDominant} returns
Expand All @@ -56,44 +66,44 @@
#' x <- GlobalPatterns
#'
#' # Finds the dominant taxa.
#' sim.dom <- getDominant(x, rank="Genus")
#' sim.dom <- getDominant(x, group = "Genus")
#'
#' # Add information to colData
#' x <- addDominant(x, rank = "Genus", name="dominant_genera")
#' x <- addDominant(x, group = "Genus", name ="dominant_genera")
#' colData(x)
NULL

#' @rdname getDominant
#' @export
setGeneric("getDominant",signature = c("x"),
function(x, assay.type = assay_name, assay_name = "counts",
rank = NULL, other.name = "Other", n = NULL,
complete = TRUE, ...)
standardGeneric("getDominant"))
function(x, assay.type = assay_name, assay_name = "counts",
group = rank, rank = NULL, other.name = "Other", n = NULL,
complete = TRUE, ...)
standardGeneric("getDominant"))

#' @rdname getDominant
#' @importFrom IRanges relist
#' @export
setMethod("getDominant", signature = c(x = "SummarizedExperiment"),
function(x, assay.type = assay_name, assay_name = "counts",
rank = NULL, other.name = "Other", n = NULL, complete = TRUE, ...){
function(x, assay.type = assay_name, assay_name = "counts", group = rank,
rank = NULL, other.name = "Other", n = NULL, complete = TRUE, ...){
# Input check
# Check assay.type
.check_assay_present(assay.type, x)
# rank check
if(!is.null(rank)){
if(!.is_a_string(rank)){
stop("'rank' must be an single character value.",
call. = FALSE)
# group check
if(!is.null(group)){
if(!.is_a_string(group)){
stop("'group' must be an single character value.",
call. = FALSE)
}
}
# If "rank" is not NULL, species are aggregated according to the
# If "group" is not NULL, species are aggregated according to the
# taxonomic rank that is specified by user.
if (!is.null(rank) && rank %in% taxonomyRanks(x)) {
x <- agglomerateByRank(x, rank, ...)
if (!is.null(group) && group %in% taxonomyRanks(x)) {
x <- agglomerateByRank(x, rank = group, ...)
# or factor that is specified by user
} else if (!is.null(rank)) {
x <- agglomerateByVariable(x, by = "rows", f = rank, ...)
} else if (!is.null(group)) {
x <- agglomerateByVariable(x, by = "rows", group = group, ...)
}
# Get assay
mat <- assay(x, assay.type)
Expand All @@ -104,17 +114,20 @@ setMethod("getDominant", signature = c(x = "SummarizedExperiment"),
# Get rownames based on indices
taxa <- rownames(mat)[unlist(idx)]

# If multiple dominant taxa were found, names contain taxa in addition to
# sample name. Names are converted so that they include only sample names.
# If multiple dominant taxa were found, names contain taxa in addition
# to
# sample name. Names are converted so that they include only sample
# names.
names(taxa) <- rep( names(idx), times = lengths(idx) )

# If individual sample contains multiple dominant taxa (they have equal
# counts) and if complete is FALSE, the an arbitrarily chosen dominant
# counts) and if complete is FALSE, the an arbitrarily chosen dominant
# taxa is returned
if( length(taxa)>ncol(x) && !complete){
# Store order
order <- unique(names(taxa))
# there are multiple dominant taxa in one sample (counts are equal), length
# there are multiple dominant taxa in one sample (counts are equal),
# length
# of dominant is greater than rows in colData.
taxa <- split(taxa, rep(names(taxa), lengths(taxa)) )
# Order the data
Expand All @@ -124,13 +137,15 @@ setMethod("getDominant", signature = c(x = "SummarizedExperiment"),
# one of them is arbitrarily chosen
taxa <- lapply(taxa, function(item) {
return(sample(item, 1)) })
taxa <- unname(sapply(taxa, function (x) {
unlist(x)}))
taxa <- unname(unlist(lapply(taxa, function (x) {
unlist(x)})))
names(taxa) <- names
warning("Multiple dominant taxa were found for some samples. Use complete = TRUE for details.", call. = FALSE)
warning(
"Multiple dominant taxa were found for some samples. ",
"Use complete = TRUE for details.", call. = FALSE)
}

# Name "Other" the features that are not included in n the most abundant
# Name "Other" the features that are not included in n the most abundant
# in the data
if(!is.null(n)){
flat_taxa <- unlist(taxa, recursive = TRUE)
Expand All @@ -157,26 +172,26 @@ setMethod("getDominant", signature = c(x = "SummarizedExperiment"),
#' @rdname getDominant
#' @export
setGeneric("addDominant", signature = c("x"),
function(x, name = "dominant_taxa", other.name = "Other", n = NULL, ...)
standardGeneric("addDominant"))
function(x, name = "dominant_taxa", other.name = "Other", n = NULL, ...)
standardGeneric("addDominant"))

#' @rdname getDominant
#' @export
setMethod("addDominant", signature = c(x = "SummarizedExperiment"),
function(x, name = "dominant_taxa", other.name = "Other", n = NULL,
complete = FALSE, ...) {
complete = FALSE, ...) {
# name check
if(!.is_non_empty_string(name)){
stop("'name' must be a non-empty single character value.",
call. = FALSE)
call. = FALSE)
}
# other.name check
if(!.is_non_empty_string(other.name)){
stop("'other.name' must be a non-empty single character value.",
call. = FALSE)
call. = FALSE)
}
dom.taxa <- getDominant(x, other.name = other.name, n = n,
complete = complete, ...)
dom.taxa <- getDominant(
x, other.name = other.name, n = n, complete = complete, ...)
# Add list into colData if there are multiple dominant taxa
if(length(unique(names(dom.taxa))) < length(names(dom.taxa))) {
# Store order
Expand All @@ -200,12 +215,13 @@ setMethod("addDominant", signature = c(x = "SummarizedExperiment"),
inds <- which(x == "NA")
if (length(inds) > 0){
x[inds] <- NA
warning(paste("Interpreting NA string as missing value NA.
Removing", length(inds), "entries"), call. = FALSE)
warning("Interpreting NA string as missing value NA. Removing",
length(inds), "entries", call. = FALSE)
}
x <- x[!is.na(x)]
}
# Create a frequency table of unique values of the dominant taxa for each sample
# Create a frequency table of unique values of the dominant taxa for each
# sample
s <- rev(sort(table(x)))
# Include only n the most frequent taxa
if (!is.null(n)){
Expand Down
Loading

0 comments on commit 8cace1b

Please sign in to comment.