From 56cce49dd63ff619274a6cdbaa59e2be3c44e2e8 Mon Sep 17 00:00:00 2001
From: Tuomas Borman <60338854+TuomasBorman@users.noreply.github.com>
Date: Tue, 5 Nov 2024 10:06:58 +0200
Subject: [PATCH] Update tree agglomeration (#656)

---
 DESCRIPTION                       |  2 +-
 NAMESPACE                         |  1 +
 NEWS                              |  3 +++
 R/agglomerate.R                   | 33 +++++++++++++++-----------
 R/getPrevalence.R                 | 39 +++++++++++++++++++++++++++++++
 man/getPrevalence.Rd              |  9 +++++++
 tests/testthat/test-5prevalence.R | 15 +++++++++++-
 7 files changed, 87 insertions(+), 15 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 055ef6702..eeff2a4e9 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: mia
 Type: Package
-Version: 1.15.3
+Version: 1.15.4
 Authors@R:
     c(person(given = "Tuomas", family = "Borman", role = c("aut", "cre"),
              email = "tuomas.v.borman@utu.fi",
diff --git a/NAMESPACE b/NAMESPACE
index 1ad38d083..b8c444f21 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -347,6 +347,7 @@ importFrom(SummarizedExperiment,colData)
 importFrom(SummarizedExperiment,rowData)
 importFrom(SummarizedExperiment,rowRanges)
 importFrom(TreeSummarizedExperiment,changeTree)
+importFrom(TreeSummarizedExperiment,subsetByLeaf)
 importFrom(ape,as.phylo)
 importFrom(ape,bind.tree)
 importFrom(ape,collapse.singles)
diff --git a/NEWS b/NEWS
index 46361a3e8..053a02399 100644
--- a/NEWS
+++ b/NEWS
@@ -156,3 +156,6 @@ computation
 + transformAssay can apply transformation to altExp
 + Added CSS transformation
 + In agglomerateByVariable, splitOn and getDominant, use 'group' to specify grouping variable.
+
+Changes in version 1.15.x
++ subsetBy*: added update.tree argument
diff --git a/R/agglomerate.R b/R/agglomerate.R
index 1527be4ce..bb95f9fe5 100644
--- a/R/agglomerate.R
+++ b/R/agglomerate.R
@@ -23,9 +23,6 @@
 #'   regarded as empty. (Default: \code{c(NA, "", " ", "\t")}). They will be
 #'   removed if \code{na.rm = TRUE} before agglomeration.
 #'
-#' @param update.tree \code{Logical scalar}. Should
-#'   \code{rowTree()} also be agglomerated? (Default: \code{FALSE})
-#'
 #' @param agglomerateTree Deprecated. Use \code{update.tree} instead.
 #' 
 #' @param agglomerate.tree Deprecated. Use \code{update.tree} instead.
@@ -450,16 +447,18 @@ setMethod(
 
 # Agglomerate all rowTrees found in TreeSE object. Get tips that represent
 # rows and remove all others.
+#' @importFrom TreeSummarizedExperiment subsetByLeaf
 .agglomerate_trees <- function(x, by = 1, ...){
     # Get right functions based on direction
     tree_names_FUN <- switch(
         by, "1" = rowTreeNames, "2" = colTreeNames, stop("."))
     links_FUN <- switch(by, "1" = rowLinks, "2" = colLinks, stop("."))
     tree_FUN <- switch(by, "1" = rowTree, "2" = colTree, stop("."))
-    # Get right argument names for changeTree call
+    # Get right argument names for subsetByLeaf call
     args_names <- switch(
-        by, "1" = c("x", "rowTree", "rowNodeLab", "whichRowTree"),
-        "2" = c("x", "colTree", "colNodeLab", "whichColTree"),
+        by,
+        "1" = c("x", "rowLeaf", "whichRowTree", "updateTree"),
+        "2" = c("x", "colLeaf", "whichColTree", "updateTree"),
         stop("."))
     # Get names of trees and links between trees and rows
     tree_names <- tree_names_FUN(x)
@@ -475,11 +474,9 @@ setMethod(
             # Get names of nodes that are preserved
             links_temp <- links_temp[["nodeLab"]]
             # Agglomerate the tree
-            tree <- .prune_tree(tree, links_temp, ...)
-            # Change the tree with agglomerated version
-            args <- list(x, tree, links_temp, name)
+            args <- list(x, links_temp, name, TRUE)
             names(args) <- args_names
-            x <- do.call(changeTree, args)
+            x <- do.call(subsetByLeaf, args)
         }
     }
     return(x)
@@ -507,12 +504,22 @@ setMethod(
         # even after pruning; these rows have still child-nodes that represent
         # other rows.
         # Suppress warning: drop all tips of the tree: returning NULL
-        suppressWarnings(
-            tree <- drop.tip(
+        tree <- tryCatch({
+            drop.tip(
                 tree, remove_tips,
                 trim.internal = FALSE,
                 collapse.singles = FALSE)
-        )
+        }, warning = function(w) {
+            # Do nothing on warning
+        }, error = function(e) {
+            # Try to prune by also pruning internal nodes. Sometimes that is the
+            # case; we need to trim also internal nodes in order to prune
+            # leaf.
+            drop.tip(
+                tree, remove_tips,
+                trim.internal = TRUE,
+                collapse.singles = FALSE)
+        })
         # If all tips were dropped, the result is NULL --> stop loop
         if( is.null(tree) ){
             warning("Pruning resulted to empty tree.", call. = FALSE)
diff --git a/R/getPrevalence.R b/R/getPrevalence.R
index 0496db069..fca790187 100644
--- a/R/getPrevalence.R
+++ b/R/getPrevalence.R
@@ -28,6 +28,9 @@
 #'
 #' @param na.rm \code{Logical scalar}. Should NA values be omitted when calculating
 #' prevalence? (Default: \code{TRUE})
+#' 
+#' @param update.tree \code{Logical scalar}. Should
+#' \code{rowTree()} also be agglomerated? (Default: \code{FALSE})
 #'
 #' @param ... additional arguments
 #' \itemize{
@@ -444,6 +447,24 @@ setMethod("subsetByPrevalent", signature = c(x = "SummarizedExperiment"),
     }
 )
 
+#' @rdname getPrevalence
+#' @export
+setMethod("subsetByPrevalent", signature = c(x = "TreeSummarizedExperiment"),
+    function(x, update.tree = FALSE, ...){
+        # Check that update.tree is logical value
+        if( !.is_a_bool(update.tree) ){
+            stop("'update.tree' must be TRUE or FALSE.", call. = FALSE)
+        }
+        #
+        x <- callNextMethod(x, ...)
+        # Agglomerate tree if specified
+        if( update.tree ){
+            x <- .agglomerate_trees(x, ...)
+        }
+    return(x)
+    }
+)
+
 ############################# subsetByRare #################################
 
 #' @rdname getPrevalence
@@ -462,6 +483,24 @@ setMethod("subsetByRare", signature = c(x = "SummarizedExperiment"),
     }
 )
 
+#' @rdname getPrevalence
+#' @export
+setMethod("subsetByRare", signature = c(x = "TreeSummarizedExperiment"),
+    function(x, update.tree = FALSE, ...){
+        # Check that update.tree is logical value
+        if( !.is_a_bool(update.tree) ){
+            stop("'update.tree' must be TRUE or FALSE.", call. = FALSE)
+        }
+        #
+        x <- callNextMethod(x, ...)
+        # Agglomerate tree if specified
+        if( update.tree ){
+            x <- .agglomerate_trees(x, ...)
+        }
+        return(x)
+    }
+)
+
 ############################# getPrevalentAbundance ############################
 
 #' @rdname getPrevalence
diff --git a/man/getPrevalence.Rd b/man/getPrevalence.Rd
index 11c42860f..370cb3a6d 100644
--- a/man/getPrevalence.Rd
+++ b/man/getPrevalence.Rd
@@ -12,8 +12,10 @@
 \alias{getRare,SummarizedExperiment-method}
 \alias{subsetByPrevalent}
 \alias{subsetByPrevalent,SummarizedExperiment-method}
+\alias{subsetByPrevalent,TreeSummarizedExperiment-method}
 \alias{subsetByRare}
 \alias{subsetByRare,SummarizedExperiment-method}
+\alias{subsetByRare,TreeSummarizedExperiment-method}
 \alias{getPrevalentAbundance}
 \alias{getPrevalentAbundance,ANY-method}
 \alias{getPrevalentAbundance,SummarizedExperiment-method}
@@ -81,10 +83,14 @@ subsetByPrevalent(x, ...)
 
 \S4method{subsetByPrevalent}{SummarizedExperiment}(x, rank = NULL, ...)
 
+\S4method{subsetByPrevalent}{TreeSummarizedExperiment}(x, update.tree = FALSE, ...)
+
 subsetByRare(x, ...)
 
 \S4method{subsetByRare}{SummarizedExperiment}(x, rank = NULL, ...)
 
+\S4method{subsetByRare}{TreeSummarizedExperiment}(x, update.tree = FALSE, ...)
+
 getPrevalentAbundance(
   x,
   assay.type = assay_name,
@@ -148,6 +154,9 @@ calculation. (Default: \code{"counts"})}
 \item{prevalence}{Prevalence threshold (in 0 to 1). The
 required prevalence is strictly greater by default. To include the
 limit, set \code{include.lowest} to \code{TRUE}.}
+
+\item{update.tree}{\code{Logical scalar}. Should
+\code{rowTree()} also be agglomerated? (Default: \code{FALSE})}
 }
 \value{
 \code{subsetPrevalent} and \code{subsetRareFeatures} return subset of
diff --git a/tests/testthat/test-5prevalence.R b/tests/testthat/test-5prevalence.R
index 8d977fd59..a77ea05d2 100644
--- a/tests/testthat/test-5prevalence.R
+++ b/tests/testthat/test-5prevalence.R
@@ -310,6 +310,13 @@ test_that("subsetByPrevalent", {
     alias <- subsetByPrevalent(gp_null, detection=5, prevalence = 0.33, rank = "Phylum")
     alias <- unname(assay(alias, "counts"))
     expect_equal(alias, pr2)
+    
+    # Check that tree subsetting works
+    expect_error(subsetByPrevalent(GlobalPatterns, update.tree = 1))
+    expect_error(subsetByPrevalent(GlobalPatterns, update.tree = NULL))
+    expect_error(subsetByPrevalent(GlobalPatterns, update.tree = c(TRUE, FALSE)))
+    tse_sub <- subsetByPrevalent(GlobalPatterns, prevalence = 0.4, rank = "Genus", update.tree = TRUE)
+    expect_equal(length(rowTree(tse_sub)$tip.label), nrow(tse_sub))
 
 })
 
@@ -371,7 +378,13 @@ test_that("subsetByRare", {
     alias <- subsetByRare(gp_null, detection=5, prevalence = 0.33, rank = "Phylum")
     alias <- unname(assay(alias, "counts"))
     expect_equal(alias, pr2)
-
+    
+    # Check that tree subsetting works
+    expect_error(subsetByRare(GlobalPatterns, update.tree = 1))
+    expect_error(subsetByRare(GlobalPatterns, update.tree = NULL))
+    expect_error(subsetByRare(GlobalPatterns, update.tree = c(TRUE, FALSE)))
+    tse_sub <- subsetByRare(GlobalPatterns, rank = "Class", update.tree = TRUE)
+    expect_equal(length(rowTree(tse_sub)$tip.label), nrow(tse_sub))
 })
 
 test_that("agglomerateByPrevalence", {