From a455b92872393119817d10e17f450238925fea26 Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Tue, 23 Apr 2024 10:09:29 +0100 Subject: [PATCH] Improve normalization documentation --- DESCRIPTION | 2 +- NEWS.md | 6 ++++-- R/tree_distance_info.R | 14 ++++++++++---- R/tree_distance_utilities.R | 32 ++++++++++++++++++++++++++++++++ man/NormalizeInfo.Rd | 35 +++++++++++++++++++++++++++++++++++ man/TreeDistance.Rd | 13 +++++++++---- 6 files changed, 91 insertions(+), 11 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8db4a4d04..7f14796b1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: TreeDist Type: Package Title: Calculate and Map Distances Between Phylogenetic Trees -Version: 2.7.0.9001 +Version: 2.7.0.9002 Authors@R: c(person("Martin R.", "Smith", email = "martin.smith@durham.ac.uk", role = c("aut", "cre", "cph", "prg"), diff --git a/NEWS.md b/NEWS.md index bbce4f2c4..e35fb0fd6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,8 @@ -# TreeDist 2.7.0.9001 (development) +# TreeDist 2.7.0.9002 (development) -- Fix dead links in documentation +- Improve documentation of normalization. + +- Fix dead links in documentation. - Fix `KCDiameter.multiPhylo()` for multiple trees. diff --git a/R/tree_distance_info.R b/R/tree_distance_info.R index 8d906e60b..5e3dc841f 100644 --- a/R/tree_distance_info.R +++ b/R/tree_distance_info.R @@ -82,12 +82,16 @@ #' for a compilation of expected values under different metrics for trees with #' up to 200 leaves. #' -#' Alternatively, to scale against the information content or entropy of all -#' splits in the most or least informative tree, use `normalize = `[`pmax`] or -#' [`pmin`] respectively. +#' Alternatively, use `normalize = `[`pmax`] or [`pmin`] to scale against the +#' information content or entropy of all splits in the most (`pmax`) or +#' least (`pmin`) informative tree in each pair. #' To calculate the relative similarity against a reference tree that is known #' to be "correct", use `normalize = SplitwiseInfo(trueTree)` (SPI, MSI) or #' `ClusteringEntropy(trueTree)` (MCI). +#' For worked examples, see the internal function [`NormalizeInfo()`], which is +#' called from distance functions with the parameter `how = normalize`. +#' . +#' #' #' # Distances between large trees #' @@ -171,7 +175,9 @@ #' SharedPhylogeneticInfo(tree1, tree3) # = 0 #' MutualClusteringInfo(tree1, tree3) # > 0 #' -#' # Converting trees to Splits objects can speed up multiple comparisons +#' # Distance functions internally convert trees to Splits objects. +#' # Pre-conversion can reduce run time if the same trees will feature in +#' # multiple comparisons #' splits1 <- TreeTools::as.Splits(tree1) #' splits2 <- TreeTools::as.Splits(tree2) #' diff --git a/R/tree_distance_utilities.R b/R/tree_distance_utilities.R index 7ab16d336..e175146ee 100644 --- a/R/tree_distance_utilities.R +++ b/R/tree_distance_utilities.R @@ -409,6 +409,38 @@ CompareAll <- function(x, Func, FUN.VALUE = Func(x[[1]], x[[1]], ...), #' independently (optional) #' @param how Method for normalization #' @param \dots Additional parameters to `InfoInTree()` or `how`. +#' @returns `NormalizeInfo()` returns an object corresponding to the normalized +#' values of `unnormalized`. +#' @examples +#' library("TreeTools", quietly = TRUE) +#' pair1 <- c(BalancedTree(9), StarTree(9)) +#' pair2 <- c(BalancedTree(9), PectinateTree(9)) +#' +#' # We'll let the number of nodes define the total information in a tree +#' Nnode(pair1) +#' Nnode(pair2) +#' +#' # Let's normalize a unit distance +#' rawDist <- cbind(c(1, 1), c(1, 1)) +#' +#' # With `Combine = "+"`, the maximum distance is the sum of +#' # the information in each tree +#' denominator <- outer(Nnode(pair1), Nnode(pair2), "+") +#' +#' NormalizeInfo(rawDist, pair1, pair2, InfoInTree = ape::Nnode, Combine = "+") +#' rawDist / denominator +#' +#' +#' # A denominator can be specified manually using `how`: +#' NormalizeInfo(rawDist, pair1, pair2, InfoInTree = ape::Nnode, how = 16) +#' rawDist / 16 +#' +#' +#' # `how` also allows the denominator to be computed from trees: +#' outer(Nnode(pair1), Nnode(pair2), pmin) +#' NormalizeInfo(rawDist, pair1, pair2, InfoInTree = ape::Nnode, how = pmin) +#' rawDist / outer(Nnode(pair1), Nnode(pair2), pmin) +#' #' @keywords internal #' @template MRS #' @importFrom TreeTools KeepTip TipLabels diff --git a/man/NormalizeInfo.Rd b/man/NormalizeInfo.Rd index e9c9903ae..c40d0b587 100644 --- a/man/NormalizeInfo.Rd +++ b/man/NormalizeInfo.Rd @@ -29,8 +29,43 @@ independently (optional)} \item{\dots}{Additional parameters to \code{InfoInTree()} or \code{how}.} } +\value{ +\code{NormalizeInfo()} returns an object corresponding to the normalized +values of \code{unnormalized}. +} \description{ Normalize information against total present in both starting trees +} +\examples{ +library("TreeTools", quietly = TRUE) +pair1 <- c(BalancedTree(9), StarTree(9)) +pair2 <- c(BalancedTree(9), PectinateTree(9)) + +# We'll let the number of nodes define the total information in a tree +Nnode(pair1) +Nnode(pair2) + +# Let's normalize a unit distance +rawDist <- cbind(c(1, 1), c(1, 1)) + +# With `Combine = "+"`, the maximum distance is the sum of +# the information in each tree +denominator <- outer(Nnode(pair1), Nnode(pair2), "+") + +NormalizeInfo(rawDist, pair1, pair2, InfoInTree = ape::Nnode, Combine = "+") +rawDist / denominator + + +# A denominator can be specified manually using `how`: +NormalizeInfo(rawDist, pair1, pair2, InfoInTree = ape::Nnode, how = 16) +rawDist / 16 + + +# `how` also allows the denominator to be computed from trees: +outer(Nnode(pair1), Nnode(pair2), pmin) +NormalizeInfo(rawDist, pair1, pair2, InfoInTree = ape::Nnode, how = pmin) +rawDist / outer(Nnode(pair1), Nnode(pair2), pmin) + } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/TreeDistance.Rd b/man/TreeDistance.Rd index b64730ad6..cc0f19646 100644 --- a/man/TreeDistance.Rd +++ b/man/TreeDistance.Rd @@ -218,12 +218,15 @@ be calculated with \code{ExpectedVariation()}; or see package for a compilation of expected values under different metrics for trees with up to 200 leaves. -Alternatively, to scale against the information content or entropy of all -splits in the most or least informative tree, use \verb{normalize = }\code{\link{pmax}} or -\code{\link{pmin}} respectively. +Alternatively, use \verb{normalize = }\code{\link{pmax}} or \code{\link{pmin}} to scale against the +information content or entropy of all splits in the most (\code{pmax}) or +least (\code{pmin}) informative tree in each pair. To calculate the relative similarity against a reference tree that is known to be "correct", use \code{normalize = SplitwiseInfo(trueTree)} (SPI, MSI) or \code{ClusteringEntropy(trueTree)} (MCI). +For worked examples, see the internal function \code{\link[=NormalizeInfo]{NormalizeInfo()}}, which is +called from distance functions with the parameter \code{how = normalize}. +. } \section{Distances between large trees}{ @@ -282,7 +285,9 @@ ExpectedVariation(tree1, tree2, sample=12)["DifferentPhylogeneticInfo", "Estimat SharedPhylogeneticInfo(tree1, tree3) # = 0 MutualClusteringInfo(tree1, tree3) # > 0 -# Converting trees to Splits objects can speed up multiple comparisons +# Distance functions internally convert trees to Splits objects. +# Pre-conversion can reduce run time if the same trees will feature in +# multiple comparisons splits1 <- TreeTools::as.Splits(tree1) splits2 <- TreeTools::as.Splits(tree2)