Bug fix in pseudo-bulk, added calcNMI and ligerToH5AD

mvfki · Jun 24, 2024 · 88362e0 · 88362e0
1 parent 16351ec
commit 88362e0
Show file tree

Hide file tree

Showing 10 changed files with 1,082 additions and 171 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: rliger
-Version: 2.0.2
-Date: 2024-04-26
+Version: 2.0.1.9001
+Date: 2024-06-24
 Type: Package
 Title: Linked Inference of Genomic Experimental Relationships
 Description: Uses an extension of nonnegative matrix factorization to identify shared and dataset-specific factors. See Welch J, Kozareva V, et al (2019) <doi:10.1016/j.cell.2019.05.006>, and Liu J, Gao C, Sodicoff J, et al (2020) <doi:10.1038/s41596-020-0391-8> for more details.

diff --git a/NAMESPACE b/NAMESPACE
@@ -79,6 +79,7 @@ export(calcARI)
 export(calcAgreement)
 export(calcAlignment)
 export(calcDatasetSpecificity)
+export(calcNMI)
 export(calcPurity)
 export(cellMeta)
 export(closeAllH5)
@@ -114,6 +115,7 @@ export(ligerCommand)
 export(ligerMethDataset)
 export(ligerRNADataset)
 export(ligerSpatialDataset)
+export(ligerToH5AD)
 export(ligerToSeurat)
 export(linkGenesAndPeaks)
 export(louvainCluster)

diff --git a/NEWS.md b/NEWS.md
@@ -1,7 +1,7 @@
 ## rliger Next
 
 - Standardized H5 IO specification that can be shared with other platforms.
-  - Will move to use HDF5Array (TENxMatrix, H5ADMatrix)/ or BPCells for backed data representation.
+  - Will move to use HDF5Array (TENxMatrix, H5ADMatrix)/ or BPCells for backed data representation, depending on easiness, cleanliness and future-sustainability of the implementation and cross-platform interoperability.
   - Read feature metadata (e.g. id, name, ...) if available; Allow setting "id" as rownames, "name" for visualization.
   - rawData - coming from the original input, read only (qc filtering should be just stored in the object, no IO)
   - preprocessing metrics - nUMI, nGene and etc, still go "chunkApply" so the file is read only once
@@ -11,15 +11,18 @@
   - Allow doing something like `reorganize(ligerObj, variable = "somethingNotDataset")` and resulting in a new liger object with different ligerDataset grouping.
 - Ability to do downstream analysis on H5 data
   - Pseudo-bulk should be easy because we are just aggregating cells.
-  - Wilcoxon might be a bit harder because ranks are calculated per gene but the H5 sparse data is column majored. Might need to find a fast on-disk transposition method.
+  - Wilcoxon might be a bit harder because ranks are calculated per gene but the H5 sparse data is column majored. Might need to find a fast on-disk transposition method, which would also enhance RcppPlanc performance when running ANLS on H5 data.
 
-## rliger 2.0.2
+## rliger 2.0.1.9001
 
+- Added `ligerToH5AD()` allowing reticulate/Python free export of liger object to H5AD format
+- Changed `runMarkerDEG()` and `runPairwiseDEG()` default method from `"wilcoxon"` to `"pseudoBulk"`
+- Fixed `runMarkerDEG(method = "pseudobulk")` bug in creating pseudo-bulk for "the rest of cells" , and optimized error signaling.
 - Added `plotProportionBox()` for visualizing compositional analysis
 - Added `plotBarcodeRank()` for basic QC visualization
+- Added `calcNMI()` for evaluating clustering results against ground truth
 - Fixed bug in `calcAlignment()`, `subsetMemLigerDataset()`, `cellMeta()`
 - Optimized `plotVolcano()` text annotation positioning
-- Changed `runMarkerDEG()` and `runPairwiseDEG()` default method from `"wilcoxon"` to `"pseudoBulk"`
 
 ## rliger 2.0.1
 

diff --git a/R/DEG_marker.R b/R/DEG_marker.R
diff --git a/R/clustering.R b/R/clustering.R
@@ -442,6 +442,7 @@ calcPurity <- function(
 #' @export
 #' @references L. Hubert and P. Arabie (1985) Comparing Partitions, Journal of
 #' the Classification, 2, pp. 193-218.
+#' @returns A numeric scalar of the ARI value
 #' @examples
 #' # Assume the true cluster in `pbmcPlot` is "leiden_cluster"
 #' # generate fake new labeling
@@ -459,6 +460,9 @@ calcPurity <- function(
 #' # evaluated
 #' calcARI(pbmcPlot, trueCluster = "stim_true_label",
 #'         useCluster = "leiden_cluster", useDatasets = "stim")
+#'
+#' # Comparison of the same labeling should always yield 1.
+#' calcARI(pbmcPlot, trueCluster = "leiden_cluster", useCluster = "leiden_cluster")
 calcARI <- function(
         object,
         trueCluster,
@@ -472,7 +476,10 @@ calcARI <- function(
     # Ensure that trueCluster is a factor object, as long as `cellIdx`, named
     # by `cellIdx`
     if (length(trueCluster) == 1) {
-        trueCluster <- cellMeta(object, trueCluster, useDatasets = useDatasets)
+        trueCluster <- .fetchCellMetaVar(
+            object, trueCluster, cellIdx = cellIdx, checkCategorical = TRUE,
+            drop = TRUE, droplevels = TRUE, returnList = FALSE
+        )
     } else {
         if (length(trueCluster) != length(cellIdx)) {
             if (is.null(names(trueCluster))) {
@@ -490,7 +497,10 @@ calcARI <- function(
     }
 
     useCluster <- useCluster %||% defaultCluster(object)
-    useCluster <- cellMeta(object, useCluster, useDatasets = useDatasets)
+    useCluster <- .fetchCellMetaVar(
+        object, useCluster, cellIdx = cellIdx, checkCategorical = TRUE,
+        drop = TRUE, droplevels = TRUE, returnList = FALSE
+    )
 
     # Copied from mclust package
     useCluster <- as.vector(useCluster)
@@ -508,5 +518,129 @@ calcARI <- function(
     return(ARI)
 }
 
+#' Calculate Normalized Mutual Information (NMI) by comparing two cluster
+#' labeling variables
+#' @description
+#' This function aims at calculating the Normalized Mutual Information for the
+#' clustering result obtained with LIGER and the external clustering (existing
+#' "true" annotation). NMI ranges from 0 to 1, with a score of 0 indicating no
+#' agreement between clusterings and 1 indicating perfect agreement. The
+#' mathematical definition of NMI is as follows:
+#'
+#' \deqn{
+#' H(X) = -\sum_{x \in X}P(X=x)\log_2 P(X=x)
+#' }
+#' \deqn{
+#' H(X|Y) = -\sum_{y \in Y}P(Y=y)\sum_{x \in X}P(X=x|Y=y)\log_2 P(X=x|Y=y)
+#' }
+#' \deqn{
+#' I(X;Y) = H(X) - H(X|Y)
+#' }
+#' \deqn{
+#' NMI(X;Y) = \frac{I(X;Y)}{\sqrt{H(X)H(Y)}}
+#' }
+#'
+#' Where \eqn{X} is the cluster variable to be evaluated and \eqn{Y} is the true
+#' cluster variable. \eqn{x} and \eqn{y} are the cluster labels in \eqn{X} and
+#' \eqn{Y} respectively. \eqn{H} is the entropy and \eqn{I} is the mutual
+#' information.
+#'
+#' The true clustering annotation must be specified as the base line. We suggest
+#' setting it to the object cellMeta so that it can be easily used for many
+#' other visualization and evaluation functions.
+#'
+#' The NMI can be calculated for only specified datasets, since true annotation
+#' might not be available for all datasets. Evaluation for only one or a few
+#' datasets can be done by specifying \code{useDatasets}. If \code{useDatasets}
+#' is specified, the argument checking for \code{trueCluster} and
+#' \code{useCluster} will be enforced to match the cells in the specified
+#' datasets.
+#' @inheritParams calcARI
+#' @export
+#' @return A numeric scalar of the NMI value
+#' @examples
+#' # Assume the true cluster in `pbmcPlot` is "leiden_cluster"
+#' # generate fake new labeling
+#' fake <- sample(1:7, ncol(pbmcPlot), replace = TRUE)
+#' # Insert into cellMeta
+#' pbmcPlot$new <- factor(fake)
+#' calcNMI(pbmcPlot, trueCluster = "leiden_cluster", useCluster = "new")
+#'
+#' # Now assume we got existing base line annotation only for "stim" dataset
+#' nStim <- ncol(dataset(pbmcPlot, "stim"))
+#' stimTrueLabel <- factor(fake[1:nStim])
+#' # Insert into cellMeta
+#' cellMeta(pbmcPlot, "stim_true_label", useDatasets = "stim") <- stimTrueLabel
+#' # Assume "leiden_cluster" is the clustering result we got and need to be
+#' # evaluated
+#' calcNMI(pbmcPlot, trueCluster = "stim_true_label",
+#'         useCluster = "leiden_cluster", useDatasets = "stim")
+#'
+#' # Comparison of the same labeling should always yield 1.
+#' calcNMI(pbmcPlot, trueCluster = "leiden_cluster", useCluster = "leiden_cluster")
+calcNMI <- function(
+        object,
+        trueCluster,
+        useCluster = NULL,
+        useDatasets = NULL,
+        verbose = getOption("ligerVerbose", TRUE)
+) {
+    cellIdx <- rownames(cellMeta(object, useDatasets = useDatasets))
+    # Ensure that trueCluster is a factor object, as long as `cellIdx`, named
+    # by `cellIdx`
+    if (length(trueCluster) == 1) {
+        trueCluster <- .fetchCellMetaVar(
+            object, trueCluster, cellIdx = cellIdx, checkCategorical = TRUE,
+            drop = TRUE, droplevels = TRUE, returnList = FALSE
+        )
+    } else {
+        if (length(trueCluster) != length(cellIdx)) {
+            if (is.null(names(trueCluster))) {
+                cli::cli_abort("Longer/shorter {.var trueCluster} than cells considered requires {.fn names()} to identify matching.")
+            }
+        } else {
+            if (is.null(names(trueCluster))) {
+                cli::cli_alert_warning(
+                    "Assuming unnamed {.var trueCluster} matches with the cells represented by {.code rownames(cellMeta(object, useDatasets = useDatasets))}."
+                )
+                names(trueCluster) <- cellIdx
+            }
+        }
+        trueCluster <- trueCluster[cellIdx]
+    }
+
+    useCluster <- useCluster %||% defaultCluster(object)
+    useCluster <- .fetchCellMetaVar(
+        object, useCluster, cellIdx = cellIdx, checkCategorical = TRUE,
+        drop = TRUE, droplevels = TRUE, returnList = FALSE
+    )
+
+    # H(X)
+    H_X <- .calcEntropy(useCluster)
+    # H(Y)
+    H_Y <- .calcEntropy(trueCluster)
 
+    overlap <- as.matrix(table(useCluster, trueCluster))
+    # H(X|Y = y), returned a vector for each Y = y
+    H_X_Yy <- apply(overlap, 2, function(x) {
+        probs <- x / sum(x)
+        probs <- probs[probs > 0]
+        -sum(probs * log2(probs))
+    })
+    # P(Y = y), a vector for each Y = y
+    PY <- table(trueCluster) / length(trueCluster)
+    # H(X|Y), a scalar
+    H_X_Y <- sum(PY * H_X_Yy)
+    # I(X;Y)
+    I_X_Y <- H_X - H_X_Y
+    NMI <- I_X_Y / sqrt(H_X*H_Y)
+    return(NMI)
+}
 
+# x - A categorical variable
+.calcEntropy <- function(x) {
+    if (is.factor(x)) x <- droplevels(x)
+    count <- table(x)
+    prob <- count / sum(count)
+    -sum(prob * log2(prob))
+}