From 88362e0fe1e7e2d06e3714f4648fa6064e216bf0 Mon Sep 17 00:00:00 2001 From: Yichen Wang Date: Mon, 24 Jun 2024 15:59:13 -0400 Subject: [PATCH] Bug fix in pseudo-bulk, added calcNMI and ligerToH5AD --- DESCRIPTION | 4 +- NAMESPACE | 2 + NEWS.md | 11 +- R/DEG_marker.R | 446 +++++++++++++++++++++++++++++++++------------ R/clustering.R | 138 +++++++++++++- R/ligerToH5AD.R | 344 ++++++++++++++++++++++++++++++++++ man/calcARI.Rd | 5 + man/calcNMI.Rd | 93 ++++++++++ man/liger-DEG.Rd | 178 +++++++++++++----- man/ligerToH5AD.Rd | 32 ++++ 10 files changed, 1082 insertions(+), 171 deletions(-) create mode 100644 R/ligerToH5AD.R create mode 100644 man/calcNMI.Rd create mode 100644 man/ligerToH5AD.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 7e613df8..0e64d2ab 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: rliger -Version: 2.0.2 -Date: 2024-04-26 +Version: 2.0.1.9001 +Date: 2024-06-24 Type: Package Title: Linked Inference of Genomic Experimental Relationships Description: Uses an extension of nonnegative matrix factorization to identify shared and dataset-specific factors. See Welch J, Kozareva V, et al (2019) , and Liu J, Gao C, Sodicoff J, et al (2020) for more details. diff --git a/NAMESPACE b/NAMESPACE index d64830d3..9fc9b6a3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -79,6 +79,7 @@ export(calcARI) export(calcAgreement) export(calcAlignment) export(calcDatasetSpecificity) +export(calcNMI) export(calcPurity) export(cellMeta) export(closeAllH5) @@ -114,6 +115,7 @@ export(ligerCommand) export(ligerMethDataset) export(ligerRNADataset) export(ligerSpatialDataset) +export(ligerToH5AD) export(ligerToSeurat) export(linkGenesAndPeaks) export(louvainCluster) diff --git a/NEWS.md b/NEWS.md index 954e4917..1b36fa93 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,7 @@ ## rliger Next - Standardized H5 IO specification that can be shared with other platforms. - - Will move to use HDF5Array (TENxMatrix, H5ADMatrix)/ or BPCells for backed data representation. + - Will move to use HDF5Array (TENxMatrix, H5ADMatrix)/ or BPCells for backed data representation, depending on easiness, cleanliness and future-sustainability of the implementation and cross-platform interoperability. - Read feature metadata (e.g. id, name, ...) if available; Allow setting "id" as rownames, "name" for visualization. - rawData - coming from the original input, read only (qc filtering should be just stored in the object, no IO) - preprocessing metrics - nUMI, nGene and etc, still go "chunkApply" so the file is read only once @@ -11,15 +11,18 @@ - Allow doing something like `reorganize(ligerObj, variable = "somethingNotDataset")` and resulting in a new liger object with different ligerDataset grouping. - Ability to do downstream analysis on H5 data - Pseudo-bulk should be easy because we are just aggregating cells. - - Wilcoxon might be a bit harder because ranks are calculated per gene but the H5 sparse data is column majored. Might need to find a fast on-disk transposition method. + - Wilcoxon might be a bit harder because ranks are calculated per gene but the H5 sparse data is column majored. Might need to find a fast on-disk transposition method, which would also enhance RcppPlanc performance when running ANLS on H5 data. -## rliger 2.0.2 +## rliger 2.0.1.9001 +- Added `ligerToH5AD()` allowing reticulate/Python free export of liger object to H5AD format +- Changed `runMarkerDEG()` and `runPairwiseDEG()` default method from `"wilcoxon"` to `"pseudoBulk"` +- Fixed `runMarkerDEG(method = "pseudobulk")` bug in creating pseudo-bulk for "the rest of cells" , and optimized error signaling. - Added `plotProportionBox()` for visualizing compositional analysis - Added `plotBarcodeRank()` for basic QC visualization +- Added `calcNMI()` for evaluating clustering results against ground truth - Fixed bug in `calcAlignment()`, `subsetMemLigerDataset()`, `cellMeta()` - Optimized `plotVolcano()` text annotation positioning -- Changed `runMarkerDEG()` and `runPairwiseDEG()` default method from `"wilcoxon"` to `"pseudoBulk"` ## rliger 2.0.1 diff --git a/R/DEG_marker.R b/R/DEG_marker.R index f148e881..7d0ae718 100644 --- a/R/DEG_marker.R +++ b/R/DEG_marker.R @@ -1,11 +1,78 @@ #' @title Find DEG between two groups #' @description Find DEG between two groups. Two methods are supported: -#' \code{"wilcoxon"} and \code{"pseudoBulk"}. Wilcoxon rank sum test is -#' performed on single-cell level, while pseudo-bulk method aggregates cells -#' basing on biological replicates and calls bulk RNAseq DE methods, DESeq2 wald -#' test. When real biological replicates are not available, pseudo replicates -#' can be generated. Please see below for detailed scenario usage. -#' @section Pairwise DEG Scenarios: +#' \code{"pseudoBulk"} and \code{"wilcoxon"}. Pseudo-bulk method aggregates +#' cells basing on biological replicates and calls bulk RNAseq DE methods, +#' DESeq2 wald test, while Wilcoxon rank sum test is performed on single-cell +#' level. +#' +#' While using pseudo-bulk method, it is generally recommended that you have +#' these variables available in your object: +#' +#' \enumerate{ +#' \item{The cell type or cluster labeling, dividing the cells by +#' functionality. This can be obtained from prior study or computed with +#' \code{\link{runCluster}}} +#' \item{The biological replicate labeling, most of the time the +#' \code{"dataset"} variable automatically generated when the +#' \linkS4class{liger} object is created. } +#' \item{The condition labeling that reflects the study design, such as the +#' treatment or disease status for each sample/dataset.} +#' } +#' +#' Please see below for detailed scenarios. +#' +#' @section Using Wilcoxon rank-sum test: +#' Wilcoxon rank-sum test works for each gene and is based on the rank of the +#' expression in each cell. LIGER provides dataset integration but does not +#' "correct" the expression values. Projects with strong batch effects or +#' integrate drastically different modalities should be cautious when using +#' this method. +#' +#' @section Comparing difference between/across cell types: +#' Most of times, people would want to know what cell types are for each cluster +#' after clustering. This can be done with a marker detection method that test +#' each cluster against all the other cells. This can be done with a command +#' like \code{runMarkerDEG(object, conditionBy = "cluster_var")}. When using +#' default pseudo-bulk method, users should additionaly determine the +#' pseudo-bulk setup parameters. If the real biological replicate variable is +#' available, it should be supplied to argument \code{useReplicate}, otherwise, +#' set \code{useReplicate = NULL} and use pseudo-replicate setup. +#' +#' @section Compare between conditions: +#' It is frequently needed to identify the difference between conditions. Users +#' can simply set \code{conditionBy = "condition_var"}. However, most of time, +#' such comparisons should be ideally done in a per-cluster manner. This can be +#' done by setting \code{splitBy = "cluster_var"}. This will run a loop for each +#' cluster, and within the group of cells, compare each condition against all +#' other cells in the cluster. +#' +#' In the scenario when users only need to compare two conditions for each +#' cluster, running \code{runPairwiseDEG(object, groupTest = "condition1", +#' groupCtrl = "condition2", variable1 = "condition_var", +#' splitBy = "cluster_var")} would address the need. +#' +#' For both use case, if pseudo-bulk (default) method is used, users should +#' determine the pseudo-bulk setup parameters as mentioned in the previous +#' section. +#' +#' @section Detailed \code{runMarkerDEG} usage: +#' Marker detection is generally performed in a one vs. rest manner. The +#' grouping of such condition is specified by \code{conditionBy}, which should +#' be a column name in \code{cellMeta}. When \code{splitBy} is specified as +#' another variable name in \code{cellMeta}, the marker detection will be +#' iteratively done for within each level of \code{splitBy} variable. +#' +#' For example, when \code{conditionBy = "celltype"} and \code{splitBy = NULL}, +#' marker detection will be performed by comparing all cells of "celltype_i" +#' against all other cells, and etc. This is analogous to the old version when +#' running \code{runWilcoxon(method = "cluster")}. +#' +#' When \code{conditionBy = "gender"} and \code{splitBy = "leiden_cluster"}, +#' marker detection will be performed by comparing "gender_i" cells from "cluster_j" +#' against other cells from "cluster_j", and etc. This is analogous to the old +#' version when running \code{runWilcoxon(method = "dataset")}. +#' +#' @section Detailed \code{runPairwiseDEG} usage: #' Users can select classes of cells from a variable in \code{cellMeta}. #' \code{variable1} and \code{variable2} are used to specify a column in #' \code{cellMeta}, and \code{groupTest} and \code{groupCtrl} are used to specify @@ -26,6 +93,8 @@ #' @param object A \linkS4class{liger} object, with normalized data available #' @param groupTest,groupCtrl,variable1,variable2 Condition specification. See #' \code{?runPairwiseDEG} section \bold{Pairwise DEG Scenarios} for detail. +#' @param splitBy Name(s) of the variable(s) in \code{cellMeta} to split the +#' comparison. See Details. Default \code{NULL}. #' @param method DEG test method to use. Choose from \code{"pseudoBulk"} or #' \code{"wilcoxon"}. Default \code{"pseudoBulk"} #' @param usePeak Logical. Whether to use peak count instead of gene count. @@ -37,37 +106,94 @@ #' \code{method = "pseudoBulk", useReplicate = NULL}. Default \code{5}. #' @param minCellPerRep Numeric, will not make pseudo-bulk for replicate with #' less than this number of cells. Default \code{10}. +#' @param printDisgnostic Logical. Whether to show more detail when +#' \code{verbose = TRUE}. Default \code{FALSE}. #' @param seed Random seed to use for pseudo-replicate generation. Default #' \code{1}. #' @param verbose Logical. Whether to show information of the progress. Default #' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set. -#' @return A data.frame with DEG information +#' @return A data.frame with DEG information with the following field: +#' \enumerate{ +#' \item{feature - Gene names} +#' \item{group - Test group name. Multiple tests might be present for each +#' function call. This is the main variable to distinguish the tests. For a +#' pairwise test, a row with a certain group name represents the test result +#' between the this group against the other control group; When split by a +#' variable, it would be presented in "split.group" format, meaning the stats +#' is by comparing the group in the split level against the control group in +#' the same split level. When running marker detection without splitting, +#' a row with group "a" represents the stats of the gene in group "a" against +#' all other cells. When running split marker detection, the group name would +#' be in "split.group" format, meaning the stats is by comparing the group in +#' the split level against all other cells in the same split level.} +#' \item{logFC - Log fold change} +#' \item{pval - P-value} +#' \item{padj - Adjusted p-value} +#' \item{avgExpr - Mean expression in the test group indicated by the "group" +#' field. Only available for wilcoxon tests.} +#' \item{statistic - Wilcoxon rank-sum test statistic. Only available for +#' wilcoxon tests.} +#' \item{auc - Area under the ROC curve. Only available for wilcoxon tests.} +#' \item{pct_in - Percentage of cells in the test group, indicated by the +#' "group" field, that express the feature. Only available for wilcoxon +#' tests.} +#' \item{pct_out - Percentage of cells in the control group or other cells, as +#' explained for the "group" field, that express the feature. Only available +#' for wilcoxon tests.} +#' } #' @rdname liger-DEG #' @export #' @examples +#' pbmc$leiden_cluster <- pbmcPlot$leiden_cluster +#' +#' # Identify cluster markers +#' degStats1 <- runMarkerDEG(pbmc, conditionBy = "leiden_cluster", +#' minCellPerRep = 5) +#' #' # Compare between cluster "0" and cluster "1" -#' degStats <- runPairwiseDEG(pbmcPlot, groupTest = 0, groupCtrl = 1, -#' variable1 = "leiden_cluster") -#' # Compare between all cells from cluster "5" and -#' # all cells from dataset "stim" -#' degStats <- runPairwiseDEG(pbmcPlot, groupTest = "5", groupCtrl = "stim", -#' variable1 = "leiden_cluster", -#' variable2 = "dataset") +#' degStats2 <- runPairwiseDEG(pbmc, groupTest = 0, groupCtrl = 1, +#' variable1 = "leiden_cluster") +#' +#' # Compare "stim" data against "ctrl" data within each cluster +#' degStats3 <- runPairwiseDEG(pbmc, groupTest = "stim", groupCtrl = "ctrl", +#' variable1 = "dataset", +#' splitBy = "leiden_cluster", +#' nPsdRep = 2, minCellPerRep = 4) +#' +#' # Identify dataset markers within each cluster. +#' degStats4 <- runMarkerDEG(pbmc, conditionBy = "dataset", +#' splitBy = "leiden_cluster", nPsdRep = 2, +#' minCellPerRep = 4) runPairwiseDEG <- function( object, groupTest, groupCtrl, variable1 = NULL, variable2 = NULL, + splitBy = NULL, method = c("pseudoBulk", "wilcoxon"), usePeak = FALSE, useReplicate = NULL, nPsdRep = 5, minCellPerRep = 10, + printDisgnostic = FALSE, seed = 1, verbose = getOption("ligerVerbose", TRUE) ) { method <- match.arg(method) + if (!is.null(splitBy)) { + splitVar <- .fetchCellMetaVar(object, splitBy, + checkCategorical = TRUE, drop = TRUE, + droplevels = TRUE) + splitVar <- interaction(splitVar, drop = TRUE) + splitGroup <- lapply(levels(splitVar), function(x) { + which(splitVar == x) + }) + names(splitGroup) <- levels(splitVar) + } else { + splitGroup <- list(seq(ncol(object))) + } + if (is.null(variable1) && is.null(variable2)) { # Directly using cell index groups <- list( @@ -88,8 +214,8 @@ runPairwiseDEG <- function( var2 <- var1 } else { var2 <- .fetchCellMetaVar(object, variable2, - checkCategorical = TRUE, drop = TRUE, - droplevels = TRUE) + checkCategorical = TRUE, drop = TRUE, + droplevels = TRUE) } group2Idx <- which(var2 %in% groupCtrl) group2Name <- paste(groupCtrl, collapse = ".") @@ -98,19 +224,52 @@ runPairwiseDEG <- function( } else { cli::cli_abort("Please see {.code ?runPairwiseDEG} for usage.") } - result <- .runDEG(object, groups = groups, method = method, - usePeak = usePeak, useReplicate = useReplicate, - nPsdRep = nPsdRep, - minCellPerRep = minCellPerRep, seed = seed, - verbose = verbose) - result <- result[result$group == group1Name,] - attributes(result)$meta <- list( - groupTest = groupTest, - variable1 = variable1, - groupCtrl = groupCtrl, - variable2 = variable2 - ) - return(result) + resultList <- list() + for (i in seq_along(splitGroup)) { + splitName <- names(splitGroup)[i] + splitIdx <- splitGroup[[i]] + + if (isTRUE(verbose)) { + if (length(splitGroup) > 1) { + cli::cli_alert_info( + "Running DEG within: {.val {splitName}}" + ) + } + } + + groups.sub <- lapply(groups, function(x) { + intersect(x, splitIdx) + }) + names(groups.sub) <- sapply(names(groups), function(x) { + paste(c(splitName, x), collapse = ".") + }) + if (length(groups.sub[[1]]) == 0) { + cli::cli_alert_warning("No cell selected for group {.val {names(groups.sub)[1]}} when split by {.val {splitBy}} in level {.val {splitName}}. Skipping.") + next + } + if (length(groups.sub[[2]]) == 0) { + cli::cli_alert_warning("No cell selected for group {.val {names(groups.sub)[1]}} when split by {.val {splitBy}} in level {.val {splitName}}. Skipping.") + next + } + result <- .runDEG(object, groups = groups.sub, method = method, + usePeak = usePeak, useReplicate = useReplicate, + nPsdRep = nPsdRep, + minCellPerRep = minCellPerRep, seed = seed, + printDisgnostic = printDisgnostic, + skipTwoGroup = TRUE, + verbose = verbose) + result <- result[result$group == names(groups.sub)[1],] + + # attributes(result)$meta <- list( + # groupTest = groupTest, + # variable1 = variable1, + # groupCtrl = groupCtrl, + # variable2 = variable2 + # ) + resultList[[i]] <- result + } + resultList <- Reduce(rbind, resultList) + return(resultList) } #' @rdname liger-DEG @@ -118,34 +277,8 @@ runPairwiseDEG <- function( #' @param conditionBy \code{cellMeta} variable(s). Marker detection will be #' performed for each level of this variable. Multiple variables will be #' combined. Default \code{NULL} uses default cluster. -#' @param splitBy Split data by \code{cellMeta} variable(s) here and identify -#' markers for \code{conditionBy} within each chunk. Default \code{NULL}. #' @param useDatasets Datasets to perform marker detection within. Default #' \code{NULL} will use all datasets. -#' @section Marker Detection Scenarios: -#' Marker detection is generally performed in a one vs. rest manner. The -#' grouping of such condition is specified by \code{conditionBy}, which should -#' be a column name in \code{cellMeta}. When \code{splitBy} is specified as -#' another variable name in \code{cellMeta}, the marker detection will be -#' iteratively done for within each level of \code{splitBy} variable. -#' -#' For example, when \code{conditionBy = "celltype"} and \code{splitBy = NULL}, -#' marker detection will be performed by comparing all cells of "celltype_i" -#' against all other cells, and etc. -#' -#' When \code{conditionBy = "celltype"} and \code{splitBy = "gender"}, marker -#' detection will be performed by comparing "celltype_i" cells from "gender_j" -#' against other cells from "gender_j", and etc. -#' @examples -#' # Identify markers for each cluster. Equivalent to old version -#' # `runWilcoxon(method = "cluster")` -#' pbmc$leiden_cluster <- pbmcPlot$leiden_cluster -#' markerStats <- runMarkerDEG(pbmc, conditionBy = "leiden_cluster") -#' # Identify dataset markers within each cluster. Equivalent to old version -#' # `runWilcoxon(method = "dataset")`. -#' markerStatsList <- runMarkerDEG(pbmc, conditionBy = "dataset", -#' splitBy = "leiden_cluster", -#' minCellPerRep = 2) runMarkerDEG <- function( object, conditionBy = NULL, @@ -156,6 +289,7 @@ runMarkerDEG <- function( useReplicate = NULL, nPsdRep = 5, minCellPerRep = 10, + printDisgnostic = FALSE, seed = 1, verbose = getOption("ligerVerbose", TRUE) ) { @@ -175,45 +309,35 @@ runMarkerDEG <- function( checkCategorical = TRUE, drop = FALSE, droplevels = TRUE ) splitBy <- interaction(splitBy, drop = TRUE) + groups <- split(allCellIdx, conditionBy) if (nlevels(splitBy) <= 1) { - groups <- split(allCellIdx, conditionBy) result <- .runDEG(object, groups = groups, method = method, usePeak = usePeak, useReplicate = useReplicate, nPsdRep = nPsdRep, minCellPerRep = minCellPerRep, + printDisgnostic = printDisgnostic, + skipTwoGroup = FALSE, seed = seed, verbose = verbose) } else { - result <- list() + resultList <- list() for (i in seq_along(levels(splitBy))) { if (isTRUE(verbose)) { cli::cli_alert_info( - "Running psuedo-bulk DEG on: {.val {levels(splitBy)[i]}}" + "Running DEG within: {.val {levels(splitBy)[i]}}" ) } - - subIdx <- splitBy == levels(splitBy)[i] - subCellIdx <- allCellIdx[subIdx] - groups <- split(subCellIdx, conditionBy[subIdx]) - result[[levels(splitBy)[i]]] <- tryCatch( - { - .runDEG( - object, groups = groups, method = method, - usePeak = usePeak, useReplicate = useReplicate, - nPsdRep = nPsdRep, minCellPerRep = minCellPerRep, - seed = seed, verbose = verbose - ) - }, - error = function(e) { - cli::cli_alert_danger( - "Error when computing on {.val {levels(splitBy)[i]}}: {e$message}" - ) - cli::cli_alert_warning( - "Empty result (NULL) returned for this test." - ) - return(NULL) - } + subIdx <- which(splitBy == levels(splitBy)[i]) + subGroups <- lapply(groups, intersect, subIdx) + names(subGroups) <- paste0(levels(splitBy)[i], '.', names(groups)) + resultList[[levels(splitBy)[i]]] <- .runDEG( + object, groups = subGroups, method = method, + usePeak = usePeak, useReplicate = useReplicate, + nPsdRep = nPsdRep, minCellPerRep = minCellPerRep, + printDisgnostic = printDisgnostic, skipTwoGroup = FALSE, + seed = seed, verbose = verbose ) } + result <- Reduce(rbind, resultList) } return(result) @@ -258,6 +382,8 @@ runWilcoxon <- function( useReplicate = NULL, nPsdRep = 5, minCellPerRep = 10, + printDisgnostic = FALSE, + skipTwoGroup = TRUE, seed = 1, verbose = getOption("ligerVerbose", TRUE) ) { @@ -276,10 +402,12 @@ runWilcoxon <- function( } slot <- .DE.checkDataAvail(object, datasetInvolve, method, usePeak) dataList <- getMatrix(object, slot, datasetInvolve, returnList = TRUE) - features <- Reduce(intersect, lapply(dataList, rownames)) - dataList <- lapply(dataList, function(x) x[features, , drop = FALSE]) + mat <- mergeSparseAll(dataList, mode = "intersection") + # features <- Reduce(intersect, lapply(dataList, rownames)) + # dataList <- lapply(dataList, function(x) x[features, , drop = FALSE]) + + # mat <- Reduce(cbind, dataList) - mat <- Reduce(cbind, dataList) mat <- mat[, allCellBC, drop = FALSE] if (method == "wilcoxon") { cliID <- cli::cli_process_start("Running Wilcoxon rank-sum test") @@ -287,31 +415,93 @@ runWilcoxon <- function( result <- wilcoxauc(mat, var) cli::cli_process_done(id = cliID) } else if (method == "pseudoBulk") { - if (is.null(useReplicate)) { - replicateAnn <- setupPseudoRep(var, nRep = nPsdRep, - seed = seed) - } else { - replicateAnn <- .fetchCellMetaVar( - object, useReplicate, - cellIdx = allCellIdx, - drop = FALSE, - checkCategorical = TRUE, - droplevels = TRUE - ) - replicateAnn$groups <- var + resultList <- list() + if (isTRUE(verbose)) { + if (nlevels(var) == 2) { + cliID <- cli::cli_process_start("Calling pairwise DESeq2 Wald test") + } else if (nlevels(var) > 2) { + if (isFALSE(printDisgnostic)) { + cliID_pb <- cli::cli_progress_bar(name = "DESeq2 Wald test", + total = nlevels(var)) + } + } } - pbs <- makePseudoBulk(mat, replicateAnn, - minCellPerRep = minCellPerRep, - verbose = verbose) - pb <- pbs[[1]] - replicateAnn <- pbs[[2]] - var <- sapply(levels(replicateAnn$groups), function(x) { - nlevels(interaction(replicateAnn[replicateAnn$groups == x, , drop = FALSE], - drop = TRUE)) - }) - var <- factor(rep(names(var), var), levels = names(var)) - result <- .callDESeq2(pb, var, verbose) + for (i in seq_along(levels(var))) { + testName <- levels(var)[i] + if (isTRUE(verbose) && + isTRUE(printDisgnostic) && + nlevels(var) > 2) { + cliID_perLevel <- cli::cli_process_start("Working on {.val {testName}}") + } + tryCatch({ + subVar <- factor(ifelse(var == testName, testName, "others"), + levels = c(testName, "others")) + if (is.null(useReplicate)) { + replicateAnn <- setupPseudoRep(subVar, nRep = nPsdRep, + seed = seed) + } else { + replicateAnn <- .fetchCellMetaVar( + object, useReplicate, + cellIdx = allCellIdx, + drop = FALSE, + checkCategorical = TRUE, + droplevels = TRUE + ) + replicateAnn$groups <- subVar + } + + pbs <- makePseudoBulk(mat, replicateAnn, + minCellPerRep = minCellPerRep, + verbose = printDisgnostic) + pb <- pbs[[1]] + replicateAnn <- pbs[[2]] + subVar <- sapply(levels(replicateAnn$groups), function(x) { + nlevels(interaction(replicateAnn[replicateAnn$groups == x, , drop = FALSE], + drop = TRUE)) + }) + subVar <- factor(rep(names(subVar), subVar), levels = names(subVar)) + resultList[[testName]] <- .callDESeq2(pb, subVar, printDisgnostic) + if (length(levels(var)) <= 2) { + if (isTRUE(verbose)) cli::cli_process_done(id = cliID) + } else { + if (isTRUE(verbose)) { + if (isTRUE(printDisgnostic)) { + cli::cli_process_done(id = cliID_perLevel) + } else { + cli::cli_progress_update(set = i) + } + } + } + }, error = function(e) { + cli::cli_alert_danger( + "Error when computing on {.val {testName}}: {e$message}" + ) + cli::cli_alert_warning( + "Empty result returned for this test." + ) + if (length(levels(var)) <= 2) { + if (isTRUE(verbose)) cli::cli_process_failed(id = cliID) + } else { + if (isTRUE(verbose)) { + if (isTRUE(printDisgnostic)) { + cli::cli_process_failed(id = cliID_perLevel) + } else { + cli::cli_progress_update(set = i, id = cliID_pb) + } + } + } + resultList[[testName]] <- data.frame( + feature = character(0), + group = character(0), + logFC = numeric(0), + pval = numeric(0), + padj = numeric(0) + ) + }) + if (length(levels(var)) <= 2 && isTRUE(skipTwoGroup)) break + } + result <- Reduce(rbind, resultList) } return(result) } @@ -351,11 +541,11 @@ runWilcoxon <- function( setupPseudoRep <- function(groups, nRep = 3, seed = 1) { # The output data.frame should be cell per row by variable per col set.seed(seed) - psdRep <- c() + psdRep <- rep(NA, length(groups)) for (i in seq_along(levels(groups))) { groupSize <- sum(groups == levels(groups)[i]) repVar <- sample(seq_len(groupSize) %% nRep) + 1 + (i - 1)*nRep - psdRep <- c(psdRep, repVar) + psdRep[groups == levels(groups)[i]] <- repVar } return(data.frame( groups = groups, @@ -382,19 +572,27 @@ makePseudoBulk <- function(mat, replicateAnn, minCellPerRep, verbose = TRUE) { } } splitLabel <- interaction(replicateAnn, drop = TRUE) - + repSizeTab <- table(splitLabel) + if (verbose) { + cli::cli_alert_info("Replicate sizes:") + print(repSizeTab) + } labelCounts <- table(splitLabel) ignored <- names(labelCounts)[labelCounts < minCellPerRep] + if (length(ignored) > 0) { + cli::cli_alert_warning( + "Ignoring replicates (size in bracket) with too few cells: {.val {paste0(ignored, ' (', repSizeTab[ignored], ')')}}" + ) + cli::cli_alert_info( + "Consider increase {.field minCellPerRep} to exclude less replicates or/and lower {.field nPsdRep} to generate larger pseudo-replicates." + ) + } keep <- names(labelCounts)[labelCounts >= minCellPerRep] idx <- splitLabel %in% keep splitLabel <- splitLabel[idx, drop = TRUE] mat <- mat[, idx, drop = FALSE] replicateAnn <- replicateAnn[idx, , drop = FALSE] - if (verbose) { - if (length(ignored) > 0) cli::cli_alert_warning("Ignoring replicates with too few cells: {.val {ignored}}") - cli::cli_alert_info("Replicate sizes:") - print(table(splitLabel)) - } + pseudoBulks <- colAggregateSums_sparse(mat, as.integer(splitLabel) - 1, nlevels(splitLabel)) dimnames(pseudoBulks) <- list(rownames(mat), levels(splitLabel)) @@ -405,18 +603,24 @@ makePseudoBulk <- function(mat, replicateAnn, minCellPerRep, verbose = TRUE) { .callDESeq2 <- function(pseudoBulks, groups, verbose = getOption("ligerVerbose", TRUE)) { # DESeq2 workflow - if (isTRUE(verbose)) cliID <- cli::cli_process_start("Calling DESeq2 Wald test") + ## NOTE: DESeq2 wishes that the contrast/control group is the first level ## whereas we required it as the second in upstream input. So we need to ## reverse it here. groups <- stats::relevel(groups, ref = levels(groups)[2]) + groupsDropped <- droplevels(groups) + if (nlevels(groupsDropped) < 2) { + cli::cli_abort("No enough replicates for conditions being compared.") + } ## Now levels(groups)[1] becomes control and levels(groups)[2] becomes ## the test group - des <- DESeq2::DESeqDataSetFromMatrix( - countData = pseudoBulks, - colData = data.frame(groups = groups), - design = stats::formula("~groups") - ) + suppressMessages({ + des <- DESeq2::DESeqDataSetFromMatrix( + countData = pseudoBulks, + colData = data.frame(groups = groups), + design = stats::formula("~groups") + ) + }) des <- DESeq2::DESeq(des, test = "Wald", quiet = !verbose) res <- DESeq2::results(des, contrast = c("groups", levels(groups)[2], levels(groups)[1])) @@ -426,7 +630,7 @@ makePseudoBulk <- function(mat, replicateAnn, minCellPerRep, verbose = TRUE) { res$group <- levels(groups)[2] res <- res[, c(7, 8, 2, 5, 6)] colnames(res) <- c("feature", "group", "logFC", "pval", "padj") - if (isTRUE(verbose)) cli::cli_process_done(id = cliID) + return(res) } diff --git a/R/clustering.R b/R/clustering.R index cb35be3f..263f4ecf 100644 --- a/R/clustering.R +++ b/R/clustering.R @@ -442,6 +442,7 @@ calcPurity <- function( #' @export #' @references L. Hubert and P. Arabie (1985) Comparing Partitions, Journal of #' the Classification, 2, pp. 193-218. +#' @returns A numeric scalar of the ARI value #' @examples #' # Assume the true cluster in `pbmcPlot` is "leiden_cluster" #' # generate fake new labeling @@ -459,6 +460,9 @@ calcPurity <- function( #' # evaluated #' calcARI(pbmcPlot, trueCluster = "stim_true_label", #' useCluster = "leiden_cluster", useDatasets = "stim") +#' +#' # Comparison of the same labeling should always yield 1. +#' calcARI(pbmcPlot, trueCluster = "leiden_cluster", useCluster = "leiden_cluster") calcARI <- function( object, trueCluster, @@ -472,7 +476,10 @@ calcARI <- function( # Ensure that trueCluster is a factor object, as long as `cellIdx`, named # by `cellIdx` if (length(trueCluster) == 1) { - trueCluster <- cellMeta(object, trueCluster, useDatasets = useDatasets) + trueCluster <- .fetchCellMetaVar( + object, trueCluster, cellIdx = cellIdx, checkCategorical = TRUE, + drop = TRUE, droplevels = TRUE, returnList = FALSE + ) } else { if (length(trueCluster) != length(cellIdx)) { if (is.null(names(trueCluster))) { @@ -490,7 +497,10 @@ calcARI <- function( } useCluster <- useCluster %||% defaultCluster(object) - useCluster <- cellMeta(object, useCluster, useDatasets = useDatasets) + useCluster <- .fetchCellMetaVar( + object, useCluster, cellIdx = cellIdx, checkCategorical = TRUE, + drop = TRUE, droplevels = TRUE, returnList = FALSE + ) # Copied from mclust package useCluster <- as.vector(useCluster) @@ -508,5 +518,129 @@ calcARI <- function( return(ARI) } +#' Calculate Normalized Mutual Information (NMI) by comparing two cluster +#' labeling variables +#' @description +#' This function aims at calculating the Normalized Mutual Information for the +#' clustering result obtained with LIGER and the external clustering (existing +#' "true" annotation). NMI ranges from 0 to 1, with a score of 0 indicating no +#' agreement between clusterings and 1 indicating perfect agreement. The +#' mathematical definition of NMI is as follows: +#' +#' \deqn{ +#' H(X) = -\sum_{x \in X}P(X=x)\log_2 P(X=x) +#' } +#' \deqn{ +#' H(X|Y) = -\sum_{y \in Y}P(Y=y)\sum_{x \in X}P(X=x|Y=y)\log_2 P(X=x|Y=y) +#' } +#' \deqn{ +#' I(X;Y) = H(X) - H(X|Y) +#' } +#' \deqn{ +#' NMI(X;Y) = \frac{I(X;Y)}{\sqrt{H(X)H(Y)}} +#' } +#' +#' Where \eqn{X} is the cluster variable to be evaluated and \eqn{Y} is the true +#' cluster variable. \eqn{x} and \eqn{y} are the cluster labels in \eqn{X} and +#' \eqn{Y} respectively. \eqn{H} is the entropy and \eqn{I} is the mutual +#' information. +#' +#' The true clustering annotation must be specified as the base line. We suggest +#' setting it to the object cellMeta so that it can be easily used for many +#' other visualization and evaluation functions. +#' +#' The NMI can be calculated for only specified datasets, since true annotation +#' might not be available for all datasets. Evaluation for only one or a few +#' datasets can be done by specifying \code{useDatasets}. If \code{useDatasets} +#' is specified, the argument checking for \code{trueCluster} and +#' \code{useCluster} will be enforced to match the cells in the specified +#' datasets. +#' @inheritParams calcARI +#' @export +#' @return A numeric scalar of the NMI value +#' @examples +#' # Assume the true cluster in `pbmcPlot` is "leiden_cluster" +#' # generate fake new labeling +#' fake <- sample(1:7, ncol(pbmcPlot), replace = TRUE) +#' # Insert into cellMeta +#' pbmcPlot$new <- factor(fake) +#' calcNMI(pbmcPlot, trueCluster = "leiden_cluster", useCluster = "new") +#' +#' # Now assume we got existing base line annotation only for "stim" dataset +#' nStim <- ncol(dataset(pbmcPlot, "stim")) +#' stimTrueLabel <- factor(fake[1:nStim]) +#' # Insert into cellMeta +#' cellMeta(pbmcPlot, "stim_true_label", useDatasets = "stim") <- stimTrueLabel +#' # Assume "leiden_cluster" is the clustering result we got and need to be +#' # evaluated +#' calcNMI(pbmcPlot, trueCluster = "stim_true_label", +#' useCluster = "leiden_cluster", useDatasets = "stim") +#' +#' # Comparison of the same labeling should always yield 1. +#' calcNMI(pbmcPlot, trueCluster = "leiden_cluster", useCluster = "leiden_cluster") +calcNMI <- function( + object, + trueCluster, + useCluster = NULL, + useDatasets = NULL, + verbose = getOption("ligerVerbose", TRUE) +) { + cellIdx <- rownames(cellMeta(object, useDatasets = useDatasets)) + # Ensure that trueCluster is a factor object, as long as `cellIdx`, named + # by `cellIdx` + if (length(trueCluster) == 1) { + trueCluster <- .fetchCellMetaVar( + object, trueCluster, cellIdx = cellIdx, checkCategorical = TRUE, + drop = TRUE, droplevels = TRUE, returnList = FALSE + ) + } else { + if (length(trueCluster) != length(cellIdx)) { + if (is.null(names(trueCluster))) { + cli::cli_abort("Longer/shorter {.var trueCluster} than cells considered requires {.fn names()} to identify matching.") + } + } else { + if (is.null(names(trueCluster))) { + cli::cli_alert_warning( + "Assuming unnamed {.var trueCluster} matches with the cells represented by {.code rownames(cellMeta(object, useDatasets = useDatasets))}." + ) + names(trueCluster) <- cellIdx + } + } + trueCluster <- trueCluster[cellIdx] + } + + useCluster <- useCluster %||% defaultCluster(object) + useCluster <- .fetchCellMetaVar( + object, useCluster, cellIdx = cellIdx, checkCategorical = TRUE, + drop = TRUE, droplevels = TRUE, returnList = FALSE + ) + + # H(X) + H_X <- .calcEntropy(useCluster) + # H(Y) + H_Y <- .calcEntropy(trueCluster) + overlap <- as.matrix(table(useCluster, trueCluster)) + # H(X|Y = y), returned a vector for each Y = y + H_X_Yy <- apply(overlap, 2, function(x) { + probs <- x / sum(x) + probs <- probs[probs > 0] + -sum(probs * log2(probs)) + }) + # P(Y = y), a vector for each Y = y + PY <- table(trueCluster) / length(trueCluster) + # H(X|Y), a scalar + H_X_Y <- sum(PY * H_X_Yy) + # I(X;Y) + I_X_Y <- H_X - H_X_Y + NMI <- I_X_Y / sqrt(H_X*H_Y) + return(NMI) +} +# x - A categorical variable +.calcEntropy <- function(x) { + if (is.factor(x)) x <- droplevels(x) + count <- table(x) + prob <- count / sum(count) + -sum(prob * log2(prob)) +} diff --git a/R/ligerToH5AD.R b/R/ligerToH5AD.R new file mode 100644 index 00000000..58842dd5 --- /dev/null +++ b/R/ligerToH5AD.R @@ -0,0 +1,344 @@ +#' Write liger object to H5AD files +#' @param object A \linkS4class{liger} object +#' @param filename A character string, the path to the H5AD file to be written +#' @param overwrite Logical, whether to overwrite the file if it exists. +#' @param verbose Logical. Whether to show information of the progress. Default +#' \code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set. +#' @return No return value, an H5AD file is written to disk +#' @export +#' @examples +#' ligerToH5AD(pbmc, filename = tempfile(fileext = ".h5ad")) +ligerToH5AD <- function( + object, + filename, + # X = c("scaleData", "normData", "rawData"), + # merge = TRUE, + overwrite = FALSE, + verbose = getOption("ligerVerbose", TRUE) +) { + if (file.exists(filename)) { + if (isTRUE(overwrite)) { + file.remove(filename) + } else { + cli::cli_abort("H5AD file exists at {.file {normalizePath(filename)}}") + } + } + + rownames <- "_index" + dfile <- hdf5r::H5File$new( + filename = filename, + mode = ifelse(test = overwrite, yes = "w", no = "w-") + ) + + # Work on expression matrices + Xslot <- NULL + rawSlot <- NULL + if (all(!sapply(scaleData(object), is.null))) { + Xslot <- "scaleData" + Xmat <- mergeSparseAll(scaleData(object)) + } + else if (all(!sapply(normData(object), is.null))) { + Xslot <- "normData" + Xmat <- mergeSparseAll(normData(object)) + } + else if (all(!sapply(rawData(object), is.null))) { + Xslot <- "rawData" + Xmat <- mergeSparseAll(rawData(object)) + } + else { + cli::cli_abort("No data available to be added to {.field X}") + } + if (all(!sapply(rawData(object), is.null)) && + Xslot != "rawData") { + rawSlot <- "rawData" + } + + if (isTRUE(verbose)) { + cliID <- cli::cli_process_start(sprintf("Adding %s to X", Xslot)) + } + .writeMatrixToH5AD(x = Xmat, dfile = dfile, dname = "X") + .H5AD.addEncoding(dfile = dfile, dname = 'X') + if (isTRUE(verbose)) cli::cli_process_done(cliID) + + # Work on var, as per liger object design, we don't provide feature metadata + # for merged dataset for now. + if (isTRUE(verbose)) cliID <- cli::cli_process_start("Adding var") + varDF <- data.frame(name = rownames(Xmat), row.names = rownames(Xmat)) + .writeDataFrameToH5AD(df = varDF, dfile = dfile, dname = "var") + if (isTRUE(verbose)) cli::cli_process_done(cliID) + + # Add raw + if (!is.null(rawSlot)) { + if (isTRUE(verbose)) { + cliID <- cli::cli_process_start(sprintf("Adding %s to raw", rawSlot)) + } + rawMat <- mergeSparseAll(rawData(object)) + dfile$create_group(name = 'raw') + .writeMatrixToH5AD(x = rawMat, dfile = dfile, dname = "raw/X") + .H5AD.addEncoding(dfile = dfile, dname = 'raw/X') + + # Similarly, Add meta.features + rawVarDF <- data.frame(name = rownames(rawMat), + row.names = rownames(rawMat)) + .writeDataFrameToH5AD(df = rawVarDF, dfile = dfile, dname = "raw/var") + } + + # Add cell metadata + if (isTRUE(verbose)) cliID <- cli::cli_process_start("Adding obs") + .writeDataFrameToH5AD( + df = cellMeta(object, as.data.frame = TRUE), + dfile = dfile, + dname = "obs" + ) + if (isTRUE(verbose)) cli::cli_process_done(cliID) + + + # Add dimensional reduction information + obsm <- dfile$create_group(name = 'obsm') + varm <- dfile$create_group(name = 'varm') + reductions <- object@dimReds + h <- getMatrix(object, "H") + if (all(!sapply(h, is.null))) { + H <- Reduce(cbind, h) + H <- t(H) + reductions[["H"]] <- H + } + if (!is.null(object@H.norm)) reductions[["H_norm"]] <- object@H.norm + for (reduc in names(reductions)) { + newname <- paste0("X_", tolower(reduc)) + if (isTRUE(verbose)) { + cliID <- cli::cli_process_start( + sprintf("Adding low-dim representation %s to obsm as %s", + reduc, newname) + ) + } + obsm$create_dataset( + name = newname, + robj = t(reductions[[reduc]]), + dtype = .H5AD.guessDType(reductions[[reduc]]), + chunk_dims = NULL + ) + if (isTRUE(verbose)) cli::cli_process_done(cliID) + } + + if (!is.null(object@W) && + Xslot == "scaleData") { + W <- object@W + if (!identical(rownames(W), rownames(Xmat))) { + cli::cli_alert_info( + "Feature names in {.field W} do not match those in 'X' ({.field {Xslot}}), skipping." + ) + } + if (isTRUE(verbose)) { + cliID <- cli::cli_process_start("Adding factor feature loadings to varm as 'W'") + } + varm$create_dataset( + name = "W", + robj = t(W), + dtype = .H5AD.guessDType(t(W)), + chunk_dims = NULL + ) + if (isTRUE(verbose)) cli::cli_process_done(cliID) + } + dfile$close_all() +} + +.writeMatrixToH5AD <- function( + x, + dfile, + dname +) { + dnamePaths <- unlist(strsplit(dname, '/')) + # Recursively create groups + for (i in seq_along(dnamePaths)) { + search <- paste0(dnamePaths[1:i], collapse = '/') + if (!dfile$exists(name = search)) { + dfile$create_group(name = search) + } + } + dfile[[dname]]$create_dataset( + name = 'data', + robj = x@x, + dtype = .H5AD.guessDType(x@x) + ) + dfile[[dname]]$create_dataset( + name = 'indices', + robj = x@i, + dtype = .H5AD.guessDType(x@i) + ) + dfile[[dname]]$create_dataset( + name = 'indptr', + robj = x@p, + dtype = .H5AD.guessDType(x@p) + ) + dfile[[dname]]$create_attr( + attr_name = 'shape', + robj = rev(dim(x)), + dtype = .H5AD.guessDType(dim(x)) + ) + return(invisible(NULL)) +} + +.writeDataFrameToH5AD <- function( + df, + dfile, + dname +) { + dnamePaths <- unlist(strsplit(dname, '/')) + # Recursively create groups + for (i in seq_along(dnamePaths)) { + search <- paste0(dnamePaths[1:i], collapse = '/') + if (!dfile$exists(name = search)) { + dfile$create_group(name = search) + } + } + # Add index + dfile[[dname]]$create_dataset( + name = '_index', + robj = rownames(df), + dtype = .H5AD.guessDType(rownames(df)) + ) + dfile[[dname]]$create_attr( + attr_name = '_index', + robj = '_index', + dtype = .H5AD.guessDType('index'), + space = hdf5r::H5S$new(type = "scalar") + ) + # Add columns + for (i in colnames(df)) { + if (is.factor(df[[i]])) { + dfile[[dname]]$create_dataset( + name = i, + robj = as.integer(df[[i]]) - 1L, + dtype = .H5AD.guessDType(as.integer(df[[i]]) - 1L) + ) + if (!dfile[[dname]]$exists(name = '__categories')) { + dfile[[dname]]$create_group(name = '__categories') + } + dfile[[dname]][['__categories']]$create_dataset( + name = i, + robj = levels(df[[i]]), + dtype = .H5AD.guessDType(levels(df[[i]])), + chunk_dims = NULL + ) + dfile[[dname]][['__categories']][[i]]$create_attr( + attr_name = "ordered", + robj = FALSE, + dtype = .H5AD.guessDType(FALSE) + ) + ref <- .H5.create_reference(dfile[[dname]][['__categories']][[i]]) + dfile[[dname]][[i]]$create_attr( + attr_name = 'categories', + robj = ref, + dtype = .H5AD.guessDType(ref), + space = hdf5r::H5S$new(type = "scalar") + ) + } else { + dfile[[dname]]$create_dataset( + name = i, + robj = df[[i]], + dtype = .H5AD.guessDType(df[[i]]) + ) + } + } + dfile[[dname]]$create_attr( + attr_name = 'column-order', + robj = colnames(df), + dtype = .H5AD.guessDType(colnames(df)) + ) + encoding.info <- c('type' = 'dataframe', 'version' = '0.1.0') + names(encoding.info) <- paste0('encoding-', names(encoding.info)) + for (i in seq_along(encoding.info)) { + attr.name <- names(encoding.info)[i] + attr.value <- encoding.info[i] + if (dfile[[dname]]$attr_exists(attr_name = attr.name)) { + dfile[[dname]]$attr_delete(attr_name = attr.name) + } + dfile[[dname]]$create_attr( + attr_name = attr.name, + robj = attr.value, + dtype = .H5AD.guessDType(attr.value), + space = hdf5r::H5S$new(type = "scalar") + ) + } + return(invisible(NULL)) +} + +# self - Only support an H5D object for now +.H5.create_reference <- function(self, ...) { + space <- self$get_space() + do.call("[", c(list(space), list(...))) + ref_type <- hdf5r::h5const$H5R_OBJECT + ref_obj <- hdf5r::H5R_OBJECT$new(1, self) + res <- .Call("R_H5Rcreate", ref_obj$ref, self$id, ".", ref_type, + space$id, FALSE, PACKAGE = "hdf5r") + if (res$return_val < 0) { + stop("Error creating object reference") + } + ref_obj$ref <- res$ref + return(ref_obj) +} + +.H5AD.addEncoding <- function(dfile, dname) { + encoding.info <- c('type' = 'csr_matrix', 'version' = '0.1.0') + names(encoding.info) <- paste0('encoding-', names(encoding.info)) + if (inherits(dfile[[dname]], 'H5Group')) { + for (i in seq_along(encoding.info)) { + attr.name <- names(encoding.info)[i] + attr.value <- encoding.info[i] + if (dfile[[dname]]$attr_exists(attr_name = attr.name)) { + dfile[[dname]]$attr_delete(attr_name = attr.name) + } + dfile[[dname]]$create_attr( + attr_name = attr.name, + robj = attr.value, + dtype = .H5AD.guessDType(x = attr.value), + space = hdf5r::H5S$new(type = "scalar") + ) + } + } + return(invisible(NULL)) +} + +.H5AD.guessDType <- function(x, stype = "utf8", ...) { + dtype <- hdf5r::guess_dtype(x = x, ...) + if (inherits(dtype, "H5T_STRING")) { + dtype <- .H5AD.stringType(stype = stype) + } + else if (inherits(dtype, "H5T_COMPOUND")) { + cpd.dtypes <- dtype$get_cpd_types() + for (i in seq_along(cpd.dtypes)) { + if (inherits(cpd.dtypes[[i]], "H5T_STRING")) { + cpd.dtypes[[i]] <- .H5AD.stringType(stype = stype) + } + } + dtype <- hdf5r::H5T_COMPOUND$new( + labels = dtype$get_cpd_labels(), + dtypes = cpd.dtypes, + size = dtype$get_size() + ) + } + else if (inherits(dtype, "H5T_LOGICAL")) { + dtype <- hdf5r::guess_dtype(x = .boolToInt(x = x), ...) + } + return(dtype) +} + +.H5AD.stringType <- function(stype = c("utf8", "ascii7")) +{ + stype <- match.arg(arg = stype) + switch( + EXPR = stype, + utf8 = hdf5r::H5T_STRING$new(size = Inf)$set_cset( + cset = hdf5r::h5const$H5T_CSET_UTF8 + ), + ascii7 = hdf5r::H5T_STRING$new(size = 7L) + ) +} + +.boolToInt <- function(x) { + x <- as.integer(x) + x[is.na(x)] <- 2L + return(x) +} + diff --git a/man/calcARI.Rd b/man/calcARI.Rd index 5c199317..1d47221d 100644 --- a/man/calcARI.Rd +++ b/man/calcARI.Rd @@ -36,6 +36,8 @@ calculation. Default \code{NULL} uses all datasets.} \value{ A numeric scalar, the ARI of the clustering result indicated by \code{useCluster} compared to \code{trueCluster}. + +A numeric scalar of the ARI value } \description{ This function aims at calculating the adjusted Rand index for the clustering @@ -71,6 +73,9 @@ cellMeta(pbmcPlot, "stim_true_label", useDatasets = "stim") <- stimTrueLabel # evaluated calcARI(pbmcPlot, trueCluster = "stim_true_label", useCluster = "leiden_cluster", useDatasets = "stim") + +# Comparison of the same labeling should always yield 1. +calcARI(pbmcPlot, trueCluster = "leiden_cluster", useCluster = "leiden_cluster") } \references{ L. Hubert and P. Arabie (1985) Comparing Partitions, Journal of diff --git a/man/calcNMI.Rd b/man/calcNMI.Rd new file mode 100644 index 00000000..1c1a3780 --- /dev/null +++ b/man/calcNMI.Rd @@ -0,0 +1,93 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clustering.R +\name{calcNMI} +\alias{calcNMI} +\title{Calculate Normalized Mutual Information (NMI) by comparing two cluster +labeling variables} +\usage{ +calcNMI( + object, + trueCluster, + useCluster = NULL, + useDatasets = NULL, + verbose = getOption("ligerVerbose", TRUE) +) +} +\arguments{ +\item{object}{A \linkS4class{liger} object, with the clustering result +present in cellMeta.} + +\item{trueCluster}{Either the name of one variable in \code{cellMeta(object)} +or a factor object with annotation that matches with all cells being +considered.} + +\item{useCluster}{The name of one variable in \code{cellMeta(object)}. +Default \code{NULL} uses default clusters.} + +\item{useDatasets}{A character vector of the names, a numeric or logical +vector of the index of the datasets to be considered for the purity +calculation. Default \code{NULL} uses all datasets.} + +\item{verbose}{Logical. Whether to show information of the progress. Default +\code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.} +} +\value{ +A numeric scalar of the NMI value +} +\description{ +This function aims at calculating the Normalized Mutual Information for the +clustering result obtained with LIGER and the external clustering (existing +"true" annotation). NMI ranges from 0 to 1, with a score of 0 indicating no +agreement between clusterings and 1 indicating perfect agreement. The +mathematical definition of NMI is as follows: + +\deqn{ +H(X) = -\sum_{x \in X}P(X=x)\log_2 P(X=x) +} +\deqn{ +H(X|Y) = -\sum_{y \in Y}P(Y=y)\sum_{x \in X}P(X=x|Y=y)\log_2 P(X=x|Y=y) +} +\deqn{ +I(X;Y) = H(X) - H(X|Y) +} +\deqn{ +NMI(X;Y) = \frac{I(X;Y)}{\sqrt{H(X)H(Y)}} +} + +Where \eqn{X} is the cluster variable to be evaluated and \eqn{Y} is the true +cluster variable. \eqn{x} and \eqn{y} are the cluster labels in \eqn{X} and +\eqn{Y} respectively. \eqn{H} is the entropy and \eqn{I} is the mutual +information. + +The true clustering annotation must be specified as the base line. We suggest +setting it to the object cellMeta so that it can be easily used for many +other visualization and evaluation functions. + +The NMI can be calculated for only specified datasets, since true annotation +might not be available for all datasets. Evaluation for only one or a few +datasets can be done by specifying \code{useDatasets}. If \code{useDatasets} +is specified, the argument checking for \code{trueCluster} and +\code{useCluster} will be enforced to match the cells in the specified +datasets. +} +\examples{ +# Assume the true cluster in `pbmcPlot` is "leiden_cluster" +# generate fake new labeling +fake <- sample(1:7, ncol(pbmcPlot), replace = TRUE) +# Insert into cellMeta +pbmcPlot$new <- factor(fake) +calcNMI(pbmcPlot, trueCluster = "leiden_cluster", useCluster = "new") + +# Now assume we got existing base line annotation only for "stim" dataset +nStim <- ncol(dataset(pbmcPlot, "stim")) +stimTrueLabel <- factor(fake[1:nStim]) +# Insert into cellMeta +cellMeta(pbmcPlot, "stim_true_label", useDatasets = "stim") <- stimTrueLabel +# Assume "leiden_cluster" is the clustering result we got and need to be +# evaluated +calcNMI(pbmcPlot, trueCluster = "stim_true_label", + useCluster = "leiden_cluster", useDatasets = "stim") + +# Comparison of the same labeling should always yield 1. +calcNMI(pbmcPlot, trueCluster = "leiden_cluster", useCluster = "leiden_cluster") +} diff --git a/man/liger-DEG.Rd b/man/liger-DEG.Rd index 8e8ae0d9..f865c13e 100644 --- a/man/liger-DEG.Rd +++ b/man/liger-DEG.Rd @@ -12,11 +12,13 @@ runPairwiseDEG( groupCtrl, variable1 = NULL, variable2 = NULL, + splitBy = NULL, method = c("pseudoBulk", "wilcoxon"), usePeak = FALSE, useReplicate = NULL, nPsdRep = 5, minCellPerRep = 10, + printDisgnostic = FALSE, seed = 1, verbose = getOption("ligerVerbose", TRUE) ) @@ -31,6 +33,7 @@ runMarkerDEG( useReplicate = NULL, nPsdRep = 5, minCellPerRep = 10, + printDisgnostic = FALSE, seed = 1, verbose = getOption("ligerVerbose", TRUE) ) @@ -47,6 +50,9 @@ runWilcoxon( \item{groupTest, groupCtrl, variable1, variable2}{Condition specification. See \code{?runPairwiseDEG} section \bold{Pairwise DEG Scenarios} for detail.} +\item{splitBy}{Name(s) of the variable(s) in \code{cellMeta} to split the +comparison. See Details. Default \code{NULL}.} + \item{method}{DEG test method to use. Choose from \code{"pseudoBulk"} or \code{"wilcoxon"}. Default \code{"pseudoBulk"}} @@ -63,6 +69,9 @@ will create \code{nPsdRep} pseudo replicates per group.} \item{minCellPerRep}{Numeric, will not make pseudo-bulk for replicate with less than this number of cells. Default \code{10}.} +\item{printDisgnostic}{Logical. Whether to show more detail when +\code{verbose = TRUE}. Default \code{FALSE}.} + \item{seed}{Random seed to use for pseudo-replicate generation. Default \code{1}.} @@ -73,9 +82,6 @@ less than this number of cells. Default \code{10}.} performed for each level of this variable. Multiple variables will be combined. Default \code{NULL} uses default cluster.} -\item{splitBy}{Split data by \code{cellMeta} variable(s) here and identify -markers for \code{conditionBy} within each chunk. Default \code{NULL}.} - \item{useDatasets}{Datasets to perform marker detection within. Default \code{NULL} will use all datasets.} @@ -87,17 +93,119 @@ cells, while \code{"datasets"} run within each cluster and compare each dataset against all other datasets.} } \value{ -A data.frame with DEG information +A data.frame with DEG information with the following field: +\enumerate{ + \item{feature - Gene names} + \item{group - Test group name. Multiple tests might be present for each + function call. This is the main variable to distinguish the tests. For a + pairwise test, a row with a certain group name represents the test result + between the this group against the other control group; When split by a + variable, it would be presented in "split.group" format, meaning the stats + is by comparing the group in the split level against the control group in + the same split level. When running marker detection without splitting, + a row with group "a" represents the stats of the gene in group "a" against + all other cells. When running split marker detection, the group name would + be in "split.group" format, meaning the stats is by comparing the group in + the split level against all other cells in the same split level.} + \item{logFC - Log fold change} + \item{pval - P-value} + \item{padj - Adjusted p-value} + \item{avgExpr - Mean expression in the test group indicated by the "group" + field. Only available for wilcoxon tests.} + \item{statistic - Wilcoxon rank-sum test statistic. Only available for + wilcoxon tests.} + \item{auc - Area under the ROC curve. Only available for wilcoxon tests.} + \item{pct_in - Percentage of cells in the test group, indicated by the + "group" field, that express the feature. Only available for wilcoxon + tests.} + \item{pct_out - Percentage of cells in the control group or other cells, as + explained for the "group" field, that express the feature. Only available + for wilcoxon tests.} +} } \description{ Find DEG between two groups. Two methods are supported: -\code{"wilcoxon"} and \code{"pseudoBulk"}. Wilcoxon rank sum test is -performed on single-cell level, while pseudo-bulk method aggregates cells -basing on biological replicates and calls bulk RNAseq DE methods, DESeq2 wald -test. When real biological replicates are not available, pseudo replicates -can be generated. Please see below for detailed scenario usage. +\code{"pseudoBulk"} and \code{"wilcoxon"}. Pseudo-bulk method aggregates +cells basing on biological replicates and calls bulk RNAseq DE methods, +DESeq2 wald test, while Wilcoxon rank sum test is performed on single-cell +level. + +While using pseudo-bulk method, it is generally recommended that you have +these variables available in your object: + +\enumerate{ + \item{The cell type or cluster labeling, dividing the cells by + functionality. This can be obtained from prior study or computed with + \code{\link{runCluster}}} + \item{The biological replicate labeling, most of the time the + \code{"dataset"} variable automatically generated when the + \linkS4class{liger} object is created. } + \item{The condition labeling that reflects the study design, such as the + treatment or disease status for each sample/dataset.} +} + +Please see below for detailed scenarios. +} +\section{Using Wilcoxon rank-sum test}{ + +Wilcoxon rank-sum test works for each gene and is based on the rank of the +expression in each cell. LIGER provides dataset integration but does not +"correct" the expression values. Projects with strong batch effects or +integrate drastically different modalities should be cautious when using +this method. +} + +\section{Comparing difference between/across cell types}{ + +Most of times, people would want to know what cell types are for each cluster +after clustering. This can be done with a marker detection method that test +each cluster against all the other cells. This can be done with a command +like \code{runMarkerDEG(object, conditionBy = "cluster_var")}. When using +default pseudo-bulk method, users should additionaly determine the +pseudo-bulk setup parameters. If the real biological replicate variable is +available, it should be supplied to argument \code{useReplicate}, otherwise, +set \code{useReplicate = NULL} and use pseudo-replicate setup. +} + +\section{Compare between conditions}{ + +It is frequently needed to identify the difference between conditions. Users +can simply set \code{conditionBy = "condition_var"}. However, most of time, +such comparisons should be ideally done in a per-cluster manner. This can be +done by setting \code{splitBy = "cluster_var"}. This will run a loop for each +cluster, and within the group of cells, compare each condition against all +other cells in the cluster. + +In the scenario when users only need to compare two conditions for each +cluster, running \code{runPairwiseDEG(object, groupTest = "condition1", +groupCtrl = "condition2", variable1 = "condition_var", +splitBy = "cluster_var")} would address the need. + +For both use case, if pseudo-bulk (default) method is used, users should +determine the pseudo-bulk setup parameters as mentioned in the previous +section. +} + +\section{Detailed \code{runMarkerDEG} usage}{ + +Marker detection is generally performed in a one vs. rest manner. The +grouping of such condition is specified by \code{conditionBy}, which should +be a column name in \code{cellMeta}. When \code{splitBy} is specified as +another variable name in \code{cellMeta}, the marker detection will be +iteratively done for within each level of \code{splitBy} variable. + +For example, when \code{conditionBy = "celltype"} and \code{splitBy = NULL}, +marker detection will be performed by comparing all cells of "celltype_i" +against all other cells, and etc. This is analogous to the old version when +running \code{runWilcoxon(method = "cluster")}. + +When \code{conditionBy = "gender"} and \code{splitBy = "leiden_cluster"}, +marker detection will be performed by comparing "gender_i" cells from "cluster_j" +against other cells from "cluster_j", and etc. This is analogous to the old +version when running \code{runWilcoxon(method = "dataset")}. } -\section{Pairwise DEG Scenarios}{ + +\section{Detailed \code{runPairwiseDEG} usage}{ Users can select classes of cells from a variable in \code{cellMeta}. \code{variable1} and \code{variable2} are used to specify a column in @@ -118,39 +226,25 @@ When both \code{variable1} and \code{variable2} are missing, \code{groupTest} and \code{groupCtrl} should be valid index of cells in \code{object}. } -\section{Marker Detection Scenarios}{ - -Marker detection is generally performed in a one vs. rest manner. The -grouping of such condition is specified by \code{conditionBy}, which should -be a column name in \code{cellMeta}. When \code{splitBy} is specified as -another variable name in \code{cellMeta}, the marker detection will be -iteratively done for within each level of \code{splitBy} variable. - -For example, when \code{conditionBy = "celltype"} and \code{splitBy = NULL}, -marker detection will be performed by comparing all cells of "celltype_i" -against all other cells, and etc. +\examples{ +pbmc$leiden_cluster <- pbmcPlot$leiden_cluster -When \code{conditionBy = "celltype"} and \code{splitBy = "gender"}, marker -detection will be performed by comparing "celltype_i" cells from "gender_j" -against other cells from "gender_j", and etc. -} +# Identify cluster markers +degStats1 <- runMarkerDEG(pbmc, conditionBy = "leiden_cluster", + minCellPerRep = 5) -\examples{ # Compare between cluster "0" and cluster "1" -degStats <- runPairwiseDEG(pbmcPlot, groupTest = 0, groupCtrl = 1, - variable1 = "leiden_cluster") -# Compare between all cells from cluster "5" and -# all cells from dataset "stim" -degStats <- runPairwiseDEG(pbmcPlot, groupTest = "5", groupCtrl = "stim", - variable1 = "leiden_cluster", - variable2 = "dataset") -# Identify markers for each cluster. Equivalent to old version -# `runWilcoxon(method = "cluster")` -pbmc$leiden_cluster <- pbmcPlot$leiden_cluster -markerStats <- runMarkerDEG(pbmc, conditionBy = "leiden_cluster") -# Identify dataset markers within each cluster. Equivalent to old version -# `runWilcoxon(method = "dataset")`. -markerStatsList <- runMarkerDEG(pbmc, conditionBy = "dataset", - splitBy = "leiden_cluster", - minCellPerRep = 2) +degStats2 <- runPairwiseDEG(pbmc, groupTest = 0, groupCtrl = 1, + variable1 = "leiden_cluster") + +# Compare "stim" data against "ctrl" data within each cluster +degStats3 <- runPairwiseDEG(pbmc, groupTest = "stim", groupCtrl = "ctrl", + variable1 = "dataset", + splitBy = "leiden_cluster", + nPsdRep = 2, minCellPerRep = 4) + +# Identify dataset markers within each cluster. +degStats4 <- runMarkerDEG(pbmc, conditionBy = "dataset", + splitBy = "leiden_cluster", nPsdRep = 2, + minCellPerRep = 4) } diff --git a/man/ligerToH5AD.Rd b/man/ligerToH5AD.Rd new file mode 100644 index 00000000..3e048316 --- /dev/null +++ b/man/ligerToH5AD.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ligerToH5AD.R +\name{ligerToH5AD} +\alias{ligerToH5AD} +\title{Write liger object to H5AD files} +\usage{ +ligerToH5AD( + object, + filename, + overwrite = FALSE, + verbose = getOption("ligerVerbose", TRUE) +) +} +\arguments{ +\item{object}{A \linkS4class{liger} object} + +\item{filename}{A character string, the path to the H5AD file to be written} + +\item{overwrite}{Logical, whether to overwrite the file if it exists.} + +\item{verbose}{Logical. Whether to show information of the progress. Default +\code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set.} +} +\value{ +No return value, an H5AD file is written to disk +} +\description{ +Write liger object to H5AD files +} +\examples{ +ligerToH5AD(pbmc, filename = tempfile(fileext = ".h5ad")) +}