From c439d5fa909d0a9beb7d81fc39702e088373efdc Mon Sep 17 00:00:00 2001 From: ktpolanski Date: Fri, 8 Dec 2023 11:40:16 +0000 Subject: [PATCH] add consensus mode in multi cluster qc, document DF input for cellwise QC --- sctk/__init__.py | 2 +- sctk/_pipeline.py | 63 ++++++++++++++++++++++++++++++++--------------- 2 files changed, 44 insertions(+), 21 deletions(-) diff --git a/sctk/__init__.py b/sctk/__init__.py index 007d681..e78344d 100644 --- a/sctk/__init__.py +++ b/sctk/__init__.py @@ -99,13 +99,13 @@ auto_zoom_in, calculate_qc, cellwise_qc, - cluster_qc_find_resolution, clusterwise_qc, crossmap, custom_pipeline, generate_qc_clusters, get_good_sized_batch, integrate, + multi_resolution_cluster_qc, recluster_subset, simple_default_pipeline, default_metric_params_df, diff --git a/sctk/_pipeline.py b/sctk/_pipeline.py index 2aef3fa..16e6e33 100644 --- a/sctk/_pipeline.py +++ b/sctk/_pipeline.py @@ -393,10 +393,10 @@ def cellwise_qc(adata, metrics=None, cell_qc_key="cell_passed_qc", uns_qc_key="s Args: adata: AnnData object to filter cells from. - metrics: Optional list/tuple of metric names or dictionary of metric - names and their corresponding parameters. If not provided, the function - uses a set of default metrics. For defaults and an explanation, please - refer to the QC workflow demo notebook. + metrics: Optional data frame specifying the parameters for each metric. + If not provided, the function uses a set of default metrics stored + in ``sctk.default_metric_params_df``. For defaults and an + explanation, please refer to the QC workflow demo notebook. cell_qc_key: Obs column in the object to store the per-cell QC calls in. uns_qc_key: Uns key to store the determined QC ranges used in filtering. **kwargs: Additional keyword arguments to pass to the @@ -767,24 +767,33 @@ def clusterwise_qc( ad.obs[key_added] = ad.obs[clus_key].isin(good_qc_clusters) -def cluster_qc_find_resolution(ad, metrics, - failed=True, - res=np.arange(0.1,1.1,0.1), - n_pcs=None, - n_neighbors=None, - threshold=0.5, - clus_key="qc_cluster", - umap_key="X_umap_qc", - cell_qc_key="cell_passed_qc", - key_added="cluster_passed_qc", - ) -> None: +def multi_resolution_cluster_qc(ad, metrics, + failed=True, + res=np.arange(0.1,1.1,0.1), + n_pcs=None, + n_neighbors=None, + threshold=0.5, + clus_key="qc_cluster", + umap_key="X_umap_qc", + cell_qc_key="cell_passed_qc", + key_added="cluster_passed_qc", + consensus_threshold=0.5, + consensus_frac_key="consensus_fraction", + consensus_call_key="consensus_passed_qc" + ) -> None: """ Run ``generate_qc_clusters()`` and ``clusterwise_qc()`` for a number of - potential resolutions. The optimal resolution is determined as the one that - grants the highest Jaccard index between cell-level and cluster-level QC - calls. + potential resolutions. Uses the multiple clusterings to identify a more + robust set of QC calls than that derived from a single resolution. Proposes + two sets of QC calls: - Stores optimal resolution clusters and QC calls in input object. + 1. A clustering resolution is proposed as the one that grants the highest + Jaccard index between cell-level and cluster-level QC calls. + + 2. For each cell, the fraction of tested resolutions where the cell passes + QC is computed. This fraction is then thresholded to get a QC call. + + Stores both sets of QC calls in input object. Args: ad: AnnData object to generate QC clusters for. @@ -809,6 +818,13 @@ def cluster_qc_find_resolution(ad, metrics, cell QC calls from obs in the AnnData. key_added: ``clusterwise_qc()`` argument. Key to use for storing the results in the AnnData obs object. + consensus_threshold: A cell has to pass QC in more than this fraction of + tested resolutions to be flagged as a good QC cell in the consensus + calls. + consensus_frac_key: Key to use for storing the consensus fraction in the + AnnData obs. + consensus_call_key: Key to use for storing the consensus calls + (thresholded consensus fraction) in the AnnData obs. Returns: None. @@ -823,11 +839,13 @@ def cluster_qc_find_resolution(ad, metrics, >>> sctk.calculate_qc(adata) >>> metrics_list = ["n_counts", "n_genes", "percent_mito", "percent_ribo", "percent_hb"] >>> sctk.cellwise_qc(adata) - >>> sctk.cluster_qc_find_resolution(adata, metrics=metrics_list) + >>> sctk.multi_resolution_cluster_qc(adata, metrics=metrics_list) """ # store the qc cluster object into a helper variable, along with the Jaccards we make aux_ad = None jaccards = [] + # initialise consensus count (to become fraction later) column + ad.obs[consensus_frac_key] = 0 for sres in res: # compute the clusters and then get cluster-level QC calls # calling with aux_ad=None in the first pass will work just fine @@ -853,6 +871,8 @@ def cluster_qc_find_resolution(ad, metrics, jaccards.append(np.sum(~ad.obs[cell_qc_key] & ~ad.obs[key_added])/np.sum(~ad.obs[cell_qc_key] | ~ad.obs[key_added])) else: jaccards.append(np.sum(ad.obs[cell_qc_key] & ad.obs[key_added])/np.sum(ad.obs[cell_qc_key] | ad.obs[key_added])) + # tick up the QC passed count for consensus computation + ad.obs[consensus_frac_key] += ad.obs[key_added].astype(int) # find our best Jaccard and set the output object to have the corresponding clustering best_res = res[np.argmax(jaccards)] print("Best overlap found for resolution "+str(best_res)) @@ -871,6 +891,9 @@ def cluster_qc_find_resolution(ad, metrics, cell_qc_key=cell_qc_key, key_added=key_added ) + # compute the consensus fraction and calls + ad.obs[consensus_frac_key] = ad.obs[consensus_frac_key]/len(res) + ad.obs[consensus_call_key] = (ad.obs[consensus_frac_key] > consensus_threshold) def get_good_sized_batch(batches, min_size=10) -> list: