Added SEACells model to calculate the metacells

### pp Module: - Added `max_cells_ratio` and `max_genes_ratio` to control the max threshold in qc of scRNA-seq ### single Module: - Added `SEACells` model to calculate the metacells from scRNA-seq ### space Module: - Added `STAligner` to integrate multi stRNA-seq
Starlitnightly · Jan 3, 2024 · 2bbf9cb · 2bbf9cb
1 parent 53dff3b
commit 2bbf9cb
Show file tree

Hide file tree

Showing 24 changed files with 5,144 additions and 136 deletions.
diff --git a/omicverse/SEACells/Rscripts/chromVAR.R b/omicverse/SEACells/Rscripts/chromVAR.R
@@ -0,0 +1,67 @@
+library(chromVAR)
+library(SummarizedExperiment)
+library(Matrix)
+
+library(BSgenome.Hsapiens.UCSC.hg38)
+
+library(motifmatchr)
+library(chromVARmotifs)
+
+# parse arguments
+argReader = commandArgs(trailingOnly=TRUE)
+base_dir <- argReader[1]
+
+print(paste0("Loading files (peaks.bed, sampling_depth.txt, counts.txt, peak_names.txt and cell_names.txt) from base directory ",base_dir))
+
+peaks <- getPeaks(paste0(base_dir,"peaks.bed"), sort_peaks = TRUE)
+
+sampling_depth <- as.matrix(read.delim(paste0(base_dir, 'sampling_depth.txt'), header=F))
+sampling_depth <- as.numeric(sampling_depth)
+
+print("Loading counts...")
+
+my_counts_matrix <- as.matrix(read.delim(paste0(base_dir, 'counts.txt'), header=F))
+
+row_names <- as.matrix(read.delim(paste0(base_dir, 'peak_names.txt'), header=F))
+rownames(my_counts_matrix) <- row_names
+
+col_names <- as.matrix(read.delim(paste0(base_dir, 'cell_names.txt'), header=F))
+colnames(my_counts_matrix) <- col_names
+
+print("Creating SummarizedExperiment...")
+
+fragment_counts <- SummarizedExperiment(assays =  list(counts = my_counts_matrix),
+                                        rowRanges = peaks,
+                                        colData = sampling_depth)
+
+colnames(colData(fragment_counts)) <- c('depth')
+
+fragment_counts
+
+rm(my_counts_matrix)
+rm(peaks)
+rm(sampling_depth)
+rm(row_names)
+
+
+fragment_counts <- addGCBias(fragment_counts,
+                             genome = BSgenome.Hsapiens.UCSC.hg38)
+
+print("Matching motifs...")
+
+data('human_pwms_v2')
+motifs = human_pwms_v2
+motif_ix <- matchMotifs(motifs, fragment_counts, genome = BSgenome.Hsapiens.UCSC.hg38)
+
+rm(motifs)
+
+print("Computing deviations...")
+
+dev <- computeDeviations(object = fragment_counts, annotations = motif_ix)
+variability <- computeVariability(dev)
+
+write.csv(deviations(dev), paste0(base_dir,"deviations.csv"))
+write.csv(deviationScores(dev) ,paste0(base_dir,"deviationScores.csv"))
+write.csv(variability ,paste0(base_dir,"variability.csv"))
+
+print("Finished writing files.")
diff --git a/omicverse/SEACells/Rscripts/tanay.R b/omicverse/SEACells/Rscripts/tanay.R
@@ -0,0 +1,4 @@
+# Title     : TODO
+# Objective : TODO
+# Created by: sitarapersad
+# Created on: 5/14/21
diff --git a/omicverse/SEACells/__init__.py b/omicverse/SEACells/__init__.py
@@ -0,0 +1,2 @@
+from .core import *
+from .evaluate import *
diff --git a/omicverse/SEACells/accessibility.py b/omicverse/SEACells/accessibility.py
@@ -0,0 +1,114 @@
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+
+def determine_metacell_open_peaks(
+    atac_meta_ad,
+    peak_set=None,
+    low_dim_embedding="X_svd",
+    pval_cutoff=1e-2,
+    read_len=147,
+    n_neighbors=3,
+    n_jobs=1,
+):
+    """Determine the set of peaks that are open in each metacell.
+
+    :param atac_meta_ad: (Anndata) ATAC metacell Anndata created using `prepare_multiome_anndata`
+    :param peak_set: (pd.Series) Subset of peaks to test. All peaks are tested by default
+    :param low_dim_embedding: (str) `atac_meta_ad.obsm` field for nearest neighbor computation
+    :param p_val_cutoff: (float) Nominal p-value cutoff for open peaks
+    :param read_len: (int) Fragment length
+    :param n_jobs: (int) number of jobs for parallel processing
+
+    :atac_meta_ad is modified inplace with `.obsm['OpenPeaks']` indicating the set of open peaks in each metacell
+    """
+    from scipy.stats import multinomial, poisson
+    from sklearn.neighbors import NearestNeighbors
+
+    # Effective genome length for background computaiton
+    eff_genome_length = atac_meta_ad.shape[1] * 5000
+
+    # Set up container
+    if peak_set is None:
+        peak_set = atac_meta_ad.var_names
+    open_peaks = pd.DataFrame(0, index=atac_meta_ad.obs_names, columns=peak_set)
+
+    # Metacell neighbors
+    nbrs = NearestNeighbors(n_neighbors=n_neighbors)
+    nbrs.fit(atac_meta_ad.obsm[low_dim_embedding])
+    meta_nbrs = pd.DataFrame(
+        atac_meta_ad.obs_names.values[
+            nbrs.kneighbors(atac_meta_ad.obsm[low_dim_embedding])[1]
+        ],
+        index=atac_meta_ad.obs_names,
+    )
+
+    for m in tqdm(open_peaks.index):
+        # Boost using local neighbors
+        frag_counts = np.ravel(
+            atac_meta_ad[meta_nbrs.loc[m, :].values, :][:, peak_set].X.sum(axis=0)
+        )
+        frag_distr = frag_counts / np.sum(frag_counts).astype(np.float64)
+
+        # Multinomial distribution
+        while not 0 < np.sum(frag_distr) < 1 - 1e-5:
+            frag_distr = np.absolute(frag_distr - np.finfo(np.float32).epsneg)
+        # Sample from multinomial distribution
+        frag_counts = multinomial.rvs(
+            np.percentile(atac_meta_ad.obs["n_counts"], 100), frag_distr
+        )
+
+        # Compute background poisson distribution
+        total_frags = frag_counts.sum()
+        glambda = (read_len * total_frags) / eff_genome_length
+
+        # Significant peaks
+        cutoff = pval_cutoff / np.sum(frag_counts > 0)
+        open_peaks.loc[m, frag_counts >= poisson.ppf(1 - cutoff, glambda)] = 1
+
+    # Update ATAC Metadata object
+    atac_meta_ad.layers["OpenPeaks"] = open_peaks.values
+
+
+def get_gene_accessibility(
+    atac_meta_ad, gene_peak_cors, gene_set=None, pval_cutoff=1e-1, cor_cutoff=0.1
+):
+    """Gene accessibility scores for each. The score is defined as the fraction of significantly correlated peaks that are open in a given metacell.
+
+    :param atac_meta_ad: (Anndata) ATAC metacell Anndata created using `prepare_multiome_anndata`
+    :param gene_peak_cors: (pd.Series) Output of `get_gene_peak_correlations` function
+    :param gene_set: (pd.Series) Subset of genes to compute accessibility scores. All genes are used by default.
+    :param p_val_cutoff: (float) Nominal p-value cutoff for test of significance of correlation
+    :param cor_cutoff: (float) Correlation cutoff
+
+    :atac_meta_ad is modified inplace with `.obsm['GeneAccessibility']` indicating the gene scores for each metacell
+    """
+    if "OpenPeaks" not in atac_meta_ad.layers.keys():
+        raise Exception(
+            "Run determine_metacell_open_peaks to compute gene accessibility"
+        )
+    open_peaks = pd.DataFrame(
+        atac_meta_ad.layers["OpenPeaks"],
+        index=atac_meta_ad.obs_names,
+        columns=atac_meta_ad.var_names,
+    )
+
+    if gene_set is None:
+        gene_set = gene_peak_cors.index
+
+    # Container
+    gene_accessiblity = pd.DataFrame(-1, index=atac_meta_ad.obs_names, columns=gene_set)
+    for gene in tqdm(gene_set):
+        df = gene_peak_cors[gene]
+
+        # Skip if there no correlated peaks
+        if type(df) is int:
+            continue
+        gene_peaks = df.index[(df["pval"] < pval_cutoff) & (df["cor"] > cor_cutoff)]
+
+        # Identify fraction open
+        s = open_peaks.loc[:, gene_peaks].sum(axis=1)
+        gene_accessiblity.loc[s.index, gene] = s.values / len(gene_peaks)
+
+    atac_meta_ad.obsm["GeneAccessibility"] = gene_accessiblity
diff --git a/omicverse/SEACells/build_graph.py b/omicverse/SEACells/build_graph.py
@@ -0,0 +1,189 @@
+# optimization
+# for parallelizing stuff
+from multiprocessing import cpu_count
+
+import numpy as np
+from joblib import Parallel, delayed
+from scipy.sparse import lil_matrix
+from tqdm.notebook import tqdm
+
+# get number of cores for multiprocessing
+NUM_CORES = cpu_count()
+
+##########################################################
+# Helper functions for parallelizing kernel construction
+##########################################################
+
+
+def kth_neighbor_distance(distances, k, i):
+    """Returns distance to kth nearest neighbor.
+
+    Distances: sparse CSR matrix
+    k: kth nearest neighbor
+    i: index of row
+    .
+    """
+    # convert row to 1D array
+    row_as_array = distances[i, :].toarray().ravel()
+
+    # number of nonzero elements
+    num_nonzero = np.sum(row_as_array > 0)
+
+    # argsort
+    kth_neighbor_idx = np.argsort(np.argsort(-row_as_array)) == num_nonzero - k
+    return np.linalg.norm(row_as_array[kth_neighbor_idx])
+
+
+def rbf_for_row(G, data, median_distances, i):
+    """Helper function for computing radial basis function kernel for each row of the data matrix.
+
+    :param G: (array) KNN graph representing nearest neighbour connections between cells
+    :param data: (array) data matrix between which euclidean distances are computed for RBF
+    :param median_distances: (array) radius for RBF - the median distance between cell and k nearest-neighbours
+    :param i: (int) data row index for which RBF is calculated
+    :return: sparse matrix containing computed RBF for row
+    """
+    # convert row to binary numpy array
+    row_as_array = G[i, :].toarray().ravel()
+
+    # compute distances ||x - y||^2 in PC/original X space
+    numerator = np.sum(np.square(data[i, :] - data), axis=1, keepdims=False)
+
+    # compute radii - median distance is distance to kth nearest neighbor
+    denominator = median_distances[i] * median_distances
+
+    # exp
+    full_row = np.exp(-numerator / denominator)
+
+    # masked row - to contain only indices captured by G matrix
+    masked_row = np.multiply(full_row, row_as_array)
+
+    return lil_matrix(masked_row)
+
+
+##########################################################
+# Archetypal Analysis Metacell Graph
+##########################################################
+
+
+class SEACellGraph:
+    """SEACell graph class."""
+
+    def __init__(self, ad, build_on="X_pca", n_cores: int = -1, verbose: bool = False):
+        """SEACell graph class.
+
+        :param ad: (anndata.AnnData) object containing data for which metacells are computed
+        :param build_on: (str) key corresponding to matrix in ad.obsm which is used to compute kernel for metacells
+                        Typically 'X_pca' for scRNA or 'X_svd' for scATAC
+        :param n_cores: (int) number of cores for multiprocessing. If unspecified, computed automatically as
+                        number of CPU cores
+        :param verbose: (bool) whether or not to suppress verbose program logging
+        """
+        """Initialize model parameters"""
+        # data parameters
+        self.n, self.d = ad.obsm[build_on].shape
+
+        # indices of each point
+        self.indices = np.array(range(self.n))
+
+        # save data
+        self.ad = ad
+        self.build_on = build_on
+
+        self.knn_graph = None
+        self.sym_graph = None
+
+        # number of cores for parallelization
+        if n_cores != -1:
+            self.num_cores = n_cores
+        else:
+            self.num_cores = NUM_CORES
+
+        self.M = None  # similarity matrix
+        self.G = None  # graph
+        self.T = None  # transition matrix
+
+        # model params
+        self.verbose = verbose
+
+    ##############################################################
+    # Methods related to kernel + sim matrix construction
+    ##############################################################
+
+    def rbf(self, k: int = 15, graph_construction="union"):
+        """Initialize adaptive bandwith RBF kernel (as described in C-isomap).
+
+        :param k: (int) number of nearest neighbors for RBF kernel
+        :return: (sparse matrix) constructed RBF kernel
+        """
+        import scanpy as sc
+
+        if self.verbose:
+            print("Computing kNN graph using scanpy NN ...")
+
+        # compute kNN and the distance from each point to its nearest neighbors
+        sc.pp.neighbors(self.ad, use_rep=self.build_on, n_neighbors=k, knn=True)
+        knn_graph_distances = self.ad.obsp["distances"]
+
+        # Binarize distances to get connectivity
+        knn_graph = knn_graph_distances.copy()
+        knn_graph[knn_graph != 0] = 1
+        # Include self as neighbour
+        knn_graph.setdiag(1)
+
+        self.knn_graph = knn_graph
+        if self.verbose:
+            print("Computing radius for adaptive bandwidth kernel...")
+
+            # compute median distance for each point amongst k-nearest neighbors
+        with Parallel(n_jobs=self.num_cores, backend="threading") as parallel:
+            median = k // 2
+            median_distances = parallel(
+                delayed(kth_neighbor_distance)(knn_graph_distances, median, i)
+                for i in tqdm(range(self.n))
+            )
+
+        # convert to numpy array
+        median_distances = np.array(median_distances)
+
+        if self.verbose:
+            print("Making graph symmetric...")
+
+        print(
+            f"Parameter graph_construction = {graph_construction} being used to build KNN graph..."
+        )
+        if graph_construction == "union":
+            sym_graph = (knn_graph + knn_graph.T > 0).astype(float)
+        elif graph_construction in ["intersect", "intersection"]:
+            knn_graph = (knn_graph > 0).astype(float)
+            sym_graph = knn_graph.multiply(knn_graph.T)
+        else:
+            raise ValueError(
+                f"Parameter graph_construction = {graph_construction} is not valid. \
+             Please select `union` or `intersection`"
+            )
+
+        self.sym_graph = sym_graph
+        if self.verbose:
+            print("Computing RBF kernel...")
+
+        with Parallel(n_jobs=self.num_cores, backend="threading") as parallel:
+            similarity_matrix_rows = parallel(
+                delayed(rbf_for_row)(
+                    sym_graph, self.ad.obsm[self.build_on], median_distances, i
+                )
+                for i in tqdm(range(self.n))
+            )
+
+        if self.verbose:
+            print("Building similarity LIL matrix...")
+
+        similarity_matrix = lil_matrix((self.n, self.n))
+        for i in tqdm(range(self.n)):
+            similarity_matrix[i] = similarity_matrix_rows[i]
+
+        if self.verbose:
+            print("Constructing CSR matrix...")
+
+        self.M = (similarity_matrix).tocsr()
+        return self.M