-
-
Notifications
You must be signed in to change notification settings - Fork 67
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added
SEACells
model to calculate the metacells
### pp Module: - Added `max_cells_ratio` and `max_genes_ratio` to control the max threshold in qc of scRNA-seq ### single Module: - Added `SEACells` model to calculate the metacells from scRNA-seq ### space Module: - Added `STAligner` to integrate multi stRNA-seq
- Loading branch information
1 parent
53dff3b
commit 2bbf9cb
Showing
24 changed files
with
5,144 additions
and
136 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
library(chromVAR) | ||
library(SummarizedExperiment) | ||
library(Matrix) | ||
|
||
library(BSgenome.Hsapiens.UCSC.hg38) | ||
|
||
library(motifmatchr) | ||
library(chromVARmotifs) | ||
|
||
# parse arguments | ||
argReader = commandArgs(trailingOnly=TRUE) | ||
base_dir <- argReader[1] | ||
|
||
print(paste0("Loading files (peaks.bed, sampling_depth.txt, counts.txt, peak_names.txt and cell_names.txt) from base directory ",base_dir)) | ||
|
||
peaks <- getPeaks(paste0(base_dir,"peaks.bed"), sort_peaks = TRUE) | ||
|
||
sampling_depth <- as.matrix(read.delim(paste0(base_dir, 'sampling_depth.txt'), header=F)) | ||
sampling_depth <- as.numeric(sampling_depth) | ||
|
||
print("Loading counts...") | ||
|
||
my_counts_matrix <- as.matrix(read.delim(paste0(base_dir, 'counts.txt'), header=F)) | ||
|
||
row_names <- as.matrix(read.delim(paste0(base_dir, 'peak_names.txt'), header=F)) | ||
rownames(my_counts_matrix) <- row_names | ||
|
||
col_names <- as.matrix(read.delim(paste0(base_dir, 'cell_names.txt'), header=F)) | ||
colnames(my_counts_matrix) <- col_names | ||
|
||
print("Creating SummarizedExperiment...") | ||
|
||
fragment_counts <- SummarizedExperiment(assays = list(counts = my_counts_matrix), | ||
rowRanges = peaks, | ||
colData = sampling_depth) | ||
|
||
colnames(colData(fragment_counts)) <- c('depth') | ||
|
||
fragment_counts | ||
|
||
rm(my_counts_matrix) | ||
rm(peaks) | ||
rm(sampling_depth) | ||
rm(row_names) | ||
|
||
|
||
fragment_counts <- addGCBias(fragment_counts, | ||
genome = BSgenome.Hsapiens.UCSC.hg38) | ||
|
||
print("Matching motifs...") | ||
|
||
data('human_pwms_v2') | ||
motifs = human_pwms_v2 | ||
motif_ix <- matchMotifs(motifs, fragment_counts, genome = BSgenome.Hsapiens.UCSC.hg38) | ||
|
||
rm(motifs) | ||
|
||
print("Computing deviations...") | ||
|
||
dev <- computeDeviations(object = fragment_counts, annotations = motif_ix) | ||
variability <- computeVariability(dev) | ||
|
||
write.csv(deviations(dev), paste0(base_dir,"deviations.csv")) | ||
write.csv(deviationScores(dev) ,paste0(base_dir,"deviationScores.csv")) | ||
write.csv(variability ,paste0(base_dir,"variability.csv")) | ||
|
||
print("Finished writing files.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Title : TODO | ||
# Objective : TODO | ||
# Created by: sitarapersad | ||
# Created on: 5/14/21 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .core import * | ||
from .evaluate import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from tqdm import tqdm | ||
|
||
|
||
def determine_metacell_open_peaks( | ||
atac_meta_ad, | ||
peak_set=None, | ||
low_dim_embedding="X_svd", | ||
pval_cutoff=1e-2, | ||
read_len=147, | ||
n_neighbors=3, | ||
n_jobs=1, | ||
): | ||
"""Determine the set of peaks that are open in each metacell. | ||
:param atac_meta_ad: (Anndata) ATAC metacell Anndata created using `prepare_multiome_anndata` | ||
:param peak_set: (pd.Series) Subset of peaks to test. All peaks are tested by default | ||
:param low_dim_embedding: (str) `atac_meta_ad.obsm` field for nearest neighbor computation | ||
:param p_val_cutoff: (float) Nominal p-value cutoff for open peaks | ||
:param read_len: (int) Fragment length | ||
:param n_jobs: (int) number of jobs for parallel processing | ||
:atac_meta_ad is modified inplace with `.obsm['OpenPeaks']` indicating the set of open peaks in each metacell | ||
""" | ||
from scipy.stats import multinomial, poisson | ||
from sklearn.neighbors import NearestNeighbors | ||
|
||
# Effective genome length for background computaiton | ||
eff_genome_length = atac_meta_ad.shape[1] * 5000 | ||
|
||
# Set up container | ||
if peak_set is None: | ||
peak_set = atac_meta_ad.var_names | ||
open_peaks = pd.DataFrame(0, index=atac_meta_ad.obs_names, columns=peak_set) | ||
|
||
# Metacell neighbors | ||
nbrs = NearestNeighbors(n_neighbors=n_neighbors) | ||
nbrs.fit(atac_meta_ad.obsm[low_dim_embedding]) | ||
meta_nbrs = pd.DataFrame( | ||
atac_meta_ad.obs_names.values[ | ||
nbrs.kneighbors(atac_meta_ad.obsm[low_dim_embedding])[1] | ||
], | ||
index=atac_meta_ad.obs_names, | ||
) | ||
|
||
for m in tqdm(open_peaks.index): | ||
# Boost using local neighbors | ||
frag_counts = np.ravel( | ||
atac_meta_ad[meta_nbrs.loc[m, :].values, :][:, peak_set].X.sum(axis=0) | ||
) | ||
frag_distr = frag_counts / np.sum(frag_counts).astype(np.float64) | ||
|
||
# Multinomial distribution | ||
while not 0 < np.sum(frag_distr) < 1 - 1e-5: | ||
frag_distr = np.absolute(frag_distr - np.finfo(np.float32).epsneg) | ||
# Sample from multinomial distribution | ||
frag_counts = multinomial.rvs( | ||
np.percentile(atac_meta_ad.obs["n_counts"], 100), frag_distr | ||
) | ||
|
||
# Compute background poisson distribution | ||
total_frags = frag_counts.sum() | ||
glambda = (read_len * total_frags) / eff_genome_length | ||
|
||
# Significant peaks | ||
cutoff = pval_cutoff / np.sum(frag_counts > 0) | ||
open_peaks.loc[m, frag_counts >= poisson.ppf(1 - cutoff, glambda)] = 1 | ||
|
||
# Update ATAC Metadata object | ||
atac_meta_ad.layers["OpenPeaks"] = open_peaks.values | ||
|
||
|
||
def get_gene_accessibility( | ||
atac_meta_ad, gene_peak_cors, gene_set=None, pval_cutoff=1e-1, cor_cutoff=0.1 | ||
): | ||
"""Gene accessibility scores for each. The score is defined as the fraction of significantly correlated peaks that are open in a given metacell. | ||
:param atac_meta_ad: (Anndata) ATAC metacell Anndata created using `prepare_multiome_anndata` | ||
:param gene_peak_cors: (pd.Series) Output of `get_gene_peak_correlations` function | ||
:param gene_set: (pd.Series) Subset of genes to compute accessibility scores. All genes are used by default. | ||
:param p_val_cutoff: (float) Nominal p-value cutoff for test of significance of correlation | ||
:param cor_cutoff: (float) Correlation cutoff | ||
:atac_meta_ad is modified inplace with `.obsm['GeneAccessibility']` indicating the gene scores for each metacell | ||
""" | ||
if "OpenPeaks" not in atac_meta_ad.layers.keys(): | ||
raise Exception( | ||
"Run determine_metacell_open_peaks to compute gene accessibility" | ||
) | ||
open_peaks = pd.DataFrame( | ||
atac_meta_ad.layers["OpenPeaks"], | ||
index=atac_meta_ad.obs_names, | ||
columns=atac_meta_ad.var_names, | ||
) | ||
|
||
if gene_set is None: | ||
gene_set = gene_peak_cors.index | ||
|
||
# Container | ||
gene_accessiblity = pd.DataFrame(-1, index=atac_meta_ad.obs_names, columns=gene_set) | ||
for gene in tqdm(gene_set): | ||
df = gene_peak_cors[gene] | ||
|
||
# Skip if there no correlated peaks | ||
if type(df) is int: | ||
continue | ||
gene_peaks = df.index[(df["pval"] < pval_cutoff) & (df["cor"] > cor_cutoff)] | ||
|
||
# Identify fraction open | ||
s = open_peaks.loc[:, gene_peaks].sum(axis=1) | ||
gene_accessiblity.loc[s.index, gene] = s.values / len(gene_peaks) | ||
|
||
atac_meta_ad.obsm["GeneAccessibility"] = gene_accessiblity |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,189 @@ | ||
# optimization | ||
# for parallelizing stuff | ||
from multiprocessing import cpu_count | ||
|
||
import numpy as np | ||
from joblib import Parallel, delayed | ||
from scipy.sparse import lil_matrix | ||
from tqdm.notebook import tqdm | ||
|
||
# get number of cores for multiprocessing | ||
NUM_CORES = cpu_count() | ||
|
||
########################################################## | ||
# Helper functions for parallelizing kernel construction | ||
########################################################## | ||
|
||
|
||
def kth_neighbor_distance(distances, k, i): | ||
"""Returns distance to kth nearest neighbor. | ||
Distances: sparse CSR matrix | ||
k: kth nearest neighbor | ||
i: index of row | ||
. | ||
""" | ||
# convert row to 1D array | ||
row_as_array = distances[i, :].toarray().ravel() | ||
|
||
# number of nonzero elements | ||
num_nonzero = np.sum(row_as_array > 0) | ||
|
||
# argsort | ||
kth_neighbor_idx = np.argsort(np.argsort(-row_as_array)) == num_nonzero - k | ||
return np.linalg.norm(row_as_array[kth_neighbor_idx]) | ||
|
||
|
||
def rbf_for_row(G, data, median_distances, i): | ||
"""Helper function for computing radial basis function kernel for each row of the data matrix. | ||
:param G: (array) KNN graph representing nearest neighbour connections between cells | ||
:param data: (array) data matrix between which euclidean distances are computed for RBF | ||
:param median_distances: (array) radius for RBF - the median distance between cell and k nearest-neighbours | ||
:param i: (int) data row index for which RBF is calculated | ||
:return: sparse matrix containing computed RBF for row | ||
""" | ||
# convert row to binary numpy array | ||
row_as_array = G[i, :].toarray().ravel() | ||
|
||
# compute distances ||x - y||^2 in PC/original X space | ||
numerator = np.sum(np.square(data[i, :] - data), axis=1, keepdims=False) | ||
|
||
# compute radii - median distance is distance to kth nearest neighbor | ||
denominator = median_distances[i] * median_distances | ||
|
||
# exp | ||
full_row = np.exp(-numerator / denominator) | ||
|
||
# masked row - to contain only indices captured by G matrix | ||
masked_row = np.multiply(full_row, row_as_array) | ||
|
||
return lil_matrix(masked_row) | ||
|
||
|
||
########################################################## | ||
# Archetypal Analysis Metacell Graph | ||
########################################################## | ||
|
||
|
||
class SEACellGraph: | ||
"""SEACell graph class.""" | ||
|
||
def __init__(self, ad, build_on="X_pca", n_cores: int = -1, verbose: bool = False): | ||
"""SEACell graph class. | ||
:param ad: (anndata.AnnData) object containing data for which metacells are computed | ||
:param build_on: (str) key corresponding to matrix in ad.obsm which is used to compute kernel for metacells | ||
Typically 'X_pca' for scRNA or 'X_svd' for scATAC | ||
:param n_cores: (int) number of cores for multiprocessing. If unspecified, computed automatically as | ||
number of CPU cores | ||
:param verbose: (bool) whether or not to suppress verbose program logging | ||
""" | ||
"""Initialize model parameters""" | ||
# data parameters | ||
self.n, self.d = ad.obsm[build_on].shape | ||
|
||
# indices of each point | ||
self.indices = np.array(range(self.n)) | ||
|
||
# save data | ||
self.ad = ad | ||
self.build_on = build_on | ||
|
||
self.knn_graph = None | ||
self.sym_graph = None | ||
|
||
# number of cores for parallelization | ||
if n_cores != -1: | ||
self.num_cores = n_cores | ||
else: | ||
self.num_cores = NUM_CORES | ||
|
||
self.M = None # similarity matrix | ||
self.G = None # graph | ||
self.T = None # transition matrix | ||
|
||
# model params | ||
self.verbose = verbose | ||
|
||
############################################################## | ||
# Methods related to kernel + sim matrix construction | ||
############################################################## | ||
|
||
def rbf(self, k: int = 15, graph_construction="union"): | ||
"""Initialize adaptive bandwith RBF kernel (as described in C-isomap). | ||
:param k: (int) number of nearest neighbors for RBF kernel | ||
:return: (sparse matrix) constructed RBF kernel | ||
""" | ||
import scanpy as sc | ||
|
||
if self.verbose: | ||
print("Computing kNN graph using scanpy NN ...") | ||
|
||
# compute kNN and the distance from each point to its nearest neighbors | ||
sc.pp.neighbors(self.ad, use_rep=self.build_on, n_neighbors=k, knn=True) | ||
knn_graph_distances = self.ad.obsp["distances"] | ||
|
||
# Binarize distances to get connectivity | ||
knn_graph = knn_graph_distances.copy() | ||
knn_graph[knn_graph != 0] = 1 | ||
# Include self as neighbour | ||
knn_graph.setdiag(1) | ||
|
||
self.knn_graph = knn_graph | ||
if self.verbose: | ||
print("Computing radius for adaptive bandwidth kernel...") | ||
|
||
# compute median distance for each point amongst k-nearest neighbors | ||
with Parallel(n_jobs=self.num_cores, backend="threading") as parallel: | ||
median = k // 2 | ||
median_distances = parallel( | ||
delayed(kth_neighbor_distance)(knn_graph_distances, median, i) | ||
for i in tqdm(range(self.n)) | ||
) | ||
|
||
# convert to numpy array | ||
median_distances = np.array(median_distances) | ||
|
||
if self.verbose: | ||
print("Making graph symmetric...") | ||
|
||
print( | ||
f"Parameter graph_construction = {graph_construction} being used to build KNN graph..." | ||
) | ||
if graph_construction == "union": | ||
sym_graph = (knn_graph + knn_graph.T > 0).astype(float) | ||
elif graph_construction in ["intersect", "intersection"]: | ||
knn_graph = (knn_graph > 0).astype(float) | ||
sym_graph = knn_graph.multiply(knn_graph.T) | ||
else: | ||
raise ValueError( | ||
f"Parameter graph_construction = {graph_construction} is not valid. \ | ||
Please select `union` or `intersection`" | ||
) | ||
|
||
self.sym_graph = sym_graph | ||
if self.verbose: | ||
print("Computing RBF kernel...") | ||
|
||
with Parallel(n_jobs=self.num_cores, backend="threading") as parallel: | ||
similarity_matrix_rows = parallel( | ||
delayed(rbf_for_row)( | ||
sym_graph, self.ad.obsm[self.build_on], median_distances, i | ||
) | ||
for i in tqdm(range(self.n)) | ||
) | ||
|
||
if self.verbose: | ||
print("Building similarity LIL matrix...") | ||
|
||
similarity_matrix = lil_matrix((self.n, self.n)) | ||
for i in tqdm(range(self.n)): | ||
similarity_matrix[i] = similarity_matrix_rows[i] | ||
|
||
if self.verbose: | ||
print("Constructing CSR matrix...") | ||
|
||
self.M = (similarity_matrix).tocsr() | ||
return self.M |
Oops, something went wrong.