Skip to content

Commit

Permalink
Added SEACells model to calculate the metacells
Browse files Browse the repository at this point in the history
### pp Module:
- Added `max_cells_ratio` and `max_genes_ratio` to control the max threshold in qc of scRNA-seq

### single Module:
- Added `SEACells` model to calculate the metacells from scRNA-seq

### space Module:
- Added `STAligner` to integrate multi stRNA-seq
  • Loading branch information
Starlitnightly committed Jan 3, 2024
1 parent 53dff3b commit 2bbf9cb
Show file tree
Hide file tree
Showing 24 changed files with 5,144 additions and 136 deletions.
67 changes: 67 additions & 0 deletions omicverse/SEACells/Rscripts/chromVAR.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
library(chromVAR)
library(SummarizedExperiment)
library(Matrix)

library(BSgenome.Hsapiens.UCSC.hg38)

library(motifmatchr)
library(chromVARmotifs)

# parse arguments
argReader = commandArgs(trailingOnly=TRUE)
base_dir <- argReader[1]

print(paste0("Loading files (peaks.bed, sampling_depth.txt, counts.txt, peak_names.txt and cell_names.txt) from base directory ",base_dir))

peaks <- getPeaks(paste0(base_dir,"peaks.bed"), sort_peaks = TRUE)

sampling_depth <- as.matrix(read.delim(paste0(base_dir, 'sampling_depth.txt'), header=F))
sampling_depth <- as.numeric(sampling_depth)

print("Loading counts...")

my_counts_matrix <- as.matrix(read.delim(paste0(base_dir, 'counts.txt'), header=F))

row_names <- as.matrix(read.delim(paste0(base_dir, 'peak_names.txt'), header=F))
rownames(my_counts_matrix) <- row_names

col_names <- as.matrix(read.delim(paste0(base_dir, 'cell_names.txt'), header=F))
colnames(my_counts_matrix) <- col_names

print("Creating SummarizedExperiment...")

fragment_counts <- SummarizedExperiment(assays = list(counts = my_counts_matrix),
rowRanges = peaks,
colData = sampling_depth)

colnames(colData(fragment_counts)) <- c('depth')

fragment_counts

rm(my_counts_matrix)
rm(peaks)
rm(sampling_depth)
rm(row_names)


fragment_counts <- addGCBias(fragment_counts,
genome = BSgenome.Hsapiens.UCSC.hg38)

print("Matching motifs...")

data('human_pwms_v2')
motifs = human_pwms_v2
motif_ix <- matchMotifs(motifs, fragment_counts, genome = BSgenome.Hsapiens.UCSC.hg38)

rm(motifs)

print("Computing deviations...")

dev <- computeDeviations(object = fragment_counts, annotations = motif_ix)
variability <- computeVariability(dev)

write.csv(deviations(dev), paste0(base_dir,"deviations.csv"))
write.csv(deviationScores(dev) ,paste0(base_dir,"deviationScores.csv"))
write.csv(variability ,paste0(base_dir,"variability.csv"))

print("Finished writing files.")
4 changes: 4 additions & 0 deletions omicverse/SEACells/Rscripts/tanay.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Title : TODO
# Objective : TODO
# Created by: sitarapersad
# Created on: 5/14/21
2 changes: 2 additions & 0 deletions omicverse/SEACells/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .core import *
from .evaluate import *
114 changes: 114 additions & 0 deletions omicverse/SEACells/accessibility.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import numpy as np
import pandas as pd
from tqdm import tqdm


def determine_metacell_open_peaks(
atac_meta_ad,
peak_set=None,
low_dim_embedding="X_svd",
pval_cutoff=1e-2,
read_len=147,
n_neighbors=3,
n_jobs=1,
):
"""Determine the set of peaks that are open in each metacell.
:param atac_meta_ad: (Anndata) ATAC metacell Anndata created using `prepare_multiome_anndata`
:param peak_set: (pd.Series) Subset of peaks to test. All peaks are tested by default
:param low_dim_embedding: (str) `atac_meta_ad.obsm` field for nearest neighbor computation
:param p_val_cutoff: (float) Nominal p-value cutoff for open peaks
:param read_len: (int) Fragment length
:param n_jobs: (int) number of jobs for parallel processing
:atac_meta_ad is modified inplace with `.obsm['OpenPeaks']` indicating the set of open peaks in each metacell
"""
from scipy.stats import multinomial, poisson
from sklearn.neighbors import NearestNeighbors

# Effective genome length for background computaiton
eff_genome_length = atac_meta_ad.shape[1] * 5000

# Set up container
if peak_set is None:
peak_set = atac_meta_ad.var_names
open_peaks = pd.DataFrame(0, index=atac_meta_ad.obs_names, columns=peak_set)

# Metacell neighbors
nbrs = NearestNeighbors(n_neighbors=n_neighbors)
nbrs.fit(atac_meta_ad.obsm[low_dim_embedding])
meta_nbrs = pd.DataFrame(
atac_meta_ad.obs_names.values[
nbrs.kneighbors(atac_meta_ad.obsm[low_dim_embedding])[1]
],
index=atac_meta_ad.obs_names,
)

for m in tqdm(open_peaks.index):
# Boost using local neighbors
frag_counts = np.ravel(
atac_meta_ad[meta_nbrs.loc[m, :].values, :][:, peak_set].X.sum(axis=0)
)
frag_distr = frag_counts / np.sum(frag_counts).astype(np.float64)

# Multinomial distribution
while not 0 < np.sum(frag_distr) < 1 - 1e-5:
frag_distr = np.absolute(frag_distr - np.finfo(np.float32).epsneg)
# Sample from multinomial distribution
frag_counts = multinomial.rvs(
np.percentile(atac_meta_ad.obs["n_counts"], 100), frag_distr
)

# Compute background poisson distribution
total_frags = frag_counts.sum()
glambda = (read_len * total_frags) / eff_genome_length

# Significant peaks
cutoff = pval_cutoff / np.sum(frag_counts > 0)
open_peaks.loc[m, frag_counts >= poisson.ppf(1 - cutoff, glambda)] = 1

# Update ATAC Metadata object
atac_meta_ad.layers["OpenPeaks"] = open_peaks.values


def get_gene_accessibility(
atac_meta_ad, gene_peak_cors, gene_set=None, pval_cutoff=1e-1, cor_cutoff=0.1
):
"""Gene accessibility scores for each. The score is defined as the fraction of significantly correlated peaks that are open in a given metacell.
:param atac_meta_ad: (Anndata) ATAC metacell Anndata created using `prepare_multiome_anndata`
:param gene_peak_cors: (pd.Series) Output of `get_gene_peak_correlations` function
:param gene_set: (pd.Series) Subset of genes to compute accessibility scores. All genes are used by default.
:param p_val_cutoff: (float) Nominal p-value cutoff for test of significance of correlation
:param cor_cutoff: (float) Correlation cutoff
:atac_meta_ad is modified inplace with `.obsm['GeneAccessibility']` indicating the gene scores for each metacell
"""
if "OpenPeaks" not in atac_meta_ad.layers.keys():
raise Exception(
"Run determine_metacell_open_peaks to compute gene accessibility"
)
open_peaks = pd.DataFrame(
atac_meta_ad.layers["OpenPeaks"],
index=atac_meta_ad.obs_names,
columns=atac_meta_ad.var_names,
)

if gene_set is None:
gene_set = gene_peak_cors.index

# Container
gene_accessiblity = pd.DataFrame(-1, index=atac_meta_ad.obs_names, columns=gene_set)
for gene in tqdm(gene_set):
df = gene_peak_cors[gene]

# Skip if there no correlated peaks
if type(df) is int:
continue
gene_peaks = df.index[(df["pval"] < pval_cutoff) & (df["cor"] > cor_cutoff)]

# Identify fraction open
s = open_peaks.loc[:, gene_peaks].sum(axis=1)
gene_accessiblity.loc[s.index, gene] = s.values / len(gene_peaks)

atac_meta_ad.obsm["GeneAccessibility"] = gene_accessiblity
189 changes: 189 additions & 0 deletions omicverse/SEACells/build_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
# optimization
# for parallelizing stuff
from multiprocessing import cpu_count

import numpy as np
from joblib import Parallel, delayed
from scipy.sparse import lil_matrix
from tqdm.notebook import tqdm

# get number of cores for multiprocessing
NUM_CORES = cpu_count()

##########################################################
# Helper functions for parallelizing kernel construction
##########################################################


def kth_neighbor_distance(distances, k, i):
"""Returns distance to kth nearest neighbor.
Distances: sparse CSR matrix
k: kth nearest neighbor
i: index of row
.
"""
# convert row to 1D array
row_as_array = distances[i, :].toarray().ravel()

# number of nonzero elements
num_nonzero = np.sum(row_as_array > 0)

# argsort
kth_neighbor_idx = np.argsort(np.argsort(-row_as_array)) == num_nonzero - k
return np.linalg.norm(row_as_array[kth_neighbor_idx])


def rbf_for_row(G, data, median_distances, i):
"""Helper function for computing radial basis function kernel for each row of the data matrix.
:param G: (array) KNN graph representing nearest neighbour connections between cells
:param data: (array) data matrix between which euclidean distances are computed for RBF
:param median_distances: (array) radius for RBF - the median distance between cell and k nearest-neighbours
:param i: (int) data row index for which RBF is calculated
:return: sparse matrix containing computed RBF for row
"""
# convert row to binary numpy array
row_as_array = G[i, :].toarray().ravel()

# compute distances ||x - y||^2 in PC/original X space
numerator = np.sum(np.square(data[i, :] - data), axis=1, keepdims=False)

# compute radii - median distance is distance to kth nearest neighbor
denominator = median_distances[i] * median_distances

# exp
full_row = np.exp(-numerator / denominator)

# masked row - to contain only indices captured by G matrix
masked_row = np.multiply(full_row, row_as_array)

return lil_matrix(masked_row)


##########################################################
# Archetypal Analysis Metacell Graph
##########################################################


class SEACellGraph:
"""SEACell graph class."""

def __init__(self, ad, build_on="X_pca", n_cores: int = -1, verbose: bool = False):
"""SEACell graph class.
:param ad: (anndata.AnnData) object containing data for which metacells are computed
:param build_on: (str) key corresponding to matrix in ad.obsm which is used to compute kernel for metacells
Typically 'X_pca' for scRNA or 'X_svd' for scATAC
:param n_cores: (int) number of cores for multiprocessing. If unspecified, computed automatically as
number of CPU cores
:param verbose: (bool) whether or not to suppress verbose program logging
"""
"""Initialize model parameters"""
# data parameters
self.n, self.d = ad.obsm[build_on].shape

# indices of each point
self.indices = np.array(range(self.n))

# save data
self.ad = ad
self.build_on = build_on

self.knn_graph = None
self.sym_graph = None

# number of cores for parallelization
if n_cores != -1:
self.num_cores = n_cores
else:
self.num_cores = NUM_CORES

self.M = None # similarity matrix
self.G = None # graph
self.T = None # transition matrix

# model params
self.verbose = verbose

##############################################################
# Methods related to kernel + sim matrix construction
##############################################################

def rbf(self, k: int = 15, graph_construction="union"):
"""Initialize adaptive bandwith RBF kernel (as described in C-isomap).
:param k: (int) number of nearest neighbors for RBF kernel
:return: (sparse matrix) constructed RBF kernel
"""
import scanpy as sc

if self.verbose:
print("Computing kNN graph using scanpy NN ...")

# compute kNN and the distance from each point to its nearest neighbors
sc.pp.neighbors(self.ad, use_rep=self.build_on, n_neighbors=k, knn=True)
knn_graph_distances = self.ad.obsp["distances"]

# Binarize distances to get connectivity
knn_graph = knn_graph_distances.copy()
knn_graph[knn_graph != 0] = 1
# Include self as neighbour
knn_graph.setdiag(1)

self.knn_graph = knn_graph
if self.verbose:
print("Computing radius for adaptive bandwidth kernel...")

# compute median distance for each point amongst k-nearest neighbors
with Parallel(n_jobs=self.num_cores, backend="threading") as parallel:
median = k // 2
median_distances = parallel(
delayed(kth_neighbor_distance)(knn_graph_distances, median, i)
for i in tqdm(range(self.n))
)

# convert to numpy array
median_distances = np.array(median_distances)

if self.verbose:
print("Making graph symmetric...")

print(
f"Parameter graph_construction = {graph_construction} being used to build KNN graph..."
)
if graph_construction == "union":
sym_graph = (knn_graph + knn_graph.T > 0).astype(float)
elif graph_construction in ["intersect", "intersection"]:
knn_graph = (knn_graph > 0).astype(float)
sym_graph = knn_graph.multiply(knn_graph.T)
else:
raise ValueError(
f"Parameter graph_construction = {graph_construction} is not valid. \
Please select `union` or `intersection`"
)

self.sym_graph = sym_graph
if self.verbose:
print("Computing RBF kernel...")

with Parallel(n_jobs=self.num_cores, backend="threading") as parallel:
similarity_matrix_rows = parallel(
delayed(rbf_for_row)(
sym_graph, self.ad.obsm[self.build_on], median_distances, i
)
for i in tqdm(range(self.n))
)

if self.verbose:
print("Building similarity LIL matrix...")

similarity_matrix = lil_matrix((self.n, self.n))
for i in tqdm(range(self.n)):
similarity_matrix[i] = similarity_matrix_rows[i]

if self.verbose:
print("Constructing CSR matrix...")

self.M = (similarity_matrix).tocsr()
return self.M
Loading

0 comments on commit 2bbf9cb

Please sign in to comment.