diff --git a/imaging_transcriptomics/corr.py b/imaging_transcriptomics/corr.py index b45b61a..dfa6797 100644 --- a/imaging_transcriptomics/corr.py +++ b/imaging_transcriptomics/corr.py @@ -96,7 +96,7 @@ def bootstrap_correlation(self, imaging_data, permuted_imaging, return def gsea(self, gene_set="lake", outdir=None, - gene_limit=500, n_perm=1_000): # pragma: no cover, long to run + gene_limit=1500, n_perm=1_000): # pragma: no cover, long to run """Perform GSEA on the correlation. The function runs a first gsea with the data and then runs the same @@ -117,10 +117,10 @@ def gsea(self, gene_set="lake", outdir=None, rnk = pd.DataFrame( zip(gene_list, self.gene_results.results.corr[0, :])) gsea_results = gseapy.prerank(rnk, gene_set, + max_size=gene_limit, outdir=None, permutation_num=0, - seed=1234, - max_size=gene_limit) + seed=1234) _origin_es = gsea_results.res2d.ES.to_numpy() _boot_es = np.zeros((_origin_es.shape[0], n_perm)) # perform the GSEA on the permutations @@ -128,29 +128,30 @@ def gsea(self, gene_set="lake", outdir=None, rnk = pd.DataFrame( zip(gene_list, self.gene_results.results.boot_corr[:, i])) _gsea_res = gseapy.prerank(rnk, gene_set, + max_size=gene_limit, permutation_num=0, no_plot=True, outdir=None, - seed=1234, - max_size=gene_limit) - _boot_es[:, i] = _gsea_res.res2d.ES.to_numpy() - # calculate the p-value + seed=1234) + _boot_es[:, i] = _gsea_res.res2d.ES.values + # calculate the p-value and NES + boot_nes_mean = np.mean(_boot_es, axis=1) + boot_nes_std = np.std(_boot_es, axis=1) + boot_nes = (boot_nes_mean - _origin_es) / boot_nes_std _p_val = np.zeros((_origin_es.shape[0],)) _eps = .00001 for i in range(_origin_es.shape[0]): - _p_val[i] = np.sum(_boot_es[i, :] >= _origin_es[i]) / n_perm + _p_val[i] = np.sum(_boot_es[i, :] >= _origin_es[i]) / n_perm if _origin_es[i] >= 0 else np.sum(_boot_es[i, :] <= _origin_es[i]) / n_perm if _p_val[i] == 0.0: _p_val[i] += _eps # calculate the p-value corrected - print(_p_val) _, _p_corr, _, _ = multipletests(_p_val, method='fdr_bh', is_sorted=False) - print(_p_corr) # Prepare data to save _out_data = OrderedDict() _out_data["Term"] = gsea_results.res2d.Term.values.tolist() _out_data["es"] = gsea_results.res2d.ES.values - _out_data["nes"] = gsea_results.res2d.NES.values + _out_data["nes"] = boot_nes _out_data["p_val"] = _p_val _out_data["fdr"] = _p_corr #_out_data["genest_size"] = gsea_results.res2d.Geneset_Size.values @@ -164,14 +165,14 @@ def gsea(self, gene_set="lake", outdir=None, assert outdir.exists() out_df.to_csv(outdir / "gsea_corr_results.tsv", index=False, sep="\t") - for i in range(len(gsea_results.res2d.index)): - term = gsea_results.res2d.index[i] - gsea_results.results[term]["pval"] = _p_val[i] - gsea_results.results[term]["fdr"] = _p_corr[i] - gseaplot(rank_metric=gsea_results.ranking, - term=term, - **gsea_results.results[term], - ofname=f"{outdir}/{term}_corr_prerank.pdf") + # for i in range(len(gsea_results.res2d.index)): + # term = gsea_results.res2d.index[i] + # gsea_results.results[term]["pval"] = _p_val[i] + # gsea_results.results[term]["fdr"] = _p_corr[i] + # gseaplot(rank_metric=gsea_results.ranking, + # term=term, + # **gsea_results.results[term], + # ofname=f"{outdir}/{term}_corr_prerank.pdf") # --------- SAVE FUNCTIONS --------- # def save_results(self, outdir): # pragma: no cover, simply saves stuff diff --git a/imaging_transcriptomics/genes.py b/imaging_transcriptomics/genes.py index fe4542c..d3dd764 100644 --- a/imaging_transcriptomics/genes.py +++ b/imaging_transcriptomics/genes.py @@ -3,7 +3,7 @@ import yaml from pathlib import Path from pyls import pls_regression -from scipy.stats import zscore, norm, false_discovery_rate +from scipy.stats import zscore, norm from statsmodels.stats.multitest import multipletests import numpy as np from collections import OrderedDict @@ -188,7 +188,7 @@ def compute(self): self.boot.pval_corr[component, :] = _p_corr return - def gsea(self, gene_set="lake", outdir=None, gene_limit=500, n_iter=1000): + def gsea(self, gene_set="lake", outdir=None, gene_limit=1500, n_iter=1000): """Perform a GSEA analysis on the z-scored weights.""" assert isinstance(self.orig, OrigPLS) assert isinstance(self.boot, BootPLS) @@ -203,10 +203,10 @@ def gsea(self, gene_set="lake", outdir=None, gene_limit=500, n_iter=1000): rnk = pd.DataFrame(zip(gene_list, self.orig.zscored[_component, :])) gsea_results = gseapy.prerank(rnk, gene_set, + max_size=gene_limit, outdir=None, seed=1234, - permutation_num=n_iter, - max_size=gene_limit) + permutation_num=n_iter) _origin_es = gsea_results.res2d.es.to_numpy() _boot_es = np.zeros((_origin_es.shape[0], n_iter)) for i in range(n_iter): @@ -217,14 +217,14 @@ def gsea(self, gene_set="lake", outdir=None, gene_limit=500, n_iter=1000): ) ) gsea_res = gseapy.prerank(rnk, gene_set, + max_size=gene_limit, outdir=None, seed=1234, - permutation_num=1, - max_size=gene_limit) + permutation_num=1) _boot_es[:, i] = gsea_res.res2d.es.to_numpy() _p_val = np.zeros((_origin_es.shape[0],)) for i in range(_origin_es.shape[0]): - _p_val[i] = np.sum(_boot_es[i, :] >= _origin_es[i]) / n_iter + _p_val[i] = np.sum(_boot_es[i, :] >= _origin_es[i]) / n_iter if _origin_es[i] >= 0 else np.sum(_boot_es[i, :] <= _origin_es[i]) / n_iter # calculate the p-value corrected _, _p_corr, _, _ = multipletests(_p_val, method='fdr_bh', is_sorted=False) @@ -366,10 +366,9 @@ def compute_pval(self): # make sure the order of the other is the same. logger.info("Computing p values.") for i in range(self.n_genes): - self.pval[0, i] = np.sum( - self.boot_corr[i, :] >= self.corr[0, i]) / self._n_iter + self.pval[0, i] = np.sum(self.boot_corr[i, :] >= self.corr[0, i]) / self._n_iter if self.corr[0, i] >= 0 else np.sum(self.boot_corr[i, :] <= self.corr[0, i]) / self._n_iter _, p_corr, _, _ = multipletests(self.pval[0, :], method='fdr_bh', - is_sorted=True) + is_sorted=False) self.pval_corr[0, :] = p_corr return diff --git a/imaging_transcriptomics/transcriptomics.py b/imaging_transcriptomics/transcriptomics.py index 95baea9..6dc8b6a 100644 --- a/imaging_transcriptomics/transcriptomics.py +++ b/imaging_transcriptomics/transcriptomics.py @@ -288,7 +288,7 @@ def _save_object(self, outdir, name): return # --------- RUN ANALYSIS --------- # - def gsea(self, outdir=None, gene_set="lake", gene_limit=500): + def gsea(self, outdir=None, gene_set="lake", gene_limit=1500): if self.method == "corr": self.analysis.gsea(gene_set=gene_set, outdir=outdir, @@ -301,7 +301,7 @@ def gsea(self, outdir=None, gene_set="lake", gene_limit=500): def run(self, outdir=None, scan_name="", gsea=False, gene_set="lake", save_res=True, n_cpu=4, - gene_limit=500): # pragma: no cover + gene_limit=1500): # pragma: no cover """Method to run the imaging transcriptomics analysis. :param str outdir: path to the output directory, if not provided the