Skip to content

Commit

Permalink
Fix pval bug
Browse files Browse the repository at this point in the history
  • Loading branch information
alegiac95 committed Jun 10, 2024
1 parent fead727 commit 689d044
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 31 deletions.
39 changes: 20 additions & 19 deletions imaging_transcriptomics/corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def bootstrap_correlation(self, imaging_data, permuted_imaging,
return

def gsea(self, gene_set="lake", outdir=None,
gene_limit=500, n_perm=1_000): # pragma: no cover, long to run
gene_limit=1500, n_perm=1_000): # pragma: no cover, long to run
"""Perform GSEA on the correlation.
The function runs a first gsea with the data and then runs the same
Expand All @@ -117,40 +117,41 @@ def gsea(self, gene_set="lake", outdir=None,
rnk = pd.DataFrame(
zip(gene_list, self.gene_results.results.corr[0, :]))
gsea_results = gseapy.prerank(rnk, gene_set,
max_size=gene_limit,
outdir=None,
permutation_num=0,
seed=1234,
max_size=gene_limit)
seed=1234)
_origin_es = gsea_results.res2d.ES.to_numpy()
_boot_es = np.zeros((_origin_es.shape[0], n_perm))
# perform the GSEA on the permutations
for i in range(n_perm):
rnk = pd.DataFrame(
zip(gene_list, self.gene_results.results.boot_corr[:, i]))
_gsea_res = gseapy.prerank(rnk, gene_set,
max_size=gene_limit,
permutation_num=0,
no_plot=True,
outdir=None,
seed=1234,
max_size=gene_limit)
_boot_es[:, i] = _gsea_res.res2d.ES.to_numpy()
# calculate the p-value
seed=1234)
_boot_es[:, i] = _gsea_res.res2d.ES.values
# calculate the p-value and NES
boot_nes_mean = np.mean(_boot_es, axis=1)
boot_nes_std = np.std(_boot_es, axis=1)
boot_nes = (boot_nes_mean - _origin_es) / boot_nes_std
_p_val = np.zeros((_origin_es.shape[0],))
_eps = .00001
for i in range(_origin_es.shape[0]):
_p_val[i] = np.sum(_boot_es[i, :] >= _origin_es[i]) / n_perm
_p_val[i] = np.sum(_boot_es[i, :] >= _origin_es[i]) / n_perm if _origin_es[i] >= 0 else np.sum(_boot_es[i, :] <= _origin_es[i]) / n_perm
if _p_val[i] == 0.0:
_p_val[i] += _eps
# calculate the p-value corrected
print(_p_val)
_, _p_corr, _, _ = multipletests(_p_val, method='fdr_bh',
is_sorted=False)
print(_p_corr)
# Prepare data to save
_out_data = OrderedDict()
_out_data["Term"] = gsea_results.res2d.Term.values.tolist()
_out_data["es"] = gsea_results.res2d.ES.values
_out_data["nes"] = gsea_results.res2d.NES.values
_out_data["nes"] = boot_nes
_out_data["p_val"] = _p_val
_out_data["fdr"] = _p_corr
#_out_data["genest_size"] = gsea_results.res2d.Geneset_Size.values
Expand All @@ -164,14 +165,14 @@ def gsea(self, gene_set="lake", outdir=None,
assert outdir.exists()
out_df.to_csv(outdir / "gsea_corr_results.tsv", index=False,
sep="\t")
for i in range(len(gsea_results.res2d.index)):
term = gsea_results.res2d.index[i]
gsea_results.results[term]["pval"] = _p_val[i]
gsea_results.results[term]["fdr"] = _p_corr[i]
gseaplot(rank_metric=gsea_results.ranking,
term=term,
**gsea_results.results[term],
ofname=f"{outdir}/{term}_corr_prerank.pdf")
# for i in range(len(gsea_results.res2d.index)):
# term = gsea_results.res2d.index[i]
# gsea_results.results[term]["pval"] = _p_val[i]
# gsea_results.results[term]["fdr"] = _p_corr[i]
# gseaplot(rank_metric=gsea_results.ranking,
# term=term,
# **gsea_results.results[term],
# ofname=f"{outdir}/{term}_corr_prerank.pdf")

# --------- SAVE FUNCTIONS --------- #
def save_results(self, outdir): # pragma: no cover, simply saves stuff
Expand Down
19 changes: 9 additions & 10 deletions imaging_transcriptomics/genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import yaml
from pathlib import Path
from pyls import pls_regression
from scipy.stats import zscore, norm, false_discovery_rate
from scipy.stats import zscore, norm
from statsmodels.stats.multitest import multipletests
import numpy as np
from collections import OrderedDict
Expand Down Expand Up @@ -188,7 +188,7 @@ def compute(self):
self.boot.pval_corr[component, :] = _p_corr
return

def gsea(self, gene_set="lake", outdir=None, gene_limit=500, n_iter=1000):
def gsea(self, gene_set="lake", outdir=None, gene_limit=1500, n_iter=1000):
"""Perform a GSEA analysis on the z-scored weights."""
assert isinstance(self.orig, OrigPLS)
assert isinstance(self.boot, BootPLS)
Expand All @@ -203,10 +203,10 @@ def gsea(self, gene_set="lake", outdir=None, gene_limit=500, n_iter=1000):
rnk = pd.DataFrame(zip(gene_list,
self.orig.zscored[_component, :]))
gsea_results = gseapy.prerank(rnk, gene_set,
max_size=gene_limit,
outdir=None,
seed=1234,
permutation_num=n_iter,
max_size=gene_limit)
permutation_num=n_iter)
_origin_es = gsea_results.res2d.es.to_numpy()
_boot_es = np.zeros((_origin_es.shape[0], n_iter))
for i in range(n_iter):
Expand All @@ -217,14 +217,14 @@ def gsea(self, gene_set="lake", outdir=None, gene_limit=500, n_iter=1000):
)
)
gsea_res = gseapy.prerank(rnk, gene_set,
max_size=gene_limit,
outdir=None,
seed=1234,
permutation_num=1,
max_size=gene_limit)
permutation_num=1)
_boot_es[:, i] = gsea_res.res2d.es.to_numpy()
_p_val = np.zeros((_origin_es.shape[0],))
for i in range(_origin_es.shape[0]):
_p_val[i] = np.sum(_boot_es[i, :] >= _origin_es[i]) / n_iter
_p_val[i] = np.sum(_boot_es[i, :] >= _origin_es[i]) / n_iter if _origin_es[i] >= 0 else np.sum(_boot_es[i, :] <= _origin_es[i]) / n_iter
# calculate the p-value corrected
_, _p_corr, _, _ = multipletests(_p_val, method='fdr_bh',
is_sorted=False)
Expand Down Expand Up @@ -366,10 +366,9 @@ def compute_pval(self):
# make sure the order of the other is the same.
logger.info("Computing p values.")
for i in range(self.n_genes):
self.pval[0, i] = np.sum(
self.boot_corr[i, :] >= self.corr[0, i]) / self._n_iter
self.pval[0, i] = np.sum(self.boot_corr[i, :] >= self.corr[0, i]) / self._n_iter if self.corr[0, i] >= 0 else np.sum(self.boot_corr[i, :] <= self.corr[0, i]) / self._n_iter
_, p_corr, _, _ = multipletests(self.pval[0, :], method='fdr_bh',
is_sorted=True)
is_sorted=False)

self.pval_corr[0, :] = p_corr
return
Expand Down
4 changes: 2 additions & 2 deletions imaging_transcriptomics/transcriptomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def _save_object(self, outdir, name):
return

# --------- RUN ANALYSIS --------- #
def gsea(self, outdir=None, gene_set="lake", gene_limit=500):
def gsea(self, outdir=None, gene_set="lake", gene_limit=1500):

if self.method == "corr":
self.analysis.gsea(gene_set=gene_set, outdir=outdir,
Expand All @@ -301,7 +301,7 @@ def gsea(self, outdir=None, gene_set="lake", gene_limit=500):

def run(self, outdir=None, scan_name="", gsea=False,
gene_set="lake", save_res=True, n_cpu=4,
gene_limit=500): # pragma: no cover
gene_limit=1500): # pragma: no cover
"""Method to run the imaging transcriptomics analysis.
:param str outdir: path to the output directory, if not provided the
Expand Down

0 comments on commit 689d044

Please sign in to comment.