Skip to content

Commit

Permalink
Merge pull request #43 from BackofenLab/workMalek
Browse files Browse the repository at this point in the history
Automation of database download
  • Loading branch information
Maluuck authored Aug 17, 2023
2 parents 7040329 + 75a9685 commit 81d4ea0
Show file tree
Hide file tree
Showing 22 changed files with 132,908 additions and 2,331 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ config.yml
credentials.yml
.idea
pid-info.txt

malek/*
# Created by https://www.toptal.com/developers/gitignore/api/node,maven,python,pycharm,intellij,visualstudiocode,macos,windows,linux
# Edit at https://www.toptal.com/developers/gitignore?templates=node,maven,python,pycharm,intellij,visualstudiocode,macos,windows,linux

Expand Down
8 changes: 6 additions & 2 deletions backend/src/enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,10 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
stopwatch.round("pvalue_enrichment")

# calculate Benjamini-Hochberg FDR
p_vals = []
rank_lst = []
# Set cutoff value for p_value and fdr_rate
cutoff = 1e-318
prev = 0
# Loop over p_value column in Dataframe
for i, val in enumerate(df_terms["p_value"]):
Expand All @@ -120,11 +123,12 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
if prev < p_adj and i != 0:
p_adj = prev
prev = p_adj
val, p_adj = (cutoff, cutoff) if val <= cutoff or p_adj <= cutoff else (val, p_adj)
p_vals += [val]
rank_lst += [p_adj]

# Update Dataframe
df_terms["fdr_rate"] = rank_lst

df_terms["p_value"] = p_vals
# Remove all entries where FDR >= 0.05
df_terms = df_terms[df_terms["fdr_rate"] < alpha]
df_terms = df_terms.reset_index(drop=True)
Expand Down
84 changes: 84 additions & 0 deletions backend/src/pathway_data/cal_overlap_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python

"""
A script to calculate the score of pairwise protein overlap between
functional terms
"""

__author__ = "Dilmurat Yusuf"

import itertools
import pandas as pd
import numpy as np
import multiprocessing
import gzip
import time


def cal_overlap_score(id1, id2, set1, set2, size1, size2):
"""
calculate the overlap between two sets measured by
the fraction of smaller set, which corresponds to
the size of overlap
"""

# calculate the size of intersection
intersection_size = len(set1 & set2)

# the sizes of sets are pre-calculated
# so when performing pairwise comparison
# len() won't be repeated over the same set
min_set = min(size1, size2)

return id1, id2, intersection_size / min_set


def cal_overlap_score_worker(args):
"""
for multiprocessing
"""

i, j = args
score = cal_overlap_score(
df.loc[i].external_id, df.loc[j].external_id, df.loc[i].genes, df.loc[j].genes, df.loc[i].sizes, df.loc[j].sizes
)

# an abitratry threshold, assuming > 50% should indicate
# a strong relation
if score[2] < 0.5:
return None
else:
return f"{score[0]},{score[1]},{score[2]}\n"


start_time = time.time()

target_file = "data/AllPathways_mouse.csv"
df = pd.read_csv(target_file)
df = df.dropna()
df["genes"] = df["genes"].apply(eval).apply(set)
df = df[["id", "genes"]]
# t df = df.head(200)
# t df = pd.DataFrame({'proteins': [{1, 2 , 'c'}, {1, 2, 3, 'd'}, {1, 2, 3, 'a', 'b'}, {'bla'}], 'external_id': ['a','b','c', 'd']})

# calculate the sizes of each element
# this will be used later when calcualted overlap score
df["sizes"] = [len(ele) for ele in df["genes"]]

score_file = "data/Overlap/MusMusculusDATA/functional_terms_overlap.csv.gz"
start_time = time.time()
with gzip.open(score_file, "wb") as f:
pool = multiprocessing.Pool()
results = pool.map(cal_overlap_score_worker, itertools.combinations(range(len(df)), 2))
# remove None from the list
# None corresponds to overlap sta < 0.5
results = set(results)
results.remove(None)
f.write("".join(results).encode())
print(f"completed overlap calculation and saved in {score_file}")

time_cost = (time.time() - start_time) / 3600
time_file = "/data/Overlap/MusMusculusDATA/functional_terms_overlap_time_cost.txt"
with open(time_file, "w") as f:
f.write(f"time cost: {time_cost} hours\n")
print(f"time cost at {time_file}")
29,842 changes: 29,842 additions & 0 deletions backend/src/pathway_data/data/AllPathways_human.csv

Large diffs are not rendered by default.

28,906 changes: 28,906 additions & 0 deletions backend/src/pathway_data/data/AllPathways_mouse.csv

Large diffs are not rendered by default.

29,569 changes: 29,569 additions & 0 deletions backend/src/pathway_data/data/human_all_pathways.gmt

Large diffs are not rendered by default.

Loading

0 comments on commit 81d4ea0

Please sign in to comment.