Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automation of database download #43

Merged
merged 33 commits into from
Aug 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
d051453
Automate dl from baderlabs, merged with KEGG
Maluuck Jul 23, 2023
84e1506
updated KEGG files
Maluuck Jul 23, 2023
544bf83
NN-166 remove duplicates from pathways
Maluuck Jul 23, 2023
ad84812
NN-166 make seperate folder for pathway automation
Maluuck Jul 27, 2023
99acf2b
NN-166 add useful functions for data automation
Maluuck Jul 27, 2023
a76f213
NN-166 add function for parsing used in KEGG scrap
Maluuck Jul 27, 2023
ddc0381
NN-166 retrieve latest KEGG version
Maluuck Jul 27, 2023
d31317c
NN-166 get all pathways and their information
Maluuck Jul 27, 2023
defe2b4
NN-166 map id and genes from KEGG to String
Maluuck Jul 27, 2023
4d9e704
NN-166 write diseases, drugs and compounds to file
Maluuck Jul 27, 2023
f98e660
NN-166 final KEGG scrapping method
Maluuck Jul 27, 2023
63d45c9
NN-166 edit file location of KEGG file
Maluuck Jul 27, 2023
eeed646
NN-166 drop duplicates only if same name+category
Maluuck Jul 27, 2023
ae73267
NN-166 func to check if database up-to-date
Maluuck Jul 27, 2023
ed82fa7
NN-166 final changes to main and filepath fix
Maluuck Jul 27, 2023
eca5ec0
uncomment line
Maluuck Jul 27, 2023
f499928
NN-166 introduce mapping from symbols to ensemble
Maluuck Aug 3, 2023
f21ae20
NN-166 linted
Maluuck Aug 3, 2023
2511ef3
NN-166 fix links, download correct file (symbols)
Maluuck Aug 3, 2023
b9c2c2b
fix for build error
Maluuck Aug 3, 2023
1a877be
Fix build error by installing mygene via pip
Maluuck Aug 3, 2023
d776c40
NN-166 handle 404 error, change variable name ids
Maluuck Aug 3, 2023
4c9b596
NN-166 Use symbol to ensemble mapping
Maluuck Aug 3, 2023
a6ef58b
NN-166 changes regarding url's
Maluuck Aug 3, 2023
17c76fc
NN-166 map KEGG Pathway symbols to ensemble
Maluuck Aug 3, 2023
0caaa3d
NN-166 Last changes, human data now included
Maluuck Aug 3, 2023
e6eeb6a
edit gitignore file
Maluuck Aug 7, 2023
55c8f2d
NN-278 Extract networkit code to use in diff files
Maluuck Jul 10, 2023
91413dc
NN-278 Added eigenvector centrality to term_graph
Maluuck Jul 10, 2023
5302e8a
NN-281 graph.py is now used for all graph things
Maluuck Jul 10, 2023
84e64a1
NN-320 assigned 1e-318 for values <= 1e-318
Maluuck Aug 10, 2023
7d40765
Files relevant to Database
Maluuck Aug 17, 2023
75a9685
NN-339 overlap calculation
Maluuck Aug 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ config.yml
credentials.yml
.idea
pid-info.txt

malek/*
# Created by https://www.toptal.com/developers/gitignore/api/node,maven,python,pycharm,intellij,visualstudiocode,macos,windows,linux
# Edit at https://www.toptal.com/developers/gitignore?templates=node,maven,python,pycharm,intellij,visualstudiocode,macos,windows,linux

Expand Down
8 changes: 6 additions & 2 deletions backend/src/enrichment.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,10 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
stopwatch.round("pvalue_enrichment")

# calculate Benjamini-Hochberg FDR
p_vals = []
rank_lst = []
# Set cutoff value for p_value and fdr_rate
cutoff = 1e-318
prev = 0
# Loop over p_value column in Dataframe
for i, val in enumerate(df_terms["p_value"]):
Expand All @@ -120,11 +123,12 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
if prev < p_adj and i != 0:
p_adj = prev
prev = p_adj
val, p_adj = (cutoff, cutoff) if val <= cutoff or p_adj <= cutoff else (val, p_adj)
p_vals += [val]
rank_lst += [p_adj]

# Update Dataframe
df_terms["fdr_rate"] = rank_lst

df_terms["p_value"] = p_vals
# Remove all entries where FDR >= 0.05
df_terms = df_terms[df_terms["fdr_rate"] < alpha]
df_terms = df_terms.reset_index(drop=True)
Expand Down
84 changes: 84 additions & 0 deletions backend/src/pathway_data/cal_overlap_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python

"""
A script to calculate the score of pairwise protein overlap between
functional terms
"""

__author__ = "Dilmurat Yusuf"

import itertools
import pandas as pd
import numpy as np
import multiprocessing
import gzip
import time


def cal_overlap_score(id1, id2, set1, set2, size1, size2):
"""
calculate the overlap between two sets measured by
the fraction of smaller set, which corresponds to
the size of overlap
"""

# calculate the size of intersection
intersection_size = len(set1 & set2)

# the sizes of sets are pre-calculated
# so when performing pairwise comparison
# len() won't be repeated over the same set
min_set = min(size1, size2)

return id1, id2, intersection_size / min_set


def cal_overlap_score_worker(args):
"""
for multiprocessing
"""

i, j = args
score = cal_overlap_score(
df.loc[i].external_id, df.loc[j].external_id, df.loc[i].genes, df.loc[j].genes, df.loc[i].sizes, df.loc[j].sizes
)

# an abitratry threshold, assuming > 50% should indicate
# a strong relation
if score[2] < 0.5:
return None
else:
return f"{score[0]},{score[1]},{score[2]}\n"


start_time = time.time()

target_file = "data/AllPathways_mouse.csv"
df = pd.read_csv(target_file)
df = df.dropna()
df["genes"] = df["genes"].apply(eval).apply(set)
df = df[["id", "genes"]]
# t df = df.head(200)
# t df = pd.DataFrame({'proteins': [{1, 2 , 'c'}, {1, 2, 3, 'd'}, {1, 2, 3, 'a', 'b'}, {'bla'}], 'external_id': ['a','b','c', 'd']})

# calculate the sizes of each element
# this will be used later when calcualted overlap score
df["sizes"] = [len(ele) for ele in df["genes"]]

score_file = "data/Overlap/MusMusculusDATA/functional_terms_overlap.csv.gz"
start_time = time.time()
with gzip.open(score_file, "wb") as f:
pool = multiprocessing.Pool()
results = pool.map(cal_overlap_score_worker, itertools.combinations(range(len(df)), 2))
# remove None from the list
# None corresponds to overlap sta < 0.5
results = set(results)
results.remove(None)
f.write("".join(results).encode())
print(f"completed overlap calculation and saved in {score_file}")

time_cost = (time.time() - start_time) / 3600
time_file = "/data/Overlap/MusMusculusDATA/functional_terms_overlap_time_cost.txt"
with open(time_file, "w") as f:
f.write(f"time cost: {time_cost} hours\n")
print(f"time cost at {time_file}")
29,842 changes: 29,842 additions & 0 deletions backend/src/pathway_data/data/AllPathways_human.csv

Large diffs are not rendered by default.

28,906 changes: 28,906 additions & 0 deletions backend/src/pathway_data/data/AllPathways_mouse.csv

Large diffs are not rendered by default.

29,569 changes: 29,569 additions & 0 deletions backend/src/pathway_data/data/human_all_pathways.gmt

Large diffs are not rendered by default.

Loading
Loading