-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #43 from BackofenLab/workMalek
Automation of database download
- Loading branch information
Showing
22 changed files
with
132,908 additions
and
2,331 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
#!/usr/bin/env python | ||
|
||
""" | ||
A script to calculate the score of pairwise protein overlap between | ||
functional terms | ||
""" | ||
|
||
__author__ = "Dilmurat Yusuf" | ||
|
||
import itertools | ||
import pandas as pd | ||
import numpy as np | ||
import multiprocessing | ||
import gzip | ||
import time | ||
|
||
|
||
def cal_overlap_score(id1, id2, set1, set2, size1, size2): | ||
""" | ||
calculate the overlap between two sets measured by | ||
the fraction of smaller set, which corresponds to | ||
the size of overlap | ||
""" | ||
|
||
# calculate the size of intersection | ||
intersection_size = len(set1 & set2) | ||
|
||
# the sizes of sets are pre-calculated | ||
# so when performing pairwise comparison | ||
# len() won't be repeated over the same set | ||
min_set = min(size1, size2) | ||
|
||
return id1, id2, intersection_size / min_set | ||
|
||
|
||
def cal_overlap_score_worker(args): | ||
""" | ||
for multiprocessing | ||
""" | ||
|
||
i, j = args | ||
score = cal_overlap_score( | ||
df.loc[i].external_id, df.loc[j].external_id, df.loc[i].genes, df.loc[j].genes, df.loc[i].sizes, df.loc[j].sizes | ||
) | ||
|
||
# an abitratry threshold, assuming > 50% should indicate | ||
# a strong relation | ||
if score[2] < 0.5: | ||
return None | ||
else: | ||
return f"{score[0]},{score[1]},{score[2]}\n" | ||
|
||
|
||
start_time = time.time() | ||
|
||
target_file = "data/AllPathways_mouse.csv" | ||
df = pd.read_csv(target_file) | ||
df = df.dropna() | ||
df["genes"] = df["genes"].apply(eval).apply(set) | ||
df = df[["id", "genes"]] | ||
# t df = df.head(200) | ||
# t df = pd.DataFrame({'proteins': [{1, 2 , 'c'}, {1, 2, 3, 'd'}, {1, 2, 3, 'a', 'b'}, {'bla'}], 'external_id': ['a','b','c', 'd']}) | ||
|
||
# calculate the sizes of each element | ||
# this will be used later when calcualted overlap score | ||
df["sizes"] = [len(ele) for ele in df["genes"]] | ||
|
||
score_file = "data/Overlap/MusMusculusDATA/functional_terms_overlap.csv.gz" | ||
start_time = time.time() | ||
with gzip.open(score_file, "wb") as f: | ||
pool = multiprocessing.Pool() | ||
results = pool.map(cal_overlap_score_worker, itertools.combinations(range(len(df)), 2)) | ||
# remove None from the list | ||
# None corresponds to overlap sta < 0.5 | ||
results = set(results) | ||
results.remove(None) | ||
f.write("".join(results).encode()) | ||
print(f"completed overlap calculation and saved in {score_file}") | ||
|
||
time_cost = (time.time() - start_time) / 3600 | ||
time_file = "/data/Overlap/MusMusculusDATA/functional_terms_overlap_time_cost.txt" | ||
with open(time_file, "w") as f: | ||
f.write(f"time cost: {time_cost} hours\n") | ||
print(f"time cost at {time_file}") |
29,842 changes: 29,842 additions & 0 deletions
29,842
backend/src/pathway_data/data/AllPathways_human.csv
Large diffs are not rendered by default.
Oops, something went wrong.
28,906 changes: 28,906 additions & 0 deletions
28,906
backend/src/pathway_data/data/AllPathways_mouse.csv
Large diffs are not rendered by default.
Oops, something went wrong.
29,569 changes: 29,569 additions & 0 deletions
29,569
backend/src/pathway_data/data/human_all_pathways.gmt
Large diffs are not rendered by default.
Oops, something went wrong.
Oops, something went wrong.