Skip to content

Commit

Permalink
NN-339 added overlap and slightly modified code
Browse files Browse the repository at this point in the history
  • Loading branch information
Maluuck committed Aug 24, 2023
1 parent 9b747a1 commit 0877293
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 33 deletions.
71 changes: 38 additions & 33 deletions backend/src/pathway_data/cal_overlap_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

"""
A script to calculate the score of pairwise protein overlap between
functional terms
functional terms.
If running in the background use: python cal_overlap_score.py >/dev/null 2>&1 &
otherwise error[5] input/output error happens
"""

__author__ = "Dilmurat Yusuf"
Expand Down Expand Up @@ -40,7 +42,7 @@ def cal_overlap_score_worker(args):

i, j = args
score = cal_overlap_score(
df.loc[i].external_id, df.loc[j].external_id, df.loc[i].genes, df.loc[j].genes, df.loc[i].sizes, df.loc[j].sizes
df.loc[i].id, df.loc[j].id, df.loc[i].genes, df.loc[j].genes, df.loc[i].sizes, df.loc[j].sizes
)

# an abitratry threshold, assuming > 50% should indicate
Expand All @@ -51,34 +53,37 @@ def cal_overlap_score_worker(args):
return f"{score[0]},{score[1]},{score[2]}\n"


start_time = time.time()

target_file = "data/AllPathways_mouse.csv"
df = pd.read_csv(target_file)
df = df.dropna()
df["genes"] = df["genes"].apply(eval).apply(set)
df = df[["id", "genes"]]
# t df = df.head(200)
# t df = pd.DataFrame({'proteins': [{1, 2 , 'c'}, {1, 2, 3, 'd'}, {1, 2, 3, 'a', 'b'}, {'bla'}], 'external_id': ['a','b','c', 'd']})

# calculate the sizes of each element
# this will be used later when calcualted overlap score
df["sizes"] = [len(ele) for ele in df["genes"]]

score_file = "data/Overlap/MusMusculusDATA/functional_terms_overlap.csv.gz"
start_time = time.time()
with gzip.open(score_file, "wb") as f:
pool = multiprocessing.Pool()
results = pool.map(cal_overlap_score_worker, itertools.combinations(range(len(df)), 2))
# remove None from the list
# None corresponds to overlap sta < 0.5
results = set(results)
results.remove(None)
f.write("".join(results).encode())
print(f"completed overlap calculation and saved in {score_file}")

time_cost = (time.time() - start_time) / 3600
time_file = "/data/Overlap/MusMusculusDATA/functional_terms_overlap_time_cost.txt"
with open(time_file, "w") as f:
f.write(f"time cost: {time_cost} hours\n")
print(f"time cost at {time_file}")
try:
start_time = time.time()

target_file = "data/AllPathways_mouse.csv"
df = pd.read_csv(target_file)
df = df.dropna()
df["genes"] = df["genes"].apply(eval).apply(set)
df = df[["id", "genes"]]
# t df = df.head(200)
# t df = pd.DataFrame({'proteins': [{1, 2 , 'c'}, {1, 2, 3, 'd'}, {1, 2, 3, 'a', 'b'}, {'bla'}], 'external_id': ['a','b','c', 'd']})

# calculate the sizes of each element
# this will be used later when calcualted overlap score
df["sizes"] = [len(ele) for ele in df["genes"]]

score_file = "data/Overlap/MusMusculusDATA/functional_terms_overlap.csv.gz"
start_time = time.time()
with gzip.open(score_file, "wb") as f:
pool = multiprocessing.Pool()
results = pool.map(cal_overlap_score_worker, itertools.combinations(range(len(df)), 2))
# remove None from the list
# None corresponds to overlap sta < 0.5
results = set(results)
results.remove(None)
f.write("".join(results).encode())
print(f"completed overlap calculation and saved in {score_file}")
time_cost = (time.time() - start_time) / 3600
time_file = "data/Overlap/MusMusculusDATA/functional_terms_overlap_time_cost.txt"
with open(time_file, "w") as f:
f.write(f"time cost: {time_cost} hours\n")
print(f"time cost at {time_file}")
except Exception as e:
with open("error.txt", "w") as error:
error.write(str(e))
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
time cost: 1.909882405400276 hours

0 comments on commit 0877293

Please sign in to comment.