NN-339 added overlap and slightly modified code

BackofenLab · Aug 24, 2023 · 0877293 · 0877293
1 parent 9b747a1
commit 0877293
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 33 deletions.
diff --git a/backend/src/pathway_data/cal_overlap_score.py b/backend/src/pathway_data/cal_overlap_score.py
@@ -2,7 +2,9 @@
 
 """
 A script to calculate the score of pairwise protein overlap between
-functional terms
+functional terms.
+If running in the background use: python cal_overlap_score.py >/dev/null 2>&1 &
+otherwise error[5] input/output error happens
 """
 
 __author__ = "Dilmurat Yusuf"
@@ -40,7 +42,7 @@ def cal_overlap_score_worker(args):
 
     i, j = args
     score = cal_overlap_score(
-        df.loc[i].external_id, df.loc[j].external_id, df.loc[i].genes, df.loc[j].genes, df.loc[i].sizes, df.loc[j].sizes
+        df.loc[i].id, df.loc[j].id, df.loc[i].genes, df.loc[j].genes, df.loc[i].sizes, df.loc[j].sizes
     )
 
     # an abitratry threshold, assuming > 50% should indicate
@@ -51,34 +53,37 @@ def cal_overlap_score_worker(args):
         return f"{score[0]},{score[1]},{score[2]}\n"
 
 
-start_time = time.time()
-
-target_file = "data/AllPathways_mouse.csv"
-df = pd.read_csv(target_file)
-df = df.dropna()
-df["genes"] = df["genes"].apply(eval).apply(set)
-df = df[["id", "genes"]]
-# t df = df.head(200)
-# t df = pd.DataFrame({'proteins': [{1, 2 , 'c'}, {1, 2, 3, 'd'}, {1, 2, 3, 'a', 'b'}, {'bla'}], 'external_id': ['a','b','c', 'd']})
-
-# calculate the sizes of each element
-# this will be used later when calcualted overlap score
-df["sizes"] = [len(ele) for ele in df["genes"]]
-
-score_file = "data/Overlap/MusMusculusDATA/functional_terms_overlap.csv.gz"
-start_time = time.time()
-with gzip.open(score_file, "wb") as f:
-    pool = multiprocessing.Pool()
-    results = pool.map(cal_overlap_score_worker, itertools.combinations(range(len(df)), 2))
-    # remove None from the list
-    # None corresponds to overlap sta < 0.5
-    results = set(results)
-    results.remove(None)
-    f.write("".join(results).encode())
-print(f"completed overlap calculation and saved in {score_file}")
-
-time_cost = (time.time() - start_time) / 3600
-time_file = "/data/Overlap/MusMusculusDATA/functional_terms_overlap_time_cost.txt"
-with open(time_file, "w") as f:
-    f.write(f"time cost: {time_cost} hours\n")
-print(f"time cost at {time_file}")
+try:
+    start_time = time.time()
+
+    target_file = "data/AllPathways_mouse.csv"
+    df = pd.read_csv(target_file)
+    df = df.dropna()
+    df["genes"] = df["genes"].apply(eval).apply(set)
+    df = df[["id", "genes"]]
+    # t df = df.head(200)
+    # t df = pd.DataFrame({'proteins': [{1, 2 , 'c'}, {1, 2, 3, 'd'}, {1, 2, 3, 'a', 'b'}, {'bla'}], 'external_id': ['a','b','c', 'd']})
+
+    # calculate the sizes of each element
+    # this will be used later when calcualted overlap score
+    df["sizes"] = [len(ele) for ele in df["genes"]]
+
+    score_file = "data/Overlap/MusMusculusDATA/functional_terms_overlap.csv.gz"
+    start_time = time.time()
+    with gzip.open(score_file, "wb") as f:
+        pool = multiprocessing.Pool()
+        results = pool.map(cal_overlap_score_worker, itertools.combinations(range(len(df)), 2))
+        # remove None from the list
+        # None corresponds to overlap sta < 0.5
+        results = set(results)
+        results.remove(None)
+        f.write("".join(results).encode())
+    print(f"completed overlap calculation and saved in {score_file}")
+    time_cost = (time.time() - start_time) / 3600
+    time_file = "data/Overlap/MusMusculusDATA/functional_terms_overlap_time_cost.txt"
+    with open(time_file, "w") as f:
+        f.write(f"time cost: {time_cost} hours\n")
+    print(f"time cost at {time_file}")
+except Exception as e:
+    with open("error.txt", "w") as error:
+        error.write(str(e))
diff --git a/backend/src/pathway_data/data/Overlap/MusMusculusDATA/functional_terms_overlap.csv.gz b/backend/src/pathway_data/data/Overlap/MusMusculusDATA/functional_terms_overlap.csv.gz
diff --git a/backend/src/pathway_data/data/Overlap/MusMusculusDATA/functional_terms_overlap_time_cost.txt b/backend/src/pathway_data/data/Overlap/MusMusculusDATA/functional_terms_overlap_time_cost.txt
@@ -0,0 +1 @@
+time cost: 1.909882405400276 hours