diff --git a/backend/src/pathway_data/cal_overlap_score.py b/backend/src/pathway_data/cal_overlap_score.py index aa3a0031..84c067bc 100644 --- a/backend/src/pathway_data/cal_overlap_score.py +++ b/backend/src/pathway_data/cal_overlap_score.py @@ -2,7 +2,9 @@ """ A script to calculate the score of pairwise protein overlap between -functional terms +functional terms. +If running in the background use: python cal_overlap_score.py >/dev/null 2>&1 & +otherwise error[5] input/output error happens """ __author__ = "Dilmurat Yusuf" @@ -40,7 +42,7 @@ def cal_overlap_score_worker(args): i, j = args score = cal_overlap_score( - df.loc[i].external_id, df.loc[j].external_id, df.loc[i].genes, df.loc[j].genes, df.loc[i].sizes, df.loc[j].sizes + df.loc[i].id, df.loc[j].id, df.loc[i].genes, df.loc[j].genes, df.loc[i].sizes, df.loc[j].sizes ) # an abitratry threshold, assuming > 50% should indicate @@ -51,34 +53,37 @@ def cal_overlap_score_worker(args): return f"{score[0]},{score[1]},{score[2]}\n" -start_time = time.time() - -target_file = "data/AllPathways_mouse.csv" -df = pd.read_csv(target_file) -df = df.dropna() -df["genes"] = df["genes"].apply(eval).apply(set) -df = df[["id", "genes"]] -# t df = df.head(200) -# t df = pd.DataFrame({'proteins': [{1, 2 , 'c'}, {1, 2, 3, 'd'}, {1, 2, 3, 'a', 'b'}, {'bla'}], 'external_id': ['a','b','c', 'd']}) - -# calculate the sizes of each element -# this will be used later when calcualted overlap score -df["sizes"] = [len(ele) for ele in df["genes"]] - -score_file = "data/Overlap/MusMusculusDATA/functional_terms_overlap.csv.gz" -start_time = time.time() -with gzip.open(score_file, "wb") as f: - pool = multiprocessing.Pool() - results = pool.map(cal_overlap_score_worker, itertools.combinations(range(len(df)), 2)) - # remove None from the list - # None corresponds to overlap sta < 0.5 - results = set(results) - results.remove(None) - f.write("".join(results).encode()) -print(f"completed overlap calculation and saved in {score_file}") - -time_cost = (time.time() - start_time) / 3600 -time_file = "/data/Overlap/MusMusculusDATA/functional_terms_overlap_time_cost.txt" -with open(time_file, "w") as f: - f.write(f"time cost: {time_cost} hours\n") -print(f"time cost at {time_file}") +try: + start_time = time.time() + + target_file = "data/AllPathways_mouse.csv" + df = pd.read_csv(target_file) + df = df.dropna() + df["genes"] = df["genes"].apply(eval).apply(set) + df = df[["id", "genes"]] + # t df = df.head(200) + # t df = pd.DataFrame({'proteins': [{1, 2 , 'c'}, {1, 2, 3, 'd'}, {1, 2, 3, 'a', 'b'}, {'bla'}], 'external_id': ['a','b','c', 'd']}) + + # calculate the sizes of each element + # this will be used later when calcualted overlap score + df["sizes"] = [len(ele) for ele in df["genes"]] + + score_file = "data/Overlap/MusMusculusDATA/functional_terms_overlap.csv.gz" + start_time = time.time() + with gzip.open(score_file, "wb") as f: + pool = multiprocessing.Pool() + results = pool.map(cal_overlap_score_worker, itertools.combinations(range(len(df)), 2)) + # remove None from the list + # None corresponds to overlap sta < 0.5 + results = set(results) + results.remove(None) + f.write("".join(results).encode()) + print(f"completed overlap calculation and saved in {score_file}") + time_cost = (time.time() - start_time) / 3600 + time_file = "data/Overlap/MusMusculusDATA/functional_terms_overlap_time_cost.txt" + with open(time_file, "w") as f: + f.write(f"time cost: {time_cost} hours\n") + print(f"time cost at {time_file}") +except Exception as e: + with open("error.txt", "w") as error: + error.write(str(e)) diff --git a/backend/src/pathway_data/data/Overlap/MusMusculusDATA/functional_terms_overlap.csv.gz b/backend/src/pathway_data/data/Overlap/MusMusculusDATA/functional_terms_overlap.csv.gz new file mode 100644 index 00000000..7e6b9935 Binary files /dev/null and b/backend/src/pathway_data/data/Overlap/MusMusculusDATA/functional_terms_overlap.csv.gz differ diff --git a/backend/src/pathway_data/data/Overlap/MusMusculusDATA/functional_terms_overlap_time_cost.txt b/backend/src/pathway_data/data/Overlap/MusMusculusDATA/functional_terms_overlap_time_cost.txt new file mode 100644 index 00000000..fdd10734 --- /dev/null +++ b/backend/src/pathway_data/data/Overlap/MusMusculusDATA/functional_terms_overlap_time_cost.txt @@ -0,0 +1 @@ +time cost: 1.909882405400276 hours