Merge pull request #43 from BackofenLab/workMalek

Automation of database download
BackofenLab · Aug 17, 2023 · 81d4ea0 · 81d4ea0
2 parents 7040329 + 75a9685
commit 81d4ea0
Show file tree

Hide file tree

Showing 22 changed files with 132,908 additions and 2,331 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,7 +4,7 @@ config.yml
 credentials.yml
 .idea
 pid-info.txt
-
+malek/*
 # Created by https://www.toptal.com/developers/gitignore/api/node,maven,python,pycharm,intellij,visualstudiocode,macos,windows,linux
 # Edit at https://www.toptal.com/developers/gitignore?templates=node,maven,python,pycharm,intellij,visualstudiocode,macos,windows,linux
 

diff --git a/backend/src/enrichment.py b/backend/src/enrichment.py
@@ -110,7 +110,10 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
     stopwatch.round("pvalue_enrichment")
 
     # calculate Benjamini-Hochberg FDR
+    p_vals = []
     rank_lst = []
+    # Set cutoff value for p_value and fdr_rate
+    cutoff = 1e-318
     prev = 0
     # Loop over p_value column in Dataframe
     for i, val in enumerate(df_terms["p_value"]):
@@ -120,11 +123,12 @@ def functional_enrichment(driver: neo4j.Driver, in_proteins, species_id: Any):
         if prev < p_adj and i != 0:
             p_adj = prev
         prev = p_adj
+        val, p_adj = (cutoff, cutoff) if val <= cutoff or p_adj <= cutoff else (val, p_adj)
+        p_vals += [val]
         rank_lst += [p_adj]
-
     # Update Dataframe
     df_terms["fdr_rate"] = rank_lst
-
+    df_terms["p_value"] = p_vals
     # Remove all entries where FDR >= 0.05
     df_terms = df_terms[df_terms["fdr_rate"] < alpha]
     df_terms = df_terms.reset_index(drop=True)

diff --git a/backend/src/pathway_data/cal_overlap_score.py b/backend/src/pathway_data/cal_overlap_score.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+
+"""
+A script to calculate the score of pairwise protein overlap between
+functional terms
+"""
+
+__author__ = "Dilmurat Yusuf"
+
+import itertools
+import pandas as pd
+import numpy as np
+import multiprocessing
+import gzip
+import time
+
+
+def cal_overlap_score(id1, id2, set1, set2, size1, size2):
+    """
+    calculate the overlap between two sets measured by
+    the fraction of smaller set, which corresponds to
+    the size of overlap
+    """
+
+    # calculate the size of intersection
+    intersection_size = len(set1 & set2)
+
+    # the sizes of sets are pre-calculated
+    # so when performing pairwise comparison
+    # len() won't be repeated over the same set
+    min_set = min(size1, size2)
+
+    return id1, id2, intersection_size / min_set
+
+
+def cal_overlap_score_worker(args):
+    """
+    for multiprocessing
+    """
+
+    i, j = args
+    score = cal_overlap_score(
+        df.loc[i].external_id, df.loc[j].external_id, df.loc[i].genes, df.loc[j].genes, df.loc[i].sizes, df.loc[j].sizes
+    )
+
+    # an abitratry threshold, assuming > 50% should indicate
+    # a strong relation
+    if score[2] < 0.5:
+        return None
+    else:
+        return f"{score[0]},{score[1]},{score[2]}\n"
+
+
+start_time = time.time()
+
+target_file = "data/AllPathways_mouse.csv"
+df = pd.read_csv(target_file)
+df = df.dropna()
+df["genes"] = df["genes"].apply(eval).apply(set)
+df = df[["id", "genes"]]
+# t df = df.head(200)
+# t df = pd.DataFrame({'proteins': [{1, 2 , 'c'}, {1, 2, 3, 'd'}, {1, 2, 3, 'a', 'b'}, {'bla'}], 'external_id': ['a','b','c', 'd']})
+
+# calculate the sizes of each element
+# this will be used later when calcualted overlap score
+df["sizes"] = [len(ele) for ele in df["genes"]]
+
+score_file = "data/Overlap/MusMusculusDATA/functional_terms_overlap.csv.gz"
+start_time = time.time()
+with gzip.open(score_file, "wb") as f:
+    pool = multiprocessing.Pool()
+    results = pool.map(cal_overlap_score_worker, itertools.combinations(range(len(df)), 2))
+    # remove None from the list
+    # None corresponds to overlap sta < 0.5
+    results = set(results)
+    results.remove(None)
+    f.write("".join(results).encode())
+print(f"completed overlap calculation and saved in {score_file}")
+
+time_cost = (time.time() - start_time) / 3600
+time_file = "/data/Overlap/MusMusculusDATA/functional_terms_overlap_time_cost.txt"
+with open(time_file, "w") as f:
+    f.write(f"time cost: {time_cost} hours\n")
+print(f"time cost at {time_file}")
diff --git a/backend/src/pathway_data/data/AllPathways_human.csv b/backend/src/pathway_data/data/AllPathways_human.csv
diff --git a/backend/src/pathway_data/data/AllPathways_mouse.csv b/backend/src/pathway_data/data/AllPathways_mouse.csv
diff --git a/backend/src/pathway_data/data/human_all_pathways.gmt b/backend/src/pathway_data/data/human_all_pathways.gmt