Merge pull request #173 from BioDataFuse/example

update use case
BioDataFuse · Sep 18, 2024 · 72a4e09 · 72a4e09
2 parents 6192e4e + 1dd02cb
commit 72a4e09
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 18 deletions.
diff --git a/examples/gene_to_graph_workflow.ipynb b/examples/gene_to_graph_workflow.ipynb
@@ -2196,7 +2196,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -2288,7 +2288,7 @@
        "4  [{'pathway_id': nan, 'pathway_label': nan, 'pa...  "
       ]
      },
-     "execution_count": 14,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2300,6 +2300,36 @@
     "minerva_df.head()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'datasource': 'MINERVA',\n",
+       " 'metadata': {'source_version': '17.1.3'},\n",
+       " 'query': {'size': 6,\n",
+       "  'input_type': 'Ensembl',\n",
+       "  'MINERVA project': 'COVID19 Disease Map',\n",
+       "  'MINERVA project URL': 'https://covid19map.elixir-luxembourg.org/minerva/',\n",
+       "  'time': '0:00:47.612140',\n",
+       "  'date': '2024-09-10 15:23:35',\n",
+       "  'url': 'https://covid19map.elixir-luxembourg.org/minerva/',\n",
+       "  'number_of_added_nodes': 1,\n",
+       "  'number_of_added_edges': 1}}"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "minerva_metadata"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 15,

diff --git a/examples/usecases/PCS/PCS_usecase.ipynb b/examples/usecases/PCS/PCS_usecase.ipynb
@@ -518,7 +518,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Add litaliterature-based data\n",
+    "### Add literature-based data\n",
     "Genes found to be associated with Post-COVID-19"
    ]
   },
@@ -727,7 +727,7 @@
     "\n",
     "def get_literature_based_info(gene):\n",
     "    if gene in pcs_associated_genes[\"Gene\"].values:\n",
-    "        return literature_disease_attrs\n",
+    "        return [literature_disease_attrs]\n",
     "    else:\n",
     "        return [{\"disease_name\": np.nan, \"id\": np.nan, \"source\": np.nan}]\n",
     "\n",
@@ -745,7 +745,7 @@
     {
      "data": {
       "text/plain": [
-       "362    {'disease_name': 'Post-COVID-19', 'id': 'C0000...\n",
+       "362    [{'disease_name': 'Post-COVID-19', 'id': 'C000...\n",
        "Name: literature_based_info, dtype: object"
       ]
      },
@@ -2244,7 +2244,7 @@
        "      <td>ENSG00000152592</td>\n",
        "      <td>Ensembl</td>\n",
        "      <td>[{'disease_name': 'Hypophosphatemic Rickets', ...</td>\n",
-       "      <td>{'disease_name': 'Post-COVID-19', 'id': 'C0000...</td>\n",
+       "      <td>[{'disease_name': 'Post-COVID-19', 'id': 'C000...</td>\n",
        "      <td>[{'chembl_id': nan, 'drugbank_id': nan, 'compo...</td>\n",
        "      <td>[{'pathway_id': nan, 'pathway_label': nan, 'pa...</td>\n",
        "      <td>[{'pathway_id': 'WP3971', 'pathway_label': 'OS...</td>\n",
@@ -2315,7 +2315,7 @@
        "3  [{'disease_name': 'Cystic Fibrosis', 'HPO': ''...   \n",
        "\n",
        "                               literature_based_info  \\\n",
-       "0  {'disease_name': 'Post-COVID-19', 'id': 'C0000...   \n",
+       "0  [{'disease_name': 'Post-COVID-19', 'id': 'C000...   \n",
        "1  [{'disease_name': nan, 'id': nan, 'source': nan}]   \n",
        "2  [{'disease_name': nan, 'id': nan, 'source': nan}]   \n",
        "3  [{'disease_name': nan, 'id': nan, 'source': nan}]   \n",
@@ -2374,7 +2374,7 @@
     {
      "data": {
       "text/plain": [
-       "{'disease_name': 'Post-COVID-19', 'id': 'C00000', 'source': 'PMID: 37675861'}"
+       "[{'disease_name': 'Post-COVID-19', 'id': 'C00000', 'source': 'PMID: 37675861'}]"
       ]
      },
      "execution_count": 29,
@@ -2793,7 +2793,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2857,7 +2857,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2893,7 +2893,7 @@
     "    ```\n",
     "- Add `apoc.conf` file to **conf** subfolder of the DBMS folder\n",
     "- Open Neo4j Browser\n",
-    "- (Optionl, only run if you have imported a graph  before) Remove all the nodes before importing `.graphml` file\n",
+    "- (Optionl, only run if you have imported a graph before) Remove all the nodes before importing `.graphml` file\n",
     "\n",
     "    ```\n",
     "    MATCH (n) DETACH DELETE n\n",
@@ -2919,7 +2919,7 @@
     "    ```\n",
     "\n",
     "- Count the number of each node type\n",
-    "    - total (```MATCH (n) RETURN count(n)```) = 19859\n",
+    "    - total (```MATCH (n) RETURN count(n)```) = 19860\n",
     "        - Gene (```MATCH (n:Gene) RETURN count(n)```) = 1667\n",
     "        - Pathway (```MATCH (n:Pathway) RETURN count(n)```) = 1847\n",
     "            - WikiPathways (```MATCH (n:Pathway {source: \"WikiPathways\"}) RETURN count(n)```) = 678\n",
@@ -2928,11 +2928,13 @@
     "        - Biological Process (```MATCH (n:`Biological Process`) RETURN count(n)```) = 4624\n",
     "        - Molecular Function (```MATCH (n:`Molecular Function`) RETURN count(n)```) = 1327\n",
     "        - Cellular Component (```MATCH (n:`Cellular Component`) RETURN count(n)```) = 736\n",
-    "        - Disease (```MATCH (n:Disease) RETURN count(n)```) = 2913\n",
+    "        - Disease (```MATCH (n:Disease) RETURN count(n)```) = 2914\n",
+    "            - DISGENET (```MATCH (n:Disease {source: \"DISGENET\"}) RETURN count(n)```) = 2913\n",
+    "            - Literature (```MATCH (n:Disease {source: \"PMID: 37675861\"}) RETURN count(n)```) = 1\n",
     "        - Compound (```MATCH (n:Compound) RETURN count(n)```) = 2244\n",
     "        - Side Effect (```MATCH (n:`Side Effect`) RETURN count(n)```) = 4501\n",
     "- Count the number of each edge type\n",
-    "    - total (```MATCH ()-[r]->() RETURN count(r)```) = 101630\n",
+    "    - total (```MATCH ()-[r]->() RETURN count(r)```) = 101659\n",
     "        - interacts_with (```MATCH ()-[r:interacts_with]->() RETURN count(r)```) = 16844\n",
     "        - part_of (```MATCH ()-[r:part_of]->() RETURN count(r)```) = 30066 \n",
     "            - WikiPathways (```MATCH ()-[r:part_of {source: \"WikiPathways\"}]->() RETURN count(r)```) = 3174\n",
@@ -2942,7 +2944,9 @@
     "        - treats (```MATCH ()-[r:treats]->() RETURN count(r)```) = 8215\n",
     "        - has_side_effect (```MATCH ()-[r:has_side_effect]->() RETURN count(r)```) = 38328\n",
     "        - inhibits (```MATCH ()-[r:inhibits]->() RETURN count(r)```) = 71\n",
-    "        - associated_with (```MATCH ()-[r:associated_with]->() RETURN count(r)```) = 7607\n",
+    "        - associated_with (```MATCH ()-[r:associated_with]->() RETURN count(r)```) = 7636\n",
+    "            - Literature (```MATCH ()-[r:associated_with {source: \"DISGENET\"}]->() RETURN count(r)```) = 7607\n",
+    "            - DISGENET (```MATCH ()-[r:associated_with{source: \"PMID: 37675861\"}]->() RETURN count(r)```) = 29\n",
     "\n",
     "- Export the graph as a `.csv` file\n",
     "\n",

diff --git a/src/pyBiodatafuse/constants.py b/src/pyBiodatafuse/constants.py
@@ -287,13 +287,13 @@
     "el": None,
     "label": GENE_DISEASE_EDGE_LABEL,
 }
-# Literature
 
+# Literature
+LITERATURE_NODE_MAIN_LABEL = "id"
 LITERATURE_DISEASE_NODE_ATTRS = {
     "source": None,
     "name": None,
     "id": None,
-    "UMLS": None,
     "labels": DISEASE_NODE_LABELS,
 }
 LITERATURE_DISEASE_EDGE_ATTRS = {

diff --git a/src/pyBiodatafuse/graph/generator.py b/src/pyBiodatafuse/graph/generator.py
@@ -36,6 +36,7 @@
     LITERATURE_DISEASE_COL,
     LITERATURE_DISEASE_EDGE_ATTRS,
     LITERATURE_DISEASE_NODE_ATTRS,
+    LITERATURE_NODE_MAIN_LABEL,
     MINERVA,
     MOLMEDB_COMPOUND_NODE_ATTRS,
     MOLMEDB_PROTEIN_COMPOUND_COL,
@@ -226,7 +227,7 @@ def add_literature_gene_disease_subgraph(g, gene_node_label, annot_list):
     """
     for annot in annot_list:
         if not pd.isna(annot["disease_name"]):
-            annot_node_label = annot[DISEASE_NODE_MAIN_LABEL]
+            annot_node_label = annot[LITERATURE_NODE_MAIN_LABEL]
             annot_node_attrs = LITERATURE_DISEASE_NODE_ATTRS.copy()
             annot_node_attrs["source"] = annot["source"]
             annot_node_attrs["name"] = annot["disease_name"]