Small fixes: Pubchem and Bridgedb (#197)

BioDataFuse · Nov 21, 2024 · 05f56af · 05f56af
1 parent a7b14dd
commit 05f56af
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 6 deletions.
diff --git a/src/pyBiodatafuse/annotators/pubchem.py b/src/pyBiodatafuse/annotators/pubchem.py
@@ -5,6 +5,7 @@
 
 import datetime
 import os
+from tqdm import tqdm
 import warnings
 from string import Template
 from typing import Any, Dict, Tuple
@@ -95,7 +96,7 @@ def get_protein_compound_screened(bridgedb_df: pd.DataFrame) -> Tuple[pd.DataFra
 
     intermediate_df = pd.DataFrame()
 
-    for protein_str in query_protein_list:
+    for protein_str in tqdm(query_protein_list, desc="Querying PubChem"):
         query_count += 1
 
         sparql_query_template = Template(sparql_query)

diff --git a/src/pyBiodatafuse/id_mapper.py b/src/pyBiodatafuse/id_mapper.py
@@ -108,14 +108,16 @@ def bridgedb_xref(
     if not input_datasource:
         raise ValueError("Please provide the identifier datasource, e.g. HGNC")
 
-    if output_datasource is None or "All":
+    if output_datasource is None:
         output_datasource = [
             "Uniprot-TrEMBL",
             "NCBI Gene",
             "Ensembl",
             "HGNC Accession Number",
             "HGNC",
         ]
+    else:
+        assert isinstance(output_datasource, list), "output_datasource must be a list"
 
     data_sources = read_resource_files()
     input_source = data_sources.loc[data_sources["source"] == input_datasource, "systemCode"].iloc[
@@ -182,13 +184,13 @@ def bridgedb_xref(
     bridgedb = bridgedb.dropna(subset=["target.source"])
 
     # Subset based on the output_datasource
-    bridgedb = bridgedb[bridgedb["target.source"].isin(output_datasource)]
+    bridgedb_subset = bridgedb[bridgedb["target.source"].isin(output_datasource)]
 
-    bridgedb = bridgedb.drop_duplicates()
+    bridgedb_subset = bridgedb_subset.drop_duplicates()
     identifiers.columns = [
         "{}{}".format(c, "" if c in "identifier" else "_dea") for c in identifiers.columns
     ]
-    bridgedb = bridgedb.merge(identifiers, on="identifier")
+    bridgedb_subset = bridgedb_subset.merge(identifiers, on="identifier")
 
     """Metadata details"""
     # Get the current date and time
@@ -216,7 +218,7 @@ def bridgedb_xref(
         },
     }
 
-    return bridgedb, bridgedb_metadata
+    return bridgedb_subset, bridgedb_metadata
 
 
 """PubChem helper functions."""