ENH: add a build-eggnog-diamond-db action (#116)

* Ignore runinfo * Add colorify to utils * Register action * colorify only with green * Update q2_moshpit/plugin_setup.py Co-authored-by: Michal Ziemski <[email protected]> * Updated the output name and description * Implement build_eggnog_diamond_db action * Add test * Further refactor EggnogSequenceTaxa to EggnogProteinSequences * Add validation for taxon IDs and corresponding test * Eliminate duplicated method * Apply suggestions from code review Co-authored-by: Michal Ziemski <[email protected]> * unit tests for _validate_taxon_id function --------- Co-authored-by: Michal Ziemski <[email protected]>
bokulich-lab · Jan 19, 2024 · f4b9644 · f4b9644
1 parent af7fb96
commit f4b9644
Show file tree

Hide file tree

Showing 7 changed files with 243 additions and 5 deletions.
diff --git a/.gitignore b/.gitignore
@@ -139,5 +139,5 @@ dmypy.json
 # Ignore notebooks
 **/*.ipynb
 
-# Ignore parsl stuff
+# Ignore parsl dir
 runinfo
diff --git a/q2_moshpit/eggnog/__init__.py b/q2_moshpit/eggnog/__init__.py
@@ -8,11 +8,12 @@
 from ._method import eggnog_diamond_search, eggnog_annotate
 from ._dbs import (
     fetch_eggnog_db, fetch_diamond_db, build_custom_diamond_db,
-    fetch_eggnog_proteins
+    fetch_eggnog_proteins, build_eggnog_diamond_db
 )
 
 
 __all__ = [
     'eggnog_diamond_search', 'eggnog_annotate', 'fetch_eggnog_db',
-    'fetch_diamond_db', 'build_custom_diamond_db', 'fetch_eggnog_proteins'
+    'fetch_diamond_db', 'build_custom_diamond_db', 'fetch_eggnog_proteins',
+    'build_eggnog_diamond_db',
 ]
diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py
@@ -6,7 +6,9 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 import os
+import pandas as pd
 from q2_types.feature_data import ProteinSequencesDirectoryFormat
+import shutil
 from q2_types_genomics.reference_db import (
     EggnogRefDirFmt, DiamondDatabaseDirFmt, NCBITaxonomyDirFmt,
     EggnogProteinSequencesDirFmt
@@ -174,3 +176,62 @@ def fetch_eggnog_proteins() -> EggnogProteinSequencesDirFmt:
     ))
 
     return eggnog_fa
+
+
+def build_eggnog_diamond_db(
+        eggnog_proteins: EggnogProteinSequencesDirFmt,
+        taxon: int
+) -> DiamondDatabaseDirFmt:
+    """
+    Creates a DIAMOND database which contains the protein
+    sequences that belong to the specified taxon.
+    """
+    # Validate taxon ID
+    _validate_taxon_id(eggnog_proteins, taxon)
+
+    # Initialize output objects
+    diamond_db = DiamondDatabaseDirFmt()
+
+    # Define command.
+    cmd = [
+        "create_dbs.py",
+        "--data_dir", str(eggnog_proteins),
+        "--taxids", str(taxon),
+        "--dbname", "ref_db"
+    ]
+    run_command(cmd)
+
+    # The script will create the diamond DB in side the directory of
+    # eggnog_proteins object, so we need to move it to diamond_db
+    source_path = os.path.join(str(eggnog_proteins), "ref_db.dmnd")
+    destination_path = os.path.join(str(diamond_db), "ref_db.dmnd")
+    shutil.move(source_path, destination_path)
+
+    # Return objects
+    return diamond_db
+
+
+def _validate_taxon_id(eggnog_proteins, taxon):
+    # Validate taxon id number
+    # Read in valid taxon ids
+    taxid_info = pd.read_csv(
+        os.path.join(str(eggnog_proteins), "e5.taxid_info.tsv"),
+        sep="\t"
+    )
+
+    # Convert them into a set
+    tax_ids = set()
+    for lineage in taxid_info["Taxid Lineage"]:
+        tax_ids.update(
+            set(
+                lineage.strip().split(",")
+            )
+        )
+
+    # Check for overlap with provided taxon id
+    if not str(taxon) in tax_ids:
+        raise ValueError(
+            f"'{taxon}' is not valid taxon ID. "
+            "To view all valid taxon IDs inspect e5.taxid_info.tsv "
+            "file in the eggnog_proteins input."
+        )
diff --git a/q2_moshpit/eggnog/tests/data/build_eggnog_diamond_db/e5.taxid_info.tsv b/q2_moshpit/eggnog/tests/data/build_eggnog_diamond_db/e5.taxid_info.tsv
diff --git a/q2_moshpit/eggnog/tests/test_dbs.py b/q2_moshpit/eggnog/tests/test_dbs.py
@@ -10,10 +10,12 @@
 from qiime2.plugin.testing import TestPluginBase
 from .._dbs import (
     fetch_eggnog_db, build_custom_diamond_db, fetch_eggnog_proteins,
-    fetch_diamond_db
+    fetch_diamond_db, build_eggnog_diamond_db, _validate_taxon_id
 )
 from q2_types.feature_data import ProteinSequencesDirectoryFormat
-from q2_types_genomics.reference_db import NCBITaxonomyDirFmt
+from q2_types_genomics.reference_db import (
+    NCBITaxonomyDirFmt, EggnogProteinSequencesDirFmt
+)
 
 
 class TestFetchDB(TestPluginBase):
@@ -147,3 +149,48 @@ def test_fetch_eggnog_fasta(self, subp_run):
 
         # Check that commands are ran as expected
         subp_run.assert_has_calls([first_call, second_call], any_order=False)
+
+    @patch("q2_moshpit.eggnog._dbs._validate_taxon_id")
+    @patch("subprocess.run")
+    @patch("shutil.move")
+    def test_build_eggnog_diamond_db(self, shut_mv, subp_run, _val):
+        # Instantiate input
+        proteins_and_taxa = EggnogProteinSequencesDirFmt()
+
+        # Call function. Patching will make sure nothing is
+        # actually ran
+        diamond_db = build_eggnog_diamond_db(proteins_and_taxa, taxon=2)
+
+        # Check that command was called in the expected way
+        exp_cmd = [
+            "create_dbs.py",
+            "--data_dir", str(proteins_and_taxa),
+            "--taxids", "2",
+            "--dbname", "ref_db"
+        ]
+
+        # Check that subprocess.run is run as expected
+        subp_run.assert_called_once_with(exp_cmd, check=True)
+
+        # Check that shutil.move is run as expected
+        source_path = os.path.join(str(proteins_and_taxa), "ref_db.dmnd")
+        destination_path = os.path.join(str(diamond_db), "ref_db.dmnd")
+        shut_mv.assert_called_once_with(source_path, destination_path)
+
+    def test_validate_taxon_id_invalid(self):
+        # Init input data
+        path_to_data = self.get_data_path('build_eggnog_diamond_db/')
+        eggnog_proteins = EggnogProteinSequencesDirFmt(path_to_data, 'r')
+
+        # Call function exception error since taxon 0 is invalid
+        with self.assertRaisesRegex(
+            ValueError,
+            "'0' is not valid taxon ID. "
+        ):
+            _validate_taxon_id(eggnog_proteins, 0)
+
+    def test_validate_taxon_id_valid(self):
+        # Init input data
+        path_to_data = self.get_data_path('build_eggnog_diamond_db/')
+        eggnog_proteins = EggnogProteinSequencesDirFmt(path_to_data, 'r')
+        _validate_taxon_id(eggnog_proteins, 2)
diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py
@@ -591,6 +591,34 @@
                 "storage space is required to run this action. "
 )
 
+plugin.methods.register_function(
+    function=q2_moshpit.eggnog.build_eggnog_diamond_db,
+    inputs={
+        'eggnog_proteins': ReferenceDB[EggnogProteinSequences],
+    },
+    input_descriptions={
+        'eggnog_proteins': "eggNOG database of protein sequences and "
+                           "their corresponding taxonomy information "
+                           "(generated through the `fetch-eggnog-proteins` "
+                           "action)."
+    },
+    parameters={
+        'taxon': Int % Range(2, 1579337)
+    },
+    parameter_descriptions={
+        'taxon': "Taxon ID number."
+    },
+    outputs=[("diamond_db", ReferenceDB[Diamond])],
+    output_descriptions={
+        "diamond_db": "Complete Diamond reference database for the"
+                      "specified taxon."
+    },
+    name="Create a DIAMOND formatted reference database for the"
+         "specified taxon.",
+    description="Creates a DIAMOND database which contains the protein "
+                "sequences that belong to the specified taxon.",
+)
+
 plugin.methods.register_function(
     function=q2_moshpit.eggnog.eggnog_diamond_search,
     inputs={

diff --git a/setup.py b/setup.py
@@ -47,6 +47,7 @@
         ],
         'q2_moshpit.eggnog': [
             'tests/data/*',
+            'tests/data/build_eggnog_diamond_db/*',
             'tests/data/contig-sequences-1/*',
             'tests/data/mag-sequences/*',
             'tests/data/random-db-1/*',