Skip to content

Commit

Permalink
ENH: add a build-eggnog-diamond-db action (#116)
Browse files Browse the repository at this point in the history
* Ignore runinfo

* Add colorify to utils

* Register action

* colorify only with green

* Update q2_moshpit/plugin_setup.py

Co-authored-by: Michal Ziemski <[email protected]>

* Updated the output name and description

* Implement build_eggnog_diamond_db action

* Add test

* Further refactor EggnogSequenceTaxa to EggnogProteinSequences

* Add validation for taxon IDs and corresponding test

* Eliminate duplicated method

* Apply suggestions from code review

Co-authored-by: Michal Ziemski <[email protected]>

* unit tests for _validate_taxon_id function

---------

Co-authored-by: Michal Ziemski <[email protected]>
  • Loading branch information
Sann5 and misialq authored Jan 19, 2024
1 parent af7fb96 commit f4b9644
Show file tree
Hide file tree
Showing 7 changed files with 243 additions and 5 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -139,5 +139,5 @@ dmypy.json
# Ignore notebooks
**/*.ipynb

# Ignore parsl stuff
# Ignore parsl dir
runinfo
5 changes: 3 additions & 2 deletions q2_moshpit/eggnog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
from ._method import eggnog_diamond_search, eggnog_annotate
from ._dbs import (
fetch_eggnog_db, fetch_diamond_db, build_custom_diamond_db,
fetch_eggnog_proteins
fetch_eggnog_proteins, build_eggnog_diamond_db
)


__all__ = [
'eggnog_diamond_search', 'eggnog_annotate', 'fetch_eggnog_db',
'fetch_diamond_db', 'build_custom_diamond_db', 'fetch_eggnog_proteins'
'fetch_diamond_db', 'build_custom_diamond_db', 'fetch_eggnog_proteins',
'build_eggnog_diamond_db',
]
61 changes: 61 additions & 0 deletions q2_moshpit/eggnog/_dbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import os
import pandas as pd
from q2_types.feature_data import ProteinSequencesDirectoryFormat
import shutil
from q2_types_genomics.reference_db import (
EggnogRefDirFmt, DiamondDatabaseDirFmt, NCBITaxonomyDirFmt,
EggnogProteinSequencesDirFmt
Expand Down Expand Up @@ -174,3 +176,62 @@ def fetch_eggnog_proteins() -> EggnogProteinSequencesDirFmt:
))

return eggnog_fa


def build_eggnog_diamond_db(
eggnog_proteins: EggnogProteinSequencesDirFmt,
taxon: int
) -> DiamondDatabaseDirFmt:
"""
Creates a DIAMOND database which contains the protein
sequences that belong to the specified taxon.
"""
# Validate taxon ID
_validate_taxon_id(eggnog_proteins, taxon)

# Initialize output objects
diamond_db = DiamondDatabaseDirFmt()

# Define command.
cmd = [
"create_dbs.py",
"--data_dir", str(eggnog_proteins),
"--taxids", str(taxon),
"--dbname", "ref_db"
]
run_command(cmd)

# The script will create the diamond DB in side the directory of
# eggnog_proteins object, so we need to move it to diamond_db
source_path = os.path.join(str(eggnog_proteins), "ref_db.dmnd")
destination_path = os.path.join(str(diamond_db), "ref_db.dmnd")
shutil.move(source_path, destination_path)

# Return objects
return diamond_db


def _validate_taxon_id(eggnog_proteins, taxon):
# Validate taxon id number
# Read in valid taxon ids
taxid_info = pd.read_csv(
os.path.join(str(eggnog_proteins), "e5.taxid_info.tsv"),
sep="\t"
)

# Convert them into a set
tax_ids = set()
for lineage in taxid_info["Taxid Lineage"]:
tax_ids.update(
set(
lineage.strip().split(",")
)
)

# Check for overlap with provided taxon id
if not str(taxon) in tax_ids:
raise ValueError(
f"'{taxon}' is not valid taxon ID. "
"To view all valid taxon IDs inspect e5.taxid_info.tsv "
"file in the eggnog_proteins input."
)
100 changes: 100 additions & 0 deletions q2_moshpit/eggnog/tests/data/build_eggnog_diamond_db/e5.taxid_info.tsv

Large diffs are not rendered by default.

51 changes: 49 additions & 2 deletions q2_moshpit/eggnog/tests/test_dbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
from qiime2.plugin.testing import TestPluginBase
from .._dbs import (
fetch_eggnog_db, build_custom_diamond_db, fetch_eggnog_proteins,
fetch_diamond_db
fetch_diamond_db, build_eggnog_diamond_db, _validate_taxon_id
)
from q2_types.feature_data import ProteinSequencesDirectoryFormat
from q2_types_genomics.reference_db import NCBITaxonomyDirFmt
from q2_types_genomics.reference_db import (
NCBITaxonomyDirFmt, EggnogProteinSequencesDirFmt
)


class TestFetchDB(TestPluginBase):
Expand Down Expand Up @@ -147,3 +149,48 @@ def test_fetch_eggnog_fasta(self, subp_run):

# Check that commands are ran as expected
subp_run.assert_has_calls([first_call, second_call], any_order=False)

@patch("q2_moshpit.eggnog._dbs._validate_taxon_id")
@patch("subprocess.run")
@patch("shutil.move")
def test_build_eggnog_diamond_db(self, shut_mv, subp_run, _val):
# Instantiate input
proteins_and_taxa = EggnogProteinSequencesDirFmt()

# Call function. Patching will make sure nothing is
# actually ran
diamond_db = build_eggnog_diamond_db(proteins_and_taxa, taxon=2)

# Check that command was called in the expected way
exp_cmd = [
"create_dbs.py",
"--data_dir", str(proteins_and_taxa),
"--taxids", "2",
"--dbname", "ref_db"
]

# Check that subprocess.run is run as expected
subp_run.assert_called_once_with(exp_cmd, check=True)

# Check that shutil.move is run as expected
source_path = os.path.join(str(proteins_and_taxa), "ref_db.dmnd")
destination_path = os.path.join(str(diamond_db), "ref_db.dmnd")
shut_mv.assert_called_once_with(source_path, destination_path)

def test_validate_taxon_id_invalid(self):
# Init input data
path_to_data = self.get_data_path('build_eggnog_diamond_db/')
eggnog_proteins = EggnogProteinSequencesDirFmt(path_to_data, 'r')

# Call function exception error since taxon 0 is invalid
with self.assertRaisesRegex(
ValueError,
"'0' is not valid taxon ID. "
):
_validate_taxon_id(eggnog_proteins, 0)

def test_validate_taxon_id_valid(self):
# Init input data
path_to_data = self.get_data_path('build_eggnog_diamond_db/')
eggnog_proteins = EggnogProteinSequencesDirFmt(path_to_data, 'r')
_validate_taxon_id(eggnog_proteins, 2)
28 changes: 28 additions & 0 deletions q2_moshpit/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,34 @@
"storage space is required to run this action. "
)

plugin.methods.register_function(
function=q2_moshpit.eggnog.build_eggnog_diamond_db,
inputs={
'eggnog_proteins': ReferenceDB[EggnogProteinSequences],
},
input_descriptions={
'eggnog_proteins': "eggNOG database of protein sequences and "
"their corresponding taxonomy information "
"(generated through the `fetch-eggnog-proteins` "
"action)."
},
parameters={
'taxon': Int % Range(2, 1579337)
},
parameter_descriptions={
'taxon': "Taxon ID number."
},
outputs=[("diamond_db", ReferenceDB[Diamond])],
output_descriptions={
"diamond_db": "Complete Diamond reference database for the"
"specified taxon."
},
name="Create a DIAMOND formatted reference database for the"
"specified taxon.",
description="Creates a DIAMOND database which contains the protein "
"sequences that belong to the specified taxon.",
)

plugin.methods.register_function(
function=q2_moshpit.eggnog.eggnog_diamond_search,
inputs={
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
],
'q2_moshpit.eggnog': [
'tests/data/*',
'tests/data/build_eggnog_diamond_db/*',
'tests/data/contig-sequences-1/*',
'tests/data/mag-sequences/*',
'tests/data/random-db-1/*',
Expand Down

0 comments on commit f4b9644

Please sign in to comment.