Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add fetch-eggnog-proteins action #109

Merged
merged 20 commits into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions q2_moshpit/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,27 @@
import subprocess
from typing import List

# CONVERT shell colors to the same curses palette
SHELL_COLORS = {
"wr": '\033[1;37;41m', # white on red
"wo": '\033[1;37;43m', # white on orange
"wm": '\033[1;37;45m', # white on magenta
"wb": '\033[1;37;46m', # white on blue
"bw": '\033[1;37;40m', # black on white
"lblue": '\033[1;34m', # light blue
"lred": '\033[1;31m', # light red
"lgreen": '\033[1;32m', # light green
"yellow": '\033[1;33m', # yellow
"cyan": '\033[36m', # cyan
"blue": '\033[34m', # blue
"green": '\033[32m', # green
"orange": '\033[33m', # orange
"red": '\033[31m', # red
"magenta": "\033[35m", # magenta
"white": "\033[0m", # white
None: "\033[0m", # end
}

Sann5 marked this conversation as resolved.
Show resolved Hide resolved

def run_command(cmd, env=None, verbose=True, pipe=False):
if verbose:
Expand Down Expand Up @@ -70,3 +91,7 @@ def _process_common_input_params(processing_func, params: dict) -> List[str]:
continue

return processed_args


def colorify(string, color):
return "%s%s%s" % (SHELL_COLORS[color], string, SHELL_COLORS[None])
8 changes: 6 additions & 2 deletions q2_moshpit/eggnog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
# ----------------------------------------------------------------------------


from ._method import (eggnog_diamond_search, eggnog_annotate, fetch_eggnog_db)
from ._method import (eggnog_diamond_search, eggnog_annotate)
from ._dbs import fetch_eggnog_db, fetch_eggnog_fasta

__all__ = ['eggnog_diamond_search', 'eggnog_annotate', 'fetch_eggnog_db']
__all__ = [
'eggnog_diamond_search', 'eggnog_annotate', 'fetch_eggnog_db',
'fetch_eggnog_fasta'
]
89 changes: 89 additions & 0 deletions q2_moshpit/eggnog/_dbs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2022, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import os
from q2_types_genomics.reference_db import (
EggnogRefDirFmt, EggnogSequenceTaxaDirFmt
)
from .._utils import run_command, colorify


def fetch_eggnog_db() -> EggnogRefDirFmt:
"""
Downloads eggnog reference database using the
`download_eggnog_data.py` script from eggNOG. Here, this
script downloads 3 files amounting to 47Gb in total.
"""

# Initialize output objects
eggnog_db = EggnogRefDirFmt()

# Define command.
# Meaning of flags:
# y: Answer yest to all prompts thrown by download_eggnog_data.py
# D: Do not download the Diamond database
# data_dir: location where to save downloads
cmd = [
"download_eggnog_data.py", "-y", "-D",
"--data_dir", str(eggnog_db.path)
]
run_command(cmd)

# Return objects
return eggnog_db


def fetch_eggnog_fasta() -> EggnogSequenceTaxaDirFmt:
Sann5 marked this conversation as resolved.
Show resolved Hide resolved
"""
# TODO: Add description
"""
# Initialize output objects
eggnog_fa = EggnogSequenceTaxaDirFmt()
fasta_file = os.path.join(str(eggnog_fa), "e5.proteomes.faa")
taxonomy_file = os.path.join(str(eggnog_fa), "e5.taxid_info.tsv")

# Download Diamond DB
print(
colorify(
"Downloading fasta file...", "lgreen"
)
)
run_command(
cmd=[
"wget", "-e", "robots=off", "-O", f"{fasta_file}",
"http://eggnog5.embl.de/download/eggnog_5.0/e5.proteomes.faa"
]
)

# Decompressing file
Sann5 marked this conversation as resolved.
Show resolved Hide resolved
print(
colorify(
"Download completed.\n"
"Downloading taxonomy file...",
"lgreen"
)
)
run_command(
cmd=[
"wget", "-e", "robots=off", "-O", f"{taxonomy_file}",
"http://eggnog5.embl.de/download/eggnog_5.0/e5.taxid_info.tsv"
]
)

# Let user know that the process is done.
# The actual copying wil be taken care of by qiime behind the
# scenes.
print(
colorify(
"Download completed. \n"
"Copying files from temporary directory to final location "
"(this will take a few minutes)...",
"lgreen"
)
)

return eggnog_fa
26 changes: 0 additions & 26 deletions q2_moshpit/eggnog/_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from q2_types_genomics.reference_db import EggnogRefDirFmt
from q2_types.feature_data import DNAFASTAFormat
from q2_types_genomics.reference_db import DiamondDatabaseDirFmt
from .._utils import run_command
from q2_types_genomics.feature_data import (
OrthologAnnotationDirFmt, MAGSequencesDirFmt
)
Expand Down Expand Up @@ -132,28 +131,3 @@ def _annotate_seed_orthologs_runner(seed_ortholog, eggnog_db, sample_label,
cmds.append('--dbmem')

subprocess.run(cmds, check=True)


def fetch_eggnog_db() -> EggnogRefDirFmt:
"""
Downloads eggnog reference database using the
`download_eggnog_data.py` script from eggNOG. Here, this
script downloads 3 files amounting to 47Gb in total.
"""

# Initialize output objects
eggnog_db = EggnogRefDirFmt()

# Define command.
# Meaning of flags:
# y: Answer yest to all prompts thrown by download_eggnog_data.py
# D: Do not download the Diamond database
# data_dir: location where to save downloads
cmd = [
"download_eggnog_data.py", "-y", "-D",
"--data_dir", str(eggnog_db.path)
]
run_command(cmd)

# Return objects
return eggnog_db
55 changes: 55 additions & 0 deletions q2_moshpit/eggnog/tests/test_dbs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2022, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import os
from unittest.mock import patch, call
from qiime2.plugin.testing import TestPluginBase
from .._dbs import fetch_eggnog_db, fetch_eggnog_fasta


class TestFetchDB(TestPluginBase):
package = 'q2_moshpit.eggnog.tests'

@patch("subprocess.run")
def test_fetch_eggnog_db(self, subp_run):
# Call function. Patching will make sure nothing is
# actually ran
eggnog_db = fetch_eggnog_db()

# Check that command was called in the expected way
cmd = [
"download_eggnog_data.py", "-y", "-D",
"--data_dir", str(eggnog_db)
]
subp_run.assert_called_once_with(cmd, check=True)

@patch("subprocess.run")
def test_fetch_eggnog_fasta(self, subp_run):
# Call function. Patching will make sure nothing is
# actually ran
eggnog_fa = fetch_eggnog_fasta()
fasta_file = os.path.join(str(eggnog_fa), "e5.proteomes.faa")
taxonomy_file = os.path.join(str(eggnog_fa), "e5.taxid_info.tsv")

# Check that command was called in the expected way
first_call = call(
[
"wget", "-e", "robots=off", "-O", f"{fasta_file}",
"http://eggnog5.embl.de/download/eggnog_5.0/e5.proteomes.faa"
],
check=True
)
second_call = call(
[
"wget", "-e", "robots=off", "-O", f"{taxonomy_file}",
"http://eggnog5.embl.de/download/eggnog_5.0/e5.taxid_info.tsv"
],
check=True,
)

# Check that commands are ran as expected
subp_run.assert_has_calls([first_call, second_call], any_order=False)
22 changes: 1 addition & 21 deletions q2_moshpit/eggnog/tests/test_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,12 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import pandas as pd
import pandas.testing as pdt
from unittest.mock import patch
import qiime2
from qiime2.plugin.testing import TestPluginBase

from q2_types_genomics.feature_data import MAGSequencesDirFmt
from .._method import eggnog_diamond_search, eggnog_annotate, fetch_eggnog_db
from .._method import eggnog_diamond_search, eggnog_annotate
from q2_types_genomics.reference_db import (
DiamondDatabaseDirFmt, EggnogRefDirFmt)
from q2_types_genomics.per_sample_data import ContigSequencesDirFmt
Expand Down Expand Up @@ -85,20 +82,3 @@ def test_small_good_hits(self):
self.assertEqual(len(objs), 1)
df = objs[0][1].view(pd.DataFrame)
pdt.assert_frame_equal(df, exp)


class TestFetchDB(TestPluginBase):
package = 'q2_moshpit.eggnog.tests'

@patch("subprocess.run")
def test_fetch_eggnog_db(self, subp_run):
# Call function. Patching will make sure nothing is
# actually ran
eggnog_db = fetch_eggnog_db()

# Check that command was called in the expected way
cmd = [
"download_eggnog_data.py", "-y", "-D",
"--data_dir", str(eggnog_db)
]
subp_run.assert_called_once_with(cmd, check=True)
23 changes: 22 additions & 1 deletion q2_moshpit/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@
from q2_types_genomics.kraken2._type import BrackenDB
from q2_types_genomics.per_sample_data import MAGs, Contigs
from q2_types_genomics.per_sample_data._type import AlignmentMap
from q2_types_genomics.reference_db import ReferenceDB, Diamond, Eggnog
from q2_types_genomics.reference_db import (
ReferenceDB, Diamond, Eggnog, EggnogSequenceTaxa
)

citations = Citations.load('citations.bib', package='q2_moshpit')

Expand Down Expand Up @@ -505,6 +507,25 @@
"storage space is required to run this action. "
)

plugin.methods.register_function(
function=q2_moshpit.eggnog.fetch_eggnog_fasta,
inputs={},
parameters={},
outputs=[("eggnog_fasta", ReferenceDB[EggnogSequenceTaxa])],
output_descriptions={
"eggnog_fasta": "Artifact containing the eggNOG database "
"of protein sequences and their corresponding"
"taxonomy information."
},
name="Fetch the databases necessary to run to run the "
misialq marked this conversation as resolved.
Show resolved Hide resolved
"build-eggnog-diamond-db action.",
description="Downloads eggnog proteome database "
"This script downloads 2 files: "
"(e5.proteomes.faa and e5.taxid_info.tsv) "
"and creates and artifact with them. At least 18 Gb of "
"storage space is required to run this action. "
Sann5 marked this conversation as resolved.
Show resolved Hide resolved
)

plugin.methods.register_function(
function=q2_moshpit.eggnog.eggnog_diamond_search,
inputs={
Expand Down
Loading