Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add fetch-ncbi-taxonomy action #118

Merged
merged 26 commits into from
Jan 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
194a545
Add colorify to utils
Sann5 Dec 22, 2023
ea08a40
Register action
Sann5 Dec 22, 2023
471aaad
colorify only with green
Sann5 Jan 11, 2024
4fd93d8
Update q2_moshpit/plugin_setup.py
Sann5 Jan 11, 2024
bc9f7c6
Updated the output name and description
Sann5 Jan 12, 2024
452824f
Implement build_eggnog_diamond_db action
Sann5 Jan 12, 2024
f4e711f
Add test
Sann5 Jan 12, 2024
9b0a4c0
Further refactor EggnogSequenceTaxa to EggnogProteinSequences
Sann5 Jan 15, 2024
db0a0c8
Add validation for taxon IDs and corresponding test
Sann5 Jan 15, 2024
527bfec
Implement fetch-ncbi-taxonomy
Sann5 Jan 18, 2024
92cb2f5
_write_version_tsv functionality to separate function.
Sann5 Jan 18, 2024
2cdae75
implement tests
Sann5 Jan 18, 2024
205913a
fix bug in tests
Sann5 Jan 18, 2024
8233505
Add ellipsis to green prompts
Sann5 Jan 18, 2024
650160d
remove duplicate action
Sann5 Jan 19, 2024
5cfb55b
Update q2_moshpit/plugin_setup.py
Sann5 Jan 19, 2024
35386b6
Merge branch 'main' into fetch_ncbi_iss_107
Sann5 Jan 22, 2024
1633654
correct indentation
Sann5 Jan 22, 2024
ca2dedb
Merge branch 'main' into fetch_ncbi_iss_107
Sann5 Jan 22, 2024
1ef872d
Remove version file + adjust tests
Sann5 Jan 23, 2024
a0bf457
Adjust file size in prompt
Sann5 Jan 23, 2024
f680567
Reorganize fetch_ncbi_taxonomy
Sann5 Jan 23, 2024
af0f652
Add tests
Sann5 Jan 23, 2024
c4e28f0
Update q2_moshpit/eggnog/_dbs.py
Sann5 Jan 24, 2024
301ae00
Eliminate duplicated action
Sann5 Jan 24, 2024
9a2de57
Reveiw comments Michal
Sann5 Jan 24, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion q2_moshpit/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import subprocess
import hashlib
from typing import List


Expand All @@ -26,7 +27,7 @@
return result

if env:
subprocess.run(cmd, env=env, check=True, **kwargs)

Check warning on line 30 in q2_moshpit/_utils.py

View check run for this annotation

Codecov / codecov/patch

q2_moshpit/_utils.py#L30

Added line #L30 was not covered by tests
else:
subprocess.run(cmd, check=True, **kwargs)

Expand Down Expand Up @@ -72,5 +73,14 @@
return processed_args


def colorify(string):
def colorify(string: str):
return "%s%s%s" % ('\033[1;32m', string, "\033[0m")


def _calculate_md5_from_file(file_path: str) -> str:
md5_hash = hashlib.md5()
with open(file_path, 'rb') as f:
# Read the file in chunks to handle large files
for chunk in iter(lambda: f.read(4096), b""):
md5_hash.update(chunk)
return md5_hash.hexdigest()
6 changes: 6 additions & 0 deletions q2_moshpit/citations.bib
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,9 @@ @article{buchfink_sensitive_2021
keywords = {Computational biology and bioinformatics, Genome informatics, Genomic analysis, Sequencing, Software},
pages = {366--368},
}

@misc{NCBI,
title = {National Center for Biotechnology Information (NCBI)},
url = {https://www.ncbi.nlm.nih.gov/},
note = {Bethesda (MD): National Library of Medicine (US), National Center for Biotechnology Information;},
}
4 changes: 2 additions & 2 deletions q2_moshpit/eggnog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
from ._method import eggnog_diamond_search, eggnog_annotate
from ._dbs import (
fetch_eggnog_db, fetch_diamond_db, build_custom_diamond_db,
fetch_eggnog_proteins, build_eggnog_diamond_db
fetch_eggnog_proteins, build_eggnog_diamond_db, fetch_ncbi_taxonomy
)


__all__ = [
'eggnog_diamond_search', 'eggnog_annotate', 'fetch_eggnog_db',
'fetch_diamond_db', 'build_custom_diamond_db', 'fetch_eggnog_proteins',
'build_eggnog_diamond_db',
'build_eggnog_diamond_db', 'fetch_ncbi_taxonomy'
]
100 changes: 93 additions & 7 deletions q2_moshpit/eggnog/_dbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,18 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import os
import shutil
import pandas as pd
from qiime2.core.exceptions import ValidationError
from q2_types.feature_data import ProteinSequencesDirectoryFormat
import shutil
from q2_types_genomics.reference_db import (
EggnogRefDirFmt, DiamondDatabaseDirFmt, NCBITaxonomyDirFmt,
EggnogProteinSequencesDirFmt
)
from .._utils import run_command, _process_common_input_params, colorify
from .._utils import (
run_command, _process_common_input_params, colorify,
_calculate_md5_from_file
)
from ._utils import _parse_build_diamond_db_params


Expand Down Expand Up @@ -229,9 +233,91 @@ def _validate_taxon_id(eggnog_proteins, taxon):
)

# Check for overlap with provided taxon id
if not str(taxon) in tax_ids:
raise ValueError(
f"'{taxon}' is not valid taxon ID. "
"To view all valid taxon IDs inspect e5.taxid_info.tsv "
"file in the eggnog_proteins input."
if not str(taxon) in tax_ids:
raise ValueError(
f"'{taxon}' is not valid taxon ID. "
"To view all valid taxon IDs inspect e5.taxid_info.tsv "
"file in the eggnog_proteins input."
)


def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
"""
Script fetches 3 files from the NCBI server and puts them into the folder
of a NCBITaxonomyDirFmt object.
"""
ncbi_data = NCBITaxonomyDirFmt()
zip_path = os.path.join(str(ncbi_data), "taxdmp.zip")
proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz")

# Download dump zip file + MD5 file
print(colorify("Downloading *.dmp files..."))
run_command(
cmd=[
"wget", "-O", f"{zip_path}",
"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip"
]
)
run_command(
cmd=[
"wget", "-O", f"{zip_path}.md5",
"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip.md5"
]
)

_collect_and_compare_md5(f"{zip_path}.md5", zip_path)

run_command(
cmd=[
"unzip", "-j", zip_path, "names.dmp", "nodes.dmp",
"-d", str(ncbi_data)
]
)

os.remove(zip_path)

# Download proteins + MD5 file
print(colorify("Downloading proteins file (~8 GB)..."))
run_command(
cmd=[
"wget", "-O", f"{proteins_path}",
"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
"prot.accession2taxid.gz"
]
)
run_command(
cmd=[
"wget", "-O", f"{proteins_path}.md5",
"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
"prot.accession2taxid.gz.md5"
]
)

_collect_and_compare_md5(f"{proteins_path}.md5", proteins_path)

print(colorify(
"Done! Moving data from temporary directory to final location..."
))
return ncbi_data


def _collect_and_compare_md5(path_to_md5: str, path_to_file: str):
# Read in hash from md5 file
with open(path_to_md5, 'r') as f:
expected_hash = f.readline().strip().split(maxsplit=1)[0]

# Calculate hash from file
observed_hash = _calculate_md5_from_file(path_to_file)

if observed_hash != expected_hash:
raise ValidationError(
"Download error. Data possibly corrupted.\n"
f"{path_to_file} has an unexpected MD5 hash.\n\n"
"Expected hash:\n"
f"{expected_hash}\n\n"
"Observed hash:\n"
f"{observed_hash}"
)

# If no exception is raised, remove md5 file
os.remove(path_to_md5)
1 change: 1 addition & 0 deletions q2_moshpit/eggnog/tests/data/md5/a.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
I am a text file. Calculate an MD% hash from me.
1 change: 1 addition & 0 deletions q2_moshpit/eggnog/tests/data/md5/a.txt.md5
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
a583054a9831a6e7cc56ea5cd9cac40a a.txt
1 change: 1 addition & 0 deletions q2_moshpit/eggnog/tests/data/md5/b.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
I am a another text file.
93 changes: 92 additions & 1 deletion q2_moshpit/eggnog/tests/test_dbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
import os
from unittest.mock import patch, call
from qiime2.plugin.testing import TestPluginBase
from qiime2.core.exceptions import ValidationError
from .._dbs import (
fetch_eggnog_db, build_custom_diamond_db, fetch_eggnog_proteins,
fetch_diamond_db, build_eggnog_diamond_db, _validate_taxon_id
fetch_diamond_db, build_eggnog_diamond_db, fetch_ncbi_taxonomy,
_validate_taxon_id, _collect_and_compare_md5
)
from q2_types.feature_data import ProteinSequencesDirectoryFormat
from q2_types_genomics.reference_db import (
Expand Down Expand Up @@ -150,6 +152,95 @@ def test_fetch_eggnog_fasta(self, subp_run):
# Check that commands are ran as expected
subp_run.assert_has_calls([first_call, second_call], any_order=False)

@patch("q2_moshpit.eggnog._dbs._collect_and_compare_md5")
@patch("subprocess.run")
@patch("os.remove")
def test_fetch_ncbi_taxonomy(self, mock_os_rm, mock_run, mock_md5):
# Call function. Patching will make sure nothing is actually ran
ncbi_data = fetch_ncbi_taxonomy()
zip_path = os.path.join(str(ncbi_data), "taxdmp.zip")
proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz")

# Check that command was called in the expected way
expected_calls = [
call(
[
"wget", "-O", f"{zip_path}",
"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip"
],
check=True
),
call(
[
"wget", "-O", f"{zip_path}.md5",
"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip.md5"
],
check=True
),
call(
[
"unzip", "-j", zip_path, "names.dmp", "nodes.dmp",
"-d", str(ncbi_data)
],
check=True,
),
call(
[
"wget", "-O", f"{proteins_path}",
"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
"prot.accession2taxid.gz"
],
check=True
),
call(
[
"wget", "-O", f"{proteins_path}.md5",
"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
"prot.accession2taxid.gz.md5"
],
check=True
)
]

# Check that commands are ran as expected
mock_os_rm.assert_called_once_with(zip_path)
mock_run.assert_has_calls(
expected_calls,
any_order=False
)
mock_md5.assert_has_calls(
[
call(f"{zip_path}.md5", zip_path),
call(f"{proteins_path}.md5", proteins_path),
],
any_order=False
)

@patch("os.remove")
def test_collect_and_compare_md5_valid(self, mock_os_rm):
path_to_file = self.get_data_path("md5/a.txt")

# Should raise no errors
_collect_and_compare_md5(f"{path_to_file}.md5", path_to_file)

# Check rm is called as expected
mock_os_rm.assert_called_once_with(f"{path_to_file}.md5")

@patch("os.remove")
def test_collect_and_compare_md5_invalid(self, mock_os_rm):
path_to_file = self.get_data_path("md5/b.txt")
path_to_wrong_md5 = self.get_data_path("md5/a.txt.md5")

# Check that expected exception is raised
with self.assertRaisesRegex(
ValidationError,
"has an unexpected MD5 hash"
):
_collect_and_compare_md5(path_to_wrong_md5, path_to_file)

# check that rm is not called
mock_os_rm.assert_not_called()

@patch("q2_moshpit.eggnog._dbs._validate_taxon_id")
@patch("subprocess.run")
@patch("shutil.move")
Expand Down
19 changes: 19 additions & 0 deletions q2_moshpit/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,25 @@
"storage space is required to run this action. "
)


plugin.methods.register_function(
function=q2_moshpit.eggnog.fetch_ncbi_taxonomy,
inputs={},
parameters={},
outputs=[("taxonomy", ReferenceDB[NCBITaxonomy])],
output_descriptions={
"taxonomy": "NCBI reference taxonomy."
},
name="Fetch NCBI reference taxonomy",
description="Downloads NCBI reference taxonomy from the NCBI FTP server. "
"The resulting artifact is required by the "
"build-custom-diamond-db action if one wished to "
"create a Diamond data base with taxonomy features. "
"At least 30 GB of "
"storage space is required to run this action.",
citations=[citations["NCBI"]]
)

plugin.methods.register_function(
function=q2_moshpit.eggnog.build_eggnog_diamond_db,
inputs={
Expand Down
1 change: 1 addition & 0 deletions q2_moshpit/tests/data/md5/a.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
I am a text file. Calculate an MD% hash from me.
1 change: 1 addition & 0 deletions q2_moshpit/tests/data/md5/b.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
I am a another text file.
18 changes: 14 additions & 4 deletions q2_moshpit/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import unittest

from qiime2.plugin.testing import TestPluginBase

from .._utils import _construct_param, _process_common_input_params
from .._utils import (
_construct_param, _process_common_input_params,
_calculate_md5_from_file
)


def fake_processing_func(key, val):
Expand Down Expand Up @@ -113,6 +113,16 @@ def test_process_common_inputs_mix_with_falsy_values(self):
]
self.assertSetEqual(set(observed), set(expected))

def test_calculate_md5_from_pass(self):
path_to_file = self.get_data_path("md5/a.txt")
observed_hash = _calculate_md5_from_file(path_to_file)
self.assertEqual(observed_hash, "a583054a9831a6e7cc56ea5cd9cac40a")

def test_calculate_md5_from_fail(self):
path_to_file = self.get_data_path("md5/b.txt")
observed_hash = _calculate_md5_from_file(path_to_file)
self.assertNotEqual(observed_hash, "a583054a9831a6e7cc56ea5cd9cac40a")


if __name__ == '__main__':
unittest.main()
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
'q2_moshpit': [
'citations.bib',
'tests/data/*',
'tests/data/md5/*',
"assets/busco/*",
"assets/busco/js/*",
"assets/busco/css/*",
Expand All @@ -47,6 +48,7 @@
],
'q2_moshpit.eggnog': [
'tests/data/*',
'tests/data/md5/*',
'tests/data/build_eggnog_diamond_db/*',
misialq marked this conversation as resolved.
Show resolved Hide resolved
'tests/data/contig-sequences-1/*',
'tests/data/mag-sequences/*',
Expand Down
Loading