diff --git a/q2_moshpit/_utils.py b/q2_moshpit/_utils.py index 331f592d..f93efe17 100644 --- a/q2_moshpit/_utils.py +++ b/q2_moshpit/_utils.py @@ -6,6 +6,7 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import subprocess +import hashlib from typing import List @@ -72,5 +73,14 @@ def _process_common_input_params(processing_func, params: dict) -> List[str]: return processed_args -def colorify(string): +def colorify(string: str): return "%s%s%s" % ('\033[1;32m', string, "\033[0m") + + +def _calculate_md5_from_file(file_path: str) -> str: + md5_hash = hashlib.md5() + with open(file_path, 'rb') as f: + # Read the file in chunks to handle large files + for chunk in iter(lambda: f.read(4096), b""): + md5_hash.update(chunk) + return md5_hash.hexdigest() diff --git a/q2_moshpit/citations.bib b/q2_moshpit/citations.bib index 02c36349..64751cf4 100644 --- a/q2_moshpit/citations.bib +++ b/q2_moshpit/citations.bib @@ -123,3 +123,9 @@ @article{buchfink_sensitive_2021 keywords = {Computational biology and bioinformatics, Genome informatics, Genomic analysis, Sequencing, Software}, pages = {366--368}, } + +@misc{NCBI, + title = {National Center for Biotechnology Information (NCBI)}, + url = {https://www.ncbi.nlm.nih.gov/}, + note = {Bethesda (MD): National Library of Medicine (US), National Center for Biotechnology Information;}, +} diff --git a/q2_moshpit/eggnog/__init__.py b/q2_moshpit/eggnog/__init__.py index 9a176a43..2383b6f1 100644 --- a/q2_moshpit/eggnog/__init__.py +++ b/q2_moshpit/eggnog/__init__.py @@ -8,12 +8,12 @@ from ._method import eggnog_diamond_search, eggnog_annotate from ._dbs import ( fetch_eggnog_db, fetch_diamond_db, build_custom_diamond_db, - fetch_eggnog_proteins, build_eggnog_diamond_db + fetch_eggnog_proteins, build_eggnog_diamond_db, fetch_ncbi_taxonomy ) __all__ = [ 'eggnog_diamond_search', 'eggnog_annotate', 'fetch_eggnog_db', 'fetch_diamond_db', 'build_custom_diamond_db', 'fetch_eggnog_proteins', - 'build_eggnog_diamond_db', + 'build_eggnog_diamond_db', 'fetch_ncbi_taxonomy' ] diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py index 845b2b6d..65e244f7 100644 --- a/q2_moshpit/eggnog/_dbs.py +++ b/q2_moshpit/eggnog/_dbs.py @@ -6,14 +6,18 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import os +import shutil import pandas as pd +from qiime2.core.exceptions import ValidationError from q2_types.feature_data import ProteinSequencesDirectoryFormat -import shutil from q2_types_genomics.reference_db import ( EggnogRefDirFmt, DiamondDatabaseDirFmt, NCBITaxonomyDirFmt, EggnogProteinSequencesDirFmt ) -from .._utils import run_command, _process_common_input_params, colorify +from .._utils import ( + run_command, _process_common_input_params, colorify, + _calculate_md5_from_file +) from ._utils import _parse_build_diamond_db_params @@ -229,9 +233,91 @@ def _validate_taxon_id(eggnog_proteins, taxon): ) # Check for overlap with provided taxon id - if not str(taxon) in tax_ids: - raise ValueError( - f"'{taxon}' is not valid taxon ID. " - "To view all valid taxon IDs inspect e5.taxid_info.tsv " - "file in the eggnog_proteins input." + if not str(taxon) in tax_ids: + raise ValueError( + f"'{taxon}' is not valid taxon ID. " + "To view all valid taxon IDs inspect e5.taxid_info.tsv " + "file in the eggnog_proteins input." + ) + + +def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt: + """ + Script fetches 3 files from the NCBI server and puts them into the folder + of a NCBITaxonomyDirFmt object. + """ + ncbi_data = NCBITaxonomyDirFmt() + zip_path = os.path.join(str(ncbi_data), "taxdmp.zip") + proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz") + + # Download dump zip file + MD5 file + print(colorify("Downloading *.dmp files...")) + run_command( + cmd=[ + "wget", "-O", f"{zip_path}", + "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip" + ] + ) + run_command( + cmd=[ + "wget", "-O", f"{zip_path}.md5", + "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip.md5" + ] + ) + + _collect_and_compare_md5(f"{zip_path}.md5", zip_path) + + run_command( + cmd=[ + "unzip", "-j", zip_path, "names.dmp", "nodes.dmp", + "-d", str(ncbi_data) + ] + ) + + os.remove(zip_path) + + # Download proteins + MD5 file + print(colorify("Downloading proteins file (~8 GB)...")) + run_command( + cmd=[ + "wget", "-O", f"{proteins_path}", + "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/" + "prot.accession2taxid.gz" + ] + ) + run_command( + cmd=[ + "wget", "-O", f"{proteins_path}.md5", + "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/" + "prot.accession2taxid.gz.md5" + ] + ) + + _collect_and_compare_md5(f"{proteins_path}.md5", proteins_path) + + print(colorify( + "Done! Moving data from temporary directory to final location..." + )) + return ncbi_data + + +def _collect_and_compare_md5(path_to_md5: str, path_to_file: str): + # Read in hash from md5 file + with open(path_to_md5, 'r') as f: + expected_hash = f.readline().strip().split(maxsplit=1)[0] + + # Calculate hash from file + observed_hash = _calculate_md5_from_file(path_to_file) + + if observed_hash != expected_hash: + raise ValidationError( + "Download error. Data possibly corrupted.\n" + f"{path_to_file} has an unexpected MD5 hash.\n\n" + "Expected hash:\n" + f"{expected_hash}\n\n" + "Observed hash:\n" + f"{observed_hash}" ) + + # If no exception is raised, remove md5 file + os.remove(path_to_md5) diff --git a/q2_moshpit/eggnog/tests/data/md5/a.txt b/q2_moshpit/eggnog/tests/data/md5/a.txt new file mode 100644 index 00000000..348608f1 --- /dev/null +++ b/q2_moshpit/eggnog/tests/data/md5/a.txt @@ -0,0 +1 @@ +I am a text file. Calculate an MD% hash from me. \ No newline at end of file diff --git a/q2_moshpit/eggnog/tests/data/md5/a.txt.md5 b/q2_moshpit/eggnog/tests/data/md5/a.txt.md5 new file mode 100644 index 00000000..f9f80c4b --- /dev/null +++ b/q2_moshpit/eggnog/tests/data/md5/a.txt.md5 @@ -0,0 +1 @@ +a583054a9831a6e7cc56ea5cd9cac40a a.txt \ No newline at end of file diff --git a/q2_moshpit/eggnog/tests/data/md5/b.txt b/q2_moshpit/eggnog/tests/data/md5/b.txt new file mode 100644 index 00000000..6af1c12b --- /dev/null +++ b/q2_moshpit/eggnog/tests/data/md5/b.txt @@ -0,0 +1 @@ +I am a another text file. \ No newline at end of file diff --git a/q2_moshpit/eggnog/tests/test_dbs.py b/q2_moshpit/eggnog/tests/test_dbs.py index 0a1c12db..6529d675 100644 --- a/q2_moshpit/eggnog/tests/test_dbs.py +++ b/q2_moshpit/eggnog/tests/test_dbs.py @@ -8,9 +8,11 @@ import os from unittest.mock import patch, call from qiime2.plugin.testing import TestPluginBase +from qiime2.core.exceptions import ValidationError from .._dbs import ( fetch_eggnog_db, build_custom_diamond_db, fetch_eggnog_proteins, - fetch_diamond_db, build_eggnog_diamond_db, _validate_taxon_id + fetch_diamond_db, build_eggnog_diamond_db, fetch_ncbi_taxonomy, + _validate_taxon_id, _collect_and_compare_md5 ) from q2_types.feature_data import ProteinSequencesDirectoryFormat from q2_types_genomics.reference_db import ( @@ -150,6 +152,95 @@ def test_fetch_eggnog_fasta(self, subp_run): # Check that commands are ran as expected subp_run.assert_has_calls([first_call, second_call], any_order=False) + @patch("q2_moshpit.eggnog._dbs._collect_and_compare_md5") + @patch("subprocess.run") + @patch("os.remove") + def test_fetch_ncbi_taxonomy(self, mock_os_rm, mock_run, mock_md5): + # Call function. Patching will make sure nothing is actually ran + ncbi_data = fetch_ncbi_taxonomy() + zip_path = os.path.join(str(ncbi_data), "taxdmp.zip") + proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz") + + # Check that command was called in the expected way + expected_calls = [ + call( + [ + "wget", "-O", f"{zip_path}", + "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip" + ], + check=True + ), + call( + [ + "wget", "-O", f"{zip_path}.md5", + "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip.md5" + ], + check=True + ), + call( + [ + "unzip", "-j", zip_path, "names.dmp", "nodes.dmp", + "-d", str(ncbi_data) + ], + check=True, + ), + call( + [ + "wget", "-O", f"{proteins_path}", + "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/" + "prot.accession2taxid.gz" + ], + check=True + ), + call( + [ + "wget", "-O", f"{proteins_path}.md5", + "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/" + "prot.accession2taxid.gz.md5" + ], + check=True + ) + ] + + # Check that commands are ran as expected + mock_os_rm.assert_called_once_with(zip_path) + mock_run.assert_has_calls( + expected_calls, + any_order=False + ) + mock_md5.assert_has_calls( + [ + call(f"{zip_path}.md5", zip_path), + call(f"{proteins_path}.md5", proteins_path), + ], + any_order=False + ) + + @patch("os.remove") + def test_collect_and_compare_md5_valid(self, mock_os_rm): + path_to_file = self.get_data_path("md5/a.txt") + + # Should raise no errors + _collect_and_compare_md5(f"{path_to_file}.md5", path_to_file) + + # Check rm is called as expected + mock_os_rm.assert_called_once_with(f"{path_to_file}.md5") + + @patch("os.remove") + def test_collect_and_compare_md5_invalid(self, mock_os_rm): + path_to_file = self.get_data_path("md5/b.txt") + path_to_wrong_md5 = self.get_data_path("md5/a.txt.md5") + + # Check that expected exception is raised + with self.assertRaisesRegex( + ValidationError, + "has an unexpected MD5 hash" + ): + _collect_and_compare_md5(path_to_wrong_md5, path_to_file) + + # check that rm is not called + mock_os_rm.assert_not_called() + @patch("q2_moshpit.eggnog._dbs._validate_taxon_id") @patch("subprocess.run") @patch("shutil.move") diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index 134b2c41..885cd394 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -591,6 +591,25 @@ "storage space is required to run this action. " ) + +plugin.methods.register_function( + function=q2_moshpit.eggnog.fetch_ncbi_taxonomy, + inputs={}, + parameters={}, + outputs=[("taxonomy", ReferenceDB[NCBITaxonomy])], + output_descriptions={ + "taxonomy": "NCBI reference taxonomy." + }, + name="Fetch NCBI reference taxonomy", + description="Downloads NCBI reference taxonomy from the NCBI FTP server. " + "The resulting artifact is required by the " + "build-custom-diamond-db action if one wished to " + "create a Diamond data base with taxonomy features. " + "At least 30 GB of " + "storage space is required to run this action.", + citations=[citations["NCBI"]] +) + plugin.methods.register_function( function=q2_moshpit.eggnog.build_eggnog_diamond_db, inputs={ diff --git a/q2_moshpit/tests/data/md5/a.txt b/q2_moshpit/tests/data/md5/a.txt new file mode 100644 index 00000000..348608f1 --- /dev/null +++ b/q2_moshpit/tests/data/md5/a.txt @@ -0,0 +1 @@ +I am a text file. Calculate an MD% hash from me. \ No newline at end of file diff --git a/q2_moshpit/tests/data/md5/b.txt b/q2_moshpit/tests/data/md5/b.txt new file mode 100644 index 00000000..6af1c12b --- /dev/null +++ b/q2_moshpit/tests/data/md5/b.txt @@ -0,0 +1 @@ +I am a another text file. \ No newline at end of file diff --git a/q2_moshpit/tests/test_utils.py b/q2_moshpit/tests/test_utils.py index 77f9f37c..780a10c7 100644 --- a/q2_moshpit/tests/test_utils.py +++ b/q2_moshpit/tests/test_utils.py @@ -5,12 +5,12 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- - import unittest - from qiime2.plugin.testing import TestPluginBase - -from .._utils import _construct_param, _process_common_input_params +from .._utils import ( + _construct_param, _process_common_input_params, + _calculate_md5_from_file +) def fake_processing_func(key, val): @@ -113,6 +113,16 @@ def test_process_common_inputs_mix_with_falsy_values(self): ] self.assertSetEqual(set(observed), set(expected)) + def test_calculate_md5_from_pass(self): + path_to_file = self.get_data_path("md5/a.txt") + observed_hash = _calculate_md5_from_file(path_to_file) + self.assertEqual(observed_hash, "a583054a9831a6e7cc56ea5cd9cac40a") + + def test_calculate_md5_from_fail(self): + path_to_file = self.get_data_path("md5/b.txt") + observed_hash = _calculate_md5_from_file(path_to_file) + self.assertNotEqual(observed_hash, "a583054a9831a6e7cc56ea5cd9cac40a") + if __name__ == '__main__': unittest.main() diff --git a/setup.py b/setup.py index b7cf194e..1f048bff 100644 --- a/setup.py +++ b/setup.py @@ -28,6 +28,7 @@ 'q2_moshpit': [ 'citations.bib', 'tests/data/*', + 'tests/data/md5/*', "assets/busco/*", "assets/busco/js/*", "assets/busco/css/*", @@ -47,6 +48,7 @@ ], 'q2_moshpit.eggnog': [ 'tests/data/*', + 'tests/data/md5/*', 'tests/data/build_eggnog_diamond_db/*', 'tests/data/contig-sequences-1/*', 'tests/data/mag-sequences/*',