bokulich-lab · Sann5 · Jan 29, 2024 · Dec 22, 2023 · Dec 22, 2023 · Jan 11, 2024
diff --git a/q2_moshpit/_utils.py b/q2_moshpit/_utils.py
@@ -6,6 +6,7 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 import subprocess
+import hashlib
 from typing import List
 
 
@@ -26,7 +27,7 @@
        return result

    if env:
        subprocess.run(cmd, env=env, check=True, **kwargs)
    else:
        subprocess.run(cmd, check=True, **kwargs)

@@ -72,5 +73,14 @@
     return processed_args
 
 
-def colorify(string):
+def colorify(string: str):
     return "%s%s%s" % ('\033[1;32m', string, "\033[0m")
+
+
+def _calculate_md5_from_file(file_path: str) -> str:
+    md5_hash = hashlib.md5()
+    with open(file_path, 'rb') as f:
+        # Read the file in chunks to handle large files
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5_hash.update(chunk)
+    return md5_hash.hexdigest()
diff --git a/q2_moshpit/citations.bib b/q2_moshpit/citations.bib
@@ -123,3 +123,9 @@ @article{buchfink_sensitive_2021
 	keywords = {Computational biology and bioinformatics, Genome informatics, Genomic analysis, Sequencing, Software},
 	pages = {366--368},
 }
+
+@misc{NCBI,
+  title = {National Center for Biotechnology Information (NCBI)},
+  url = {https://www.ncbi.nlm.nih.gov/},
+  note = {Bethesda (MD): National Library of Medicine (US), National Center for Biotechnology Information;},
+}
diff --git a/q2_moshpit/eggnog/__init__.py b/q2_moshpit/eggnog/__init__.py
@@ -8,12 +8,12 @@
 from ._method import eggnog_diamond_search, eggnog_annotate
 from ._dbs import (
     fetch_eggnog_db, fetch_diamond_db, build_custom_diamond_db,
-    fetch_eggnog_proteins, build_eggnog_diamond_db
+    fetch_eggnog_proteins, build_eggnog_diamond_db, fetch_ncbi_taxonomy
 )
 
 
 __all__ = [
     'eggnog_diamond_search', 'eggnog_annotate', 'fetch_eggnog_db',
     'fetch_diamond_db', 'build_custom_diamond_db', 'fetch_eggnog_proteins',
-    'build_eggnog_diamond_db',
+    'build_eggnog_diamond_db', 'fetch_ncbi_taxonomy'
 ]
diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py
@@ -6,14 +6,18 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 import os
+import shutil
 import pandas as pd
+from qiime2.core.exceptions import ValidationError
 from q2_types.feature_data import ProteinSequencesDirectoryFormat
-import shutil
 from q2_types_genomics.reference_db import (
     EggnogRefDirFmt, DiamondDatabaseDirFmt, NCBITaxonomyDirFmt,
     EggnogProteinSequencesDirFmt
 )
-from .._utils import run_command, _process_common_input_params, colorify
+from .._utils import (
+    run_command, _process_common_input_params, colorify,
+    _calculate_md5_from_file
+)
 from ._utils import _parse_build_diamond_db_params
 
 
@@ -229,9 +233,91 @@ def _validate_taxon_id(eggnog_proteins, taxon):
         )
 
     # Check for overlap with provided taxon id
-    if not str(taxon) in tax_ids:
-        raise ValueError(
-            f"'{taxon}' is not valid taxon ID. "
-            "To view all valid taxon IDs inspect e5.taxid_info.tsv "
-            "file in the eggnog_proteins input."
+        if not str(taxon) in tax_ids:
+            raise ValueError(
+                f"'{taxon}' is not valid taxon ID. "
+                "To view all valid taxon IDs inspect e5.taxid_info.tsv "
+                "file in the eggnog_proteins input."
+            )
+
+
+def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
+    """
+    Script fetches 3 files from the NCBI server and puts them into the folder
+    of a NCBITaxonomyDirFmt object.
+    """
+    ncbi_data = NCBITaxonomyDirFmt()
+    zip_path = os.path.join(str(ncbi_data), "taxdmp.zip")
+    proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz")
+
+    # Download dump zip file + MD5 file
+    print(colorify("Downloading *.dmp files..."))
+    run_command(
+        cmd=[
+            "wget", "-O", f"{zip_path}",
+            "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip"
+        ]
+    )
+    run_command(
+        cmd=[
+            "wget", "-O", f"{zip_path}.md5",
+            "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip.md5"
+        ]
+    )
+
+    _collect_and_compare_md5(f"{zip_path}.md5", zip_path)
+
+    run_command(
+        cmd=[
+            "unzip", "-j", zip_path, "names.dmp", "nodes.dmp",
+            "-d", str(ncbi_data)
+        ]
+    )
+
+    os.remove(zip_path)
+
+    # Download proteins + MD5 file
+    print(colorify("Downloading proteins file (~8 GB)..."))
+    run_command(
+        cmd=[
+            "wget", "-O", f"{proteins_path}",
+            "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
+            "prot.accession2taxid.gz"
+        ]
+    )
+    run_command(
+        cmd=[
+            "wget", "-O", f"{proteins_path}.md5",
+            "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
+            "prot.accession2taxid.gz.md5"
+        ]
+    )
+
+    _collect_and_compare_md5(f"{proteins_path}.md5", proteins_path)
+
+    print(colorify(
+        "Done! Moving data from temporary directory to final location..."
+    ))
+    return ncbi_data
+
+
+def _collect_and_compare_md5(path_to_md5: str, path_to_file: str):
+    # Read in hash from md5 file
+    with open(path_to_md5, 'r') as f:
+        expected_hash = f.readline().strip().split(maxsplit=1)[0]
+
+    # Calculate hash from file
+    observed_hash = _calculate_md5_from_file(path_to_file)
+
+    if observed_hash != expected_hash:
+        raise ValidationError(
+            "Download error. Data possibly corrupted.\n"
+            f"{path_to_file} has an unexpected MD5 hash.\n\n"
+            "Expected hash:\n"
+            f"{expected_hash}\n\n"
+            "Observed hash:\n"
+            f"{observed_hash}"
         )
+
+    # If no exception is raised, remove md5 file
+    os.remove(path_to_md5)
diff --git a/q2_moshpit/eggnog/tests/data/md5/a.txt b/q2_moshpit/eggnog/tests/data/md5/a.txt
@@ -0,0 +1 @@
+I am a text file. Calculate an MD% hash from me.
diff --git a/q2_moshpit/eggnog/tests/data/md5/a.txt.md5 b/q2_moshpit/eggnog/tests/data/md5/a.txt.md5
@@ -0,0 +1 @@
+a583054a9831a6e7cc56ea5cd9cac40a a.txt
diff --git a/q2_moshpit/eggnog/tests/data/md5/b.txt b/q2_moshpit/eggnog/tests/data/md5/b.txt
@@ -0,0 +1 @@
+I am a another text file. 
diff --git a/q2_moshpit/eggnog/tests/test_dbs.py b/q2_moshpit/eggnog/tests/test_dbs.py
@@ -8,9 +8,11 @@
 import os
 from unittest.mock import patch, call
 from qiime2.plugin.testing import TestPluginBase
+from qiime2.core.exceptions import ValidationError
 from .._dbs import (
     fetch_eggnog_db, build_custom_diamond_db, fetch_eggnog_proteins,
-    fetch_diamond_db, build_eggnog_diamond_db, _validate_taxon_id
+    fetch_diamond_db, build_eggnog_diamond_db, fetch_ncbi_taxonomy,
+    _validate_taxon_id, _collect_and_compare_md5
 )
 from q2_types.feature_data import ProteinSequencesDirectoryFormat
 from q2_types_genomics.reference_db import (
@@ -150,6 +152,95 @@ def test_fetch_eggnog_fasta(self, subp_run):
         # Check that commands are ran as expected
         subp_run.assert_has_calls([first_call, second_call], any_order=False)
 
+    @patch("q2_moshpit.eggnog._dbs._collect_and_compare_md5")
+    @patch("subprocess.run")
+    @patch("os.remove")
+    def test_fetch_ncbi_taxonomy(self, mock_os_rm, mock_run, mock_md5):
+        # Call function. Patching will make sure nothing is actually ran
+        ncbi_data = fetch_ncbi_taxonomy()
+        zip_path = os.path.join(str(ncbi_data), "taxdmp.zip")
+        proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz")
+
+        # Check that command was called in the expected way
+        expected_calls = [
+            call(
+                [
+                    "wget", "-O", f"{zip_path}",
+                    "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip"
+                ],
+                check=True
+            ),
+            call(
+                [
+                    "wget", "-O", f"{zip_path}.md5",
+                    "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip.md5"
+                ],
+                check=True
+            ),
+            call(
+                [
+                    "unzip", "-j", zip_path, "names.dmp", "nodes.dmp",
+                    "-d", str(ncbi_data)
+                ],
+                check=True,
+            ),
+            call(
+                [
+                    "wget", "-O", f"{proteins_path}",
+                    "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
+                    "prot.accession2taxid.gz"
+                ],
+                check=True
+            ),
+            call(
+                [
+                    "wget", "-O", f"{proteins_path}.md5",
+                    "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
+                    "prot.accession2taxid.gz.md5"
+                ],
+                check=True
+            )
+        ]
+
+        # Check that commands are ran as expected
+        mock_os_rm.assert_called_once_with(zip_path)
+        mock_run.assert_has_calls(
+            expected_calls,
+            any_order=False
+        )
+        mock_md5.assert_has_calls(
+            [
+                call(f"{zip_path}.md5", zip_path),
+                call(f"{proteins_path}.md5", proteins_path),
+            ],
+            any_order=False
+        )
+
+    @patch("os.remove")
+    def test_collect_and_compare_md5_valid(self, mock_os_rm):
+        path_to_file = self.get_data_path("md5/a.txt")
+
+        # Should raise no errors
+        _collect_and_compare_md5(f"{path_to_file}.md5", path_to_file)
+
+        # Check rm is called as expected
+        mock_os_rm.assert_called_once_with(f"{path_to_file}.md5")
+
+    @patch("os.remove")
+    def test_collect_and_compare_md5_invalid(self, mock_os_rm):
+        path_to_file = self.get_data_path("md5/b.txt")
+        path_to_wrong_md5 = self.get_data_path("md5/a.txt.md5")
+
+        # Check that expected exception is raised
+        with self.assertRaisesRegex(
+            ValidationError,
+            "has an unexpected MD5 hash"
+        ):
+            _collect_and_compare_md5(path_to_wrong_md5, path_to_file)
+
+        # check that rm is not called
+        mock_os_rm.assert_not_called()
+
     @patch("q2_moshpit.eggnog._dbs._validate_taxon_id")
     @patch("subprocess.run")
     @patch("shutil.move")

diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py
@@ -591,6 +591,25 @@
                 "storage space is required to run this action. "
 )
 
+
+plugin.methods.register_function(
+    function=q2_moshpit.eggnog.fetch_ncbi_taxonomy,
+    inputs={},
+    parameters={},
+    outputs=[("taxonomy", ReferenceDB[NCBITaxonomy])],
+    output_descriptions={
+        "taxonomy": "NCBI reference taxonomy."
+    },
+    name="Fetch NCBI reference taxonomy",
+    description="Downloads NCBI reference taxonomy from the NCBI FTP server. "
+                "The resulting artifact is required by the "
+                "build-custom-diamond-db action if one wished to "
+                "create a Diamond data base with taxonomy features. "
+                "At least 30 GB of "
+                "storage space is required to run this action.",
+    citations=[citations["NCBI"]]
+)
+
 plugin.methods.register_function(
     function=q2_moshpit.eggnog.build_eggnog_diamond_db,
     inputs={

diff --git a/q2_moshpit/tests/data/md5/a.txt b/q2_moshpit/tests/data/md5/a.txt
@@ -0,0 +1 @@
+I am a text file. Calculate an MD% hash from me.
diff --git a/q2_moshpit/tests/data/md5/b.txt b/q2_moshpit/tests/data/md5/b.txt
@@ -0,0 +1 @@
+I am a another text file. 
diff --git a/q2_moshpit/tests/test_utils.py b/q2_moshpit/tests/test_utils.py
@@ -5,12 +5,12 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
-
 import unittest
-
 from qiime2.plugin.testing import TestPluginBase
-
-from .._utils import _construct_param, _process_common_input_params
+from .._utils import (
+    _construct_param, _process_common_input_params,
+    _calculate_md5_from_file
+)
 
 
 def fake_processing_func(key, val):
@@ -113,6 +113,16 @@ def test_process_common_inputs_mix_with_falsy_values(self):
         ]
         self.assertSetEqual(set(observed), set(expected))
 
+    def test_calculate_md5_from_pass(self):
+        path_to_file = self.get_data_path("md5/a.txt")
+        observed_hash = _calculate_md5_from_file(path_to_file)
+        self.assertEqual(observed_hash, "a583054a9831a6e7cc56ea5cd9cac40a")
+
+    def test_calculate_md5_from_fail(self):
+        path_to_file = self.get_data_path("md5/b.txt")
+        observed_hash = _calculate_md5_from_file(path_to_file)
+        self.assertNotEqual(observed_hash, "a583054a9831a6e7cc56ea5cd9cac40a")
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/setup.py b/setup.py
@@ -28,6 +28,7 @@
         'q2_moshpit': [
             'citations.bib',
             'tests/data/*',
+            'tests/data/md5/*',
             "assets/busco/*",
             "assets/busco/js/*",
             "assets/busco/css/*",
@@ -47,6 +48,7 @@
         ],
         'q2_moshpit.eggnog': [
             'tests/data/*',
+            'tests/data/md5/*',
             'tests/data/build_eggnog_diamond_db/*',
             'tests/data/contig-sequences-1/*',
             'tests/data/mag-sequences/*',
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		I am a text file. Calculate an MD% hash from me.