From 194a54539cf2a1d994b4822f9a641d53e3816019 Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Fri, 22 Dec 2023 15:02:10 +0100
Subject: [PATCH 01/24] Add colorify to utils

---
 q2_moshpit/_utils.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/q2_moshpit/_utils.py b/q2_moshpit/_utils.py
index 331f592d..941bed2d 100644
--- a/q2_moshpit/_utils.py
+++ b/q2_moshpit/_utils.py
@@ -8,6 +8,27 @@
 import subprocess
 from typing import List
 
+# CONVERT shell colors to the same curses palette
+SHELL_COLORS = {
+    "wr": '\033[1;37;41m',  # white on red
+    "wo": '\033[1;37;43m',  # white on orange
+    "wm": '\033[1;37;45m',  # white on magenta
+    "wb": '\033[1;37;46m',  # white on blue
+    "bw": '\033[1;37;40m',  # black on white
+    "lblue": '\033[1;34m',  # light blue
+    "lred": '\033[1;31m',  # light red
+    "lgreen": '\033[1;32m',  # light green
+    "yellow": '\033[1;33m',  # yellow
+    "cyan": '\033[36m',  # cyan
+    "blue": '\033[34m',  # blue
+    "green": '\033[32m',  # green
+    "orange": '\033[33m',  # orange
+    "red": '\033[31m',  # red
+    "magenta": "\033[35m",  # magenta
+    "white": "\033[0m",  # white
+    None: "\033[0m",  # end
+}
+
 
 def run_command(cmd, env=None, verbose=True, pipe=False, **kwargs):
     if verbose:

From ea08a403eb6338976013c62113ab75599b03d25c Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Fri, 22 Dec 2023 15:19:32 +0100
Subject: [PATCH 02/24] Register action

---
 q2_moshpit/plugin_setup.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py
index fc18b0ec..fa1d4751 100644
--- a/q2_moshpit/plugin_setup.py
+++ b/q2_moshpit/plugin_setup.py
@@ -591,6 +591,25 @@
                 "storage space is required to run this action. "
 )
 
+plugin.methods.register_function(
+    function=q2_moshpit.eggnog.fetch_eggnog_fasta,
+    inputs={},
+    parameters={},
+    outputs=[("eggnog_fasta", ReferenceDB[EggnogSequenceTaxa])],
+    output_descriptions={
+        "eggnog_fasta": "Artifact containing the eggNOG database "
+                        "of protein sequences and their corresponding"
+                        "taxonomy information."
+    },
+    name="Fetch the databases necessary to run to run the "
+         "build-eggnog-diamond-db action.",
+    description="Downloads eggnog proteome database  "
+                "This script downloads 2 files: "
+                "(e5.proteomes.faa and e5.taxid_info.tsv) "
+                "and creates and artifact with them. At least 18 Gb of "
+                "storage space is required to run this action. "
+)
+
 plugin.methods.register_function(
     function=q2_moshpit.eggnog.eggnog_diamond_search,
     inputs={

From 471aaad4863d61fa9b36a055b73fffabe9fcd2ff Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Thu, 11 Jan 2024 15:05:20 +0100
Subject: [PATCH 03/24] colorify only with green

---
 q2_moshpit/_utils.py | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/q2_moshpit/_utils.py b/q2_moshpit/_utils.py
index 941bed2d..331f592d 100644
--- a/q2_moshpit/_utils.py
+++ b/q2_moshpit/_utils.py
@@ -8,27 +8,6 @@
 import subprocess
 from typing import List
 
-# CONVERT shell colors to the same curses palette
-SHELL_COLORS = {
-    "wr": '\033[1;37;41m',  # white on red
-    "wo": '\033[1;37;43m',  # white on orange
-    "wm": '\033[1;37;45m',  # white on magenta
-    "wb": '\033[1;37;46m',  # white on blue
-    "bw": '\033[1;37;40m',  # black on white
-    "lblue": '\033[1;34m',  # light blue
-    "lred": '\033[1;31m',  # light red
-    "lgreen": '\033[1;32m',  # light green
-    "yellow": '\033[1;33m',  # yellow
-    "cyan": '\033[36m',  # cyan
-    "blue": '\033[34m',  # blue
-    "green": '\033[32m',  # green
-    "orange": '\033[33m',  # orange
-    "red": '\033[31m',  # red
-    "magenta": "\033[35m",  # magenta
-    "white": "\033[0m",  # white
-    None: "\033[0m",  # end
-}
-
 
 def run_command(cmd, env=None, verbose=True, pipe=False, **kwargs):
     if verbose:

From 4fd93d8f9c6e6cee55524bc29ac70e5c0096450a Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <54123712+Sann5@users.noreply.github.com>
Date: Thu, 11 Jan 2024 15:06:52 +0100
Subject: [PATCH 04/24] Update q2_moshpit/plugin_setup.py

Co-authored-by: Michal Ziemski <mziemski@ethz.ch>
---
 q2_moshpit/plugin_setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py
index fa1d4751..a450dc25 100644
--- a/q2_moshpit/plugin_setup.py
+++ b/q2_moshpit/plugin_setup.py
@@ -604,9 +604,9 @@
     name="Fetch the databases necessary to run to run the "
          "build-eggnog-diamond-db action.",
     description="Downloads eggnog proteome database  "
-                "This script downloads 2 files: "
+                "This script downloads 2 files "
                 "(e5.proteomes.faa and e5.taxid_info.tsv) "
-                "and creates and artifact with them. At least 18 Gb of "
+                "and creates and artifact with them. At least 18 GB of "
                 "storage space is required to run this action. "
 )
 

From bc9f7c64faa6ffc195da95551743d18443d00c83 Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Fri, 12 Jan 2024 14:23:53 +0100
Subject: [PATCH 05/24] Updated the output name and description

---
 q2_moshpit/plugin_setup.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py
index a450dc25..f0b84141 100644
--- a/q2_moshpit/plugin_setup.py
+++ b/q2_moshpit/plugin_setup.py
@@ -597,9 +597,8 @@
     parameters={},
     outputs=[("eggnog_fasta", ReferenceDB[EggnogSequenceTaxa])],
     output_descriptions={
-        "eggnog_fasta": "Artifact containing the eggNOG database "
-                        "of protein sequences and their corresponding"
-                        "taxonomy information."
+        "eggnog_proteins": "eggNOG database of protein sequences and "
+                           "their corresponding taxonomy information."
     },
     name="Fetch the databases necessary to run to run the "
          "build-eggnog-diamond-db action.",

From 452824f2a675718a737cfd7314b75b4850c365e6 Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Fri, 12 Jan 2024 14:17:36 +0100
Subject: [PATCH 06/24] Implement build_eggnog_diamond_db action

---
 q2_moshpit/eggnog/__init__.py |  5 +++--
 q2_moshpit/eggnog/_dbs.py     | 32 ++++++++++++++++++++++++++++++++
 q2_moshpit/plugin_setup.py    | 28 ++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/q2_moshpit/eggnog/__init__.py b/q2_moshpit/eggnog/__init__.py
index c6e7f8cc..9a176a43 100644
--- a/q2_moshpit/eggnog/__init__.py
+++ b/q2_moshpit/eggnog/__init__.py
@@ -8,11 +8,12 @@
 from ._method import eggnog_diamond_search, eggnog_annotate
 from ._dbs import (
     fetch_eggnog_db, fetch_diamond_db, build_custom_diamond_db,
-    fetch_eggnog_proteins
+    fetch_eggnog_proteins, build_eggnog_diamond_db
 )
 
 
 __all__ = [
     'eggnog_diamond_search', 'eggnog_annotate', 'fetch_eggnog_db',
-    'fetch_diamond_db', 'build_custom_diamond_db', 'fetch_eggnog_proteins'
+    'fetch_diamond_db', 'build_custom_diamond_db', 'fetch_eggnog_proteins',
+    'build_eggnog_diamond_db',
 ]
diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py
index ea7382fe..8bc71dd8 100644
--- a/q2_moshpit/eggnog/_dbs.py
+++ b/q2_moshpit/eggnog/_dbs.py
@@ -7,6 +7,7 @@
 # ----------------------------------------------------------------------------
 import os
 from q2_types.feature_data import ProteinSequencesDirectoryFormat
+import shutil
 from q2_types_genomics.reference_db import (
     EggnogRefDirFmt, DiamondDatabaseDirFmt, NCBITaxonomyDirFmt,
     EggnogProteinSequencesDirFmt
@@ -174,3 +175,34 @@ def fetch_eggnog_proteins() -> EggnogProteinSequencesDirFmt:
     ))
 
     return eggnog_fa
+
+
+def build_eggnog_diamond_db(
+        eggnog_proteins: EggnogProteinSequencesDirFmt,
+        taxon: str
+) -> DiamondDatabaseDirFmt:
+    """
+    Creates an DIAMOND database which contains the protein
+    sequences that belong to the specified taxon.
+    """
+
+    # Initialize output objects
+    diamond_db = DiamondDatabaseDirFmt()
+
+    # Define command.
+    cmd = [
+        "create_dbs.py",
+        "--data_dir", str(eggnog_proteins),
+        "--taxids", taxon,
+        "--dbname", "ref_db"
+    ]
+    run_command(cmd)
+
+    # The script will create the diamond DB in side the directory of
+    # eggnog_proteins object, so we need to move it to diamond_db
+    source_path = os.path.join(str(eggnog_proteins), "ref_db.dmnd")
+    destination_path = os.path.join(str(diamond_db), "ref_db.dmnd")
+    shutil.move(source_path, destination_path)
+
+    # Return objects
+    return diamond_db
diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py
index f0b84141..e518da8b 100644
--- a/q2_moshpit/plugin_setup.py
+++ b/q2_moshpit/plugin_setup.py
@@ -609,6 +609,34 @@
                 "storage space is required to run this action. "
 )
 
+plugin.methods.register_function(
+    function=q2_moshpit.eggnog.build_eggnog_diamond_db,
+    inputs={
+        'eggnog_proteins': ReferenceDB[EggnogSequenceTaxa],
+    },
+    input_descriptions={
+        'eggnog_proteins': "eggNOG database of protein sequences and "
+                           "their corresponding taxonomy information "
+                           "(generated through the fetch-eggnog-proteins "
+                           "action)."
+    },
+    parameters={
+        'taxon': Int % Range(2, 1579337)
+    },
+    parameter_descriptions={
+        'taxon': "Taxon ID number."
+    },
+    outputs=[("diamond_db", ReferenceDB[Diamond])],
+    output_descriptions={
+        "diamond_db": "Complete Diamond reference database for the"
+                      "specified taxon."
+    },
+    name="Create a DIAMOND formatted reference database for the"
+         "specified taxon.",
+    description="Creates an DIAMOND database which contains the protein "
+                "sequences that belong to the specified taxon.",
+)
+
 plugin.methods.register_function(
     function=q2_moshpit.eggnog.eggnog_diamond_search,
     inputs={

From f4e711f8d63cf027b93e38dc3d97c5d86598299d Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Fri, 12 Jan 2024 15:06:39 +0100
Subject: [PATCH 07/24] Add test

---
 q2_moshpit/eggnog/_dbs.py           |  4 ++--
 q2_moshpit/eggnog/tests/test_dbs.py | 32 +++++++++++++++++++++++++++--
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py
index 8bc71dd8..f6744d78 100644
--- a/q2_moshpit/eggnog/_dbs.py
+++ b/q2_moshpit/eggnog/_dbs.py
@@ -179,7 +179,7 @@ def fetch_eggnog_proteins() -> EggnogProteinSequencesDirFmt:
 
 def build_eggnog_diamond_db(
         eggnog_proteins: EggnogProteinSequencesDirFmt,
-        taxon: str
+        taxon: int
 ) -> DiamondDatabaseDirFmt:
     """
     Creates an DIAMOND database which contains the protein
@@ -193,7 +193,7 @@ def build_eggnog_diamond_db(
     cmd = [
         "create_dbs.py",
         "--data_dir", str(eggnog_proteins),
-        "--taxids", taxon,
+        "--taxids", str(taxon),
         "--dbname", "ref_db"
     ]
     run_command(cmd)
diff --git a/q2_moshpit/eggnog/tests/test_dbs.py b/q2_moshpit/eggnog/tests/test_dbs.py
index 33d4304e..f9280d86 100644
--- a/q2_moshpit/eggnog/tests/test_dbs.py
+++ b/q2_moshpit/eggnog/tests/test_dbs.py
@@ -10,10 +10,12 @@
 from qiime2.plugin.testing import TestPluginBase
 from .._dbs import (
     fetch_eggnog_db, build_custom_diamond_db, fetch_eggnog_proteins,
-    fetch_diamond_db
+    fetch_diamond_db, build_eggnog_diamond_db
 )
 from q2_types.feature_data import ProteinSequencesDirectoryFormat
-from q2_types_genomics.reference_db import NCBITaxonomyDirFmt
+from q2_types_genomics.reference_db import (
+    NCBITaxonomyDirFmt, EggnogProteinSequencesDirFmt
+)
 
 
 class TestFetchDB(TestPluginBase):
@@ -147,3 +149,29 @@ def test_fetch_eggnog_fasta(self, subp_run):
 
         # Check that commands are ran as expected
         subp_run.assert_has_calls([first_call, second_call], any_order=False)
+
+    @patch("subprocess.run")
+    @patch("shutil.move")
+    def test_build_eggnog_diamond_db(self, shut_mv, subp_run):
+        # Instantiate input
+        proteins_and_taxa = EggnogProteinSequencesDirFmt()
+
+        # Call function. Patching will make sure nothing is
+        # actually ran
+        diamond_db = build_eggnog_diamond_db(proteins_and_taxa, taxon=2)
+
+        # Check that command was called in the expected way
+        cmd = [
+            "create_dbs.py",
+            "--data_dir", str(proteins_and_taxa),
+            "--taxids", "2",
+            "--dbname", "ref_db"
+        ]
+
+        # Check that subprocess.run is run as expected
+        subp_run.assert_called_once_with(cmd, check=True)
+
+        # Check that shutil.move is run as expected
+        source_path = os.path.join(str(proteins_and_taxa), "ref_db.dmnd")
+        destination_path = os.path.join(str(diamond_db), "ref_db.dmnd")
+        shut_mv.assert_called_once_with(source_path, destination_path)

From 9b0a4c07f7b5d87a339a981533a849a9643c131e Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Mon, 15 Jan 2024 11:50:02 +0100
Subject: [PATCH 08/24] Further refactor EggnogSequenceTaxa to
 EggnogProteinSequences

---
 q2_moshpit/plugin_setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py
index e518da8b..cf933cc1 100644
--- a/q2_moshpit/plugin_setup.py
+++ b/q2_moshpit/plugin_setup.py
@@ -612,7 +612,7 @@
 plugin.methods.register_function(
     function=q2_moshpit.eggnog.build_eggnog_diamond_db,
     inputs={
-        'eggnog_proteins': ReferenceDB[EggnogSequenceTaxa],
+        'eggnog_proteins': ReferenceDB[EggnogProteinSequences],
     },
     input_descriptions={
         'eggnog_proteins': "eggNOG database of protein sequences and "

From db0a0c831a55b3f3097be0a60d3cf045ebb8ae78 Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Mon, 15 Jan 2024 14:37:06 +0100
Subject: [PATCH 09/24] Add validation for taxon IDs and corresponding test

---
 q2_moshpit/eggnog/_dbs.py                     |  29 +++++
 .../build_eggnog_diamond_db/e5.taxid_info.tsv | 100 ++++++++++++++++++
 q2_moshpit/eggnog/tests/test_dbs.py           |  15 ++-
 setup.py                                      |   1 +
 4 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 q2_moshpit/eggnog/tests/data/build_eggnog_diamond_db/e5.taxid_info.tsv

diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py
index f6744d78..49442342 100644
--- a/q2_moshpit/eggnog/_dbs.py
+++ b/q2_moshpit/eggnog/_dbs.py
@@ -6,6 +6,7 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 import os
+import pandas as pd
 from q2_types.feature_data import ProteinSequencesDirectoryFormat
 import shutil
 from q2_types_genomics.reference_db import (
@@ -185,6 +186,8 @@ def build_eggnog_diamond_db(
     Creates an DIAMOND database which contains the protein
     sequences that belong to the specified taxon.
     """
+    # Validate taxon ID
+    _validate_taxon_id(eggnog_proteins, taxon)
 
     # Initialize output objects
     diamond_db = DiamondDatabaseDirFmt()
@@ -206,3 +209,29 @@ def build_eggnog_diamond_db(
 
     # Return objects
     return diamond_db
+
+
+def _validate_taxon_id(eggnog_proteins, taxon):
+    # Validate taxon id number
+    # Read in valid taxon ids
+    taxid_info = pd.read_csv(
+        os.path.join(str(eggnog_proteins), "e5.taxid_info.tsv"),
+        sep="\t"
+    )
+
+    # Convert them into a set
+    tax_ids = set()
+    for lineage in taxid_info["Taxid Lineage"]:
+        tax_ids.update(
+            set(
+                lineage.strip().split(",")
+            )
+        )
+
+    # Check for overlap with provided taxon id
+    if not tax_ids.intersection(set(str(taxon))):
+        raise ValueError(
+            f"'{taxon}' is not valid taxon ID. "
+            "To view all valid taxon IDs inspect e5.taxid_info.tsv "
+            "file in the input eggnog_proteins input."
+        )
diff --git a/q2_moshpit/eggnog/tests/data/build_eggnog_diamond_db/e5.taxid_info.tsv b/q2_moshpit/eggnog/tests/data/build_eggnog_diamond_db/e5.taxid_info.tsv
new file mode 100644
index 00000000..e3e30994
--- /dev/null
+++ b/q2_moshpit/eggnog/tests/data/build_eggnog_diamond_db/e5.taxid_info.tsv
@@ -0,0 +1,100 @@
+# Taxid	Sci.Name	Rank	Named Lineage	Taxid Lineage
+679937	Bacteroides coprosuis DSM 18011	no rank	root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides coprosuis,Bacteroides coprosuis DSM 18011	1,131567,2,68336,976,200643,171549,815,816,151276,679937
+1146883	Blastococcus saxobsidens DD2	no rank	root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Frankineae,Geodermatophilaceae,Blastococcus,Blastococcus saxobsidens,Blastococcus saxobsidens DD2	1,131567,2,201174,1760,85003,2037,85013,85030,38501,138336,1146883
+1497679	Listeriaceae bacterium FSL A5-0209	species	root,cellular organisms,Bacteria,Firmicutes,Bacilli,Bacillales,Listeriaceae,unclassified Listeriaceae,Listeriaceae bacterium FSL A5-0209	1,131567,2,1239,91061,1385,186820,1081735,1497679
+69014	Thermococcus kodakarensis KOD1	no rank	root,cellular organisms,Archaea,Euryarchaeota,Thermococci,Thermococcales,Thermococcaceae,Thermococcus,Thermococcus kodakarensis,Thermococcus kodakarensis KOD1	1,131567,2157,28890,183968,2258,2259,2263,311400,69014
+888833	Streptococcus australis ATCC 700641	no rank	root,cellular organisms,Bacteria,Firmicutes,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus australis,Streptococcus australis ATCC 700641	1,131567,2,1239,91061,186826,1300,1301,113107,888833
+1089544	Amycolatopsis benzoatilytica AK 16/65	no rank	root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Pseudonocardineae,Pseudonocardiaceae,Amycolatopsis,Amycolatopsis benzoatilytica,Amycolatopsis benzoatilytica AK 16/65	1,131567,2,201174,1760,85003,2037,85010,2070,1813,346045,1089544
+1089545	Amycolatopsis balhimycina FH 1894	no rank	root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Pseudonocardineae,Pseudonocardiaceae,Amycolatopsis,Amycolatopsis balhimycina,Amycolatopsis balhimycina FH 1894	1,131567,2,201174,1760,85003,2037,85010,2070,1813,208443,1089545
+1089546	Actinopolyspora halophila DSM 43834	no rank	root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinopolysporineae,Actinopolysporaceae,Actinopolyspora,Actinopolyspora halophila,Actinopolyspora halophila DSM 43834	1,131567,2,201174,1760,85003,2037,622450,622451,1849,1850,1089546
+521393	Actinomyces timonensis DSM 23838	no rank	root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinomycineae,Actinomycetaceae,Actinomyces,Actinomyces timonensis,Actinomyces timonensis DSM 23838	1,131567,2,201174,1760,85003,2037,85005,2049,1654,1288391,521393
+1089548	Thermicanus aegyptius DSM 12793	no rank	root,cellular organisms,Bacteria,Firmicutes,Bacilli,Bacillales,Bacillales incertae sedis,Bacillales Family X. Incertae Sedis,Thermicanus,Thermicanus aegyptius,Thermicanus aegyptius DSM 12793	1,131567,2,1239,91061,1385,539002,539003,94008,94009,1089548
+172045	Elizabethkingia miricola	species	root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Flavobacteriia,Flavobacteriales,Flavobacteriaceae,Elizabethkingia,Elizabethkingia miricola	1,131567,2,68336,976,117743,200644,49546,308865,172045
+1089550	Salisaeta longa DSM 21114	no rank	root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Bacteroidetes Order II. Incertae sedis,Rhodothermaceae,Salisaeta,Salisaeta longa,Salisaeta longa DSM 21114	1,131567,2,68336,976,1100069,563843,689697,503170,1089550
+1089551	Geminicoccus roseus DSM 18922	no rank	root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,unclassified Alphaproteobacteria,Geminicoccus,Geminicoccus roseus,Geminicoccus roseus DSM 18922	1,131567,2,1224,28211,82117,489140,404900,1089551
+1089552	Rhodovibrio salinarum DSM 9154	no rank	root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodospirillales,Rhodospirillaceae,Rhodovibrio,Rhodovibrio salinarum,Rhodovibrio salinarum DSM 9154	1,131567,2,1224,28211,204441,41295,85274,1087,1089552
+1089553	Thermacetogenium phaeum DSM 12270	no rank	root,cellular organisms,Bacteria,Firmicutes,Clostridia,Thermoanaerobacterales,Thermoanaerobacteraceae,Thermacetogenium,Thermacetogenium phaeum,Thermacetogenium phaeum DSM 12270	1,131567,2,1239,186801,68295,186814,140458,85874,1089553
+196627	Corynebacterium glutamicum ATCC 13032	no rank	root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Corynebacterineae,Corynebacteriaceae,Corynebacterium,Corynebacterium glutamicum,Corynebacterium glutamicum ATCC 13032	1,131567,2,201174,1760,85003,2037,85007,1653,1716,1718,196627
+1161902	Eubacterium nodatum ATCC 33099	no rank	root,cellular organisms,Bacteria,Firmicutes,Clostridia,Clostridiales,Clostridiales incertae sedis,Clostridiales Family XIII. Incertae Sedis,[Eubacterium] nodatum,Eubacterium nodatum ATCC 33099	1,131567,2,1239,186801,186802,538999,543314,35518,1161902
+446468	Nocardiopsis dassonvillei subsp. dassonvillei DSM 43111	no rank	root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Streptosporangineae,Nocardiopsaceae,Nocardiopsis,Nocardiopsis dassonvillei,Nocardiopsis dassonvillei subsp. dassonvillei,Nocardiopsis dassonvillei subsp. dassonvillei DSM 43111	1,131567,2,201174,1760,85003,2037,85012,83676,2013,2014,568208,446468
+1286170	Raoultella ornithinolytica B6	no rank	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacteriales,Enterobacteriaceae,Raoultella,Raoultella ornithinolytica,Raoultella ornithinolytica B6	1,131567,2,1224,1236,91347,543,160674,54291,1286170
+1286171	Eubacterium acidaminophilum DSM 3953	no rank	root,cellular organisms,Bacteria,Firmicutes,Clostridia,Clostridiales,Eubacteriaceae,Eubacterium,Eubacterium acidaminophilum,Eubacterium acidaminophilum DSM 3953	1,131567,2,1239,186801,186802,186806,1730,1731,1286171
+446469	Sanguibacter keddieii DSM 10542	no rank	root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Micrococcineae,Sanguibacteraceae,Sanguibacter,Sanguibacter keddieii,Sanguibacter keddieii DSM 10542	1,131567,2,201174,1760,85003,2037,85006,145360,60919,60920,446469
+1384484	Adlercreutzia equolifaciens DSM 19450	no rank	root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Coriobacteridae,Coriobacteriales,Coriobacterineae,Coriobacteriaceae,Adlercreutzia,Adlercreutzia equolifaciens,Adlercreutzia equolifaciens DSM 19450	1,131567,2,201174,1760,84998,84999,255727,84107,447020,446660,1384484
+446470	Stackebrandtia nassauensis DSM 44728	no rank	root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Glycomycineae,Glycomycetaceae,Stackebrandtia,Stackebrandtia nassauensis,Stackebrandtia nassauensis DSM 44728	1,131567,2,201174,1760,85003,2037,85014,85034,283810,283811,446470
+270374	Marinobacter sp. ELB17	species	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Marinobacter,Marinobacter sp. ELB17	1,131567,2,1224,1236,135622,72275,2742,270374
+237609	Pseudomonas alkylphenolia	species	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,Pseudomonas alkylphenolia	1,131567,2,1224,1236,72274,135621,286,237609
+622637	Methylocystis sp. ATCC 49242	species	root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Methylocystaceae,Methylocystis,Methylocystis sp. ATCC 49242	1,131567,2,1224,28211,356,31993,133,622637
+536019	Mesorhizobium opportunistum WSM2075	no rank	root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Phyllobacteriaceae,Mesorhizobium,Mesorhizobium opportunistum,Mesorhizobium opportunistum WSM2075	1,131567,2,1224,28211,356,69277,68287,593909,536019
+46429	Sphingobium chlorophenolicum	species	root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Sphingomonadaceae,Sphingobium,Sphingobium chlorophenolicum	1,131567,2,1224,28211,204457,41297,165695,46429
+1056816	Nocardia sp. BMG51109	species	root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Corynebacterineae,Nocardiaceae,Nocardia,Nocardia sp. BMG51109	1,131567,2,201174,1760,85003,2037,85007,85025,1817,1056816
+589873	Alteromonas australica	species	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas australica	1,131567,2,1224,1236,135622,72275,226,589873
+1120947	Actinomyces vaccimaxillae DSM 15804	no rank	root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinomycineae,Actinomycetaceae,Actinomyces,Actinomyces vaccimaxillae,Actinomyces vaccimaxillae DSM 15804	1,131567,2,201174,1760,85003,2037,85005,2049,1654,183916,1120947
+1056820	Teredinibacter turnerae T7902	no rank	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadales genera incertae sedis,Teredinibacter,Teredinibacter turnerae,Teredinibacter turnerae T7902	1,131567,2,1224,1236,135622,256005,2425,2426,1056820
+1269813	Thioalkalivibrio sp. ALR17-21	species	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Chromatiales,Ectothiorhodospiraceae,Thioalkalivibrio,Thioalkalivibrio sp. ALR17-21	1,131567,2,1224,1236,135613,72276,106633,1269813
+639030	Acidobacteria bacterium KBS 146	species	root,cellular organisms,Bacteria,Fibrobacteres/Acidobacteria group,Acidobacteria,Acidobacteriia,Acidobacteriales,Acidobacteriaceae,unclassified Acidobacteriaceae,Acidobacteria bacterium KBS 146	1,131567,2,131550,57723,204432,204433,204434,112074,639030
+172088	Bradyrhizobium sp. th.b2	species	root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Bradyrhizobiaceae,Bradyrhizobium,Bradyrhizobium sp. th.b2	1,131567,2,1224,28211,356,41294,374,172088
+180281	Cyanobium sp. PCC 7001	species	root,cellular organisms,Bacteria,Cyanobacteria,Oscillatoriophycideae,Chroococcales,Cyanobium,Cyanobium sp. PCC 7001	1,131567,2,1117,1301283,1118,167375,180281
+663610	Methylocapsa aurea	species	root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Beijerinckiaceae,Methylocapsa,Methylocapsa aurea	1,131567,2,1224,28211,356,45404,184923,663610
+1045855	Pseudoxanthomonas spadix BD-a59	no rank	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,Pseudoxanthomonas,Pseudoxanthomonas spadix,Pseudoxanthomonas spadix BD-a59	1,131567,2,1224,1236,135614,32033,83618,415229,1045855
+1120949	Actinoplanes globisporus DSM 43857	no rank	root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Micromonosporineae,Micromonosporaceae,Actinoplanes,Actinoplanes globisporus,Actinoplanes globisporus DSM 43857	1,131567,2,201174,1760,85003,2037,85008,28056,1865,113565,1120949
+81985	Capsella rubella	species	root,cellular organisms,Eukaryota,Viridiplantae,Streptophyta,Streptophytina,Embryophyta,Tracheophyta,Euphyllophyta,Spermatophyta,Magnoliophyta,Mesangiospermae,eudicotyledons,Gunneridae,Pentapetalae,rosids,malvids,Brassicales,Brassicaceae,Camelineae,Capsella,Capsella rubella	1,131567,2759,33090,35493,131221,3193,58023,78536,58024,3398,1437183,71240,91827,1437201,71275,91836,3699,3700,980083,3718,81985
+393283	Pestalotiopsis fici	species	root,cellular organisms,Eukaryota,Opisthokonta,Fungi,Dikarya,Ascomycota,saccharomyceta,Pezizomycotina,leotiomyceta,sordariomyceta,Sordariomycetes,Xylariomycetidae,Xylariales,Amphisphaeriaceae,Pestalotiopsis,Pestalotiopsis fici	1,131567,2759,33154,4751,451864,4890,716545,147538,716546,715989,147550,222545,37989,54958,37840,393283
+163908	Anabaena sp. PCC 7108	species	root,cellular organisms,Bacteria,Cyanobacteria,Nostocales,Nostocaceae,Anabaena,Anabaena sp. PCC 7108	1,131567,2,1117,1161,1162,1163,163908
+1120950	Actinopolymorpha alba DSM 45243	no rank	root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Propionibacterineae,Nocardioidaceae,Actinopolymorpha,Actinopolymorpha alba,Actinopolymorpha alba DSM 45243	1,131567,2,201174,1760,85003,2037,85009,85015,117156,533267,1120950
+1144325	Pseudomonas sp. GM21	species	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,Pseudomonas sp. GM21	1,131567,2,1224,1236,72274,135621,286,1144325
+1045858	Brachyspira intermedia PWS/A	no rank	root,cellular organisms,Bacteria,Spirochaetes,Spirochaetia,Spirochaetales,Brachyspiraceae,Brachyspira,Brachyspira intermedia,Brachyspira intermedia PWS/A	1,131567,2,203691,203692,136,143786,29521,84377,1045858
+925775	Xanthomonas vesicatoria ATCC 35937	no rank	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,Xanthomonas,Xanthomonas vesicatoria,Xanthomonas vesicatoria ATCC 35937	1,131567,2,1224,1236,135614,32033,338,56460,925775
+1417296	Defluviimonas sp. 20V17	species	root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodobacterales,Rhodobacteraceae,Defluviimonas,Defluviimonas sp. 20V17	1,131567,2,1224,28211,204455,31989,1097466,1417296
+1417230	Borrelia persica No12	no rank	root,cellular organisms,Bacteria,Spirochaetes,Spirochaetia,Spirochaetales,Spirochaetaceae,Borrelia,Borrelia persica,Borrelia persica No12	1,131567,2,203691,203692,136,137,138,44448,1417230
+106582	Maylandia zebra	species	root,cellular organisms,Eukaryota,Opisthokonta,Metazoa,Eumetazoa,Bilateria,Deuterostomia,Chordata,Craniata,Vertebrata,Gnathostomata,Teleostomi,Euteleostomi,Actinopterygii,Actinopteri,Neopterygii,Teleostei,Osteoglossocephalai,Clupeocephala,Euteleosteomorpha,Neoteleostei,Eurypterygia,Ctenosquamata,Acanthomorphata,Euacanthomorphacea,Percomorphaceae,Ovalentaria,Cichlomorphae,Cichliformes,Cichlidae,African cichlids,Pseudocrenilabrinae,Haplochromini,Maylandia,Maylandia zebra complex,Maylandia zebra	1,131567,2759,33154,33208,6072,33213,33511,7711,89593,7742,7776,117570,117571,7898,186623,41665,32443,1489341,186625,1489388,123365,123366,123367,123368,123369,1489872,1489908,1489910,1489911,8113,319095,318546,319058,143623,57445,106582
+1120953	Aestuariibacter salexigens DSM 15300	no rank	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Aestuariibacter,Aestuariibacter salexigens,Aestuariibacter salexigens DSM 15300	1,131567,2,1224,1236,135622,72275,249523,226010,1120953
+393305	Yersinia enterocolitica subsp. enterocolitica 8081	no rank	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacteriales,Enterobacteriaceae,Yersinia,Yersinia enterocolitica,Yersinia enterocolitica subsp. enterocolitica,Yersinia enterocolitica subsp. enterocolitica 8081	1,131567,2,1224,1236,91347,543,629,630,150052,393305
+1280706	Selenomonas ruminantium subsp. ruminantium ATCC 12561	no rank	root,cellular organisms,Bacteria,Firmicutes,Negativicutes,Selenomonadales,Veillonellaceae,Selenomonas,Selenomonas ruminantium,Selenomonas ruminantium subsp. ruminantium,Selenomonas ruminantium subsp. ruminantium ATCC 12561	1,131567,2,1239,909932,909929,31977,970,971,114196,1280706
+1515613	Porphyromonas sp. COT-239_OH1446	species	root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Bacteroidia,Bacteroidales,Porphyromonadaceae,Porphyromonas,Porphyromonas sp. COT-239_OH1446	1,131567,2,68336,976,200643,171549,171551,836,1515613
+1123257	Solimonas flava DSM 18980	no rank	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Sinobacteraceae,Solimonas,Solimonas flava,Solimonas flava DSM 18980	1,131567,2,1224,1236,135614,568386,413435,415849,1123257
+1515615	Porphyromonas sp. COT-290_OH860	species	root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Bacteroidia,Bacteroidales,Porphyromonadaceae,Porphyromonas,Porphyromonas sp. COT-290_OH860	1,131567,2,68336,976,200643,171549,171551,836,1515615
+715451	Alteromonas sp. SN2	species	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas sp. SN2	1,131567,2,1224,1236,135622,72275,226,715451
+589924	Ferroglobus placidus DSM 10642	no rank	root,cellular organisms,Archaea,Euryarchaeota,Archaeoglobi,Archaeoglobales,Archaeoglobaceae,Ferroglobus,Ferroglobus placidus,Ferroglobus placidus DSM 10642	1,131567,2157,28890,183980,2231,2232,54260,54261,589924
+221288	Mastigocladopsis repens PCC 10914	no rank	root,cellular organisms,Bacteria,Cyanobacteria,Stigonematales,Mastigocladopsis,Mastigocladopsis repens,Mastigocladopsis repens PCC 10914	1,131567,2,1117,1189,221282,221287,221288
+862908	Bacteriovorax marinus SJ	no rank	root,cellular organisms,Bacteria,Proteobacteria,delta/epsilon subdivisions,Deltaproteobacteria,Bdellovibrionales,Bacteriovoracaceae,Bacteriovorax,Bacteriovorax marinus,Bacteriovorax marinus SJ	1,131567,2,1224,68525,28221,213481,263369,146784,97084,862908
+311402	Agrobacterium vitis S4	no rank	root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Rhizobiaceae,Rhizobium/Agrobacterium group,Agrobacterium,Agrobacterium vitis,Agrobacterium vitis S4	1,131567,2,1224,28211,356,82115,227290,357,373,311402
+311403	Agrobacterium radiobacter K84	no rank	root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Rhizobiaceae,Rhizobium/Agrobacterium group,Agrobacterium,Agrobacterium tumefaciens complex,Agrobacterium tumefaciens,Agrobacterium radiobacter K84	1,131567,2,1224,28211,356,82115,227290,357,1183400,358,311403
+180332	Robinsoniella peoriensis	species	root,cellular organisms,Bacteria,Firmicutes,Clostridia,Clostridiales,Lachnospiraceae,Robinsoniella,Robinsoniella peoriensis	1,131567,2,1239,186801,186802,186803,588605,180332
+1227453	Haloarcula japonica DSM 6131	no rank	root,cellular organisms,Archaea,Euryarchaeota,Halobacteria,Halobacteriales,Halobacteriaceae,Haloarcula,Haloarcula japonica,Haloarcula japonica DSM 6131	1,131567,2157,28890,183963,2235,2236,2237,29282,1227453
+1150600	Arcticibacter svalbardensis MN12-7	no rank	root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Sphingobacteriia,Sphingobacteriales,Sphingobacteriaceae,Arcticibacter,Arcticibacter svalbardensis,Arcticibacter svalbardensis MN12-7	1,131567,2,68336,976,117747,200666,84566,1288026,1288027,1150600
+1406840	Flavobacterium beibuense F44-8	no rank	root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Flavobacteriia,Flavobacteriales,Flavobacteriaceae,Flavobacterium,Flavobacterium beibuense,Flavobacterium beibuense F44-8	1,131567,2,68336,976,117743,200644,49546,237,657326,1406840
+688245	Comamonas testosteroni CNB-2	no rank	root,cellular organisms,Bacteria,Proteobacteria,Betaproteobacteria,Burkholderiales,Comamonadaceae,Comamonas,Comamonas testosteroni,Comamonas testosteroni CNB-1,Comamonas testosteroni CNB-2	1,131567,2,1224,28216,80840,80864,283,285,543891,688245
+401526	Thermosinus carboxydivorans Nor1	no rank	root,cellular organisms,Bacteria,Firmicutes,Negativicutes,Selenomonadales,Veillonellaceae,Thermosinus,Thermosinus carboxydivorans,Thermosinus carboxydivorans Nor1	1,131567,2,1239,909932,909929,31977,261684,261685,401526
+335992	Candidatus Pelagibacter ubique HTCC1062	no rank	root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,unclassified Alphaproteobacteria,SAR11 cluster,Candidatus Pelagibacter,Candidatus Pelagibacter ubique,Candidatus Pelagibacter ubique HTCC1062	1,131567,2,1224,28211,82117,54526,198251,198252,335992
+1163385	Peanut witches'-broom phytoplasma NTU2011	no rank	root,cellular organisms,Bacteria,Tenericutes,Mollicutes,Acholeplasmatales,Acholeplasmataceae,Candidatus Phytoplasma,16SrII (Peanut WB group),Peanut witches'-broom phytoplasma,Peanut witches'-broom phytoplasma NTU2011	1,131567,2,544448,31969,186329,2146,33926,85621,35772,1163385
+999547	Leisingera daeponensis DSM 23529	no rank	root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodobacterales,Rhodobacteraceae,Leisingera,Leisingera daeponensis,Leisingera daeponensis DSM 23529	1,131567,2,1224,28211,204455,31989,191028,405746,999547
+1288083	Streptomyces sp. TAA040	species	root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Streptomycineae,Streptomycetaceae,Streptomyces,Streptomyces sp. TAA040	1,131567,2,201174,1760,85003,2037,85011,2062,1883,1288083
+999549	Leisingera caerulea DSM 24564	no rank	root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodobacterales,Rhodobacteraceae,Leisingera,Leisingera caerulea,Leisingera caerulea DSM 24564	1,131567,2,1224,28211,204455,31989,191028,506591,999549
+999550	Pseudophaeobacter arcticus DSM 23566	no rank	root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodobacterales,Rhodobacteraceae,Pseudophaeobacter,Pseudophaeobacter arcticus,Pseudophaeobacter arcticus DSM 23566	1,131567,2,1224,28211,204455,31989,1541822,385492,999550
+1216362	Fusobacterium nucleatum ChDC F128	no rank	root,cellular organisms,Bacteria,Fusobacteria,Fusobacteriia,Fusobacteriales,Fusobacteriaceae,Fusobacterium,Fusobacterium nucleatum,unclassified Fusobacterium nucleatum,Fusobacterium nucleatum ChDC F128	1,131567,2,32066,203490,203491,203492,848,851,189727,1216362
+311424	Dehalococcoides mccartyi VS	no rank	root,cellular organisms,Bacteria,Chloroflexi,Dehalococcoidia,Dehalococcoidales,Dehalococcoidaceae,Dehalococcoides,Dehalococcoides mccartyi,Dehalococcoides mccartyi VS	1,131567,2,200795,301297,1202465,1202464,61434,61435,311424
+573569	Francisella sp. TX077308	species	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Thiotrichales,Francisellaceae,Francisella,Francisella sp. TX077308	1,131567,2,1224,1236,72273,34064,262,573569
+426114	Thiomonas arsenitoxydans	species	root,cellular organisms,Bacteria,Proteobacteria,Betaproteobacteria,Burkholderiales,unclassified Burkholderiales,Burkholderiales Genera incertae sedis,Thiomonas,Thiomonas arsenitoxydans	1,131567,2,1224,28216,80840,119065,224471,32012,426114
+1212548	Pseudomonas stutzeri NF13	no rank	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,Pseudomonas stutzeri group,Pseudomonas stutzeri subgroup,Pseudomonas stutzeri,Pseudomonas stutzeri NF13	1,131567,2,1224,1236,72274,135621,286,136846,578833,316,1212548
+426117	Methylobacterium sp. 4-46	species	root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Methylobacteriaceae,Methylobacterium,Methylobacterium sp. 4-46	1,131567,2,1224,28211,356,119045,407,426117
+1138822	Lactobacillus curieae	species	root,cellular organisms,Bacteria,Firmicutes,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,Lactobacillus curieae	1,131567,2,1239,91061,186826,33958,1578,1138822
+98439	Fischerella thermalis PCC 7521	no rank	root,cellular organisms,Bacteria,Cyanobacteria,Stigonematales,Fischerella,Fischerella thermalis,Fischerella thermalis PCC 7521	1,131567,2,1117,1189,1190,372787,98439
+65672	Piriformospora indica	species	root,cellular organisms,Eukaryota,Opisthokonta,Fungi,Dikarya,Basidiomycota,Agaricomycotina,Agaricomycetes,Agaricomycetes incertae sedis,Sebacinales,Sebacinales group B,Piriformospora,Piriformospora indica	1,131567,2759,33154,4751,451864,5204,5302,155619,355688,297313,1506295,65702,65672
+1441930	Serratia fonticola RB-25	no rank	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacteriales,Enterobacteriaceae,Serratia,Serratia fonticola,Serratia fonticola RB-25	1,131567,2,1224,1236,91347,543,613,47917,1441930
+688269	Thermotoga thermarum DSM 5069	no rank	root,cellular organisms,Bacteria,Thermotogae,Thermotogae,Thermotogales,Thermotogaceae,Thermotoga,Thermotoga thermarum,Thermotoga thermarum DSM 5069	1,131567,2,200918,188708,2419,188709,2335,119394,688269
+688270	Cellulophaga algicola DSM 14237	no rank	root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Flavobacteriia,Flavobacteriales,Flavobacteriaceae,Cellulophaga,Cellulophaga algicola,Cellulophaga algicola DSM 14237	1,131567,2,68336,976,117743,200644,49546,104264,59600,688270
+1163407	Rhodanobacter spathiphylli B39	no rank	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,Rhodanobacter,Rhodanobacter spathiphylli,Rhodanobacter spathiphylli B39	1,131567,2,1224,1236,135614,32033,75309,347483,1163407
+1163408	Rhodanobacter fulvus Jip2	no rank	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,Rhodanobacter,Rhodanobacter fulvus,Rhodanobacter fulvus Jip2	1,131567,2,1224,1236,135614,32033,75309,219571,1163408
+1069080	Succinispira mobilis DSM 6222	no rank	root,cellular organisms,Bacteria,Firmicutes,Negativicutes,Selenomonadales,Acidaminococcaceae,Succinispira,Succinispira mobilis,Succinispira mobilis DSM 6222	1,131567,2,1239,909932,909929,909930,78119,78120,1069080
+1120963	Algicola sagamiensis DSM 14643	no rank	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Pseudoalteromonadaceae,Algicola,Algicola sagamiensis,Algicola sagamiensis DSM 14643	1,131567,2,1224,1236,135622,267888,296014,163869,1120963
+561177	Anaerococcus hydrogenalis DSM 7454	no rank	root,cellular organisms,Bacteria,Firmicutes,Clostridia,Clostridiales,Peptoniphilaceae,Anaerococcus,Anaerococcus hydrogenalis,Anaerococcus hydrogenalis DSM 7454	1,131567,2,1239,186801,186802,1570339,165779,33029,561177
+106648	Acinetobacter bereziniae	species	root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Moraxellaceae,Acinetobacter,Acinetobacter bereziniae	1,131567,2,1224,1236,72274,468,469,106648
+1107311	Flavobacterium enshiense DK69	no rank	root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Flavobacteriia,Flavobacteriales,Flavobacteriaceae,Flavobacterium,Flavobacterium enshiense,Flavobacterium enshiense DK69	1,131567,2,68336,976,117743,200644,49546,237,1341165,1107311
+1136417	Salinispora pacifica CNT003	no rank	root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Micromonosporineae,Micromonosporaceae,Salinispora,Salinispora pacifica,Salinispora pacifica CNT003	1,131567,2,201174,1760,85003,2037,85008,28056,168694,351187,1136417
+237727	Erythrobacter sp. NAP1	species	root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Erythrobacteraceae,Erythrobacter,Erythrobacter sp. NAP1	1,131567,2,1224,28211,204457,335929,1041,237727
+1506583	Flavobacterium sp. Fl	species	root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Flavobacteriia,Flavobacteriales,Flavobacteriaceae,Flavobacterium,Flavobacterium sp. Fl	1,131567,2,68336,976,117743,200644,49546,237,1506583
+1540257	Clostridium sp. KNHs214	species	root,cellular organisms,Bacteria,Firmicutes,Clostridia,Clostridiales,Clostridiaceae,Clostridium,Clostridium sp. KNHs214	1,131567,2,1239,186801,186802,31979,1485,1540257
+1120966	Algoriphagus marincola DSM 16067	no rank	root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Cytophagia,Cytophagales,Cyclobacteriaceae,Algoriphagus,Algoriphagus marincola,Algoriphagus marincola DSM 16067	1,131567,2,68336,976,768503,768507,563798,246875,264027,1120966
+1437610	Bifidobacterium reuteri DSM 23975	no rank	root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Bifidobacteriales,Bifidobacteriaceae,Bifidobacterium,Bifidobacterium reuteri,Bifidobacterium reuteri DSM 23975	1,131567,2,201174,1760,85003,85004,31953,1678,983706,1437610
+1380380	Ahrensia sp. 13_GOM-1096m	species	root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodobacterales,Rhodobacteraceae,Ahrensia,Ahrensia sp. 13_GOM-1096m	1,131567,2,1224,28211,204455,31989,152180,1380380
diff --git a/q2_moshpit/eggnog/tests/test_dbs.py b/q2_moshpit/eggnog/tests/test_dbs.py
index f9280d86..31ac4c8c 100644
--- a/q2_moshpit/eggnog/tests/test_dbs.py
+++ b/q2_moshpit/eggnog/tests/test_dbs.py
@@ -150,9 +150,10 @@ def test_fetch_eggnog_fasta(self, subp_run):
         # Check that commands are ran as expected
         subp_run.assert_has_calls([first_call, second_call], any_order=False)
 
+    @patch("q2_moshpit.eggnog._dbs._validate_taxon_id")
     @patch("subprocess.run")
     @patch("shutil.move")
-    def test_build_eggnog_diamond_db(self, shut_mv, subp_run):
+    def test_build_eggnog_diamond_db(self, shut_mv, subp_run, _val):
         # Instantiate input
         proteins_and_taxa = EggnogProteinSequencesDirFmt()
 
@@ -175,3 +176,15 @@ def test_build_eggnog_diamond_db(self, shut_mv, subp_run):
         source_path = os.path.join(str(proteins_and_taxa), "ref_db.dmnd")
         destination_path = os.path.join(str(diamond_db), "ref_db.dmnd")
         shut_mv.assert_called_once_with(source_path, destination_path)
+
+    def test_build_eggnog_diamond_db_invalid_taxon_id(self):
+        # Init input data
+        path_to_data = self.get_data_path('build_eggnog_diamond_db/')
+        eggnog_proteins = EggnogProteinSequencesDirFmt(path_to_data, 'r')
+
+        # Call function exception error since taxon 0 is invalid
+        with self.assertRaisesRegex(
+            ValueError,
+            "'0' is not valid taxon ID. "
+        ):
+            _ = build_eggnog_diamond_db(eggnog_proteins, taxon=0)
diff --git a/setup.py b/setup.py
index 2a286e79..b7cf194e 100644
--- a/setup.py
+++ b/setup.py
@@ -47,6 +47,7 @@
         ],
         'q2_moshpit.eggnog': [
             'tests/data/*',
+            'tests/data/build_eggnog_diamond_db/*',
             'tests/data/contig-sequences-1/*',
             'tests/data/mag-sequences/*',
             'tests/data/random-db-1/*',

From 527bfecca1aa11766a29b9da741e27eae6715719 Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Thu, 18 Jan 2024 13:04:34 +0100
Subject: [PATCH 10/24] Implement fetch-ncbi-taxonomy

---
 q2_moshpit/citations.bib      |  6 +++
 q2_moshpit/eggnog/__init__.py |  4 +-
 q2_moshpit/eggnog/_dbs.py     | 92 +++++++++++++++++++++++++++++++++++
 q2_moshpit/plugin_setup.py    | 18 +++++++
 4 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/q2_moshpit/citations.bib b/q2_moshpit/citations.bib
index 02c36349..64751cf4 100644
--- a/q2_moshpit/citations.bib
+++ b/q2_moshpit/citations.bib
@@ -123,3 +123,9 @@ @article{buchfink_sensitive_2021
 	keywords = {Computational biology and bioinformatics, Genome informatics, Genomic analysis, Sequencing, Software},
 	pages = {366--368},
 }
+
+@misc{NCBI,
+  title = {National Center for Biotechnology Information (NCBI)},
+  url = {https://www.ncbi.nlm.nih.gov/},
+  note = {Bethesda (MD): National Library of Medicine (US), National Center for Biotechnology Information;},
+}
diff --git a/q2_moshpit/eggnog/__init__.py b/q2_moshpit/eggnog/__init__.py
index 9a176a43..2383b6f1 100644
--- a/q2_moshpit/eggnog/__init__.py
+++ b/q2_moshpit/eggnog/__init__.py
@@ -8,12 +8,12 @@
 from ._method import eggnog_diamond_search, eggnog_annotate
 from ._dbs import (
     fetch_eggnog_db, fetch_diamond_db, build_custom_diamond_db,
-    fetch_eggnog_proteins, build_eggnog_diamond_db
+    fetch_eggnog_proteins, build_eggnog_diamond_db, fetch_ncbi_taxonomy
 )
 
 
 __all__ = [
     'eggnog_diamond_search', 'eggnog_annotate', 'fetch_eggnog_db',
     'fetch_diamond_db', 'build_custom_diamond_db', 'fetch_eggnog_proteins',
-    'build_eggnog_diamond_db',
+    'build_eggnog_diamond_db', 'fetch_ncbi_taxonomy'
 ]
diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py
index 49442342..fe1a4b8a 100644
--- a/q2_moshpit/eggnog/_dbs.py
+++ b/q2_moshpit/eggnog/_dbs.py
@@ -6,6 +6,7 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 import os
+import datetime
 import pandas as pd
 from q2_types.feature_data import ProteinSequencesDirectoryFormat
 import shutil
@@ -235,3 +236,94 @@ def _validate_taxon_id(eggnog_proteins, taxon):
             "To view all valid taxon IDs inspect e5.taxid_info.tsv "
             "file in the input eggnog_proteins input."
         )
+
+
+def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
+    """
+    Script fetches 3 files from the internet and puts them into the folder of
+    a NCBITaxonomyDirFmt object.
+    """
+    # Initialize output object and paths
+    ncbi_data = NCBITaxonomyDirFmt()
+    zip_path = os.path.join(str(ncbi_data), "taxdmp.zip")
+    proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz")
+    version_path = os.path.join(str(ncbi_data), "version.tsv")
+
+    # Download zip file
+    print(colorify("Downloading *.dmp files"))
+    run_command(
+        cmd=[
+            "wget", "-O", zip_path,
+            "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip"
+        ]
+    )
+
+    # Unzip
+    run_command(
+        cmd=[
+            "unzip", "-j", zip_path, "names.dmp", "nodes.dmp",
+            "-d", str(ncbi_data)
+        ]
+    )
+
+    # Remove zip file
+    run_command(
+        cmd=[
+            "rm", zip_path
+        ]
+    )
+
+    # Download proteins
+    print(colorify("Downloading proteins file (~15 GB)"))
+    run_command(
+        cmd=[
+            "wget", "-O", proteins_path,
+            "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
+            "prot.accession2taxid.gz"
+        ]
+    )
+
+    # Get last modification times
+    print(colorify("Constructing version file"))
+    names_time = _get_last_modified_time(str(ncbi_data), "names.dmp")
+    nodes_time = _get_last_modified_time(str(ncbi_data), "nodes.dmp")
+    proteins_time = _get_last_modified_time(
+        str(ncbi_data), "prot.accession2taxid.gz"
+    )
+
+    # Create a DataFrame with file names and last modification times
+    data = {'file_name': [
+                'names.dmp',
+                'nodes.dmp',
+                'prot.accession2taxid.gz'
+                ],
+            'date': [
+                names_time.strftime('%d/%m/%Y'),
+                nodes_time.strftime('%d/%m/%Y'),
+                proteins_time.strftime('%d/%m/%Y')
+                ],
+            'time': [
+                names_time.strftime('%H:%M:%S'),
+                nodes_time.strftime('%H:%M:%S'),
+                proteins_time.strftime('%H:%M:%S')
+                ]
+            }
+    version = pd.DataFrame(data)
+
+    # Write version file
+    version.to_csv(version_path, sep='\t', index=False)
+
+    # Return completed object
+    print(colorify(
+        "Done! Moving data from temporary directory to final location."
+    ))
+    return ncbi_data
+
+
+# Get the date and time modified of a file
+def _get_last_modified_time(dir, file):
+    return datetime.date.fromtimestamp(
+        os.path.getmtime(
+            os.path.join(dir, file)
+        )
+    )
diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py
index cf933cc1..e00a0feb 100644
--- a/q2_moshpit/plugin_setup.py
+++ b/q2_moshpit/plugin_setup.py
@@ -609,6 +609,24 @@
                 "storage space is required to run this action. "
 )
 
+plugin.methods.register_function(
+    function=q2_moshpit.eggnog.fetch_ncbi_taxonomy,
+    inputs={},
+    parameters={},
+    outputs=[("taxonomy", ReferenceDB[NCBITaxonomy])],
+    output_descriptions={
+        "taxonomy": "NCBI reference taxonomy."
+    },
+    name="Fetch NCBI reference taxonomy",
+    description="Downloads NCBI reference taxonomy for the NCBI ftp server. "
+                "The resulting artifact is required in the "
+                "build-custom-diamond-db action if one whished to "
+                "create a Diamond data base with taxonomy features. "
+                "At least 30 GB of "
+                "storage space is required to run this action.",
+    citations=[citations["NCBI"]]
+)
+
 plugin.methods.register_function(
     function=q2_moshpit.eggnog.build_eggnog_diamond_db,
     inputs={

From 92cb2f5f5e4bca9e80efcfb1bea6dbc15ad53878 Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Thu, 18 Jan 2024 14:04:39 +0100
Subject: [PATCH 11/24] _write_version_tsv functionality to separate function.

---
 q2_moshpit/eggnog/_dbs.py | 48 +++++++++++++++------------------------
 1 file changed, 18 insertions(+), 30 deletions(-)

diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py
index fe1a4b8a..4fbe663b 100644
--- a/q2_moshpit/eggnog/_dbs.py
+++ b/q2_moshpit/eggnog/_dbs.py
@@ -246,6 +246,8 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
     # Initialize output object and paths
     ncbi_data = NCBITaxonomyDirFmt()
     zip_path = os.path.join(str(ncbi_data), "taxdmp.zip")
+    nodes_path = os.path.join(str(ncbi_data), "nodes.dmp")
+    names_path = os.path.join(str(ncbi_data), "names.dmp")
     proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz")
     version_path = os.path.join(str(ncbi_data), "version.tsv")
 
@@ -267,11 +269,7 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
     )
 
     # Remove zip file
-    run_command(
-        cmd=[
-            "rm", zip_path
-        ]
-    )
+    run_command(cmd=["rm", zip_path])
 
     # Download proteins
     print(colorify("Downloading proteins file (~15 GB)"))
@@ -283,13 +281,21 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
         ]
     )
 
-    # Get last modification times
+    # Constructing version file
     print(colorify("Constructing version file"))
-    names_time = _get_last_modified_time(str(ncbi_data), "names.dmp")
-    nodes_time = _get_last_modified_time(str(ncbi_data), "nodes.dmp")
-    proteins_time = _get_last_modified_time(
-        str(ncbi_data), "prot.accession2taxid.gz"
-    )
+    _write_version_tsv(nodes_path, names_path, proteins_path, version_path)
+
+    # Return object
+    print(colorify(
+        "Done! Moving data from temporary directory to final location."
+    ))
+    return ncbi_data
+
+
+def _write_version_tsv(nodes, names, proteins, version):
+    names_time = datetime.date.fromtimestamp(os.path.getmtime(nodes))
+    nodes_time = datetime.date.fromtimestamp(os.path.getmtime(names))
+    proteins_time = datetime.date.fromtimestamp(os.path.getmtime(proteins))
 
     # Create a DataFrame with file names and last modification times
     data = {'file_name': [
@@ -308,22 +314,4 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
                 proteins_time.strftime('%H:%M:%S')
                 ]
             }
-    version = pd.DataFrame(data)
-
-    # Write version file
-    version.to_csv(version_path, sep='\t', index=False)
-
-    # Return completed object
-    print(colorify(
-        "Done! Moving data from temporary directory to final location."
-    ))
-    return ncbi_data
-
-
-# Get the date and time modified of a file
-def _get_last_modified_time(dir, file):
-    return datetime.date.fromtimestamp(
-        os.path.getmtime(
-            os.path.join(dir, file)
-        )
-    )
+    pd.DataFrame(data).to_csv(version, sep='\t', index=False)

From 2cdae75aec641e72ed6a4fbe80d9a16267afbd3f Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Thu, 18 Jan 2024 14:04:48 +0100
Subject: [PATCH 12/24] implement tests

---
 q2_moshpit/eggnog/tests/data/ncbi/names.dmp   |  11 +++
 q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp   |  11 +++
 .../tests/data/ncbi/prot.accession2taxid.gz   | Bin 0 -> 10956 bytes
 q2_moshpit/eggnog/tests/test_dbs.py           |  68 +++++++++++++++++-
 4 files changed, 88 insertions(+), 2 deletions(-)
 create mode 100644 q2_moshpit/eggnog/tests/data/ncbi/names.dmp
 create mode 100644 q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp
 create mode 100644 q2_moshpit/eggnog/tests/data/ncbi/prot.accession2taxid.gz

diff --git a/q2_moshpit/eggnog/tests/data/ncbi/names.dmp b/q2_moshpit/eggnog/tests/data/ncbi/names.dmp
new file mode 100644
index 00000000..b89e8a2b
--- /dev/null
+++ b/q2_moshpit/eggnog/tests/data/ncbi/names.dmp
@@ -0,0 +1,11 @@
+1	|	all	|		|	synonym	|
+1	|	root	|		|	scientific name	|
+2	|	Bacteria	|	Bacteria <bacteria>	|	scientific name	|
+2	|	bacteria	|		|	blast name	|
+2	|	eubacteria	|		|	genbank common name	|
+2	|	Monera	|	Monera <bacteria>	|	in-part	|
+2	|	Procaryotae	|	Procaryotae <bacteria>	|	in-part	|
+2	|	Prokaryotae	|	Prokaryotae <bacteria>	|	in-part	|
+2	|	Prokaryota	|	Prokaryota <bacteria>	|	in-part	|
+2	|	prokaryote	|	prokaryote <bacteria>	|	in-part	|
+2	|	prokaryotes	|	prokaryotes <bacteria>	|	in-part	|
diff --git a/q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp b/q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp
new file mode 100644
index 00000000..61a662a0
--- /dev/null
+++ b/q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp
@@ -0,0 +1,11 @@
+1	|	1	|	no rank	|		|	8	|	0	|	1	|	0	|	0	|	0	|	0	|	0	|		|
+2	|	131567	|	superkingdom	|		|	0	|	0	|	11	|	0	|	0	|	0	|	0	|	0	|		|
+6	|	335928	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+7	|	6	|	species	|	AC	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
+9	|	32199	|	species	|	BA	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
+10	|	1706371	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+11	|	1707	|	species	|	CG	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|	effective current name;	|
+13	|	203488	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+14	|	13	|	species	|	DT	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
+16	|	32011	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+17	|	16	|	species	|	MM	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
diff --git a/q2_moshpit/eggnog/tests/data/ncbi/prot.accession2taxid.gz b/q2_moshpit/eggnog/tests/data/ncbi/prot.accession2taxid.gz
new file mode 100644
index 0000000000000000000000000000000000000000..8b78ac0fc0030af4e68ead0ebb547dfb55ae34c2
GIT binary patch
literal 10956
zcmV;-Dl^p|iwFod4|`<*18{P0bS`0IV`X!5X>V>abYXaDWB_fQ%Wea)a&5<*k1?8W
zawu63OR@}EvSmxNgCq#fBmt5jxSxMnB&&A2*%M>KTqu&oH=CsX@vr~-@Bi~Z|NO82
z9R9)T<^TTuzsrCA_aFc7pZ_-e$3OpNHLk`n9o~N80QsT@Wz^`E8B{bbX{~)Is8XM&
zm<Gs<I<3v<2dzirR2qY?WVmgk8X$|&7=zEKQ9c@Pbpq;ux&_o&L3d_=TtbbYzhQt}
zN&wRO&;#U>Rn6#b9S6uIbYyh@DhbGP2~~_<+&Dl^QLAh)Dmh8(<z3R6<%^!7dK?3z
zpH_+>%LP<PTCY|TkmW)GQY@%8yY-J9ATw&LAH$%MNikaGEx-UUbYKb$ZXR)foP#-;
z4DMQ;s<N}_l7MSBK<1qPwb=4ap__4l?4vdeep#`YWd^lVCMl4*k$b=P17wanlXe_I
zaw_-fBMp!}sl?{xz#bw=v76lNGCAsywDCF$c^n7GY%?hDay<y%`AChL!J8J7SyIk}
z^KMW+Bol&-319-4Ixw*>5B8NUlBB+2jJeJw<#MK?R>E!k06FJW6mF*m$d2QBb_iSH
zvdo-~CP~Hh?8L**hBHEv-0ii_#hR<Sb-9H(iTP(c4v-Xc98GqEGQrpw$8iLyKvLX3
z1Z@a(Zp=m<Kn<{VF*<huBcWPlk9rT(05yMN*UC9>5&Iso?^T~pg}m<({a$zJ^a>y%
z-_Ik@$+U3N-h#B%<QS1k<mUkJU>CeOAstB9dlPXVJV5_9?(nB1rQwkL>-J(Z1VK;8
z`N>Re)>KYF<mY6ksjN;n0Fj-OYZ*5qOeT=X&&f|s=KR<gP>!bnFwCnFxgE(oBbjI0
z=Lbc8Ml#Q?%g+dBo{`G4U&s$~env9S;SX5~D4<=OQ9$Q7BPl<e%**eJlSz`uJU2<X
zgVS*&7bOAL;{e%#(F26Vb!X9O3orppokfqRU-ba_7j&bE8ejkz<`(npK){U|AbX2Q
z;|7`8xDJ$ho<otDP3kQ=0k^QY?JY{jpF<<S*n`OR4({1??y(~*?hy5^i`tGz;|@{p
zx~Q!}?b*YR`_7M6Mx7jrlG%q2)JWqSGV|7F#-N(LS%9{yJppC?hFrgO_0dNZ;hP8O
z`!u3HzM-t&`iiSz@qjjS82ikinjMgtL!X&R#XTT1hrS36(s)1-9%7$Hl=TrscwA@-
zi28^kJT5dY)W;Kw@YFZgfJ$+K#Zy-)%%n?sR|90<6i^@EVe!4Um|*dYG@cjQ3@n~u
z@!VHj1&4pYj~@$70q*%g?fK|>iaga9@PbNlv3-3+8ZSuWWubAQntj6JXK&G<6@DU(
zpM4sQ)Skzyw3#gXRw%iC$dZ6$U&CP$UGrn!H4`|o7yvHR$BD%Ruy3wW{P@_4d$R1?
zi~$IX+l3AQY23l$u5-8v9w0KaTWAVMqfAao_Fw|&04|I(aL*e`=B?|bOoBypq>p(=
zN^Qmhi~tvg52<F4gS5gddk|?vxA$1+_C}=fh|C=O9#_NRN0iKQp>aikD4BV;M@<2h
z0^Q!D@Aj+_i_aucXW7*rw8Cdf02eB*IQ+SS#Z_m~qa%H;U~$!342XIytuV{ZqMv{f
zU_Z`8fUmIl+PCYF00w}4Q<#A0Ql9fJg_=SD=m0J>1u4R(tUF2eAS}wdlVtC3lr_4P
z=e$dyimL&7fD0X|0tkoCr-jt_psdkJJ?EViMUBj0{`j2FA1O1aW;+z&ZefUnA2HE<
z&L^5Qvxj?jNaJoHjmXR%Wxel*50A|3;qd)JyN1KjKR)OEBUzk8jWq82G-4+8d_(Pd
zTj-k66fkvt&Zn*f!aeA5p9?*%L8ZWq^f{lAQpH6P9+1X^n&;YK{`ka<^f{lAl0~HP
zfT#}(MTkm)iRQDPXxb4LkFa=Lm>?lDC#3Pzm$gErIKkp+p$L)2cQl3fz7NOD{)vg^
zv!7`4&NhXfk?V7xYr_`5SDHbR(~X6?qr$oVUaz<VAo*u)U&$!suOA?3f0MW78aWF2
zkjS&_wuoe&^8LCJlUXk8qDeAeU)2ElhpaZ)>G$6iJDnu$zT{pfCw0&|>X2svvIe<7
zW%H6GrBjzLD)~<uotN(P{*+A>l5^0>q~`v-O5|Bm<YUTSf=!u^|Ij`uWN$MFn8pFJ
zPo)x2HnmB1pn^piJV|z-^lCrjH=_Pth<X(8j;QYxb;{id6%U)VAQQ;c$Bo?l0Q^b4
zNm(oXD5>itdrvw6tz<{pE<3>+fDWMRLCKABVSwy&BMwp?Y4i&&N7Oz7Y@4CU*h#**
zLH^`5+W=Y6Tve2|(Wo&<$M$%L%SjTKk(iyPvv(A2^|fT$wdQgx@`b)5*ib(scw=}J
zvuneXr6M~{3&+D^ksW85%hUY<vg3?!TX5T+bIgT<q%V}MlHkHf-`14R-G(uek?^Ai
zNb1yRJsPP0GttlrC@}iPEA1jPYHiSN2((r)j{9i4iw{mCNpg4gurhv7xiiR}mEzTa
z-6h4zB)Nd{o=y%w0zEFGiqVzjJds>V09*Zhe6?y>WJcqtvztcxM5+@{W$jg6TdgdT
z8l$HV%GQl3=-`A~^LnyK=9HyRp3X=TufG?umu0EC*5<J|SIs<u&RwQdjoWKrl;i}Z
z!6&aL6;EdE?M<_fNcL70PZe``@B`#R0(d26ZwFqy6gfp{VdqqU)$?5skbh7W1AWsJ
z^kW?c$c41>pyGcNj}%3gI;wKCx%bdv40$Qob}@|l*|hdiWf}Yu`9F7AF%JN>om+d|
zs_N7#OLs({?Nm}eY1`Vl%N4D)-zrPjp>jFK+T>jB5iFzbq6f$+S`H|v)P~a>ezbmo
zT(asIJy|<IE}^NRU;2$vl9Z=B-j@!MzQEex*yz{m$VnwhEm=|LP`~Jotv<Q*okmV4
z$r8EI+7D5?T$T76Uk=@FIcky}*eXa4J+v;n$_{i^Kr;@Ib1;q^_0Rh=43GtkxuIz)
zd7ZUti<un$xNvqzQaz&nONEf<Qn|Wuwx4&;i6lvmG$D6{RNYc=c>+~k?#GpHHM2^Z
zl_jOP8ab;Qx5?FSfFy^zkh|?X_~hvue^T({>CvV-RT4MGY|p2tb1&1nVn4pCk>e#v
z-4<<h<eKIZ&2!r-A&1(woT#z~ErY9f0kiDDw&&r;TVB8pkd)=z2+Pr$XdU<0e*F1d
znE_IIR_+Nog}?sEStj|SD;INHHF?BRjD5DUtFklKJgru*-_0{cmWFK-S8dZefE>6T
z-2lm4?oo4<Nvf^za^T}3_8#TbjqxR7n)Jr#+X&&+di-RNWCyYX)n?Zg997b>4{zy+
zMs9hzg0;p+foE{#qvM2A)^fy~vkC)b$8o&7^$#bI<c#CZm>jsdr~$I$*fYBm*O(&z
zaAO`4lyL16Z<TX<TDbR>q%X^vU!0Wi?!4oYBRP*(N_hA6D)zGcV{c4J{Bp=^>H)GW
zS;n%kaymEWBn5Z41_84au$<wrxnkB%&YM^D)2eil1S&dZ;dvwo?ZZh9U$^bhDyL{|
z^S*r?^x^f~Hu9>Z8k<~nBM$_oO`(0bZC_i-6j=rW=VIhjDSV0}2fwd{d|yvVUVk>P
zhu?i@wW7$EcDRRn+dKoZA-3-W6#N?p$ob%6g?n>mfb2NS-1#K-vMe3EPTuP_L(Oe4
z3flG?s_am!gT#LRNCV`8)hJm%U&aBFPN@C0%3CNUTme-JRvprML0T_*ZZ+2tX9WEc
zPo2vowex@i>VUch546#+ga@FZOF$eQoOMT9?+bo4Z6;8{k=A>kR;wo-xLcrhAzMi6
z$D5=z%Y_7J@xaGH()w{&ssqLW^+5fCRU_H@I7%J(sH=}zT7#6<)r$wNWu;4P-tpm6
zt-OFIse;$CjWz>0osad%jv`55OnLH~>qv~sdT^g+fb2M~rH`|;P?CMb?Dl6`vx7(y
zn6ny^tdH6{*T+?!at@HCA2Z6vGD4RVw>&YRa0ApVpj?^aYSorLRW4Z#f+nD85%r9&
z+sdQLB{VVm`YmbAatVz<6VSATwi*)G<FmQ7NSezeOZ$jzmBJM?uI<P+NfwZ1lDzP1
z%gG$-KgD8}0_LSz(77*h0`A-Sq$<ncsRKG)&dvJJb_T1Y+<2#P>T?u}uOGimlB9tl
z4@!Ad%oALlM>5U6+#OC_MM##Yjg!NnilaWpwn<fGnFBl9M!CQrgMIvTpPUtv#o;M?
zNwuhLic!$JA0W%fo!3NxkB&QW<fk9Ss4vsPIcY#9kO?I1Ddnldlw9ipav`B^6!aZv
z9l%=jWksyhC`N;3x0CEh-W{ai`#3-@WT|;Ht<GFRMPl)sSH#Yy_C3m{*Ke-sMYb(E
zO;ZtW!L<el{q@#%^QtUeQsh;IPwjM84t&20m!)uZ!V;P<bKvzV-1R!KSF_S4HbH-z
zF$|EicwOLlzpvg6kiz8};dEZIi@mQ)8X)KP+GIEAdupqhDv^(vn~mlJ3OR7oJ`z^R
z94|kVaNCR6%W|GR$?ttXaz03kJ(pK_dCYH9(;WDAR0HHZADaDqKb*zh;XL<?ZXa%P
z;JJN;s#3T%K7hMCx!g!j@=N<?v3>Ll=5^avja8Pm;>RJcplqIvw&O3stzA1QTqSL9
z=1wG(j?s>K<apiMw~0fQvLDx${B5yS0wcKi;-?(#h=NPtlGw|$fRvYbMiS`T{%e&l
z;V;)_tR+x^Q=L7e?_j{07xOoLZNw;n7M$(uP2%<7eCHPr4*omPB=K*&BRKfwkY4O8
zzsScC`91deg`bZaobKX<pO47L(a-I*!KVZwea9sB8W)g{6Uyy0b{x`oLi$d+<B+}+
z%Hw1^4t{B;u&b1Ov`-lp<#s}TPhEZ|<?)X6y(4|CKOH9=?7e%br;Te18w&hDxqT?H
zXOhn2jT@1V55)VSC4H5%X<`rcxg78Nvrit8z7P2C!*(3(UEse9{Fhhg%Ii^~KllXq
zIpvbGE(O$=E9${@6s~UOgh@TPBEQ$J9z?XGE9&1hb?puH<%W1~sDCxyI4QRq;=Spv
z-or09`0v*B2O8yegI{iL&Q1JxLw&jVxt}NP_lEr5lGH2iPm#Va*!zOLviFi_DH9I;
z&=>ll#wq0E3;p<4?CgR2hF`wzj2kEY^BsP<yEzw;-yd*49T(ugUvR%2m%u&HJ|0H$
zTkE}@^lK0J`GJ0|TzfKnEI9|}CO<$5S9f<K+Ibi~IFI^Q$9<$P81Y}0oDW`@jNqK4
zFH7cfUyv)or26G`$^ePxR-^S&?Qg$gE=yYH(6}!vA}tm-P6k4fWfkFRLZrzty!)2}
zz2lfk(hk@-=0&rBoRiI6MmUm`MCzMma~@;sE^L`3NgI!}-N-AQFbbFqH)4`y&pj^#
zQ(in+W2NudrKcGn%SM?tJn2<2auRnd$DK6E9+ZezGAfbmL5X_3XYY`lgGt)mZv7=`
zTz_?GOo~hJ?c<lDCizkpLDu7Lu5|sMpCdPAlBEPw5YBOCzK%TgEs{P+j6R#qH7Ae5
z+HuwX;%&StDdxPcu&87TU<4RDP$p$xSIyob*@4okzOKd$kR2G4fL0BVy+tSBDh-gm
zMK9nQQLp=`6~I?<PnNw!@i%W<RoS~|6!0Au-+PN1xqf#5U0*UhI)42=#XVW}X$$}j
zz=d2JfDvF{*6rX$@Vi|*sZb>?V3KhoE^r0`dF51O4>pH~&90qWu9DU;$+_BMXg`s;
z?JcyJi=-tHt#RnnQp~+*?xeV$<?tKO+al^1{fT~nTtc;=ZO^F6DcZhsa?=Uh$9o(g
zm#j92hEr^JovJLJr}KHa)daf!#(6p=p79phah##|3(q);ByQBw4e&kg5FTr;l|@o_
z5vjYVCxJZIwYdKLXpOtb7ydxV-H#Wp?c-O;Ty*pS#d0qdU-#`ma+OpqC>wdrZPV!6
zo$wIfBJY<J$ynRpFeAr$xOp~5Qn)ss<PIeV9>E<uuI;}>yV~t5S7q7Xj-%5?zInG}
z1|>WI4GU<PSdBpA5~_eEplJy;g0>qLRW6~H(MwxiSGkle15^XmOX$R^0cw^|2h;+!
zOQ;9xfVw3#0QEo@ODi^XogVC0l1t9gf<DIqatU=n70@NW1`kvNUGi%PKn>6(zXoLM
zX@M^JH6)-8=#pRK2-E{z@@rH;1JI>b5GQ_(K$lv9XLPTB7$upfj=6%;?1<-4Qdr05
zib;|*9UXn7FSljOKQD=*kUKp<{v}mjCn(PTRlRZcN!EnsHMWw3PEcuBBp1+81GPZy
z5(=wFSUoOU4L~(ey@Xnz0ccpt7Seh`0!~W_Pzop#a9T=$GC<+i(~@7u1QcmKEv0pY
z)ibP~7p;1ruzFs!8U_78wmug9>VQU|aS3hC@;lRBi>R{93+ULOk;P6cSO4YO45CVE
z0i&hkgJX<&*lkmt7=Avut`<q}WF%{&>{xII7=AA8DqxlLnR^_|p}2Z<klSAPx&4j<
zWVf~EglorJs{~iSvT_BXKo8E#mNIXN2H~EFH+Q>p4)Hz{IBX&DR)oa+HL3x!<3?~A
zoPL>am1uAVoas1m^w%5WeUo_UGDWb-!d$zAQ>E-Ml=GpZZu6=f_{|leNZC0^F;AFM
z0(KYgecz?j_+W0^pvi%EC%L#g<f8VLBfNa~;*~6=G}=a|0=I*`o!ASOHtKkPD~J6=
zseM^?`@c!Jl{ha;Y=8t4Tv9>o{RZ($mhv4y*=T&={ZI-#OJ&KD<89N;AZ7M@19nUE
zT(FaWNA8u16sPbmu@>&^2FT8NVlTIq0dgL#6YN#YW%;*8MP~%t>j83Zt+lhbzO?mL
zm8B1&ua@PDsY}^^t!2dXj_W{?FYOd&`#7h$D#!By+@WxEt-vQK4z4XZ_-h;>7f4R!
z`MqsmRypmis7sY@Bi{BBIY}~^=lyi-wcG{Jd~-3T-A8*OqsqD{3%Z<Vtw+7NKBV0z
zPb`b<QKJRDUIe{f7Es+b`*ZlMUA(Pw$!ZX^UA?Vx0p%%3+JD97(;}C$6@XfxOICRR
zPM;^9*A+>7a>f|V;}nk;>GQhgJdvbhtA(pO;klq{?M&nEs=XUm<x6?Ii!Q$Kb6|nN
zb^EkfWe<wQb^B&hWe+L_x9v<!l^tknrjuKp+aqXImXVy!qtKw6TU8szlY2F~<)uif
zbIAtU+-Ub`S~({F{?QJ?RVn>hD;Z8=4s848RdPSYO|6|k%Ym0s>|G{<THOUmO5(lP
zG(b*A4d?hsAB8!IHp@SQOCE_3>jBuo&Bh=P?1V^97@6Cd<JPYve?GR4SyhsoN2NTU
zolb2BnL-}`9{_pI610GA?oj0%<a4Gu^kl;TS-O=HbvYm)H~y2q@FjvODKCz8rRpiq
zkODXl&Uf6zUH}(5j&pK}?VXA$X*X;v^SO3RIC4&I*E5$1$(MGNKKq&;Ahevm``a&9
z7?OmgayhKp(@6pJ0Qn+}0cZq#gaPs|s1qL=qtGX!elA3<0V3+M+?;`2aem{GqbA8)
zh3y+fw4B6WRIuM#!CV-9+kDf3A8bng$tQ%XZ1ZuORGo(zAH_e<?PeSxm(b?(a8T_d
zXO#u@qboO+>aJOvzUOe=UbU)n33Y;U0*YKhTipnkueK<zlJak?ES@8`hH$&K^I%n$
zW@B?-$p`S-9GOCY%>W6Ix+kysYIzBb0Apt{F!;Q%=ZmaE5gobY%T#=6gu*v=fLuZY
zPzTg4p(D@;G%leEDAKxFOsf-gI}VTws6)2eV-Hm>S@l2-P_u*@pm5f9$yp8_*m|IT
z355r?aMpIwSt!`H&a83?wSw;9f&HQfT3-@hpQ@on(uk{*Gluum`2$6Yo3Wi_t8zl-
z4uamN(L(ZmNR<<^RaUNnIrszQM_7nm3CZJqkv-Ygbvf9#E7es_$hsy9ZHAeH<4VXZ
z7fANfW^<6(H-~BL_2^>cvZr7R(k_sa_hS_D7$;=ybs`^!%)#fakmv2xW}9aj@zUiX
zIm)stBmP{FgRdv?{Ph%iGDwcDpA!4^(=OPQ*uSFK%yNNj-AUZ4!2TpTEzNe`v68Ft
z{ryqNDrt)8U6L71>-JXT`P!z{Rn9k8THP#x5nSvxLp-<yu0Mw-+^wB^sPe_l&Y@^<
z3Y?mqm-1~oQvx+OY??F<`MtGGlPc$28(>PH1!udJwF2kBxz65%^Wc1EPyBpSh*wQE
zvD!H(_(dUJ)t{G={C*jUH_NVk#okwt^2qY<c@?>bUjkt-Ozh>6Kjbxu0~f)?`S}+k
ze&M=b<vd=8e8dC}=V_>gsL_nzlK3ynE+5G63f!vWTmpymtwNU%_RIA=GDniyn^vAK
zP17NYyd-IZ%xb)Q!!?j34KjI3nO94t5eJ&0kAiwVKrWz>+tYo0<@!K!2@Qh&+5xgI
zIZUqZbQn77z)wuC)T?Qq-l}~0bK>9dJ@6cu+MVDkXB^*{U#abiD`Sx~4RoX2w&hm~
zbKoaCv&gzWPIB8eu)Y0h#pxqimd83+XLu#3IfeU=osb{91(FL=y@fNUkz|S6jIQ>u
zR`Inz3T<=xDoYPrW(wsn5r4bNf$<<*EF&Aec;B`yxZSI_757U;&aDN`$iZ*fH6&|B
za{77mt!=GPXuC&OWe>I%qYvwL;<L&ZIcZT}_23ucD70-XRoQ_?z)f4KR@qy03?45M
zb(X!wmTP@fPmY?TfOMO}r*V`t9^H2wAbX3WfbINXl|ASgTyqCiB%(ICdgt~ZB#jp*
zQD@nKQKJ5d0%qB}CkU9-0NJNe12g~^{0NI}$yQ|_bpq%BF1W`s_|O7AbPsCr<HJbQ
z57S4@BSH>6NoF3Wh0L@jqfhS2m4ajsay2`hpIjeF_G#qIoW49~h9ruxzR$$30j~Vx
zw;gw^vaBlWW^lc8%fqyPh<l2hfaUzSPh#=$3h>p*e2CA{I^o*dvr2GnC*KO!mNHd>
zD>F)&hf$ybXF4tj*VZOgcIk`4wGXvbc3c97e4LOEo)5t<?GtU4ojv5^g!G;I^dTSb
z$nX2)qginD99HCfJa<=4AIX>5WjF4(bKnEqqvLq~99FCLXjGLi<qD{ktDPz^xcT5-
zw#d?N=jmL`>7>mKzK*vF+e<qyRpm=LL8KYu0uI~Hs~I3E!((}bqihm|u^k^;h5eRu
zQX~z|-i>DRGcK)#hPQQlhNH^PH=4O!;HXKKjX65w)E-{&9$W&K<{XbyIdCiS?zXD4
zXTf=Je$H`69NzXTvA18rUh5XD<oB>+Pm}C8v3K|tdxzgHeNx0Z(1YvJ$0JI3JGO@f
zs-zOBpWMs#9v{-qf#-JTpvoD?PeHE2`}LFUk)+*9o806~HBbD)@5hQuj%4Y!yr(iA
zQ=*du?X&DE9^<VIW<?U2hwWUo<ql>0+C<^)Dsq)eXk;|ClZ{m_p^|{q-s`WjLmmIz
z28G|l0J&t<GrAfjt*db{tp=zE>X*>K==xKfmE{tu1ieZE*4M=Z)WTYipT~CGqsp>t
zWi-7Iq(^NmJwiANdQ}4?P5u9%mKXf%@wRVY@v1CVLl~W^qoE19ebZj+pW{a@ttv}x
ziFqYzaI9A~^mpG*!dID5pVxR+&h@IcfTZB|%1V_JvV9e9-NM>#*KL|qW$7^kUDmFn
ziO)-r!04@AkgbwYt8hrVWy<bUyC_)Y)G8lp){MH={;FKC+V=Nz_^SQ<QI)+_mBi}%
zTCBdW^+E#L1Sf}ILKq<X1n{#tIsEI}50DF16+3J9(;j22a%z?R$_dz;Pfh^ICDb$O
zM)m;7IZAsqf~Fmp7D-7f?w=^G1N+Ul8z2`v5ZD9z_qCw!Yqwz4u?O}aZKhu36y<Cs
zM!&3hAj^e<4dSftBxsg>0u;9_`=*cV0g?--=j!ricU)gdiU-gLTi=qy<B=N-$%XnV
zmGJHCxDJr)t*XRT`|b0MYZl3c1Zeia+x?rXHpvC6QU?y|$VEVM!KwlJt_H}3g7sWl
zhh5^dlAKy?1=|MowLPlH^_67VSn<Jm!!s=(h1`MMS()osj<@+Ka6b-^SQFGeIVa9P
zs&|0z6Ohj_BIXY>Kmu%!N^<v`L%#ujW46z`xE36;GP#Avw=5kv4{mZxu&y5TB;Ek7
z+hrcX-I3$_<fvcU)kM5^1x|IG2dBa5jzfO$$j5!YOX||Z9`bSTZms(u+z+@P;o2|v
z@hXi1f87At#S44C5nSx@0qz0r5$Aq36E1<9@6z+Tv=sP+|DOGAR^jw0<(8#zyqhZV
zY9sYZ<94Fl##G=OxcO~NzQtGq1Gq4^$5SK<jNs<CPsbolQ(NT2jvc4LDRA?noNDyo
zkYC&7H^P4w`L%t1C+R~z>?9xMsUyrta$o}2A4!E@yp*r@{ZZ2qoB=mKYC6hzl>?Dq
z-{sd&;zfRapWg}R!Od?Wj}i62!!N!o-zXiW4k)+KmmA6>z%OC$moZH2!7rir3(6zF
zFQKpZ@LzymLVpVx^(w$Gq4P_C{{q@a=-Nk^*h75@U422nt0US+?Ak|!y%qeo694h-
zVpOXQvb#Zc>t$*^sV*BNZ_|BLv~b@R$=i<I<`|mM4%|+%k#ux18IMxiAs0K;0-GxU
zF4s}u8_c~;My+!2+@RDpC#ij#Q{6X7<x%5uxPTuiS=*CO95G2+sB3x*icgtw#HS#$
zGm^rMCXdtJrYPJ|?1dxjnUIHG%a2&(!06b|B!z3|<)j-w@vE{`GAF;{%jI#}Ke(Eb
z6pn8Ii@m1|ZzRcqNz)^zd~IFkPVef6Ns<dB`{sQoLmbIz8e=(t+_>-b&K60&(9QMH
z$vJhYgU_!|_902yZnIH7d`i4-HS!9#$T`<GKyzSdS3jyGE>9|zd`abLWe)uigIQ9R
zgQBN=8pw^4LLETYg8^U!SeDi{S#Xk-&~8vxS-D9HJ#(>LS*?=3k<x{K&4psDSd8tb
zzN(yCY&&x$FoKJnbAmW0eo7MKXP-n14oNKUu+fu~8U`ii^+ueyE?rVK=#WAUE`dw)
zh=WAD-b#+w+je5q&{<zjq66tc`UTQ3`Kl!DENdgE{;uCCqLlP2O6Ij(CjRShDVNvw
zym6KDT!#eCfpeX^HMj^ac2&_!0yph>f+{;VdT<F`>g%@nX7ern-hA6GCz8U=U2?o>
zkFVt=dO#96&)f5!qbB*nAFb(0TTk!!Hg13Axn7a<x<tJXT{`W=YaDm_R^0GOlD!Pb
zv%tKg;7*S(wnuW#iEG|n<T4>S=XfFO-b3IrA=$+%_CizxWM@wcceMi~wNd)iA8|UJ
z%jA9SxRH|VVvgV(I9INi371DmPT}?A$T5?o#!S~X@rVe|tKIc`&819ofo!?)ulsh$
zzsi^Hb%18l-w(%@$RbO1Pzk@K&1-Fc|Kz!Pk!6n|2HFs+C-3-2L)W=$ZBGVP`69oa
z)8<P&W`v7rkMUJW4ib>k$o*%yB;H~uQU<@W$?H?AW3Os`Z}Z$L%Z$WCEtl@wLMw;<
zv`-OL0&E}P+nDjHUX8N@WN$GtXf<0T*;~{N+^Y7Rb(OtE&!E1sMUuV6kwJIlT$Ai9
zstFhY#y)DtAir)@Wf!$)hi`Xn3Rq=tv4Q%g+i8L-sl&?^L2H?0wSFfDU)#CODi=rv
z(t&gfWXq5K@$Ktzl|9Kdh=M&xzhHA@^7J7#Pag|5ThCI01IVyI)}H0n?D#85l`rLo
z>T{Qe2en%THgCuF-M31zsp$9TYY~;F>l%5zMz%>ZH?m-zGyEv1vu(BeYVOB2o2l}}
zOy6t_`~%RhcJrniS7f(~GJ<p9y0_BWxsVd*!S!zhPT~#VCXY9@T|5sif{PtDv6sM2
zmSJ@liYs{zY)3t+>~_gu&nR&HE*|{So@1&KcQxz9-ie>v$+{{#dkP%N!_3RWwWUD`
zM0rehp*SB79LmFV<>BFH13#N?cW#{I7k)OKpW9rA0xkSE*@fbKpgb)6XXm?kF2R2m
z{<H93d+SicFAjcjuG>XM`W*7%Cc6scmpo;KJ&*KFcA?r#%%R*o>a*{5krm3r!_U6+
zvzo+<^!aXEL806N(igg2PzC=5_%E1w`{lvZ?rr3FNz%+9_ZeoC!ywiR`K|`Y1(Nf!
zcPDOhBo|1o4fn^LZIUF?(Gfa}-`Hplrzc!n#V%4ZRF97ETWzoI_1GT&t@4GgAI7$w
z!ry0J-R9gjYpX0{pPL+`Xw}AD-aeTlStd@#>hh~X^_F6r5asaery3y1q_gzAcKbro
z?%_~qJAPATZ&Nw8`DsQ*N%kl|KA*$wgZ)FYP8xmS(cJKO>OQY+AH7O)kdO7Vdn4IN
z-`5-Gh$O9Fd{A=DmHT4%dus3AS6ODZd2XKY(vW8IJ#yiYTp*j{{pQ-<a;%bao~V>v
zJ3}1#{W%J6-?6JCG?ptEbr+^BKy%=C``}dNf=RBIe)H2_u&i<cb!>Gz9yp;S7f>Up
zYbPG7TtK<Dr<-lZnyMt%CyUD>Z4>!5-8bzEc9muMotN>LbhLqHqt@T;lW3Jx>?Y7Y
z1{ZtV@uT2&QKrg{4DD0<M!l-`Aaa!;>q8AHwThgBecQgNa$>XYf$?K&IXJ$G%`7Km
z?Ga4WLbjunRsOYEuKbNmZW8C6<UDrE3*sESYuDhb?DKBL=C1AAR=MCM%jC7)=Bskz
zrS@Kr5u4YI*t~AWo@~QZ4!++co%frYd7e&1N$mGu@qCu^yvx1*9Q<g<zN_qP^3Rv&
z;HMYz>8E7d6&BBL&PVn<$<9mssB#GoAVWv;(wu@%dVuV0N?qFU&kj`C+jJvH4{{-$
zifi5GdgQ#5?ENV<Wb@@EonO9Br=5}xWMAueAzy+c$k>s(ZSd#dpHGSX=W`OfoHQ0M
z{X8Z1pQr2EW|VT@{H`VT-*xO-96!aIgC8jO$F$tJCL{09yTtyyyPlNN+>VM#lD+4p
z#clP4V<$OJCw*68(%ghJw}o_SA^n=mo#ccJidKaD%f}S_t_R3z4_1EzO8U>Oze?=>
z+VxAhZA#wLn)?uv^PW}~WI1?s<#t4JA)T!q>G#ln-muEDv~_uP$zw-s8vq;qy^k&L
zMbf4a{b)y&Lj>G^>fbxpr6Os7*WMDzo^N^A58IxhtP*5>#VPVd>>TXcwWlg4HtAi0
zyku^t@N@9)Ddg@MrZ(FKX#g2P#szX>Gl5JC<jCZHlpO8L^T~fn{tb;1tU#*XrcWR>
zNWDOMkOrh#APq<d(k+ma9C?twCtU!UKrVQxy|7Ec3SLqRUSiLi^RYd{US(MvOg?3D
zk8IB0M!0WNf3C*yQb&FzlLFm1K+d_gm9jCPw{|A3$`^XwB`-3|W1WG21*U{WVDS$|
zr&EC?U}<KJ2Xwo>wXfq<mboFl8{K}myWKdZ!1leY%8oO_wd+h(3df6)<`lb0?e2J$
z<P=38{J0C^AA{Z4pRcx7tFl=1*2%3x8`<X1U2Kx1?jq35c95;oy61)`dquv?Z!EMT
ziR&Nk(2JzFEj?M$uDI|3mCo1pBhFP4XHs4W(06T3-?#W@a5w3u+T;CI=6dMqDs8#g
zr@Z{AT?w@S%U2u$=?35wOaN04O4K{{Pm!{erEmE;=WT7i9k2Y0YE{1UxAjLRzuK9x
zD(RuSzqjC97ny^9TW4D3#AdtK?<ZsenHEUZ$lGgcSXC~h(=mB&XZfoHDfi_)lOOF-
zg(^ETwk|RUU)njEDi>_l8obqqyK1o)Nws54E?{23C%(646WeP&RhGV@bQbOB#aj)g
z&~`4e$_{LwD@&*XsOBKw1}>o*pzgq_MFX&m%43jUK<Ck>1UhhT#@Y6vrUZI$zR$A-
z2<N<a&fyE%6naLU&kK2m#WV7J?(*y>X*?s(=RVI9&VsXD%nnXG!#U@<bNKhUN+6tb
z?wrF<1eZ`ar#!0n$Dl{VAE-DVV^=B)9DMlE`Or^6Wcj1ZGQVq8LXnk^g>ptCXx~1n
z?2?EqUyzkcpA|%WK?*MmDV!KYp<G;FdJ$j*7<-EWAWG%3P-Wr8Pq_PYp~}KJpQy5*
zx~n*B@W_uDR9U*G800CDz|J|X-?(Oz>^Sz#$9BsJBuVjVQ|_X)&Q^-=V|QaOlBAkA
zBe@g1-Pi0!l4ZBa(bi@Ar8Wm}8wSW8<hZ}gn&T!(ZY)p0m3}tx3%xneIrbvSj^l{G
z^@%x>FMO@X%7HMT*Eh}#$%2NE?Bqx5+2{A;52GXr^<yme-}nu>R@3Zi+xFe7<V{8y
z+k<csT=`=6himgV3T$6#tL*Gq=63CMqbfTt3D+K6-^zpQe>f*x+Xk$%<2-Zsi^O{`
z^T%X^qDQ9rr+g)a4=WZQ^>^xLPSilRlE_DWFyJrB+x<EGxfB0nxrB-<pWOWvxs(6{
u)B?2&R=Fdv>t}nZwn}n<v$7X9j_lCgX~%gYxny$um;VpZ{NS*|pa1|-0PhI^

literal 0
HcmV?d00001

diff --git a/q2_moshpit/eggnog/tests/test_dbs.py b/q2_moshpit/eggnog/tests/test_dbs.py
index 31ac4c8c..66e657cd 100644
--- a/q2_moshpit/eggnog/tests/test_dbs.py
+++ b/q2_moshpit/eggnog/tests/test_dbs.py
@@ -6,15 +6,17 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 import os
+import tempfile
 from unittest.mock import patch, call
 from qiime2.plugin.testing import TestPluginBase
 from .._dbs import (
     fetch_eggnog_db, build_custom_diamond_db, fetch_eggnog_proteins,
-    fetch_diamond_db, build_eggnog_diamond_db
+    fetch_diamond_db, build_eggnog_diamond_db, fetch_ncbi_taxonomy,
+    _write_version_tsv
 )
 from q2_types.feature_data import ProteinSequencesDirectoryFormat
 from q2_types_genomics.reference_db import (
-    NCBITaxonomyDirFmt, EggnogProteinSequencesDirFmt
+    NCBITaxonomyDirFmt, EggnogProteinSequencesDirFmt, NCBITaxonomyVersionFormat
 )
 
 
@@ -150,6 +152,68 @@ def test_fetch_eggnog_fasta(self, subp_run):
         # Check that commands are ran as expected
         subp_run.assert_has_calls([first_call, second_call], any_order=False)
 
+    @patch("q2_moshpit.eggnog._dbs._make_version_df")
+    @patch("subprocess.run")
+    def test_fetch_ncbi_taxonomy(self, subp_run, mk_v_df):
+        # Call function. Patching will make sure nothing is actually ran
+        ncbi_data = fetch_ncbi_taxonomy()
+        zip_path = os.path.join(str(ncbi_data), "taxdmp.zip")
+        nodes_path = os.path.join(str(ncbi_data), "nodes.dmp")
+        names_path = os.path.join(str(ncbi_data), "names.dmp")
+        proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz")
+        version_path = os.path.join(str(ncbi_data), "version.tsv")
+
+        # Check that command was called in the expected way
+        first_call = call(
+            [
+                "wget", "-O", zip_path,
+                "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip"
+            ],
+            check=True
+        )
+        second_call = call(
+            [
+                "unzip", "-j", zip_path, "names.dmp", "nodes.dmp",
+                "-d", str(ncbi_data)
+            ],
+            check=True,
+        )
+        third_call = call(
+            ["rm", zip_path],
+            check=True,
+        )
+        forth_call = call(
+            [
+                "wget", "-O", proteins_path,
+                "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
+                "prot.accession2taxid.gz"
+            ],
+            check=True,
+        )
+
+        # Check that commands are ran as expected
+        subp_run.assert_has_calls(
+            [first_call, second_call, third_call, forth_call],
+            any_order=False
+        )
+        mk_v_df.assert_called_once_with(
+            nodes_path,
+            names_path,
+            proteins_path,
+            version_path
+        )
+
+    def test_make_version_df(self):
+        nodes = self.get_data_path('ncbi/nodes.dmp')
+        names = self.get_data_path('ncbi/names.dmp')
+        proteins = self.get_data_path('ncbi/prot.accession2taxid.gz')
+
+        with tempfile.TemporaryDirectory() as tmp:
+            version = os.path.join(tmp, 'version.tsv')
+            _write_version_tsv(nodes, names, proteins, version)
+            format = NCBITaxonomyVersionFormat(version, mode="r")
+            format.validate()
+
     @patch("q2_moshpit.eggnog._dbs._validate_taxon_id")
     @patch("subprocess.run")
     @patch("shutil.move")

From 205913ab11a74cfb0b7121037a9a8132b8d670fc Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Thu, 18 Jan 2024 14:07:10 +0100
Subject: [PATCH 13/24] fix bug in tests

---
 q2_moshpit/eggnog/tests/test_dbs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/q2_moshpit/eggnog/tests/test_dbs.py b/q2_moshpit/eggnog/tests/test_dbs.py
index 66e657cd..c73d41f8 100644
--- a/q2_moshpit/eggnog/tests/test_dbs.py
+++ b/q2_moshpit/eggnog/tests/test_dbs.py
@@ -152,9 +152,9 @@ def test_fetch_eggnog_fasta(self, subp_run):
         # Check that commands are ran as expected
         subp_run.assert_has_calls([first_call, second_call], any_order=False)
 
-    @patch("q2_moshpit.eggnog._dbs._make_version_df")
+    @patch("q2_moshpit.eggnog._dbs._write_version_tsv")
     @patch("subprocess.run")
-    def test_fetch_ncbi_taxonomy(self, subp_run, mk_v_df):
+    def test_fetch_ncbi_taxonomy(self, subp_run, w_v_tsv):
         # Call function. Patching will make sure nothing is actually ran
         ncbi_data = fetch_ncbi_taxonomy()
         zip_path = os.path.join(str(ncbi_data), "taxdmp.zip")
@@ -196,7 +196,7 @@ def test_fetch_ncbi_taxonomy(self, subp_run, mk_v_df):
             [first_call, second_call, third_call, forth_call],
             any_order=False
         )
-        mk_v_df.assert_called_once_with(
+        w_v_tsv.assert_called_once_with(
             nodes_path,
             names_path,
             proteins_path,

From 8233505d345ee20d3936abb6b2025e1fb29374db Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Thu, 18 Jan 2024 14:12:48 +0100
Subject: [PATCH 14/24] Add ellipsis to green prompts

---
 q2_moshpit/eggnog/_dbs.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py
index 4fbe663b..1264eb19 100644
--- a/q2_moshpit/eggnog/_dbs.py
+++ b/q2_moshpit/eggnog/_dbs.py
@@ -252,7 +252,7 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
     version_path = os.path.join(str(ncbi_data), "version.tsv")
 
     # Download zip file
-    print(colorify("Downloading *.dmp files"))
+    print(colorify("Downloading *.dmp files..."))
     run_command(
         cmd=[
             "wget", "-O", zip_path,
@@ -272,7 +272,7 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
     run_command(cmd=["rm", zip_path])
 
     # Download proteins
-    print(colorify("Downloading proteins file (~15 GB)"))
+    print(colorify("Downloading proteins file (~15 GB)..."))
     run_command(
         cmd=[
             "wget", "-O", proteins_path,
@@ -282,12 +282,12 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
     )
 
     # Constructing version file
-    print(colorify("Constructing version file"))
+    print(colorify("Constructing version file..."))
     _write_version_tsv(nodes_path, names_path, proteins_path, version_path)
 
     # Return object
     print(colorify(
-        "Done! Moving data from temporary directory to final location."
+        "Done! Moving data from temporary directory to final location..."
     ))
     return ncbi_data
 

From 650160d8746ab4f0a3aa1df305a3f47dbecd2e74 Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Fri, 19 Jan 2024 12:42:36 +0100
Subject: [PATCH 15/24] remove duplicate action

---
 q2_moshpit/plugin_setup.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py
index e00a0feb..e708300e 100644
--- a/q2_moshpit/plugin_setup.py
+++ b/q2_moshpit/plugin_setup.py
@@ -591,23 +591,6 @@
                 "storage space is required to run this action. "
 )
 
-plugin.methods.register_function(
-    function=q2_moshpit.eggnog.fetch_eggnog_fasta,
-    inputs={},
-    parameters={},
-    outputs=[("eggnog_fasta", ReferenceDB[EggnogSequenceTaxa])],
-    output_descriptions={
-        "eggnog_proteins": "eggNOG database of protein sequences and "
-                           "their corresponding taxonomy information."
-    },
-    name="Fetch the databases necessary to run to run the "
-         "build-eggnog-diamond-db action.",
-    description="Downloads eggnog proteome database  "
-                "This script downloads 2 files "
-                "(e5.proteomes.faa and e5.taxid_info.tsv) "
-                "and creates and artifact with them. At least 18 GB of "
-                "storage space is required to run this action. "
-)
 
 plugin.methods.register_function(
     function=q2_moshpit.eggnog.fetch_ncbi_taxonomy,

From 5cfb55b9d1361d6b453b9e3d2edf665f54a80218 Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <54123712+Sann5@users.noreply.github.com>
Date: Fri, 19 Jan 2024 14:42:35 +0100
Subject: [PATCH 16/24] Update q2_moshpit/plugin_setup.py

Co-authored-by: Michal Ziemski <mziemski@ethz.ch>
---
 q2_moshpit/plugin_setup.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py
index e708300e..46660aeb 100644
--- a/q2_moshpit/plugin_setup.py
+++ b/q2_moshpit/plugin_setup.py
@@ -601,9 +601,9 @@
         "taxonomy": "NCBI reference taxonomy."
     },
     name="Fetch NCBI reference taxonomy",
-    description="Downloads NCBI reference taxonomy for the NCBI ftp server. "
-                "The resulting artifact is required in the "
-                "build-custom-diamond-db action if one whished to "
+    description="Downloads NCBI reference taxonomy from the NCBI FTP server. "
+                "The resulting artifact is required by the "
+                "build-custom-diamond-db action if one wished to "
                 "create a Diamond data base with taxonomy features. "
                 "At least 30 GB of "
                 "storage space is required to run this action.",

From 1633654f7cca5aa7d8febdc111c586d7d3e55a71 Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Mon, 22 Jan 2024 07:37:02 +0100
Subject: [PATCH 17/24] correct indentation

---
 q2_moshpit/eggnog/_dbs.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py
index 2c751a21..5281c7d3 100644
--- a/q2_moshpit/eggnog/_dbs.py
+++ b/q2_moshpit/eggnog/_dbs.py
@@ -231,11 +231,11 @@ def _validate_taxon_id(eggnog_proteins, taxon):
 
     # Check for overlap with provided taxon id
         if not str(taxon) in tax_ids:
-        raise ValueError(
-            f"'{taxon}' is not valid taxon ID. "
-            "To view all valid taxon IDs inspect e5.taxid_info.tsv "
-            "file in the eggnog_proteins input."
-        )
+            raise ValueError(
+                f"'{taxon}' is not valid taxon ID. "
+                "To view all valid taxon IDs inspect e5.taxid_info.tsv "
+                "file in the eggnog_proteins input."
+            )
 
 
 def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:

From 1ef872d7ee1e2d7a50ea994d02e9a0ac73167cef Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Tue, 23 Jan 2024 10:21:13 +0100
Subject: [PATCH 18/24] Remove version file + adjust tests

---
 q2_moshpit/eggnog/_dbs.py                     |  33 ------------------
 q2_moshpit/eggnog/tests/data/ncbi/names.dmp   |  11 ------
 q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp   |  11 ------
 .../tests/data/ncbi/prot.accession2taxid.gz   | Bin 10956 -> 0 bytes
 q2_moshpit/eggnog/tests/test_dbs.py           |  28 ++-------------
 5 files changed, 3 insertions(+), 80 deletions(-)
 delete mode 100644 q2_moshpit/eggnog/tests/data/ncbi/names.dmp
 delete mode 100644 q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp
 delete mode 100644 q2_moshpit/eggnog/tests/data/ncbi/prot.accession2taxid.gz

diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py
index 5281c7d3..07a2fb09 100644
--- a/q2_moshpit/eggnog/_dbs.py
+++ b/q2_moshpit/eggnog/_dbs.py
@@ -6,7 +6,6 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 import os
-import datetime
 import pandas as pd
 from q2_types.feature_data import ProteinSequencesDirectoryFormat
 import shutil
@@ -246,10 +245,7 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
     # Initialize output object and paths
     ncbi_data = NCBITaxonomyDirFmt()
     zip_path = os.path.join(str(ncbi_data), "taxdmp.zip")
-    nodes_path = os.path.join(str(ncbi_data), "nodes.dmp")
-    names_path = os.path.join(str(ncbi_data), "names.dmp")
     proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz")
-    version_path = os.path.join(str(ncbi_data), "version.tsv")
 
     # Download zip file
     print(colorify("Downloading *.dmp files..."))
@@ -281,37 +277,8 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
         ]
     )
 
-    # Constructing version file
-    print(colorify("Constructing version file..."))
-    _write_version_tsv(nodes_path, names_path, proteins_path, version_path)
-
     # Return object
     print(colorify(
         "Done! Moving data from temporary directory to final location..."
     ))
     return ncbi_data
-
-
-def _write_version_tsv(nodes, names, proteins, version):
-    names_time = datetime.date.fromtimestamp(os.path.getmtime(nodes))
-    nodes_time = datetime.date.fromtimestamp(os.path.getmtime(names))
-    proteins_time = datetime.date.fromtimestamp(os.path.getmtime(proteins))
-
-    # Create a DataFrame with file names and last modification times
-    data = {'file_name': [
-                'names.dmp',
-                'nodes.dmp',
-                'prot.accession2taxid.gz'
-                ],
-            'date': [
-                names_time.strftime('%d/%m/%Y'),
-                nodes_time.strftime('%d/%m/%Y'),
-                proteins_time.strftime('%d/%m/%Y')
-                ],
-            'time': [
-                names_time.strftime('%H:%M:%S'),
-                nodes_time.strftime('%H:%M:%S'),
-                proteins_time.strftime('%H:%M:%S')
-                ]
-            }
-    pd.DataFrame(data).to_csv(version, sep='\t', index=False)
diff --git a/q2_moshpit/eggnog/tests/data/ncbi/names.dmp b/q2_moshpit/eggnog/tests/data/ncbi/names.dmp
deleted file mode 100644
index b89e8a2b..00000000
--- a/q2_moshpit/eggnog/tests/data/ncbi/names.dmp
+++ /dev/null
@@ -1,11 +0,0 @@
-1	|	all	|		|	synonym	|
-1	|	root	|		|	scientific name	|
-2	|	Bacteria	|	Bacteria <bacteria>	|	scientific name	|
-2	|	bacteria	|		|	blast name	|
-2	|	eubacteria	|		|	genbank common name	|
-2	|	Monera	|	Monera <bacteria>	|	in-part	|
-2	|	Procaryotae	|	Procaryotae <bacteria>	|	in-part	|
-2	|	Prokaryotae	|	Prokaryotae <bacteria>	|	in-part	|
-2	|	Prokaryota	|	Prokaryota <bacteria>	|	in-part	|
-2	|	prokaryote	|	prokaryote <bacteria>	|	in-part	|
-2	|	prokaryotes	|	prokaryotes <bacteria>	|	in-part	|
diff --git a/q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp b/q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp
deleted file mode 100644
index 61a662a0..00000000
--- a/q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp
+++ /dev/null
@@ -1,11 +0,0 @@
-1	|	1	|	no rank	|		|	8	|	0	|	1	|	0	|	0	|	0	|	0	|	0	|		|
-2	|	131567	|	superkingdom	|		|	0	|	0	|	11	|	0	|	0	|	0	|	0	|	0	|		|
-6	|	335928	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
-7	|	6	|	species	|	AC	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
-9	|	32199	|	species	|	BA	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
-10	|	1706371	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
-11	|	1707	|	species	|	CG	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|	effective current name;	|
-13	|	203488	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
-14	|	13	|	species	|	DT	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
-16	|	32011	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
-17	|	16	|	species	|	MM	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
diff --git a/q2_moshpit/eggnog/tests/data/ncbi/prot.accession2taxid.gz b/q2_moshpit/eggnog/tests/data/ncbi/prot.accession2taxid.gz
deleted file mode 100644
index 8b78ac0fc0030af4e68ead0ebb547dfb55ae34c2..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10956
zcmV;-Dl^p|iwFod4|`<*18{P0bS`0IV`X!5X>V>abYXaDWB_fQ%Wea)a&5<*k1?8W
zawu63OR@}EvSmxNgCq#fBmt5jxSxMnB&&A2*%M>KTqu&oH=CsX@vr~-@Bi~Z|NO82
z9R9)T<^TTuzsrCA_aFc7pZ_-e$3OpNHLk`n9o~N80QsT@Wz^`E8B{bbX{~)Is8XM&
zm<Gs<I<3v<2dzirR2qY?WVmgk8X$|&7=zEKQ9c@Pbpq;ux&_o&L3d_=TtbbYzhQt}
zN&wRO&;#U>Rn6#b9S6uIbYyh@DhbGP2~~_<+&Dl^QLAh)Dmh8(<z3R6<%^!7dK?3z
zpH_+>%LP<PTCY|TkmW)GQY@%8yY-J9ATw&LAH$%MNikaGEx-UUbYKb$ZXR)foP#-;
z4DMQ;s<N}_l7MSBK<1qPwb=4ap__4l?4vdeep#`YWd^lVCMl4*k$b=P17wanlXe_I
zaw_-fBMp!}sl?{xz#bw=v76lNGCAsywDCF$c^n7GY%?hDay<y%`AChL!J8J7SyIk}
z^KMW+Bol&-319-4Ixw*>5B8NUlBB+2jJeJw<#MK?R>E!k06FJW6mF*m$d2QBb_iSH
zvdo-~CP~Hh?8L**hBHEv-0ii_#hR<Sb-9H(iTP(c4v-Xc98GqEGQrpw$8iLyKvLX3
z1Z@a(Zp=m<Kn<{VF*<huBcWPlk9rT(05yMN*UC9>5&Iso?^T~pg}m<({a$zJ^a>y%
z-_Ik@$+U3N-h#B%<QS1k<mUkJU>CeOAstB9dlPXVJV5_9?(nB1rQwkL>-J(Z1VK;8
z`N>Re)>KYF<mY6ksjN;n0Fj-OYZ*5qOeT=X&&f|s=KR<gP>!bnFwCnFxgE(oBbjI0
z=Lbc8Ml#Q?%g+dBo{`G4U&s$~env9S;SX5~D4<=OQ9$Q7BPl<e%**eJlSz`uJU2<X
zgVS*&7bOAL;{e%#(F26Vb!X9O3orppokfqRU-ba_7j&bE8ejkz<`(npK){U|AbX2Q
z;|7`8xDJ$ho<otDP3kQ=0k^QY?JY{jpF<<S*n`OR4({1??y(~*?hy5^i`tGz;|@{p
zx~Q!}?b*YR`_7M6Mx7jrlG%q2)JWqSGV|7F#-N(LS%9{yJppC?hFrgO_0dNZ;hP8O
z`!u3HzM-t&`iiSz@qjjS82ikinjMgtL!X&R#XTT1hrS36(s)1-9%7$Hl=TrscwA@-
zi28^kJT5dY)W;Kw@YFZgfJ$+K#Zy-)%%n?sR|90<6i^@EVe!4Um|*dYG@cjQ3@n~u
z@!VHj1&4pYj~@$70q*%g?fK|>iaga9@PbNlv3-3+8ZSuWWubAQntj6JXK&G<6@DU(
zpM4sQ)Skzyw3#gXRw%iC$dZ6$U&CP$UGrn!H4`|o7yvHR$BD%Ruy3wW{P@_4d$R1?
zi~$IX+l3AQY23l$u5-8v9w0KaTWAVMqfAao_Fw|&04|I(aL*e`=B?|bOoBypq>p(=
zN^Qmhi~tvg52<F4gS5gddk|?vxA$1+_C}=fh|C=O9#_NRN0iKQp>aikD4BV;M@<2h
z0^Q!D@Aj+_i_aucXW7*rw8Cdf02eB*IQ+SS#Z_m~qa%H;U~$!342XIytuV{ZqMv{f
zU_Z`8fUmIl+PCYF00w}4Q<#A0Ql9fJg_=SD=m0J>1u4R(tUF2eAS}wdlVtC3lr_4P
z=e$dyimL&7fD0X|0tkoCr-jt_psdkJJ?EViMUBj0{`j2FA1O1aW;+z&ZefUnA2HE<
z&L^5Qvxj?jNaJoHjmXR%Wxel*50A|3;qd)JyN1KjKR)OEBUzk8jWq82G-4+8d_(Pd
zTj-k66fkvt&Zn*f!aeA5p9?*%L8ZWq^f{lAQpH6P9+1X^n&;YK{`ka<^f{lAl0~HP
zfT#}(MTkm)iRQDPXxb4LkFa=Lm>?lDC#3Pzm$gErIKkp+p$L)2cQl3fz7NOD{)vg^
zv!7`4&NhXfk?V7xYr_`5SDHbR(~X6?qr$oVUaz<VAo*u)U&$!suOA?3f0MW78aWF2
zkjS&_wuoe&^8LCJlUXk8qDeAeU)2ElhpaZ)>G$6iJDnu$zT{pfCw0&|>X2svvIe<7
zW%H6GrBjzLD)~<uotN(P{*+A>l5^0>q~`v-O5|Bm<YUTSf=!u^|Ij`uWN$MFn8pFJ
zPo)x2HnmB1pn^piJV|z-^lCrjH=_Pth<X(8j;QYxb;{id6%U)VAQQ;c$Bo?l0Q^b4
zNm(oXD5>itdrvw6tz<{pE<3>+fDWMRLCKABVSwy&BMwp?Y4i&&N7Oz7Y@4CU*h#**
zLH^`5+W=Y6Tve2|(Wo&<$M$%L%SjTKk(iyPvv(A2^|fT$wdQgx@`b)5*ib(scw=}J
zvuneXr6M~{3&+D^ksW85%hUY<vg3?!TX5T+bIgT<q%V}MlHkHf-`14R-G(uek?^Ai
zNb1yRJsPP0GttlrC@}iPEA1jPYHiSN2((r)j{9i4iw{mCNpg4gurhv7xiiR}mEzTa
z-6h4zB)Nd{o=y%w0zEFGiqVzjJds>V09*Zhe6?y>WJcqtvztcxM5+@{W$jg6TdgdT
z8l$HV%GQl3=-`A~^LnyK=9HyRp3X=TufG?umu0EC*5<J|SIs<u&RwQdjoWKrl;i}Z
z!6&aL6;EdE?M<_fNcL70PZe``@B`#R0(d26ZwFqy6gfp{VdqqU)$?5skbh7W1AWsJ
z^kW?c$c41>pyGcNj}%3gI;wKCx%bdv40$Qob}@|l*|hdiWf}Yu`9F7AF%JN>om+d|
zs_N7#OLs({?Nm}eY1`Vl%N4D)-zrPjp>jFK+T>jB5iFzbq6f$+S`H|v)P~a>ezbmo
zT(asIJy|<IE}^NRU;2$vl9Z=B-j@!MzQEex*yz{m$VnwhEm=|LP`~Jotv<Q*okmV4
z$r8EI+7D5?T$T76Uk=@FIcky}*eXa4J+v;n$_{i^Kr;@Ib1;q^_0Rh=43GtkxuIz)
zd7ZUti<un$xNvqzQaz&nONEf<Qn|Wuwx4&;i6lvmG$D6{RNYc=c>+~k?#GpHHM2^Z
zl_jOP8ab;Qx5?FSfFy^zkh|?X_~hvue^T({>CvV-RT4MGY|p2tb1&1nVn4pCk>e#v
z-4<<h<eKIZ&2!r-A&1(woT#z~ErY9f0kiDDw&&r;TVB8pkd)=z2+Pr$XdU<0e*F1d
znE_IIR_+Nog}?sEStj|SD;INHHF?BRjD5DUtFklKJgru*-_0{cmWFK-S8dZefE>6T
z-2lm4?oo4<Nvf^za^T}3_8#TbjqxR7n)Jr#+X&&+di-RNWCyYX)n?Zg997b>4{zy+
zMs9hzg0;p+foE{#qvM2A)^fy~vkC)b$8o&7^$#bI<c#CZm>jsdr~$I$*fYBm*O(&z
zaAO`4lyL16Z<TX<TDbR>q%X^vU!0Wi?!4oYBRP*(N_hA6D)zGcV{c4J{Bp=^>H)GW
zS;n%kaymEWBn5Z41_84au$<wrxnkB%&YM^D)2eil1S&dZ;dvwo?ZZh9U$^bhDyL{|
z^S*r?^x^f~Hu9>Z8k<~nBM$_oO`(0bZC_i-6j=rW=VIhjDSV0}2fwd{d|yvVUVk>P
zhu?i@wW7$EcDRRn+dKoZA-3-W6#N?p$ob%6g?n>mfb2NS-1#K-vMe3EPTuP_L(Oe4
z3flG?s_am!gT#LRNCV`8)hJm%U&aBFPN@C0%3CNUTme-JRvprML0T_*ZZ+2tX9WEc
zPo2vowex@i>VUch546#+ga@FZOF$eQoOMT9?+bo4Z6;8{k=A>kR;wo-xLcrhAzMi6
z$D5=z%Y_7J@xaGH()w{&ssqLW^+5fCRU_H@I7%J(sH=}zT7#6<)r$wNWu;4P-tpm6
zt-OFIse;$CjWz>0osad%jv`55OnLH~>qv~sdT^g+fb2M~rH`|;P?CMb?Dl6`vx7(y
zn6ny^tdH6{*T+?!at@HCA2Z6vGD4RVw>&YRa0ApVpj?^aYSorLRW4Z#f+nD85%r9&
z+sdQLB{VVm`YmbAatVz<6VSATwi*)G<FmQ7NSezeOZ$jzmBJM?uI<P+NfwZ1lDzP1
z%gG$-KgD8}0_LSz(77*h0`A-Sq$<ncsRKG)&dvJJb_T1Y+<2#P>T?u}uOGimlB9tl
z4@!Ad%oALlM>5U6+#OC_MM##Yjg!NnilaWpwn<fGnFBl9M!CQrgMIvTpPUtv#o;M?
zNwuhLic!$JA0W%fo!3NxkB&QW<fk9Ss4vsPIcY#9kO?I1Ddnldlw9ipav`B^6!aZv
z9l%=jWksyhC`N;3x0CEh-W{ai`#3-@WT|;Ht<GFRMPl)sSH#Yy_C3m{*Ke-sMYb(E
zO;ZtW!L<el{q@#%^QtUeQsh;IPwjM84t&20m!)uZ!V;P<bKvzV-1R!KSF_S4HbH-z
zF$|EicwOLlzpvg6kiz8};dEZIi@mQ)8X)KP+GIEAdupqhDv^(vn~mlJ3OR7oJ`z^R
z94|kVaNCR6%W|GR$?ttXaz03kJ(pK_dCYH9(;WDAR0HHZADaDqKb*zh;XL<?ZXa%P
z;JJN;s#3T%K7hMCx!g!j@=N<?v3>Ll=5^avja8Pm;>RJcplqIvw&O3stzA1QTqSL9
z=1wG(j?s>K<apiMw~0fQvLDx${B5yS0wcKi;-?(#h=NPtlGw|$fRvYbMiS`T{%e&l
z;V;)_tR+x^Q=L7e?_j{07xOoLZNw;n7M$(uP2%<7eCHPr4*omPB=K*&BRKfwkY4O8
zzsScC`91deg`bZaobKX<pO47L(a-I*!KVZwea9sB8W)g{6Uyy0b{x`oLi$d+<B+}+
z%Hw1^4t{B;u&b1Ov`-lp<#s}TPhEZ|<?)X6y(4|CKOH9=?7e%br;Te18w&hDxqT?H
zXOhn2jT@1V55)VSC4H5%X<`rcxg78Nvrit8z7P2C!*(3(UEse9{Fhhg%Ii^~KllXq
zIpvbGE(O$=E9${@6s~UOgh@TPBEQ$J9z?XGE9&1hb?puH<%W1~sDCxyI4QRq;=Spv
z-or09`0v*B2O8yegI{iL&Q1JxLw&jVxt}NP_lEr5lGH2iPm#Va*!zOLviFi_DH9I;
z&=>ll#wq0E3;p<4?CgR2hF`wzj2kEY^BsP<yEzw;-yd*49T(ugUvR%2m%u&HJ|0H$
zTkE}@^lK0J`GJ0|TzfKnEI9|}CO<$5S9f<K+Ibi~IFI^Q$9<$P81Y}0oDW`@jNqK4
zFH7cfUyv)or26G`$^ePxR-^S&?Qg$gE=yYH(6}!vA}tm-P6k4fWfkFRLZrzty!)2}
zz2lfk(hk@-=0&rBoRiI6MmUm`MCzMma~@;sE^L`3NgI!}-N-AQFbbFqH)4`y&pj^#
zQ(in+W2NudrKcGn%SM?tJn2<2auRnd$DK6E9+ZezGAfbmL5X_3XYY`lgGt)mZv7=`
zTz_?GOo~hJ?c<lDCizkpLDu7Lu5|sMpCdPAlBEPw5YBOCzK%TgEs{P+j6R#qH7Ae5
z+HuwX;%&StDdxPcu&87TU<4RDP$p$xSIyob*@4okzOKd$kR2G4fL0BVy+tSBDh-gm
zMK9nQQLp=`6~I?<PnNw!@i%W<RoS~|6!0Au-+PN1xqf#5U0*UhI)42=#XVW}X$$}j
zz=d2JfDvF{*6rX$@Vi|*sZb>?V3KhoE^r0`dF51O4>pH~&90qWu9DU;$+_BMXg`s;
z?JcyJi=-tHt#RnnQp~+*?xeV$<?tKO+al^1{fT~nTtc;=ZO^F6DcZhsa?=Uh$9o(g
zm#j92hEr^JovJLJr}KHa)daf!#(6p=p79phah##|3(q);ByQBw4e&kg5FTr;l|@o_
z5vjYVCxJZIwYdKLXpOtb7ydxV-H#Wp?c-O;Ty*pS#d0qdU-#`ma+OpqC>wdrZPV!6
zo$wIfBJY<J$ynRpFeAr$xOp~5Qn)ss<PIeV9>E<uuI;}>yV~t5S7q7Xj-%5?zInG}
z1|>WI4GU<PSdBpA5~_eEplJy;g0>qLRW6~H(MwxiSGkle15^XmOX$R^0cw^|2h;+!
zOQ;9xfVw3#0QEo@ODi^XogVC0l1t9gf<DIqatU=n70@NW1`kvNUGi%PKn>6(zXoLM
zX@M^JH6)-8=#pRK2-E{z@@rH;1JI>b5GQ_(K$lv9XLPTB7$upfj=6%;?1<-4Qdr05
zib;|*9UXn7FSljOKQD=*kUKp<{v}mjCn(PTRlRZcN!EnsHMWw3PEcuBBp1+81GPZy
z5(=wFSUoOU4L~(ey@Xnz0ccpt7Seh`0!~W_Pzop#a9T=$GC<+i(~@7u1QcmKEv0pY
z)ibP~7p;1ruzFs!8U_78wmug9>VQU|aS3hC@;lRBi>R{93+ULOk;P6cSO4YO45CVE
z0i&hkgJX<&*lkmt7=Avut`<q}WF%{&>{xII7=AA8DqxlLnR^_|p}2Z<klSAPx&4j<
zWVf~EglorJs{~iSvT_BXKo8E#mNIXN2H~EFH+Q>p4)Hz{IBX&DR)oa+HL3x!<3?~A
zoPL>am1uAVoas1m^w%5WeUo_UGDWb-!d$zAQ>E-Ml=GpZZu6=f_{|leNZC0^F;AFM
z0(KYgecz?j_+W0^pvi%EC%L#g<f8VLBfNa~;*~6=G}=a|0=I*`o!ASOHtKkPD~J6=
zseM^?`@c!Jl{ha;Y=8t4Tv9>o{RZ($mhv4y*=T&={ZI-#OJ&KD<89N;AZ7M@19nUE
zT(FaWNA8u16sPbmu@>&^2FT8NVlTIq0dgL#6YN#YW%;*8MP~%t>j83Zt+lhbzO?mL
zm8B1&ua@PDsY}^^t!2dXj_W{?FYOd&`#7h$D#!By+@WxEt-vQK4z4XZ_-h;>7f4R!
z`MqsmRypmis7sY@Bi{BBIY}~^=lyi-wcG{Jd~-3T-A8*OqsqD{3%Z<Vtw+7NKBV0z
zPb`b<QKJRDUIe{f7Es+b`*ZlMUA(Pw$!ZX^UA?Vx0p%%3+JD97(;}C$6@XfxOICRR
zPM;^9*A+>7a>f|V;}nk;>GQhgJdvbhtA(pO;klq{?M&nEs=XUm<x6?Ii!Q$Kb6|nN
zb^EkfWe<wQb^B&hWe+L_x9v<!l^tknrjuKp+aqXImXVy!qtKw6TU8szlY2F~<)uif
zbIAtU+-Ub`S~({F{?QJ?RVn>hD;Z8=4s848RdPSYO|6|k%Ym0s>|G{<THOUmO5(lP
zG(b*A4d?hsAB8!IHp@SQOCE_3>jBuo&Bh=P?1V^97@6Cd<JPYve?GR4SyhsoN2NTU
zolb2BnL-}`9{_pI610GA?oj0%<a4Gu^kl;TS-O=HbvYm)H~y2q@FjvODKCz8rRpiq
zkODXl&Uf6zUH}(5j&pK}?VXA$X*X;v^SO3RIC4&I*E5$1$(MGNKKq&;Ahevm``a&9
z7?OmgayhKp(@6pJ0Qn+}0cZq#gaPs|s1qL=qtGX!elA3<0V3+M+?;`2aem{GqbA8)
zh3y+fw4B6WRIuM#!CV-9+kDf3A8bng$tQ%XZ1ZuORGo(zAH_e<?PeSxm(b?(a8T_d
zXO#u@qboO+>aJOvzUOe=UbU)n33Y;U0*YKhTipnkueK<zlJak?ES@8`hH$&K^I%n$
zW@B?-$p`S-9GOCY%>W6Ix+kysYIzBb0Apt{F!;Q%=ZmaE5gobY%T#=6gu*v=fLuZY
zPzTg4p(D@;G%leEDAKxFOsf-gI}VTws6)2eV-Hm>S@l2-P_u*@pm5f9$yp8_*m|IT
z355r?aMpIwSt!`H&a83?wSw;9f&HQfT3-@hpQ@on(uk{*Gluum`2$6Yo3Wi_t8zl-
z4uamN(L(ZmNR<<^RaUNnIrszQM_7nm3CZJqkv-Ygbvf9#E7es_$hsy9ZHAeH<4VXZ
z7fANfW^<6(H-~BL_2^>cvZr7R(k_sa_hS_D7$;=ybs`^!%)#fakmv2xW}9aj@zUiX
zIm)stBmP{FgRdv?{Ph%iGDwcDpA!4^(=OPQ*uSFK%yNNj-AUZ4!2TpTEzNe`v68Ft
z{ryqNDrt)8U6L71>-JXT`P!z{Rn9k8THP#x5nSvxLp-<yu0Mw-+^wB^sPe_l&Y@^<
z3Y?mqm-1~oQvx+OY??F<`MtGGlPc$28(>PH1!udJwF2kBxz65%^Wc1EPyBpSh*wQE
zvD!H(_(dUJ)t{G={C*jUH_NVk#okwt^2qY<c@?>bUjkt-Ozh>6Kjbxu0~f)?`S}+k
ze&M=b<vd=8e8dC}=V_>gsL_nzlK3ynE+5G63f!vWTmpymtwNU%_RIA=GDniyn^vAK
zP17NYyd-IZ%xb)Q!!?j34KjI3nO94t5eJ&0kAiwVKrWz>+tYo0<@!K!2@Qh&+5xgI
zIZUqZbQn77z)wuC)T?Qq-l}~0bK>9dJ@6cu+MVDkXB^*{U#abiD`Sx~4RoX2w&hm~
zbKoaCv&gzWPIB8eu)Y0h#pxqimd83+XLu#3IfeU=osb{91(FL=y@fNUkz|S6jIQ>u
zR`Inz3T<=xDoYPrW(wsn5r4bNf$<<*EF&Aec;B`yxZSI_757U;&aDN`$iZ*fH6&|B
za{77mt!=GPXuC&OWe>I%qYvwL;<L&ZIcZT}_23ucD70-XRoQ_?z)f4KR@qy03?45M
zb(X!wmTP@fPmY?TfOMO}r*V`t9^H2wAbX3WfbINXl|ASgTyqCiB%(ICdgt~ZB#jp*
zQD@nKQKJ5d0%qB}CkU9-0NJNe12g~^{0NI}$yQ|_bpq%BF1W`s_|O7AbPsCr<HJbQ
z57S4@BSH>6NoF3Wh0L@jqfhS2m4ajsay2`hpIjeF_G#qIoW49~h9ruxzR$$30j~Vx
zw;gw^vaBlWW^lc8%fqyPh<l2hfaUzSPh#=$3h>p*e2CA{I^o*dvr2GnC*KO!mNHd>
zD>F)&hf$ybXF4tj*VZOgcIk`4wGXvbc3c97e4LOEo)5t<?GtU4ojv5^g!G;I^dTSb
z$nX2)qginD99HCfJa<=4AIX>5WjF4(bKnEqqvLq~99FCLXjGLi<qD{ktDPz^xcT5-
zw#d?N=jmL`>7>mKzK*vF+e<qyRpm=LL8KYu0uI~Hs~I3E!((}bqihm|u^k^;h5eRu
zQX~z|-i>DRGcK)#hPQQlhNH^PH=4O!;HXKKjX65w)E-{&9$W&K<{XbyIdCiS?zXD4
zXTf=Je$H`69NzXTvA18rUh5XD<oB>+Pm}C8v3K|tdxzgHeNx0Z(1YvJ$0JI3JGO@f
zs-zOBpWMs#9v{-qf#-JTpvoD?PeHE2`}LFUk)+*9o806~HBbD)@5hQuj%4Y!yr(iA
zQ=*du?X&DE9^<VIW<?U2hwWUo<ql>0+C<^)Dsq)eXk;|ClZ{m_p^|{q-s`WjLmmIz
z28G|l0J&t<GrAfjt*db{tp=zE>X*>K==xKfmE{tu1ieZE*4M=Z)WTYipT~CGqsp>t
zWi-7Iq(^NmJwiANdQ}4?P5u9%mKXf%@wRVY@v1CVLl~W^qoE19ebZj+pW{a@ttv}x
ziFqYzaI9A~^mpG*!dID5pVxR+&h@IcfTZB|%1V_JvV9e9-NM>#*KL|qW$7^kUDmFn
ziO)-r!04@AkgbwYt8hrVWy<bUyC_)Y)G8lp){MH={;FKC+V=Nz_^SQ<QI)+_mBi}%
zTCBdW^+E#L1Sf}ILKq<X1n{#tIsEI}50DF16+3J9(;j22a%z?R$_dz;Pfh^ICDb$O
zM)m;7IZAsqf~Fmp7D-7f?w=^G1N+Ul8z2`v5ZD9z_qCw!Yqwz4u?O}aZKhu36y<Cs
zM!&3hAj^e<4dSftBxsg>0u;9_`=*cV0g?--=j!ricU)gdiU-gLTi=qy<B=N-$%XnV
zmGJHCxDJr)t*XRT`|b0MYZl3c1Zeia+x?rXHpvC6QU?y|$VEVM!KwlJt_H}3g7sWl
zhh5^dlAKy?1=|MowLPlH^_67VSn<Jm!!s=(h1`MMS()osj<@+Ka6b-^SQFGeIVa9P
zs&|0z6Ohj_BIXY>Kmu%!N^<v`L%#ujW46z`xE36;GP#Avw=5kv4{mZxu&y5TB;Ek7
z+hrcX-I3$_<fvcU)kM5^1x|IG2dBa5jzfO$$j5!YOX||Z9`bSTZms(u+z+@P;o2|v
z@hXi1f87At#S44C5nSx@0qz0r5$Aq36E1<9@6z+Tv=sP+|DOGAR^jw0<(8#zyqhZV
zY9sYZ<94Fl##G=OxcO~NzQtGq1Gq4^$5SK<jNs<CPsbolQ(NT2jvc4LDRA?noNDyo
zkYC&7H^P4w`L%t1C+R~z>?9xMsUyrta$o}2A4!E@yp*r@{ZZ2qoB=mKYC6hzl>?Dq
z-{sd&;zfRapWg}R!Od?Wj}i62!!N!o-zXiW4k)+KmmA6>z%OC$moZH2!7rir3(6zF
zFQKpZ@LzymLVpVx^(w$Gq4P_C{{q@a=-Nk^*h75@U422nt0US+?Ak|!y%qeo694h-
zVpOXQvb#Zc>t$*^sV*BNZ_|BLv~b@R$=i<I<`|mM4%|+%k#ux18IMxiAs0K;0-GxU
zF4s}u8_c~;My+!2+@RDpC#ij#Q{6X7<x%5uxPTuiS=*CO95G2+sB3x*icgtw#HS#$
zGm^rMCXdtJrYPJ|?1dxjnUIHG%a2&(!06b|B!z3|<)j-w@vE{`GAF;{%jI#}Ke(Eb
z6pn8Ii@m1|ZzRcqNz)^zd~IFkPVef6Ns<dB`{sQoLmbIz8e=(t+_>-b&K60&(9QMH
z$vJhYgU_!|_902yZnIH7d`i4-HS!9#$T`<GKyzSdS3jyGE>9|zd`abLWe)uigIQ9R
zgQBN=8pw^4LLETYg8^U!SeDi{S#Xk-&~8vxS-D9HJ#(>LS*?=3k<x{K&4psDSd8tb
zzN(yCY&&x$FoKJnbAmW0eo7MKXP-n14oNKUu+fu~8U`ii^+ueyE?rVK=#WAUE`dw)
zh=WAD-b#+w+je5q&{<zjq66tc`UTQ3`Kl!DENdgE{;uCCqLlP2O6Ij(CjRShDVNvw
zym6KDT!#eCfpeX^HMj^ac2&_!0yph>f+{;VdT<F`>g%@nX7ern-hA6GCz8U=U2?o>
zkFVt=dO#96&)f5!qbB*nAFb(0TTk!!Hg13Axn7a<x<tJXT{`W=YaDm_R^0GOlD!Pb
zv%tKg;7*S(wnuW#iEG|n<T4>S=XfFO-b3IrA=$+%_CizxWM@wcceMi~wNd)iA8|UJ
z%jA9SxRH|VVvgV(I9INi371DmPT}?A$T5?o#!S~X@rVe|tKIc`&819ofo!?)ulsh$
zzsi^Hb%18l-w(%@$RbO1Pzk@K&1-Fc|Kz!Pk!6n|2HFs+C-3-2L)W=$ZBGVP`69oa
z)8<P&W`v7rkMUJW4ib>k$o*%yB;H~uQU<@W$?H?AW3Os`Z}Z$L%Z$WCEtl@wLMw;<
zv`-OL0&E}P+nDjHUX8N@WN$GtXf<0T*;~{N+^Y7Rb(OtE&!E1sMUuV6kwJIlT$Ai9
zstFhY#y)DtAir)@Wf!$)hi`Xn3Rq=tv4Q%g+i8L-sl&?^L2H?0wSFfDU)#CODi=rv
z(t&gfWXq5K@$Ktzl|9Kdh=M&xzhHA@^7J7#Pag|5ThCI01IVyI)}H0n?D#85l`rLo
z>T{Qe2en%THgCuF-M31zsp$9TYY~;F>l%5zMz%>ZH?m-zGyEv1vu(BeYVOB2o2l}}
zOy6t_`~%RhcJrniS7f(~GJ<p9y0_BWxsVd*!S!zhPT~#VCXY9@T|5sif{PtDv6sM2
zmSJ@liYs{zY)3t+>~_gu&nR&HE*|{So@1&KcQxz9-ie>v$+{{#dkP%N!_3RWwWUD`
zM0rehp*SB79LmFV<>BFH13#N?cW#{I7k)OKpW9rA0xkSE*@fbKpgb)6XXm?kF2R2m
z{<H93d+SicFAjcjuG>XM`W*7%Cc6scmpo;KJ&*KFcA?r#%%R*o>a*{5krm3r!_U6+
zvzo+<^!aXEL806N(igg2PzC=5_%E1w`{lvZ?rr3FNz%+9_ZeoC!ywiR`K|`Y1(Nf!
zcPDOhBo|1o4fn^LZIUF?(Gfa}-`Hplrzc!n#V%4ZRF97ETWzoI_1GT&t@4GgAI7$w
z!ry0J-R9gjYpX0{pPL+`Xw}AD-aeTlStd@#>hh~X^_F6r5asaery3y1q_gzAcKbro
z?%_~qJAPATZ&Nw8`DsQ*N%kl|KA*$wgZ)FYP8xmS(cJKO>OQY+AH7O)kdO7Vdn4IN
z-`5-Gh$O9Fd{A=DmHT4%dus3AS6ODZd2XKY(vW8IJ#yiYTp*j{{pQ-<a;%bao~V>v
zJ3}1#{W%J6-?6JCG?ptEbr+^BKy%=C``}dNf=RBIe)H2_u&i<cb!>Gz9yp;S7f>Up
zYbPG7TtK<Dr<-lZnyMt%CyUD>Z4>!5-8bzEc9muMotN>LbhLqHqt@T;lW3Jx>?Y7Y
z1{ZtV@uT2&QKrg{4DD0<M!l-`Aaa!;>q8AHwThgBecQgNa$>XYf$?K&IXJ$G%`7Km
z?Ga4WLbjunRsOYEuKbNmZW8C6<UDrE3*sESYuDhb?DKBL=C1AAR=MCM%jC7)=Bskz
zrS@Kr5u4YI*t~AWo@~QZ4!++co%frYd7e&1N$mGu@qCu^yvx1*9Q<g<zN_qP^3Rv&
z;HMYz>8E7d6&BBL&PVn<$<9mssB#GoAVWv;(wu@%dVuV0N?qFU&kj`C+jJvH4{{-$
zifi5GdgQ#5?ENV<Wb@@EonO9Br=5}xWMAueAzy+c$k>s(ZSd#dpHGSX=W`OfoHQ0M
z{X8Z1pQr2EW|VT@{H`VT-*xO-96!aIgC8jO$F$tJCL{09yTtyyyPlNN+>VM#lD+4p
z#clP4V<$OJCw*68(%ghJw}o_SA^n=mo#ccJidKaD%f}S_t_R3z4_1EzO8U>Oze?=>
z+VxAhZA#wLn)?uv^PW}~WI1?s<#t4JA)T!q>G#ln-muEDv~_uP$zw-s8vq;qy^k&L
zMbf4a{b)y&Lj>G^>fbxpr6Os7*WMDzo^N^A58IxhtP*5>#VPVd>>TXcwWlg4HtAi0
zyku^t@N@9)Ddg@MrZ(FKX#g2P#szX>Gl5JC<jCZHlpO8L^T~fn{tb;1tU#*XrcWR>
zNWDOMkOrh#APq<d(k+ma9C?twCtU!UKrVQxy|7Ec3SLqRUSiLi^RYd{US(MvOg?3D
zk8IB0M!0WNf3C*yQb&FzlLFm1K+d_gm9jCPw{|A3$`^XwB`-3|W1WG21*U{WVDS$|
zr&EC?U}<KJ2Xwo>wXfq<mboFl8{K}myWKdZ!1leY%8oO_wd+h(3df6)<`lb0?e2J$
z<P=38{J0C^AA{Z4pRcx7tFl=1*2%3x8`<X1U2Kx1?jq35c95;oy61)`dquv?Z!EMT
ziR&Nk(2JzFEj?M$uDI|3mCo1pBhFP4XHs4W(06T3-?#W@a5w3u+T;CI=6dMqDs8#g
zr@Z{AT?w@S%U2u$=?35wOaN04O4K{{Pm!{erEmE;=WT7i9k2Y0YE{1UxAjLRzuK9x
zD(RuSzqjC97ny^9TW4D3#AdtK?<ZsenHEUZ$lGgcSXC~h(=mB&XZfoHDfi_)lOOF-
zg(^ETwk|RUU)njEDi>_l8obqqyK1o)Nws54E?{23C%(646WeP&RhGV@bQbOB#aj)g
z&~`4e$_{LwD@&*XsOBKw1}>o*pzgq_MFX&m%43jUK<Ck>1UhhT#@Y6vrUZI$zR$A-
z2<N<a&fyE%6naLU&kK2m#WV7J?(*y>X*?s(=RVI9&VsXD%nnXG!#U@<bNKhUN+6tb
z?wrF<1eZ`ar#!0n$Dl{VAE-DVV^=B)9DMlE`Or^6Wcj1ZGQVq8LXnk^g>ptCXx~1n
z?2?EqUyzkcpA|%WK?*MmDV!KYp<G;FdJ$j*7<-EWAWG%3P-Wr8Pq_PYp~}KJpQy5*
zx~n*B@W_uDR9U*G800CDz|J|X-?(Oz>^Sz#$9BsJBuVjVQ|_X)&Q^-=V|QaOlBAkA
zBe@g1-Pi0!l4ZBa(bi@Ar8Wm}8wSW8<hZ}gn&T!(ZY)p0m3}tx3%xneIrbvSj^l{G
z^@%x>FMO@X%7HMT*Eh}#$%2NE?Bqx5+2{A;52GXr^<yme-}nu>R@3Zi+xFe7<V{8y
z+k<csT=`=6himgV3T$6#tL*Gq=63CMqbfTt3D+K6-^zpQe>f*x+Xk$%<2-Zsi^O{`
z^T%X^qDQ9rr+g)a4=WZQ^>^xLPSilRlE_DWFyJrB+x<EGxfB0nxrB-<pWOWvxs(6{
u)B?2&R=Fdv>t}nZwn}n<v$7X9j_lCgX~%gYxny$um;VpZ{NS*|pa1|-0PhI^

diff --git a/q2_moshpit/eggnog/tests/test_dbs.py b/q2_moshpit/eggnog/tests/test_dbs.py
index 59fde66d..1b3923d4 100644
--- a/q2_moshpit/eggnog/tests/test_dbs.py
+++ b/q2_moshpit/eggnog/tests/test_dbs.py
@@ -6,17 +6,16 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 import os
-import tempfile
 from unittest.mock import patch, call
 from qiime2.plugin.testing import TestPluginBase
 from .._dbs import (
     fetch_eggnog_db, build_custom_diamond_db, fetch_eggnog_proteins,
     fetch_diamond_db, build_eggnog_diamond_db, fetch_ncbi_taxonomy,
-    _write_version_tsv, _validate_taxon_id
+    _validate_taxon_id
 )
 from q2_types.feature_data import ProteinSequencesDirectoryFormat
 from q2_types_genomics.reference_db import (
-    NCBITaxonomyDirFmt, EggnogProteinSequencesDirFmt, NCBITaxonomyVersionFormat
+    NCBITaxonomyDirFmt, EggnogProteinSequencesDirFmt
 )
 
 
@@ -152,16 +151,12 @@ def test_fetch_eggnog_fasta(self, subp_run):
         # Check that commands are ran as expected
         subp_run.assert_has_calls([first_call, second_call], any_order=False)
 
-    @patch("q2_moshpit.eggnog._dbs._write_version_tsv")
     @patch("subprocess.run")
-    def test_fetch_ncbi_taxonomy(self, subp_run, w_v_tsv):
+    def test_fetch_ncbi_taxonomy(self, subp_run):
         # Call function. Patching will make sure nothing is actually ran
         ncbi_data = fetch_ncbi_taxonomy()
         zip_path = os.path.join(str(ncbi_data), "taxdmp.zip")
-        nodes_path = os.path.join(str(ncbi_data), "nodes.dmp")
-        names_path = os.path.join(str(ncbi_data), "names.dmp")
         proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz")
-        version_path = os.path.join(str(ncbi_data), "version.tsv")
 
         # Check that command was called in the expected way
         first_call = call(
@@ -196,23 +191,6 @@ def test_fetch_ncbi_taxonomy(self, subp_run, w_v_tsv):
             [first_call, second_call, third_call, forth_call],
             any_order=False
         )
-        w_v_tsv.assert_called_once_with(
-            nodes_path,
-            names_path,
-            proteins_path,
-            version_path
-        )
-
-    def test_make_version_df(self):
-        nodes = self.get_data_path('ncbi/nodes.dmp')
-        names = self.get_data_path('ncbi/names.dmp')
-        proteins = self.get_data_path('ncbi/prot.accession2taxid.gz')
-
-        with tempfile.TemporaryDirectory() as tmp:
-            version = os.path.join(tmp, 'version.tsv')
-            _write_version_tsv(nodes, names, proteins, version)
-            format = NCBITaxonomyVersionFormat(version, mode="r")
-            format.validate()
 
     @patch("q2_moshpit.eggnog._dbs._validate_taxon_id")
     @patch("subprocess.run")

From a0bf4579e76060361f4b45c6812605a1da7b2e33 Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Tue, 23 Jan 2024 10:31:06 +0100
Subject: [PATCH 19/24] Adjust file size in prompt

---
 q2_moshpit/eggnog/_dbs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py
index 07a2fb09..a9f8a344 100644
--- a/q2_moshpit/eggnog/_dbs.py
+++ b/q2_moshpit/eggnog/_dbs.py
@@ -268,7 +268,7 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
     run_command(cmd=["rm", zip_path])
 
     # Download proteins
-    print(colorify("Downloading proteins file (~15 GB)..."))
+    print(colorify("Downloading proteins file (~8 GB)..."))
     run_command(
         cmd=[
             "wget", "-O", proteins_path,

From f6805677de9be7215feadecb7fa6c388b12192ba Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Tue, 23 Jan 2024 14:24:14 +0100
Subject: [PATCH 20/24] Reorganize fetch_ncbi_taxonomy

---
 q2_moshpit/eggnog/_dbs.py | 56 ++++++++++++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 16 deletions(-)

diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py
index a9f8a344..4de3ac2a 100644
--- a/q2_moshpit/eggnog/_dbs.py
+++ b/q2_moshpit/eggnog/_dbs.py
@@ -13,7 +13,9 @@
     EggnogRefDirFmt, DiamondDatabaseDirFmt, NCBITaxonomyDirFmt,
     EggnogProteinSequencesDirFmt
 )
-from .._utils import run_command, _process_common_input_params, colorify
+from .._utils import (
+    run_command, _process_common_input_params, colorify, compare_md5_hashes
+)
 from ._utils import _parse_build_diamond_db_params
 
 
@@ -247,14 +249,19 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
     zip_path = os.path.join(str(ncbi_data), "taxdmp.zip")
     proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz")
 
-    # Download zip file
+    # Download zip file + MD5 file
     print(colorify("Downloading *.dmp files..."))
-    run_command(
-        cmd=[
-            "wget", "-O", zip_path,
-            "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip"
-        ]
-    )
+    for ext in ["", ".md5"]:
+        # Download MD5
+        run_command(
+            cmd=[
+                "wget", "-O", f"{zip_path}{ext}",
+                f"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip{ext}"
+            ]
+        )
+
+    # Collect and compare md5 hashes
+    _collect_and_compare_md5(f"{zip_path}.md5", zip_path)
 
     # Unzip
     run_command(
@@ -267,18 +274,35 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
     # Remove zip file
     run_command(cmd=["rm", zip_path])
 
-    # Download proteins
+    # Download proteins + MD5 file
     print(colorify("Downloading proteins file (~8 GB)..."))
-    run_command(
-        cmd=[
-            "wget", "-O", proteins_path,
-            "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
-            "prot.accession2taxid.gz"
-        ]
-    )
+    for ext in ["", ".md5"]:
+        run_command(
+            cmd=[
+                "wget", "-O", f"{proteins_path}{ext}",
+                "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
+                f"prot.accession2taxid.gz{ext}"
+            ]
+        )
+
+    # Collect and compare md5 hashes
+    _collect_and_compare_md5(f"{proteins_path}.md5", proteins_path)
 
     # Return object
     print(colorify(
         "Done! Moving data from temporary directory to final location..."
     ))
     return ncbi_data
+
+
+def _collect_and_compare_md5(path_to_md5: str, path_to_file: str):
+    with open(path_to_md5, 'r') as f:
+        # Read the first line
+        first_line = f.readline().strip()
+        # Split the line into hash and file name
+        md5_hash, _ = first_line.split(' ', 1)
+        # Compare
+        compare_md5_hashes(md5_hash, path_to_file)
+
+    # If no exception is raised, remove md5 file
+    run_command(cmd=["rm", path_to_md5])

From af0f652b780c1a859054cc32102c6342daac9eb1 Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Tue, 23 Jan 2024 14:37:36 +0100
Subject: [PATCH 21/24] Add tests

---
 q2_moshpit/_utils.py                       | 24 +++++++
 q2_moshpit/eggnog/tests/data/md5/a.txt     |  1 +
 q2_moshpit/eggnog/tests/data/md5/a.txt.md5 |  1 +
 q2_moshpit/eggnog/tests/data/md5/b.txt     |  1 +
 q2_moshpit/eggnog/tests/test_dbs.py        | 81 ++++++++++++++++------
 q2_moshpit/tests/data/md5/a.txt            |  1 +
 q2_moshpit/tests/data/md5/b.txt            |  1 +
 q2_moshpit/tests/test_utils.py             | 33 +++++++--
 8 files changed, 116 insertions(+), 27 deletions(-)
 create mode 100644 q2_moshpit/eggnog/tests/data/md5/a.txt
 create mode 100644 q2_moshpit/eggnog/tests/data/md5/a.txt.md5
 create mode 100644 q2_moshpit/eggnog/tests/data/md5/b.txt
 create mode 100644 q2_moshpit/tests/data/md5/a.txt
 create mode 100644 q2_moshpit/tests/data/md5/b.txt

diff --git a/q2_moshpit/_utils.py b/q2_moshpit/_utils.py
index 331f592d..0bce8a0a 100644
--- a/q2_moshpit/_utils.py
+++ b/q2_moshpit/_utils.py
@@ -5,7 +5,9 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
+from qiime2.core.exceptions import ValidationError
 import subprocess
+import hashlib
 from typing import List
 
 
@@ -74,3 +76,25 @@ def _process_common_input_params(processing_func, params: dict) -> List[str]:
 
 def colorify(string):
     return "%s%s%s" % ('\033[1;32m', string, "\033[0m")
+
+
+def compare_md5_hashes(expected_hash: str, path_to_file: str):
+    observed_hash = calculate_md5_from_file(path_to_file)
+    if observed_hash != expected_hash:
+        raise ValidationError(
+            "Download error. Data possibly corrupted.\n"
+            f"{path_to_file} has an unexpected MD5 hash.\n\n"
+            "Expected hash:\n"
+            f"{expected_hash}\n\n"
+            "Observed hash:\n"
+            f"{observed_hash}"
+        )
+
+
+def calculate_md5_from_file(file_path):
+    md5_hash = hashlib.md5()
+    with open(file_path, 'rb') as f:
+        # Read the file in chunks to handle large files
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5_hash.update(chunk)
+    return md5_hash.hexdigest()
diff --git a/q2_moshpit/eggnog/tests/data/md5/a.txt b/q2_moshpit/eggnog/tests/data/md5/a.txt
new file mode 100644
index 00000000..348608f1
--- /dev/null
+++ b/q2_moshpit/eggnog/tests/data/md5/a.txt
@@ -0,0 +1 @@
+I am a text file. Calculate an MD% hash from me.
\ No newline at end of file
diff --git a/q2_moshpit/eggnog/tests/data/md5/a.txt.md5 b/q2_moshpit/eggnog/tests/data/md5/a.txt.md5
new file mode 100644
index 00000000..f9f80c4b
--- /dev/null
+++ b/q2_moshpit/eggnog/tests/data/md5/a.txt.md5
@@ -0,0 +1 @@
+a583054a9831a6e7cc56ea5cd9cac40a a.txt
\ No newline at end of file
diff --git a/q2_moshpit/eggnog/tests/data/md5/b.txt b/q2_moshpit/eggnog/tests/data/md5/b.txt
new file mode 100644
index 00000000..6af1c12b
--- /dev/null
+++ b/q2_moshpit/eggnog/tests/data/md5/b.txt
@@ -0,0 +1 @@
+I am a another text file. 
\ No newline at end of file
diff --git a/q2_moshpit/eggnog/tests/test_dbs.py b/q2_moshpit/eggnog/tests/test_dbs.py
index 1b3923d4..a32fb702 100644
--- a/q2_moshpit/eggnog/tests/test_dbs.py
+++ b/q2_moshpit/eggnog/tests/test_dbs.py
@@ -8,10 +8,11 @@
 import os
 from unittest.mock import patch, call
 from qiime2.plugin.testing import TestPluginBase
+from qiime2.core.exceptions import ValidationError
 from .._dbs import (
     fetch_eggnog_db, build_custom_diamond_db, fetch_eggnog_proteins,
     fetch_diamond_db, build_eggnog_diamond_db, fetch_ncbi_taxonomy,
-    _validate_taxon_id
+    _validate_taxon_id, _collect_and_compare_md5
 )
 from q2_types.feature_data import ProteinSequencesDirectoryFormat
 from q2_types_genomics.reference_db import (
@@ -151,46 +152,80 @@ def test_fetch_eggnog_fasta(self, subp_run):
         # Check that commands are ran as expected
         subp_run.assert_has_calls([first_call, second_call], any_order=False)
 
+    @patch("q2_moshpit.eggnog._dbs._collect_and_compare_md5")
     @patch("subprocess.run")
-    def test_fetch_ncbi_taxonomy(self, subp_run):
+    def test_fetch_ncbi_taxonomy(self, subp_run, cc_md5):
         # Call function. Patching will make sure nothing is actually ran
         ncbi_data = fetch_ncbi_taxonomy()
         zip_path = os.path.join(str(ncbi_data), "taxdmp.zip")
         proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz")
 
         # Check that command was called in the expected way
-        first_call = call(
-            [
-                "wget", "-O", zip_path,
-                "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip"
-            ],
-            check=True
-        )
-        second_call = call(
+        I_call, II_call = [
+            call(
+                [
+                    "wget", "-O", f"{zip_path}{ext}",
+                    f"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip{ext}"
+                ],
+                check=True
+            )
+            for ext in ["", ".md5"]
+        ]
+        III_call = call(f"{zip_path}.md5", zip_path)
+        IV_call = call(
             [
                 "unzip", "-j", zip_path, "names.dmp", "nodes.dmp",
                 "-d", str(ncbi_data)
             ],
             check=True,
         )
-        third_call = call(
-            ["rm", zip_path],
-            check=True,
-        )
-        forth_call = call(
-            [
-                "wget", "-O", proteins_path,
-                "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
-                "prot.accession2taxid.gz"
-            ],
-            check=True,
-        )
+        V_call = call(["rm", zip_path], check=True)
+        VI_call, VII_call = [
+            call(
+                [
+                    "wget", "-O", f"{proteins_path}{ext}",
+                    "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
+                    f"prot.accession2taxid.gz{ext}"
+                ],
+                check=True
+            )
+            for ext in ["", ".md5"]
+        ]
+        VIII_call = call(f"{proteins_path}.md5", proteins_path)
 
         # Check that commands are ran as expected
         subp_run.assert_has_calls(
-            [first_call, second_call, third_call, forth_call],
+            [I_call, II_call, IV_call, V_call, VI_call, VII_call],
             any_order=False
         )
+        cc_md5.assert_has_calls([III_call, VIII_call], any_order=False)
+
+    @patch("subprocess.run")
+    def test_collect_and_compare_md5_valid(self, subp_run):
+        path_to_file = self.get_data_path("md5/a.txt")
+
+        # Should raise no errors
+        _collect_and_compare_md5(f"{path_to_file}.md5", path_to_file)
+
+        # Check rm is called as expected
+        subp_run.assert_called_once_with(
+            ["rm", f"{path_to_file}.md5"], check=True
+        )
+
+    @patch("subprocess.run")
+    def test_collect_and_compare_md5_invalid(self, subp_run):
+        path_to_file = self.get_data_path("md5/b.txt")
+        path_to_wrong_md5 = self.get_data_path("md5/a.txt.md5")
+
+        # Check that expected exception is raised
+        with self.assertRaisesRegex(
+            ValidationError,
+            "has an unexpected MD5 hash"
+        ):
+            _collect_and_compare_md5(path_to_wrong_md5, path_to_file)
+
+        # check that rm is not called
+        subp_run.assert_not_called()
 
     @patch("q2_moshpit.eggnog._dbs._validate_taxon_id")
     @patch("subprocess.run")
diff --git a/q2_moshpit/tests/data/md5/a.txt b/q2_moshpit/tests/data/md5/a.txt
new file mode 100644
index 00000000..348608f1
--- /dev/null
+++ b/q2_moshpit/tests/data/md5/a.txt
@@ -0,0 +1 @@
+I am a text file. Calculate an MD% hash from me.
\ No newline at end of file
diff --git a/q2_moshpit/tests/data/md5/b.txt b/q2_moshpit/tests/data/md5/b.txt
new file mode 100644
index 00000000..6af1c12b
--- /dev/null
+++ b/q2_moshpit/tests/data/md5/b.txt
@@ -0,0 +1 @@
+I am a another text file. 
\ No newline at end of file
diff --git a/q2_moshpit/tests/test_utils.py b/q2_moshpit/tests/test_utils.py
index 77f9f37c..98a6858d 100644
--- a/q2_moshpit/tests/test_utils.py
+++ b/q2_moshpit/tests/test_utils.py
@@ -5,12 +5,13 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
-
 import unittest
-
 from qiime2.plugin.testing import TestPluginBase
-
-from .._utils import _construct_param, _process_common_input_params
+from qiime2.core.exceptions import ValidationError
+from .._utils import (
+    _construct_param, _process_common_input_params, compare_md5_hashes,
+    calculate_md5_from_file
+)
 
 
 def fake_processing_func(key, val):
@@ -113,6 +114,30 @@ def test_process_common_inputs_mix_with_falsy_values(self):
         ]
         self.assertSetEqual(set(observed), set(expected))
 
+    def test_compare_md5_hashes_pass(self):
+        path_to_file = self.get_data_path("md5/a.txt")
+        compare_md5_hashes("a583054a9831a6e7cc56ea5cd9cac40a", path_to_file)
+
+    def test_compare_md5_hashes_fail(self):
+        path_to_file = self.get_data_path("md5/b.txt")
+        with self.assertRaisesRegex(
+            ValidationError,
+            "has an unexpected MD5 hash"
+        ):
+            compare_md5_hashes(
+                "a583054a9831a6e7cc56ea5cd9cac40a", path_to_file
+            )
+
+    def test_calculate_md5_from_pass(self):
+        path_to_file = self.get_data_path("md5/a.txt")
+        observed_hash = calculate_md5_from_file(path_to_file)
+        self.assertEqual(observed_hash, "a583054a9831a6e7cc56ea5cd9cac40a")
+
+    def test_calculate_md5_from_fail(self):
+        path_to_file = self.get_data_path("md5/b.txt")
+        observed_hash = calculate_md5_from_file(path_to_file)
+        self.assertNotEqual(observed_hash, "a583054a9831a6e7cc56ea5cd9cac40a")
+
 
 if __name__ == '__main__':
     unittest.main()

From c4e28f0d1ba578d6cb8a72e9af95ab70b1abf85c Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <54123712+Sann5@users.noreply.github.com>
Date: Wed, 24 Jan 2024 16:29:05 +0100
Subject: [PATCH 22/24] Update q2_moshpit/eggnog/_dbs.py

Co-authored-by: Michal Ziemski <mziemski@ethz.ch>
---
 q2_moshpit/eggnog/_dbs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py
index 4de3ac2a..dec0e953 100644
--- a/q2_moshpit/eggnog/_dbs.py
+++ b/q2_moshpit/eggnog/_dbs.py
@@ -241,7 +241,7 @@ def _validate_taxon_id(eggnog_proteins, taxon):
 
 def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
     """
-    Script fetches 3 files from the internet and puts them into the folder of
+    Script fetches 3 files from the NCBI server and puts them into the folder of
     a NCBITaxonomyDirFmt object.
     """
     # Initialize output object and paths

From 301ae006bf84a733e0943ce4704ebe7b6fada916 Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Wed, 24 Jan 2024 17:08:59 +0100
Subject: [PATCH 23/24] Eliminate duplicated action

---
 q2_moshpit/plugin_setup.py | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py
index c1833089..885cd394 100644
--- a/q2_moshpit/plugin_setup.py
+++ b/q2_moshpit/plugin_setup.py
@@ -610,34 +610,6 @@
     citations=[citations["NCBI"]]
 )
 
-plugin.methods.register_function(
-    function=q2_moshpit.eggnog.build_eggnog_diamond_db,
-    inputs={
-        'eggnog_proteins': ReferenceDB[EggnogProteinSequences],
-    },
-    input_descriptions={
-        'eggnog_proteins': "eggNOG database of protein sequences and "
-                           "their corresponding taxonomy information "
-                           "(generated through the fetch-eggnog-proteins "
-                           "action)."
-    },
-    parameters={
-        'taxon': Int % Range(2, 1579337)
-    },
-    parameter_descriptions={
-        'taxon': "Taxon ID number."
-    },
-    outputs=[("diamond_db", ReferenceDB[Diamond])],
-    output_descriptions={
-        "diamond_db": "Complete Diamond reference database for the"
-                      "specified taxon."
-    },
-    name="Create a DIAMOND formatted reference database for the"
-         "specified taxon.",
-    description="Creates an DIAMOND database which contains the protein "
-                "sequences that belong to the specified taxon.",
-)
-
 plugin.methods.register_function(
     function=q2_moshpit.eggnog.build_eggnog_diamond_db,
     inputs={

From 9a2de5791b3d366a15ee3d08023870bd2e1fc87d Mon Sep 17 00:00:00 2001
From: Santiago Castro Dau <sanntiago5@gmail.com>
Date: Wed, 24 Jan 2024 17:09:52 +0100
Subject: [PATCH 24/24] Reveiw comments Michal

---
 q2_moshpit/_utils.py                | 18 +-----
 q2_moshpit/eggnog/_dbs.py           | 85 +++++++++++++++++------------
 q2_moshpit/eggnog/tests/test_dbs.py | 78 +++++++++++++++-----------
 q2_moshpit/tests/test_utils.py      | 23 ++------
 setup.py                            |  2 +
 5 files changed, 104 insertions(+), 102 deletions(-)

diff --git a/q2_moshpit/_utils.py b/q2_moshpit/_utils.py
index 0bce8a0a..f93efe17 100644
--- a/q2_moshpit/_utils.py
+++ b/q2_moshpit/_utils.py
@@ -5,7 +5,6 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
-from qiime2.core.exceptions import ValidationError
 import subprocess
 import hashlib
 from typing import List
@@ -74,24 +73,11 @@ def _process_common_input_params(processing_func, params: dict) -> List[str]:
     return processed_args
 
 
-def colorify(string):
+def colorify(string: str):
     return "%s%s%s" % ('\033[1;32m', string, "\033[0m")
 
 
-def compare_md5_hashes(expected_hash: str, path_to_file: str):
-    observed_hash = calculate_md5_from_file(path_to_file)
-    if observed_hash != expected_hash:
-        raise ValidationError(
-            "Download error. Data possibly corrupted.\n"
-            f"{path_to_file} has an unexpected MD5 hash.\n\n"
-            "Expected hash:\n"
-            f"{expected_hash}\n\n"
-            "Observed hash:\n"
-            f"{observed_hash}"
-        )
-
-
-def calculate_md5_from_file(file_path):
+def _calculate_md5_from_file(file_path: str) -> str:
     md5_hash = hashlib.md5()
     with open(file_path, 'rb') as f:
         # Read the file in chunks to handle large files
diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py
index dec0e953..65e244f7 100644
--- a/q2_moshpit/eggnog/_dbs.py
+++ b/q2_moshpit/eggnog/_dbs.py
@@ -6,15 +6,17 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 import os
+import shutil
 import pandas as pd
+from qiime2.core.exceptions import ValidationError
 from q2_types.feature_data import ProteinSequencesDirectoryFormat
-import shutil
 from q2_types_genomics.reference_db import (
     EggnogRefDirFmt, DiamondDatabaseDirFmt, NCBITaxonomyDirFmt,
     EggnogProteinSequencesDirFmt
 )
 from .._utils import (
-    run_command, _process_common_input_params, colorify, compare_md5_hashes
+    run_command, _process_common_input_params, colorify,
+    _calculate_md5_from_file
 )
 from ._utils import _parse_build_diamond_db_params
 
@@ -241,29 +243,30 @@ def _validate_taxon_id(eggnog_proteins, taxon):
 
 def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
     """
-    Script fetches 3 files from the NCBI server and puts them into the folder of
-    a NCBITaxonomyDirFmt object.
+    Script fetches 3 files from the NCBI server and puts them into the folder
+    of a NCBITaxonomyDirFmt object.
     """
-    # Initialize output object and paths
     ncbi_data = NCBITaxonomyDirFmt()
     zip_path = os.path.join(str(ncbi_data), "taxdmp.zip")
     proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz")
 
-    # Download zip file + MD5 file
+    # Download dump zip file + MD5 file
     print(colorify("Downloading *.dmp files..."))
-    for ext in ["", ".md5"]:
-        # Download MD5
-        run_command(
-            cmd=[
-                "wget", "-O", f"{zip_path}{ext}",
-                f"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip{ext}"
-            ]
-        )
+    run_command(
+        cmd=[
+            "wget", "-O", f"{zip_path}",
+            "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip"
+        ]
+    )
+    run_command(
+        cmd=[
+            "wget", "-O", f"{zip_path}.md5",
+            "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip.md5"
+        ]
+    )
 
-    # Collect and compare md5 hashes
     _collect_and_compare_md5(f"{zip_path}.md5", zip_path)
 
-    # Unzip
     run_command(
         cmd=[
             "unzip", "-j", zip_path, "names.dmp", "nodes.dmp",
@@ -271,24 +274,27 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
         ]
     )
 
-    # Remove zip file
-    run_command(cmd=["rm", zip_path])
+    os.remove(zip_path)
 
     # Download proteins + MD5 file
     print(colorify("Downloading proteins file (~8 GB)..."))
-    for ext in ["", ".md5"]:
-        run_command(
-            cmd=[
-                "wget", "-O", f"{proteins_path}{ext}",
-                "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
-                f"prot.accession2taxid.gz{ext}"
-            ]
-        )
+    run_command(
+        cmd=[
+            "wget", "-O", f"{proteins_path}",
+            "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
+            "prot.accession2taxid.gz"
+        ]
+    )
+    run_command(
+        cmd=[
+            "wget", "-O", f"{proteins_path}.md5",
+            "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
+            "prot.accession2taxid.gz.md5"
+        ]
+    )
 
-    # Collect and compare md5 hashes
     _collect_and_compare_md5(f"{proteins_path}.md5", proteins_path)
 
-    # Return object
     print(colorify(
         "Done! Moving data from temporary directory to final location..."
     ))
@@ -296,13 +302,22 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
 
 
 def _collect_and_compare_md5(path_to_md5: str, path_to_file: str):
+    # Read in hash from md5 file
     with open(path_to_md5, 'r') as f:
-        # Read the first line
-        first_line = f.readline().strip()
-        # Split the line into hash and file name
-        md5_hash, _ = first_line.split(' ', 1)
-        # Compare
-        compare_md5_hashes(md5_hash, path_to_file)
+        expected_hash = f.readline().strip().split(maxsplit=1)[0]
+
+    # Calculate hash from file
+    observed_hash = _calculate_md5_from_file(path_to_file)
+
+    if observed_hash != expected_hash:
+        raise ValidationError(
+            "Download error. Data possibly corrupted.\n"
+            f"{path_to_file} has an unexpected MD5 hash.\n\n"
+            "Expected hash:\n"
+            f"{expected_hash}\n\n"
+            "Observed hash:\n"
+            f"{observed_hash}"
+        )
 
     # If no exception is raised, remove md5 file
-    run_command(cmd=["rm", path_to_md5])
+    os.remove(path_to_md5)
diff --git a/q2_moshpit/eggnog/tests/test_dbs.py b/q2_moshpit/eggnog/tests/test_dbs.py
index a32fb702..6529d675 100644
--- a/q2_moshpit/eggnog/tests/test_dbs.py
+++ b/q2_moshpit/eggnog/tests/test_dbs.py
@@ -154,66 +154,80 @@ def test_fetch_eggnog_fasta(self, subp_run):
 
     @patch("q2_moshpit.eggnog._dbs._collect_and_compare_md5")
     @patch("subprocess.run")
-    def test_fetch_ncbi_taxonomy(self, subp_run, cc_md5):
+    @patch("os.remove")
+    def test_fetch_ncbi_taxonomy(self, mock_os_rm, mock_run, mock_md5):
         # Call function. Patching will make sure nothing is actually ran
         ncbi_data = fetch_ncbi_taxonomy()
         zip_path = os.path.join(str(ncbi_data), "taxdmp.zip")
         proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz")
 
         # Check that command was called in the expected way
-        I_call, II_call = [
+        expected_calls = [
             call(
                 [
-                    "wget", "-O", f"{zip_path}{ext}",
-                    f"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip{ext}"
+                    "wget", "-O", f"{zip_path}",
+                    "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip"
                 ],
                 check=True
-            )
-            for ext in ["", ".md5"]
-        ]
-        III_call = call(f"{zip_path}.md5", zip_path)
-        IV_call = call(
-            [
-                "unzip", "-j", zip_path, "names.dmp", "nodes.dmp",
-                "-d", str(ncbi_data)
-            ],
-            check=True,
-        )
-        V_call = call(["rm", zip_path], check=True)
-        VI_call, VII_call = [
+            ),
+            call(
+                [
+                    "wget", "-O", f"{zip_path}.md5",
+                    "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip.md5"
+                ],
+                check=True
+            ),
+            call(
+                [
+                    "unzip", "-j", zip_path, "names.dmp", "nodes.dmp",
+                    "-d", str(ncbi_data)
+                ],
+                check=True,
+            ),
             call(
                 [
-                    "wget", "-O", f"{proteins_path}{ext}",
+                    "wget", "-O", f"{proteins_path}",
                     "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
-                    f"prot.accession2taxid.gz{ext}"
+                    "prot.accession2taxid.gz"
+                ],
+                check=True
+            ),
+            call(
+                [
+                    "wget", "-O", f"{proteins_path}.md5",
+                    "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
+                    "prot.accession2taxid.gz.md5"
                 ],
                 check=True
             )
-            for ext in ["", ".md5"]
         ]
-        VIII_call = call(f"{proteins_path}.md5", proteins_path)
 
         # Check that commands are ran as expected
-        subp_run.assert_has_calls(
-            [I_call, II_call, IV_call, V_call, VI_call, VII_call],
+        mock_os_rm.assert_called_once_with(zip_path)
+        mock_run.assert_has_calls(
+            expected_calls,
+            any_order=False
+        )
+        mock_md5.assert_has_calls(
+            [
+                call(f"{zip_path}.md5", zip_path),
+                call(f"{proteins_path}.md5", proteins_path),
+            ],
             any_order=False
         )
-        cc_md5.assert_has_calls([III_call, VIII_call], any_order=False)
 
-    @patch("subprocess.run")
-    def test_collect_and_compare_md5_valid(self, subp_run):
+    @patch("os.remove")
+    def test_collect_and_compare_md5_valid(self, mock_os_rm):
         path_to_file = self.get_data_path("md5/a.txt")
 
         # Should raise no errors
         _collect_and_compare_md5(f"{path_to_file}.md5", path_to_file)
 
         # Check rm is called as expected
-        subp_run.assert_called_once_with(
-            ["rm", f"{path_to_file}.md5"], check=True
-        )
+        mock_os_rm.assert_called_once_with(f"{path_to_file}.md5")
 
-    @patch("subprocess.run")
-    def test_collect_and_compare_md5_invalid(self, subp_run):
+    @patch("os.remove")
+    def test_collect_and_compare_md5_invalid(self, mock_os_rm):
         path_to_file = self.get_data_path("md5/b.txt")
         path_to_wrong_md5 = self.get_data_path("md5/a.txt.md5")
 
@@ -225,7 +239,7 @@ def test_collect_and_compare_md5_invalid(self, subp_run):
             _collect_and_compare_md5(path_to_wrong_md5, path_to_file)
 
         # check that rm is not called
-        subp_run.assert_not_called()
+        mock_os_rm.assert_not_called()
 
     @patch("q2_moshpit.eggnog._dbs._validate_taxon_id")
     @patch("subprocess.run")
diff --git a/q2_moshpit/tests/test_utils.py b/q2_moshpit/tests/test_utils.py
index 98a6858d..780a10c7 100644
--- a/q2_moshpit/tests/test_utils.py
+++ b/q2_moshpit/tests/test_utils.py
@@ -7,10 +7,9 @@
 # ----------------------------------------------------------------------------
 import unittest
 from qiime2.plugin.testing import TestPluginBase
-from qiime2.core.exceptions import ValidationError
 from .._utils import (
-    _construct_param, _process_common_input_params, compare_md5_hashes,
-    calculate_md5_from_file
+    _construct_param, _process_common_input_params,
+    _calculate_md5_from_file
 )
 
 
@@ -114,28 +113,14 @@ def test_process_common_inputs_mix_with_falsy_values(self):
         ]
         self.assertSetEqual(set(observed), set(expected))
 
-    def test_compare_md5_hashes_pass(self):
-        path_to_file = self.get_data_path("md5/a.txt")
-        compare_md5_hashes("a583054a9831a6e7cc56ea5cd9cac40a", path_to_file)
-
-    def test_compare_md5_hashes_fail(self):
-        path_to_file = self.get_data_path("md5/b.txt")
-        with self.assertRaisesRegex(
-            ValidationError,
-            "has an unexpected MD5 hash"
-        ):
-            compare_md5_hashes(
-                "a583054a9831a6e7cc56ea5cd9cac40a", path_to_file
-            )
-
     def test_calculate_md5_from_pass(self):
         path_to_file = self.get_data_path("md5/a.txt")
-        observed_hash = calculate_md5_from_file(path_to_file)
+        observed_hash = _calculate_md5_from_file(path_to_file)
         self.assertEqual(observed_hash, "a583054a9831a6e7cc56ea5cd9cac40a")
 
     def test_calculate_md5_from_fail(self):
         path_to_file = self.get_data_path("md5/b.txt")
-        observed_hash = calculate_md5_from_file(path_to_file)
+        observed_hash = _calculate_md5_from_file(path_to_file)
         self.assertNotEqual(observed_hash, "a583054a9831a6e7cc56ea5cd9cac40a")
 
 
diff --git a/setup.py b/setup.py
index b7cf194e..1f048bff 100644
--- a/setup.py
+++ b/setup.py
@@ -28,6 +28,7 @@
         'q2_moshpit': [
             'citations.bib',
             'tests/data/*',
+            'tests/data/md5/*',
             "assets/busco/*",
             "assets/busco/js/*",
             "assets/busco/css/*",
@@ -47,6 +48,7 @@
         ],
         'q2_moshpit.eggnog': [
             'tests/data/*',
+            'tests/data/md5/*',
             'tests/data/build_eggnog_diamond_db/*',
             'tests/data/contig-sequences-1/*',
             'tests/data/mag-sequences/*',