From 194a54539cf2a1d994b4822f9a641d53e3816019 Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Fri, 22 Dec 2023 15:02:10 +0100 Subject: [PATCH 01/24] Add colorify to utils --- q2_moshpit/_utils.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/q2_moshpit/_utils.py b/q2_moshpit/_utils.py index 331f592d..941bed2d 100644 --- a/q2_moshpit/_utils.py +++ b/q2_moshpit/_utils.py @@ -8,6 +8,27 @@ import subprocess from typing import List +# CONVERT shell colors to the same curses palette +SHELL_COLORS = { + "wr": '\033[1;37;41m', # white on red + "wo": '\033[1;37;43m', # white on orange + "wm": '\033[1;37;45m', # white on magenta + "wb": '\033[1;37;46m', # white on blue + "bw": '\033[1;37;40m', # black on white + "lblue": '\033[1;34m', # light blue + "lred": '\033[1;31m', # light red + "lgreen": '\033[1;32m', # light green + "yellow": '\033[1;33m', # yellow + "cyan": '\033[36m', # cyan + "blue": '\033[34m', # blue + "green": '\033[32m', # green + "orange": '\033[33m', # orange + "red": '\033[31m', # red + "magenta": "\033[35m", # magenta + "white": "\033[0m", # white + None: "\033[0m", # end +} + def run_command(cmd, env=None, verbose=True, pipe=False, **kwargs): if verbose: From ea08a403eb6338976013c62113ab75599b03d25c Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Fri, 22 Dec 2023 15:19:32 +0100 Subject: [PATCH 02/24] Register action --- q2_moshpit/plugin_setup.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index fc18b0ec..fa1d4751 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -591,6 +591,25 @@ "storage space is required to run this action. " ) +plugin.methods.register_function( + function=q2_moshpit.eggnog.fetch_eggnog_fasta, + inputs={}, + parameters={}, + outputs=[("eggnog_fasta", ReferenceDB[EggnogSequenceTaxa])], + output_descriptions={ + "eggnog_fasta": "Artifact containing the eggNOG database " + "of protein sequences and their corresponding" + "taxonomy information." + }, + name="Fetch the databases necessary to run to run the " + "build-eggnog-diamond-db action.", + description="Downloads eggnog proteome database " + "This script downloads 2 files: " + "(e5.proteomes.faa and e5.taxid_info.tsv) " + "and creates and artifact with them. At least 18 Gb of " + "storage space is required to run this action. " +) + plugin.methods.register_function( function=q2_moshpit.eggnog.eggnog_diamond_search, inputs={ From 471aaad4863d61fa9b36a055b73fffabe9fcd2ff Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Thu, 11 Jan 2024 15:05:20 +0100 Subject: [PATCH 03/24] colorify only with green --- q2_moshpit/_utils.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/q2_moshpit/_utils.py b/q2_moshpit/_utils.py index 941bed2d..331f592d 100644 --- a/q2_moshpit/_utils.py +++ b/q2_moshpit/_utils.py @@ -8,27 +8,6 @@ import subprocess from typing import List -# CONVERT shell colors to the same curses palette -SHELL_COLORS = { - "wr": '\033[1;37;41m', # white on red - "wo": '\033[1;37;43m', # white on orange - "wm": '\033[1;37;45m', # white on magenta - "wb": '\033[1;37;46m', # white on blue - "bw": '\033[1;37;40m', # black on white - "lblue": '\033[1;34m', # light blue - "lred": '\033[1;31m', # light red - "lgreen": '\033[1;32m', # light green - "yellow": '\033[1;33m', # yellow - "cyan": '\033[36m', # cyan - "blue": '\033[34m', # blue - "green": '\033[32m', # green - "orange": '\033[33m', # orange - "red": '\033[31m', # red - "magenta": "\033[35m", # magenta - "white": "\033[0m", # white - None: "\033[0m", # end -} - def run_command(cmd, env=None, verbose=True, pipe=False, **kwargs): if verbose: From 4fd93d8f9c6e6cee55524bc29ac70e5c0096450a Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau <54123712+Sann5@users.noreply.github.com> Date: Thu, 11 Jan 2024 15:06:52 +0100 Subject: [PATCH 04/24] Update q2_moshpit/plugin_setup.py Co-authored-by: Michal Ziemski --- q2_moshpit/plugin_setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index fa1d4751..a450dc25 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -604,9 +604,9 @@ name="Fetch the databases necessary to run to run the " "build-eggnog-diamond-db action.", description="Downloads eggnog proteome database " - "This script downloads 2 files: " + "This script downloads 2 files " "(e5.proteomes.faa and e5.taxid_info.tsv) " - "and creates and artifact with them. At least 18 Gb of " + "and creates and artifact with them. At least 18 GB of " "storage space is required to run this action. " ) From bc9f7c64faa6ffc195da95551743d18443d00c83 Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Fri, 12 Jan 2024 14:23:53 +0100 Subject: [PATCH 05/24] Updated the output name and description --- q2_moshpit/plugin_setup.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index a450dc25..f0b84141 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -597,9 +597,8 @@ parameters={}, outputs=[("eggnog_fasta", ReferenceDB[EggnogSequenceTaxa])], output_descriptions={ - "eggnog_fasta": "Artifact containing the eggNOG database " - "of protein sequences and their corresponding" - "taxonomy information." + "eggnog_proteins": "eggNOG database of protein sequences and " + "their corresponding taxonomy information." }, name="Fetch the databases necessary to run to run the " "build-eggnog-diamond-db action.", From 452824f2a675718a737cfd7314b75b4850c365e6 Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Fri, 12 Jan 2024 14:17:36 +0100 Subject: [PATCH 06/24] Implement build_eggnog_diamond_db action --- q2_moshpit/eggnog/__init__.py | 5 +++-- q2_moshpit/eggnog/_dbs.py | 32 ++++++++++++++++++++++++++++++++ q2_moshpit/plugin_setup.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/q2_moshpit/eggnog/__init__.py b/q2_moshpit/eggnog/__init__.py index c6e7f8cc..9a176a43 100644 --- a/q2_moshpit/eggnog/__init__.py +++ b/q2_moshpit/eggnog/__init__.py @@ -8,11 +8,12 @@ from ._method import eggnog_diamond_search, eggnog_annotate from ._dbs import ( fetch_eggnog_db, fetch_diamond_db, build_custom_diamond_db, - fetch_eggnog_proteins + fetch_eggnog_proteins, build_eggnog_diamond_db ) __all__ = [ 'eggnog_diamond_search', 'eggnog_annotate', 'fetch_eggnog_db', - 'fetch_diamond_db', 'build_custom_diamond_db', 'fetch_eggnog_proteins' + 'fetch_diamond_db', 'build_custom_diamond_db', 'fetch_eggnog_proteins', + 'build_eggnog_diamond_db', ] diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py index ea7382fe..8bc71dd8 100644 --- a/q2_moshpit/eggnog/_dbs.py +++ b/q2_moshpit/eggnog/_dbs.py @@ -7,6 +7,7 @@ # ---------------------------------------------------------------------------- import os from q2_types.feature_data import ProteinSequencesDirectoryFormat +import shutil from q2_types_genomics.reference_db import ( EggnogRefDirFmt, DiamondDatabaseDirFmt, NCBITaxonomyDirFmt, EggnogProteinSequencesDirFmt @@ -174,3 +175,34 @@ def fetch_eggnog_proteins() -> EggnogProteinSequencesDirFmt: )) return eggnog_fa + + +def build_eggnog_diamond_db( + eggnog_proteins: EggnogProteinSequencesDirFmt, + taxon: str +) -> DiamondDatabaseDirFmt: + """ + Creates an DIAMOND database which contains the protein + sequences that belong to the specified taxon. + """ + + # Initialize output objects + diamond_db = DiamondDatabaseDirFmt() + + # Define command. + cmd = [ + "create_dbs.py", + "--data_dir", str(eggnog_proteins), + "--taxids", taxon, + "--dbname", "ref_db" + ] + run_command(cmd) + + # The script will create the diamond DB in side the directory of + # eggnog_proteins object, so we need to move it to diamond_db + source_path = os.path.join(str(eggnog_proteins), "ref_db.dmnd") + destination_path = os.path.join(str(diamond_db), "ref_db.dmnd") + shutil.move(source_path, destination_path) + + # Return objects + return diamond_db diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index f0b84141..e518da8b 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -609,6 +609,34 @@ "storage space is required to run this action. " ) +plugin.methods.register_function( + function=q2_moshpit.eggnog.build_eggnog_diamond_db, + inputs={ + 'eggnog_proteins': ReferenceDB[EggnogSequenceTaxa], + }, + input_descriptions={ + 'eggnog_proteins': "eggNOG database of protein sequences and " + "their corresponding taxonomy information " + "(generated through the fetch-eggnog-proteins " + "action)." + }, + parameters={ + 'taxon': Int % Range(2, 1579337) + }, + parameter_descriptions={ + 'taxon': "Taxon ID number." + }, + outputs=[("diamond_db", ReferenceDB[Diamond])], + output_descriptions={ + "diamond_db": "Complete Diamond reference database for the" + "specified taxon." + }, + name="Create a DIAMOND formatted reference database for the" + "specified taxon.", + description="Creates an DIAMOND database which contains the protein " + "sequences that belong to the specified taxon.", +) + plugin.methods.register_function( function=q2_moshpit.eggnog.eggnog_diamond_search, inputs={ From f4e711f8d63cf027b93e38dc3d97c5d86598299d Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Fri, 12 Jan 2024 15:06:39 +0100 Subject: [PATCH 07/24] Add test --- q2_moshpit/eggnog/_dbs.py | 4 ++-- q2_moshpit/eggnog/tests/test_dbs.py | 32 +++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py index 8bc71dd8..f6744d78 100644 --- a/q2_moshpit/eggnog/_dbs.py +++ b/q2_moshpit/eggnog/_dbs.py @@ -179,7 +179,7 @@ def fetch_eggnog_proteins() -> EggnogProteinSequencesDirFmt: def build_eggnog_diamond_db( eggnog_proteins: EggnogProteinSequencesDirFmt, - taxon: str + taxon: int ) -> DiamondDatabaseDirFmt: """ Creates an DIAMOND database which contains the protein @@ -193,7 +193,7 @@ def build_eggnog_diamond_db( cmd = [ "create_dbs.py", "--data_dir", str(eggnog_proteins), - "--taxids", taxon, + "--taxids", str(taxon), "--dbname", "ref_db" ] run_command(cmd) diff --git a/q2_moshpit/eggnog/tests/test_dbs.py b/q2_moshpit/eggnog/tests/test_dbs.py index 33d4304e..f9280d86 100644 --- a/q2_moshpit/eggnog/tests/test_dbs.py +++ b/q2_moshpit/eggnog/tests/test_dbs.py @@ -10,10 +10,12 @@ from qiime2.plugin.testing import TestPluginBase from .._dbs import ( fetch_eggnog_db, build_custom_diamond_db, fetch_eggnog_proteins, - fetch_diamond_db + fetch_diamond_db, build_eggnog_diamond_db ) from q2_types.feature_data import ProteinSequencesDirectoryFormat -from q2_types_genomics.reference_db import NCBITaxonomyDirFmt +from q2_types_genomics.reference_db import ( + NCBITaxonomyDirFmt, EggnogProteinSequencesDirFmt +) class TestFetchDB(TestPluginBase): @@ -147,3 +149,29 @@ def test_fetch_eggnog_fasta(self, subp_run): # Check that commands are ran as expected subp_run.assert_has_calls([first_call, second_call], any_order=False) + + @patch("subprocess.run") + @patch("shutil.move") + def test_build_eggnog_diamond_db(self, shut_mv, subp_run): + # Instantiate input + proteins_and_taxa = EggnogProteinSequencesDirFmt() + + # Call function. Patching will make sure nothing is + # actually ran + diamond_db = build_eggnog_diamond_db(proteins_and_taxa, taxon=2) + + # Check that command was called in the expected way + cmd = [ + "create_dbs.py", + "--data_dir", str(proteins_and_taxa), + "--taxids", "2", + "--dbname", "ref_db" + ] + + # Check that subprocess.run is run as expected + subp_run.assert_called_once_with(cmd, check=True) + + # Check that shutil.move is run as expected + source_path = os.path.join(str(proteins_and_taxa), "ref_db.dmnd") + destination_path = os.path.join(str(diamond_db), "ref_db.dmnd") + shut_mv.assert_called_once_with(source_path, destination_path) From 9b0a4c07f7b5d87a339a981533a849a9643c131e Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Mon, 15 Jan 2024 11:50:02 +0100 Subject: [PATCH 08/24] Further refactor EggnogSequenceTaxa to EggnogProteinSequences --- q2_moshpit/plugin_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index e518da8b..cf933cc1 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -612,7 +612,7 @@ plugin.methods.register_function( function=q2_moshpit.eggnog.build_eggnog_diamond_db, inputs={ - 'eggnog_proteins': ReferenceDB[EggnogSequenceTaxa], + 'eggnog_proteins': ReferenceDB[EggnogProteinSequences], }, input_descriptions={ 'eggnog_proteins': "eggNOG database of protein sequences and " From db0a0c831a55b3f3097be0a60d3cf045ebb8ae78 Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Mon, 15 Jan 2024 14:37:06 +0100 Subject: [PATCH 09/24] Add validation for taxon IDs and corresponding test --- q2_moshpit/eggnog/_dbs.py | 29 +++++ .../build_eggnog_diamond_db/e5.taxid_info.tsv | 100 ++++++++++++++++++ q2_moshpit/eggnog/tests/test_dbs.py | 15 ++- setup.py | 1 + 4 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 q2_moshpit/eggnog/tests/data/build_eggnog_diamond_db/e5.taxid_info.tsv diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py index f6744d78..49442342 100644 --- a/q2_moshpit/eggnog/_dbs.py +++ b/q2_moshpit/eggnog/_dbs.py @@ -6,6 +6,7 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import os +import pandas as pd from q2_types.feature_data import ProteinSequencesDirectoryFormat import shutil from q2_types_genomics.reference_db import ( @@ -185,6 +186,8 @@ def build_eggnog_diamond_db( Creates an DIAMOND database which contains the protein sequences that belong to the specified taxon. """ + # Validate taxon ID + _validate_taxon_id(eggnog_proteins, taxon) # Initialize output objects diamond_db = DiamondDatabaseDirFmt() @@ -206,3 +209,29 @@ def build_eggnog_diamond_db( # Return objects return diamond_db + + +def _validate_taxon_id(eggnog_proteins, taxon): + # Validate taxon id number + # Read in valid taxon ids + taxid_info = pd.read_csv( + os.path.join(str(eggnog_proteins), "e5.taxid_info.tsv"), + sep="\t" + ) + + # Convert them into a set + tax_ids = set() + for lineage in taxid_info["Taxid Lineage"]: + tax_ids.update( + set( + lineage.strip().split(",") + ) + ) + + # Check for overlap with provided taxon id + if not tax_ids.intersection(set(str(taxon))): + raise ValueError( + f"'{taxon}' is not valid taxon ID. " + "To view all valid taxon IDs inspect e5.taxid_info.tsv " + "file in the input eggnog_proteins input." + ) diff --git a/q2_moshpit/eggnog/tests/data/build_eggnog_diamond_db/e5.taxid_info.tsv b/q2_moshpit/eggnog/tests/data/build_eggnog_diamond_db/e5.taxid_info.tsv new file mode 100644 index 00000000..e3e30994 --- /dev/null +++ b/q2_moshpit/eggnog/tests/data/build_eggnog_diamond_db/e5.taxid_info.tsv @@ -0,0 +1,100 @@ +# Taxid Sci.Name Rank Named Lineage Taxid Lineage +679937 Bacteroides coprosuis DSM 18011 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides coprosuis,Bacteroides coprosuis DSM 18011 1,131567,2,68336,976,200643,171549,815,816,151276,679937 +1146883 Blastococcus saxobsidens DD2 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Frankineae,Geodermatophilaceae,Blastococcus,Blastococcus saxobsidens,Blastococcus saxobsidens DD2 1,131567,2,201174,1760,85003,2037,85013,85030,38501,138336,1146883 +1497679 Listeriaceae bacterium FSL A5-0209 species root,cellular organisms,Bacteria,Firmicutes,Bacilli,Bacillales,Listeriaceae,unclassified Listeriaceae,Listeriaceae bacterium FSL A5-0209 1,131567,2,1239,91061,1385,186820,1081735,1497679 +69014 Thermococcus kodakarensis KOD1 no rank root,cellular organisms,Archaea,Euryarchaeota,Thermococci,Thermococcales,Thermococcaceae,Thermococcus,Thermococcus kodakarensis,Thermococcus kodakarensis KOD1 1,131567,2157,28890,183968,2258,2259,2263,311400,69014 +888833 Streptococcus australis ATCC 700641 no rank root,cellular organisms,Bacteria,Firmicutes,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus australis,Streptococcus australis ATCC 700641 1,131567,2,1239,91061,186826,1300,1301,113107,888833 +1089544 Amycolatopsis benzoatilytica AK 16/65 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Pseudonocardineae,Pseudonocardiaceae,Amycolatopsis,Amycolatopsis benzoatilytica,Amycolatopsis benzoatilytica AK 16/65 1,131567,2,201174,1760,85003,2037,85010,2070,1813,346045,1089544 +1089545 Amycolatopsis balhimycina FH 1894 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Pseudonocardineae,Pseudonocardiaceae,Amycolatopsis,Amycolatopsis balhimycina,Amycolatopsis balhimycina FH 1894 1,131567,2,201174,1760,85003,2037,85010,2070,1813,208443,1089545 +1089546 Actinopolyspora halophila DSM 43834 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinopolysporineae,Actinopolysporaceae,Actinopolyspora,Actinopolyspora halophila,Actinopolyspora halophila DSM 43834 1,131567,2,201174,1760,85003,2037,622450,622451,1849,1850,1089546 +521393 Actinomyces timonensis DSM 23838 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinomycineae,Actinomycetaceae,Actinomyces,Actinomyces timonensis,Actinomyces timonensis DSM 23838 1,131567,2,201174,1760,85003,2037,85005,2049,1654,1288391,521393 +1089548 Thermicanus aegyptius DSM 12793 no rank root,cellular organisms,Bacteria,Firmicutes,Bacilli,Bacillales,Bacillales incertae sedis,Bacillales Family X. Incertae Sedis,Thermicanus,Thermicanus aegyptius,Thermicanus aegyptius DSM 12793 1,131567,2,1239,91061,1385,539002,539003,94008,94009,1089548 +172045 Elizabethkingia miricola species root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Flavobacteriia,Flavobacteriales,Flavobacteriaceae,Elizabethkingia,Elizabethkingia miricola 1,131567,2,68336,976,117743,200644,49546,308865,172045 +1089550 Salisaeta longa DSM 21114 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Bacteroidetes Order II. Incertae sedis,Rhodothermaceae,Salisaeta,Salisaeta longa,Salisaeta longa DSM 21114 1,131567,2,68336,976,1100069,563843,689697,503170,1089550 +1089551 Geminicoccus roseus DSM 18922 no rank root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,unclassified Alphaproteobacteria,Geminicoccus,Geminicoccus roseus,Geminicoccus roseus DSM 18922 1,131567,2,1224,28211,82117,489140,404900,1089551 +1089552 Rhodovibrio salinarum DSM 9154 no rank root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodospirillales,Rhodospirillaceae,Rhodovibrio,Rhodovibrio salinarum,Rhodovibrio salinarum DSM 9154 1,131567,2,1224,28211,204441,41295,85274,1087,1089552 +1089553 Thermacetogenium phaeum DSM 12270 no rank root,cellular organisms,Bacteria,Firmicutes,Clostridia,Thermoanaerobacterales,Thermoanaerobacteraceae,Thermacetogenium,Thermacetogenium phaeum,Thermacetogenium phaeum DSM 12270 1,131567,2,1239,186801,68295,186814,140458,85874,1089553 +196627 Corynebacterium glutamicum ATCC 13032 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Corynebacterineae,Corynebacteriaceae,Corynebacterium,Corynebacterium glutamicum,Corynebacterium glutamicum ATCC 13032 1,131567,2,201174,1760,85003,2037,85007,1653,1716,1718,196627 +1161902 Eubacterium nodatum ATCC 33099 no rank root,cellular organisms,Bacteria,Firmicutes,Clostridia,Clostridiales,Clostridiales incertae sedis,Clostridiales Family XIII. Incertae Sedis,[Eubacterium] nodatum,Eubacterium nodatum ATCC 33099 1,131567,2,1239,186801,186802,538999,543314,35518,1161902 +446468 Nocardiopsis dassonvillei subsp. dassonvillei DSM 43111 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Streptosporangineae,Nocardiopsaceae,Nocardiopsis,Nocardiopsis dassonvillei,Nocardiopsis dassonvillei subsp. dassonvillei,Nocardiopsis dassonvillei subsp. dassonvillei DSM 43111 1,131567,2,201174,1760,85003,2037,85012,83676,2013,2014,568208,446468 +1286170 Raoultella ornithinolytica B6 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacteriales,Enterobacteriaceae,Raoultella,Raoultella ornithinolytica,Raoultella ornithinolytica B6 1,131567,2,1224,1236,91347,543,160674,54291,1286170 +1286171 Eubacterium acidaminophilum DSM 3953 no rank root,cellular organisms,Bacteria,Firmicutes,Clostridia,Clostridiales,Eubacteriaceae,Eubacterium,Eubacterium acidaminophilum,Eubacterium acidaminophilum DSM 3953 1,131567,2,1239,186801,186802,186806,1730,1731,1286171 +446469 Sanguibacter keddieii DSM 10542 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Micrococcineae,Sanguibacteraceae,Sanguibacter,Sanguibacter keddieii,Sanguibacter keddieii DSM 10542 1,131567,2,201174,1760,85003,2037,85006,145360,60919,60920,446469 +1384484 Adlercreutzia equolifaciens DSM 19450 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Coriobacteridae,Coriobacteriales,Coriobacterineae,Coriobacteriaceae,Adlercreutzia,Adlercreutzia equolifaciens,Adlercreutzia equolifaciens DSM 19450 1,131567,2,201174,1760,84998,84999,255727,84107,447020,446660,1384484 +446470 Stackebrandtia nassauensis DSM 44728 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Glycomycineae,Glycomycetaceae,Stackebrandtia,Stackebrandtia nassauensis,Stackebrandtia nassauensis DSM 44728 1,131567,2,201174,1760,85003,2037,85014,85034,283810,283811,446470 +270374 Marinobacter sp. ELB17 species root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Marinobacter,Marinobacter sp. ELB17 1,131567,2,1224,1236,135622,72275,2742,270374 +237609 Pseudomonas alkylphenolia species root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,Pseudomonas alkylphenolia 1,131567,2,1224,1236,72274,135621,286,237609 +622637 Methylocystis sp. ATCC 49242 species root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Methylocystaceae,Methylocystis,Methylocystis sp. ATCC 49242 1,131567,2,1224,28211,356,31993,133,622637 +536019 Mesorhizobium opportunistum WSM2075 no rank root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Phyllobacteriaceae,Mesorhizobium,Mesorhizobium opportunistum,Mesorhizobium opportunistum WSM2075 1,131567,2,1224,28211,356,69277,68287,593909,536019 +46429 Sphingobium chlorophenolicum species root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Sphingomonadaceae,Sphingobium,Sphingobium chlorophenolicum 1,131567,2,1224,28211,204457,41297,165695,46429 +1056816 Nocardia sp. BMG51109 species root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Corynebacterineae,Nocardiaceae,Nocardia,Nocardia sp. BMG51109 1,131567,2,201174,1760,85003,2037,85007,85025,1817,1056816 +589873 Alteromonas australica species root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas australica 1,131567,2,1224,1236,135622,72275,226,589873 +1120947 Actinomyces vaccimaxillae DSM 15804 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinomycineae,Actinomycetaceae,Actinomyces,Actinomyces vaccimaxillae,Actinomyces vaccimaxillae DSM 15804 1,131567,2,201174,1760,85003,2037,85005,2049,1654,183916,1120947 +1056820 Teredinibacter turnerae T7902 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadales genera incertae sedis,Teredinibacter,Teredinibacter turnerae,Teredinibacter turnerae T7902 1,131567,2,1224,1236,135622,256005,2425,2426,1056820 +1269813 Thioalkalivibrio sp. ALR17-21 species root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Chromatiales,Ectothiorhodospiraceae,Thioalkalivibrio,Thioalkalivibrio sp. ALR17-21 1,131567,2,1224,1236,135613,72276,106633,1269813 +639030 Acidobacteria bacterium KBS 146 species root,cellular organisms,Bacteria,Fibrobacteres/Acidobacteria group,Acidobacteria,Acidobacteriia,Acidobacteriales,Acidobacteriaceae,unclassified Acidobacteriaceae,Acidobacteria bacterium KBS 146 1,131567,2,131550,57723,204432,204433,204434,112074,639030 +172088 Bradyrhizobium sp. th.b2 species root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Bradyrhizobiaceae,Bradyrhizobium,Bradyrhizobium sp. th.b2 1,131567,2,1224,28211,356,41294,374,172088 +180281 Cyanobium sp. PCC 7001 species root,cellular organisms,Bacteria,Cyanobacteria,Oscillatoriophycideae,Chroococcales,Cyanobium,Cyanobium sp. PCC 7001 1,131567,2,1117,1301283,1118,167375,180281 +663610 Methylocapsa aurea species root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Beijerinckiaceae,Methylocapsa,Methylocapsa aurea 1,131567,2,1224,28211,356,45404,184923,663610 +1045855 Pseudoxanthomonas spadix BD-a59 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,Pseudoxanthomonas,Pseudoxanthomonas spadix,Pseudoxanthomonas spadix BD-a59 1,131567,2,1224,1236,135614,32033,83618,415229,1045855 +1120949 Actinoplanes globisporus DSM 43857 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Micromonosporineae,Micromonosporaceae,Actinoplanes,Actinoplanes globisporus,Actinoplanes globisporus DSM 43857 1,131567,2,201174,1760,85003,2037,85008,28056,1865,113565,1120949 +81985 Capsella rubella species root,cellular organisms,Eukaryota,Viridiplantae,Streptophyta,Streptophytina,Embryophyta,Tracheophyta,Euphyllophyta,Spermatophyta,Magnoliophyta,Mesangiospermae,eudicotyledons,Gunneridae,Pentapetalae,rosids,malvids,Brassicales,Brassicaceae,Camelineae,Capsella,Capsella rubella 1,131567,2759,33090,35493,131221,3193,58023,78536,58024,3398,1437183,71240,91827,1437201,71275,91836,3699,3700,980083,3718,81985 +393283 Pestalotiopsis fici species root,cellular organisms,Eukaryota,Opisthokonta,Fungi,Dikarya,Ascomycota,saccharomyceta,Pezizomycotina,leotiomyceta,sordariomyceta,Sordariomycetes,Xylariomycetidae,Xylariales,Amphisphaeriaceae,Pestalotiopsis,Pestalotiopsis fici 1,131567,2759,33154,4751,451864,4890,716545,147538,716546,715989,147550,222545,37989,54958,37840,393283 +163908 Anabaena sp. PCC 7108 species root,cellular organisms,Bacteria,Cyanobacteria,Nostocales,Nostocaceae,Anabaena,Anabaena sp. PCC 7108 1,131567,2,1117,1161,1162,1163,163908 +1120950 Actinopolymorpha alba DSM 45243 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Propionibacterineae,Nocardioidaceae,Actinopolymorpha,Actinopolymorpha alba,Actinopolymorpha alba DSM 45243 1,131567,2,201174,1760,85003,2037,85009,85015,117156,533267,1120950 +1144325 Pseudomonas sp. GM21 species root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,Pseudomonas sp. GM21 1,131567,2,1224,1236,72274,135621,286,1144325 +1045858 Brachyspira intermedia PWS/A no rank root,cellular organisms,Bacteria,Spirochaetes,Spirochaetia,Spirochaetales,Brachyspiraceae,Brachyspira,Brachyspira intermedia,Brachyspira intermedia PWS/A 1,131567,2,203691,203692,136,143786,29521,84377,1045858 +925775 Xanthomonas vesicatoria ATCC 35937 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,Xanthomonas,Xanthomonas vesicatoria,Xanthomonas vesicatoria ATCC 35937 1,131567,2,1224,1236,135614,32033,338,56460,925775 +1417296 Defluviimonas sp. 20V17 species root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodobacterales,Rhodobacteraceae,Defluviimonas,Defluviimonas sp. 20V17 1,131567,2,1224,28211,204455,31989,1097466,1417296 +1417230 Borrelia persica No12 no rank root,cellular organisms,Bacteria,Spirochaetes,Spirochaetia,Spirochaetales,Spirochaetaceae,Borrelia,Borrelia persica,Borrelia persica No12 1,131567,2,203691,203692,136,137,138,44448,1417230 +106582 Maylandia zebra species root,cellular organisms,Eukaryota,Opisthokonta,Metazoa,Eumetazoa,Bilateria,Deuterostomia,Chordata,Craniata,Vertebrata,Gnathostomata,Teleostomi,Euteleostomi,Actinopterygii,Actinopteri,Neopterygii,Teleostei,Osteoglossocephalai,Clupeocephala,Euteleosteomorpha,Neoteleostei,Eurypterygia,Ctenosquamata,Acanthomorphata,Euacanthomorphacea,Percomorphaceae,Ovalentaria,Cichlomorphae,Cichliformes,Cichlidae,African cichlids,Pseudocrenilabrinae,Haplochromini,Maylandia,Maylandia zebra complex,Maylandia zebra 1,131567,2759,33154,33208,6072,33213,33511,7711,89593,7742,7776,117570,117571,7898,186623,41665,32443,1489341,186625,1489388,123365,123366,123367,123368,123369,1489872,1489908,1489910,1489911,8113,319095,318546,319058,143623,57445,106582 +1120953 Aestuariibacter salexigens DSM 15300 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Aestuariibacter,Aestuariibacter salexigens,Aestuariibacter salexigens DSM 15300 1,131567,2,1224,1236,135622,72275,249523,226010,1120953 +393305 Yersinia enterocolitica subsp. enterocolitica 8081 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacteriales,Enterobacteriaceae,Yersinia,Yersinia enterocolitica,Yersinia enterocolitica subsp. enterocolitica,Yersinia enterocolitica subsp. enterocolitica 8081 1,131567,2,1224,1236,91347,543,629,630,150052,393305 +1280706 Selenomonas ruminantium subsp. ruminantium ATCC 12561 no rank root,cellular organisms,Bacteria,Firmicutes,Negativicutes,Selenomonadales,Veillonellaceae,Selenomonas,Selenomonas ruminantium,Selenomonas ruminantium subsp. ruminantium,Selenomonas ruminantium subsp. ruminantium ATCC 12561 1,131567,2,1239,909932,909929,31977,970,971,114196,1280706 +1515613 Porphyromonas sp. COT-239_OH1446 species root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Bacteroidia,Bacteroidales,Porphyromonadaceae,Porphyromonas,Porphyromonas sp. COT-239_OH1446 1,131567,2,68336,976,200643,171549,171551,836,1515613 +1123257 Solimonas flava DSM 18980 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Sinobacteraceae,Solimonas,Solimonas flava,Solimonas flava DSM 18980 1,131567,2,1224,1236,135614,568386,413435,415849,1123257 +1515615 Porphyromonas sp. COT-290_OH860 species root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Bacteroidia,Bacteroidales,Porphyromonadaceae,Porphyromonas,Porphyromonas sp. COT-290_OH860 1,131567,2,68336,976,200643,171549,171551,836,1515615 +715451 Alteromonas sp. SN2 species root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas sp. SN2 1,131567,2,1224,1236,135622,72275,226,715451 +589924 Ferroglobus placidus DSM 10642 no rank root,cellular organisms,Archaea,Euryarchaeota,Archaeoglobi,Archaeoglobales,Archaeoglobaceae,Ferroglobus,Ferroglobus placidus,Ferroglobus placidus DSM 10642 1,131567,2157,28890,183980,2231,2232,54260,54261,589924 +221288 Mastigocladopsis repens PCC 10914 no rank root,cellular organisms,Bacteria,Cyanobacteria,Stigonematales,Mastigocladopsis,Mastigocladopsis repens,Mastigocladopsis repens PCC 10914 1,131567,2,1117,1189,221282,221287,221288 +862908 Bacteriovorax marinus SJ no rank root,cellular organisms,Bacteria,Proteobacteria,delta/epsilon subdivisions,Deltaproteobacteria,Bdellovibrionales,Bacteriovoracaceae,Bacteriovorax,Bacteriovorax marinus,Bacteriovorax marinus SJ 1,131567,2,1224,68525,28221,213481,263369,146784,97084,862908 +311402 Agrobacterium vitis S4 no rank root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Rhizobiaceae,Rhizobium/Agrobacterium group,Agrobacterium,Agrobacterium vitis,Agrobacterium vitis S4 1,131567,2,1224,28211,356,82115,227290,357,373,311402 +311403 Agrobacterium radiobacter K84 no rank root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Rhizobiaceae,Rhizobium/Agrobacterium group,Agrobacterium,Agrobacterium tumefaciens complex,Agrobacterium tumefaciens,Agrobacterium radiobacter K84 1,131567,2,1224,28211,356,82115,227290,357,1183400,358,311403 +180332 Robinsoniella peoriensis species root,cellular organisms,Bacteria,Firmicutes,Clostridia,Clostridiales,Lachnospiraceae,Robinsoniella,Robinsoniella peoriensis 1,131567,2,1239,186801,186802,186803,588605,180332 +1227453 Haloarcula japonica DSM 6131 no rank root,cellular organisms,Archaea,Euryarchaeota,Halobacteria,Halobacteriales,Halobacteriaceae,Haloarcula,Haloarcula japonica,Haloarcula japonica DSM 6131 1,131567,2157,28890,183963,2235,2236,2237,29282,1227453 +1150600 Arcticibacter svalbardensis MN12-7 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Sphingobacteriia,Sphingobacteriales,Sphingobacteriaceae,Arcticibacter,Arcticibacter svalbardensis,Arcticibacter svalbardensis MN12-7 1,131567,2,68336,976,117747,200666,84566,1288026,1288027,1150600 +1406840 Flavobacterium beibuense F44-8 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Flavobacteriia,Flavobacteriales,Flavobacteriaceae,Flavobacterium,Flavobacterium beibuense,Flavobacterium beibuense F44-8 1,131567,2,68336,976,117743,200644,49546,237,657326,1406840 +688245 Comamonas testosteroni CNB-2 no rank root,cellular organisms,Bacteria,Proteobacteria,Betaproteobacteria,Burkholderiales,Comamonadaceae,Comamonas,Comamonas testosteroni,Comamonas testosteroni CNB-1,Comamonas testosteroni CNB-2 1,131567,2,1224,28216,80840,80864,283,285,543891,688245 +401526 Thermosinus carboxydivorans Nor1 no rank root,cellular organisms,Bacteria,Firmicutes,Negativicutes,Selenomonadales,Veillonellaceae,Thermosinus,Thermosinus carboxydivorans,Thermosinus carboxydivorans Nor1 1,131567,2,1239,909932,909929,31977,261684,261685,401526 +335992 Candidatus Pelagibacter ubique HTCC1062 no rank root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,unclassified Alphaproteobacteria,SAR11 cluster,Candidatus Pelagibacter,Candidatus Pelagibacter ubique,Candidatus Pelagibacter ubique HTCC1062 1,131567,2,1224,28211,82117,54526,198251,198252,335992 +1163385 Peanut witches'-broom phytoplasma NTU2011 no rank root,cellular organisms,Bacteria,Tenericutes,Mollicutes,Acholeplasmatales,Acholeplasmataceae,Candidatus Phytoplasma,16SrII (Peanut WB group),Peanut witches'-broom phytoplasma,Peanut witches'-broom phytoplasma NTU2011 1,131567,2,544448,31969,186329,2146,33926,85621,35772,1163385 +999547 Leisingera daeponensis DSM 23529 no rank root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodobacterales,Rhodobacteraceae,Leisingera,Leisingera daeponensis,Leisingera daeponensis DSM 23529 1,131567,2,1224,28211,204455,31989,191028,405746,999547 +1288083 Streptomyces sp. TAA040 species root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Streptomycineae,Streptomycetaceae,Streptomyces,Streptomyces sp. TAA040 1,131567,2,201174,1760,85003,2037,85011,2062,1883,1288083 +999549 Leisingera caerulea DSM 24564 no rank root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodobacterales,Rhodobacteraceae,Leisingera,Leisingera caerulea,Leisingera caerulea DSM 24564 1,131567,2,1224,28211,204455,31989,191028,506591,999549 +999550 Pseudophaeobacter arcticus DSM 23566 no rank root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodobacterales,Rhodobacteraceae,Pseudophaeobacter,Pseudophaeobacter arcticus,Pseudophaeobacter arcticus DSM 23566 1,131567,2,1224,28211,204455,31989,1541822,385492,999550 +1216362 Fusobacterium nucleatum ChDC F128 no rank root,cellular organisms,Bacteria,Fusobacteria,Fusobacteriia,Fusobacteriales,Fusobacteriaceae,Fusobacterium,Fusobacterium nucleatum,unclassified Fusobacterium nucleatum,Fusobacterium nucleatum ChDC F128 1,131567,2,32066,203490,203491,203492,848,851,189727,1216362 +311424 Dehalococcoides mccartyi VS no rank root,cellular organisms,Bacteria,Chloroflexi,Dehalococcoidia,Dehalococcoidales,Dehalococcoidaceae,Dehalococcoides,Dehalococcoides mccartyi,Dehalococcoides mccartyi VS 1,131567,2,200795,301297,1202465,1202464,61434,61435,311424 +573569 Francisella sp. TX077308 species root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Thiotrichales,Francisellaceae,Francisella,Francisella sp. TX077308 1,131567,2,1224,1236,72273,34064,262,573569 +426114 Thiomonas arsenitoxydans species root,cellular organisms,Bacteria,Proteobacteria,Betaproteobacteria,Burkholderiales,unclassified Burkholderiales,Burkholderiales Genera incertae sedis,Thiomonas,Thiomonas arsenitoxydans 1,131567,2,1224,28216,80840,119065,224471,32012,426114 +1212548 Pseudomonas stutzeri NF13 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,Pseudomonas stutzeri group,Pseudomonas stutzeri subgroup,Pseudomonas stutzeri,Pseudomonas stutzeri NF13 1,131567,2,1224,1236,72274,135621,286,136846,578833,316,1212548 +426117 Methylobacterium sp. 4-46 species root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Methylobacteriaceae,Methylobacterium,Methylobacterium sp. 4-46 1,131567,2,1224,28211,356,119045,407,426117 +1138822 Lactobacillus curieae species root,cellular organisms,Bacteria,Firmicutes,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,Lactobacillus curieae 1,131567,2,1239,91061,186826,33958,1578,1138822 +98439 Fischerella thermalis PCC 7521 no rank root,cellular organisms,Bacteria,Cyanobacteria,Stigonematales,Fischerella,Fischerella thermalis,Fischerella thermalis PCC 7521 1,131567,2,1117,1189,1190,372787,98439 +65672 Piriformospora indica species root,cellular organisms,Eukaryota,Opisthokonta,Fungi,Dikarya,Basidiomycota,Agaricomycotina,Agaricomycetes,Agaricomycetes incertae sedis,Sebacinales,Sebacinales group B,Piriformospora,Piriformospora indica 1,131567,2759,33154,4751,451864,5204,5302,155619,355688,297313,1506295,65702,65672 +1441930 Serratia fonticola RB-25 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacteriales,Enterobacteriaceae,Serratia,Serratia fonticola,Serratia fonticola RB-25 1,131567,2,1224,1236,91347,543,613,47917,1441930 +688269 Thermotoga thermarum DSM 5069 no rank root,cellular organisms,Bacteria,Thermotogae,Thermotogae,Thermotogales,Thermotogaceae,Thermotoga,Thermotoga thermarum,Thermotoga thermarum DSM 5069 1,131567,2,200918,188708,2419,188709,2335,119394,688269 +688270 Cellulophaga algicola DSM 14237 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Flavobacteriia,Flavobacteriales,Flavobacteriaceae,Cellulophaga,Cellulophaga algicola,Cellulophaga algicola DSM 14237 1,131567,2,68336,976,117743,200644,49546,104264,59600,688270 +1163407 Rhodanobacter spathiphylli B39 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,Rhodanobacter,Rhodanobacter spathiphylli,Rhodanobacter spathiphylli B39 1,131567,2,1224,1236,135614,32033,75309,347483,1163407 +1163408 Rhodanobacter fulvus Jip2 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,Rhodanobacter,Rhodanobacter fulvus,Rhodanobacter fulvus Jip2 1,131567,2,1224,1236,135614,32033,75309,219571,1163408 +1069080 Succinispira mobilis DSM 6222 no rank root,cellular organisms,Bacteria,Firmicutes,Negativicutes,Selenomonadales,Acidaminococcaceae,Succinispira,Succinispira mobilis,Succinispira mobilis DSM 6222 1,131567,2,1239,909932,909929,909930,78119,78120,1069080 +1120963 Algicola sagamiensis DSM 14643 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Pseudoalteromonadaceae,Algicola,Algicola sagamiensis,Algicola sagamiensis DSM 14643 1,131567,2,1224,1236,135622,267888,296014,163869,1120963 +561177 Anaerococcus hydrogenalis DSM 7454 no rank root,cellular organisms,Bacteria,Firmicutes,Clostridia,Clostridiales,Peptoniphilaceae,Anaerococcus,Anaerococcus hydrogenalis,Anaerococcus hydrogenalis DSM 7454 1,131567,2,1239,186801,186802,1570339,165779,33029,561177 +106648 Acinetobacter bereziniae species root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Moraxellaceae,Acinetobacter,Acinetobacter bereziniae 1,131567,2,1224,1236,72274,468,469,106648 +1107311 Flavobacterium enshiense DK69 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Flavobacteriia,Flavobacteriales,Flavobacteriaceae,Flavobacterium,Flavobacterium enshiense,Flavobacterium enshiense DK69 1,131567,2,68336,976,117743,200644,49546,237,1341165,1107311 +1136417 Salinispora pacifica CNT003 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Micromonosporineae,Micromonosporaceae,Salinispora,Salinispora pacifica,Salinispora pacifica CNT003 1,131567,2,201174,1760,85003,2037,85008,28056,168694,351187,1136417 +237727 Erythrobacter sp. NAP1 species root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Erythrobacteraceae,Erythrobacter,Erythrobacter sp. NAP1 1,131567,2,1224,28211,204457,335929,1041,237727 +1506583 Flavobacterium sp. Fl species root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Flavobacteriia,Flavobacteriales,Flavobacteriaceae,Flavobacterium,Flavobacterium sp. Fl 1,131567,2,68336,976,117743,200644,49546,237,1506583 +1540257 Clostridium sp. KNHs214 species root,cellular organisms,Bacteria,Firmicutes,Clostridia,Clostridiales,Clostridiaceae,Clostridium,Clostridium sp. KNHs214 1,131567,2,1239,186801,186802,31979,1485,1540257 +1120966 Algoriphagus marincola DSM 16067 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Cytophagia,Cytophagales,Cyclobacteriaceae,Algoriphagus,Algoriphagus marincola,Algoriphagus marincola DSM 16067 1,131567,2,68336,976,768503,768507,563798,246875,264027,1120966 +1437610 Bifidobacterium reuteri DSM 23975 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Bifidobacteriales,Bifidobacteriaceae,Bifidobacterium,Bifidobacterium reuteri,Bifidobacterium reuteri DSM 23975 1,131567,2,201174,1760,85003,85004,31953,1678,983706,1437610 +1380380 Ahrensia sp. 13_GOM-1096m species root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodobacterales,Rhodobacteraceae,Ahrensia,Ahrensia sp. 13_GOM-1096m 1,131567,2,1224,28211,204455,31989,152180,1380380 diff --git a/q2_moshpit/eggnog/tests/test_dbs.py b/q2_moshpit/eggnog/tests/test_dbs.py index f9280d86..31ac4c8c 100644 --- a/q2_moshpit/eggnog/tests/test_dbs.py +++ b/q2_moshpit/eggnog/tests/test_dbs.py @@ -150,9 +150,10 @@ def test_fetch_eggnog_fasta(self, subp_run): # Check that commands are ran as expected subp_run.assert_has_calls([first_call, second_call], any_order=False) + @patch("q2_moshpit.eggnog._dbs._validate_taxon_id") @patch("subprocess.run") @patch("shutil.move") - def test_build_eggnog_diamond_db(self, shut_mv, subp_run): + def test_build_eggnog_diamond_db(self, shut_mv, subp_run, _val): # Instantiate input proteins_and_taxa = EggnogProteinSequencesDirFmt() @@ -175,3 +176,15 @@ def test_build_eggnog_diamond_db(self, shut_mv, subp_run): source_path = os.path.join(str(proteins_and_taxa), "ref_db.dmnd") destination_path = os.path.join(str(diamond_db), "ref_db.dmnd") shut_mv.assert_called_once_with(source_path, destination_path) + + def test_build_eggnog_diamond_db_invalid_taxon_id(self): + # Init input data + path_to_data = self.get_data_path('build_eggnog_diamond_db/') + eggnog_proteins = EggnogProteinSequencesDirFmt(path_to_data, 'r') + + # Call function exception error since taxon 0 is invalid + with self.assertRaisesRegex( + ValueError, + "'0' is not valid taxon ID. " + ): + _ = build_eggnog_diamond_db(eggnog_proteins, taxon=0) diff --git a/setup.py b/setup.py index 2a286e79..b7cf194e 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ ], 'q2_moshpit.eggnog': [ 'tests/data/*', + 'tests/data/build_eggnog_diamond_db/*', 'tests/data/contig-sequences-1/*', 'tests/data/mag-sequences/*', 'tests/data/random-db-1/*', From 527bfecca1aa11766a29b9da741e27eae6715719 Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Thu, 18 Jan 2024 13:04:34 +0100 Subject: [PATCH 10/24] Implement fetch-ncbi-taxonomy --- q2_moshpit/citations.bib | 6 +++ q2_moshpit/eggnog/__init__.py | 4 +- q2_moshpit/eggnog/_dbs.py | 92 +++++++++++++++++++++++++++++++++++ q2_moshpit/plugin_setup.py | 18 +++++++ 4 files changed, 118 insertions(+), 2 deletions(-) diff --git a/q2_moshpit/citations.bib b/q2_moshpit/citations.bib index 02c36349..64751cf4 100644 --- a/q2_moshpit/citations.bib +++ b/q2_moshpit/citations.bib @@ -123,3 +123,9 @@ @article{buchfink_sensitive_2021 keywords = {Computational biology and bioinformatics, Genome informatics, Genomic analysis, Sequencing, Software}, pages = {366--368}, } + +@misc{NCBI, + title = {National Center for Biotechnology Information (NCBI)}, + url = {https://www.ncbi.nlm.nih.gov/}, + note = {Bethesda (MD): National Library of Medicine (US), National Center for Biotechnology Information;}, +} diff --git a/q2_moshpit/eggnog/__init__.py b/q2_moshpit/eggnog/__init__.py index 9a176a43..2383b6f1 100644 --- a/q2_moshpit/eggnog/__init__.py +++ b/q2_moshpit/eggnog/__init__.py @@ -8,12 +8,12 @@ from ._method import eggnog_diamond_search, eggnog_annotate from ._dbs import ( fetch_eggnog_db, fetch_diamond_db, build_custom_diamond_db, - fetch_eggnog_proteins, build_eggnog_diamond_db + fetch_eggnog_proteins, build_eggnog_diamond_db, fetch_ncbi_taxonomy ) __all__ = [ 'eggnog_diamond_search', 'eggnog_annotate', 'fetch_eggnog_db', 'fetch_diamond_db', 'build_custom_diamond_db', 'fetch_eggnog_proteins', - 'build_eggnog_diamond_db', + 'build_eggnog_diamond_db', 'fetch_ncbi_taxonomy' ] diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py index 49442342..fe1a4b8a 100644 --- a/q2_moshpit/eggnog/_dbs.py +++ b/q2_moshpit/eggnog/_dbs.py @@ -6,6 +6,7 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import os +import datetime import pandas as pd from q2_types.feature_data import ProteinSequencesDirectoryFormat import shutil @@ -235,3 +236,94 @@ def _validate_taxon_id(eggnog_proteins, taxon): "To view all valid taxon IDs inspect e5.taxid_info.tsv " "file in the input eggnog_proteins input." ) + + +def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt: + """ + Script fetches 3 files from the internet and puts them into the folder of + a NCBITaxonomyDirFmt object. + """ + # Initialize output object and paths + ncbi_data = NCBITaxonomyDirFmt() + zip_path = os.path.join(str(ncbi_data), "taxdmp.zip") + proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz") + version_path = os.path.join(str(ncbi_data), "version.tsv") + + # Download zip file + print(colorify("Downloading *.dmp files")) + run_command( + cmd=[ + "wget", "-O", zip_path, + "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip" + ] + ) + + # Unzip + run_command( + cmd=[ + "unzip", "-j", zip_path, "names.dmp", "nodes.dmp", + "-d", str(ncbi_data) + ] + ) + + # Remove zip file + run_command( + cmd=[ + "rm", zip_path + ] + ) + + # Download proteins + print(colorify("Downloading proteins file (~15 GB)")) + run_command( + cmd=[ + "wget", "-O", proteins_path, + "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/" + "prot.accession2taxid.gz" + ] + ) + + # Get last modification times + print(colorify("Constructing version file")) + names_time = _get_last_modified_time(str(ncbi_data), "names.dmp") + nodes_time = _get_last_modified_time(str(ncbi_data), "nodes.dmp") + proteins_time = _get_last_modified_time( + str(ncbi_data), "prot.accession2taxid.gz" + ) + + # Create a DataFrame with file names and last modification times + data = {'file_name': [ + 'names.dmp', + 'nodes.dmp', + 'prot.accession2taxid.gz' + ], + 'date': [ + names_time.strftime('%d/%m/%Y'), + nodes_time.strftime('%d/%m/%Y'), + proteins_time.strftime('%d/%m/%Y') + ], + 'time': [ + names_time.strftime('%H:%M:%S'), + nodes_time.strftime('%H:%M:%S'), + proteins_time.strftime('%H:%M:%S') + ] + } + version = pd.DataFrame(data) + + # Write version file + version.to_csv(version_path, sep='\t', index=False) + + # Return completed object + print(colorify( + "Done! Moving data from temporary directory to final location." + )) + return ncbi_data + + +# Get the date and time modified of a file +def _get_last_modified_time(dir, file): + return datetime.date.fromtimestamp( + os.path.getmtime( + os.path.join(dir, file) + ) + ) diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index cf933cc1..e00a0feb 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -609,6 +609,24 @@ "storage space is required to run this action. " ) +plugin.methods.register_function( + function=q2_moshpit.eggnog.fetch_ncbi_taxonomy, + inputs={}, + parameters={}, + outputs=[("taxonomy", ReferenceDB[NCBITaxonomy])], + output_descriptions={ + "taxonomy": "NCBI reference taxonomy." + }, + name="Fetch NCBI reference taxonomy", + description="Downloads NCBI reference taxonomy for the NCBI ftp server. " + "The resulting artifact is required in the " + "build-custom-diamond-db action if one whished to " + "create a Diamond data base with taxonomy features. " + "At least 30 GB of " + "storage space is required to run this action.", + citations=[citations["NCBI"]] +) + plugin.methods.register_function( function=q2_moshpit.eggnog.build_eggnog_diamond_db, inputs={ From 92cb2f5f5e4bca9e80efcfb1bea6dbc15ad53878 Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Thu, 18 Jan 2024 14:04:39 +0100 Subject: [PATCH 11/24] _write_version_tsv functionality to separate function. --- q2_moshpit/eggnog/_dbs.py | 48 +++++++++++++++------------------------ 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py index fe1a4b8a..4fbe663b 100644 --- a/q2_moshpit/eggnog/_dbs.py +++ b/q2_moshpit/eggnog/_dbs.py @@ -246,6 +246,8 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt: # Initialize output object and paths ncbi_data = NCBITaxonomyDirFmt() zip_path = os.path.join(str(ncbi_data), "taxdmp.zip") + nodes_path = os.path.join(str(ncbi_data), "nodes.dmp") + names_path = os.path.join(str(ncbi_data), "names.dmp") proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz") version_path = os.path.join(str(ncbi_data), "version.tsv") @@ -267,11 +269,7 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt: ) # Remove zip file - run_command( - cmd=[ - "rm", zip_path - ] - ) + run_command(cmd=["rm", zip_path]) # Download proteins print(colorify("Downloading proteins file (~15 GB)")) @@ -283,13 +281,21 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt: ] ) - # Get last modification times + # Constructing version file print(colorify("Constructing version file")) - names_time = _get_last_modified_time(str(ncbi_data), "names.dmp") - nodes_time = _get_last_modified_time(str(ncbi_data), "nodes.dmp") - proteins_time = _get_last_modified_time( - str(ncbi_data), "prot.accession2taxid.gz" - ) + _write_version_tsv(nodes_path, names_path, proteins_path, version_path) + + # Return object + print(colorify( + "Done! Moving data from temporary directory to final location." + )) + return ncbi_data + + +def _write_version_tsv(nodes, names, proteins, version): + names_time = datetime.date.fromtimestamp(os.path.getmtime(nodes)) + nodes_time = datetime.date.fromtimestamp(os.path.getmtime(names)) + proteins_time = datetime.date.fromtimestamp(os.path.getmtime(proteins)) # Create a DataFrame with file names and last modification times data = {'file_name': [ @@ -308,22 +314,4 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt: proteins_time.strftime('%H:%M:%S') ] } - version = pd.DataFrame(data) - - # Write version file - version.to_csv(version_path, sep='\t', index=False) - - # Return completed object - print(colorify( - "Done! Moving data from temporary directory to final location." - )) - return ncbi_data - - -# Get the date and time modified of a file -def _get_last_modified_time(dir, file): - return datetime.date.fromtimestamp( - os.path.getmtime( - os.path.join(dir, file) - ) - ) + pd.DataFrame(data).to_csv(version, sep='\t', index=False) From 2cdae75aec641e72ed6a4fbe80d9a16267afbd3f Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Thu, 18 Jan 2024 14:04:48 +0100 Subject: [PATCH 12/24] implement tests --- q2_moshpit/eggnog/tests/data/ncbi/names.dmp | 11 +++ q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp | 11 +++ .../tests/data/ncbi/prot.accession2taxid.gz | Bin 0 -> 10956 bytes q2_moshpit/eggnog/tests/test_dbs.py | 68 +++++++++++++++++- 4 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 q2_moshpit/eggnog/tests/data/ncbi/names.dmp create mode 100644 q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp create mode 100644 q2_moshpit/eggnog/tests/data/ncbi/prot.accession2taxid.gz diff --git a/q2_moshpit/eggnog/tests/data/ncbi/names.dmp b/q2_moshpit/eggnog/tests/data/ncbi/names.dmp new file mode 100644 index 00000000..b89e8a2b --- /dev/null +++ b/q2_moshpit/eggnog/tests/data/ncbi/names.dmp @@ -0,0 +1,11 @@ +1 | all | | synonym | +1 | root | | scientific name | +2 | Bacteria | Bacteria | scientific name | +2 | bacteria | | blast name | +2 | eubacteria | | genbank common name | +2 | Monera | Monera | in-part | +2 | Procaryotae | Procaryotae | in-part | +2 | Prokaryotae | Prokaryotae | in-part | +2 | Prokaryota | Prokaryota | in-part | +2 | prokaryote | prokaryote | in-part | +2 | prokaryotes | prokaryotes | in-part | diff --git a/q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp b/q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp new file mode 100644 index 00000000..61a662a0 --- /dev/null +++ b/q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp @@ -0,0 +1,11 @@ +1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | +2 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | | +6 | 335928 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +7 | 6 | species | AC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +9 | 32199 | species | BA | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +10 | 1706371 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +11 | 1707 | species | CG | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | effective current name; | +13 | 203488 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +14 | 13 | species | DT | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +16 | 32011 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +17 | 16 | species | MM | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | diff --git a/q2_moshpit/eggnog/tests/data/ncbi/prot.accession2taxid.gz b/q2_moshpit/eggnog/tests/data/ncbi/prot.accession2taxid.gz new file mode 100644 index 0000000000000000000000000000000000000000..8b78ac0fc0030af4e68ead0ebb547dfb55ae34c2 GIT binary patch literal 10956 zcmV;-Dl^p|iwFod4|`<*18{P0bS`0IV`X!5X>V>abYXaDWB_fQ%Wea)a&5<*k1?8W zawu63OR@}EvSmxNgCq#fBmt5jxSxMnB&&A2*%M>KTqu&oH=CsX@vr~-@Bi~Z|NO82 z9R9)T<^TTuzsrCA_aFc7pZ_-e$3OpNHLk`n9o~N80QsT@Wz^`E8B{bbX{~)Is8XM& zmRn6#b9S6uIbYyh@DhbGP2~~_<+&Dl^QLAh)Dmh8(%LP5B8NUlBB+2jJeJw<#MK?R>E!k06FJW6mF*m$d2QBb_iSH zvdo-~CP~Hh?8L**hBHEv-0ii_#hR5&Iso?^T~pg}m<({a$zJ^a>y% z-_Ik@$+U3N-h#B%CeOAstB9dlPXVJV5_9?(nB1rQwkL>-J(Z1VK;8 z`N>Re)>KYF!bnFwCnFxgE(oBbjI0 z=Lbc8Ml#Q?%g+dBo{`G4U&s$~env9S;SX5~D4<=OQ9$Q7BPliaga9@PbNlv3-3+8ZSuWWubAQntj6JXK&G<6@DU( zpM4sQ)Skzyw3#gXRw%iC$dZ6$U&CP$UGrn!H4`|o7yvHR$BD%Ruy3wW{P@_4d$R1? zi~$IX+l3AQY23l$u5-8v9w0KaTWAVMqfAao_Fw|&04|I(aL*e`=B?|bOoBypq>p(= zN^Qmhi~tvg52aikD4BV;M@<2h z0^Q!D@Aj+_i_aucXW7*rw8Cdf02eB*IQ+SS#Z_m~qa%H;U~$!342XIytuV{ZqMv{f zU_Z`8fUmIl+PCYF00w}4Q<#A0Ql9fJg_=SD=m0J>1u4R(tUF2eAS}wdlVtC3lr_4P z=e$dyimL&7fD0X|0tkoCr-jt_psdkJJ?EViMUBj0{`j2FA1O1aW;+z&ZefUnA2HE< z&L^5Qvxj?jNaJoHjmXR%Wxel*50A|3;qd)JyN1KjKR)OEBUzk8jWq82G-4+8d_(Pd zTj-k66fkvt&Zn*f!aeA5p9?*%L8ZWq^f{lAQpH6P9+1X^n&;YK{`ka<^f{lAl0~HP zfT#}(MTkm)iRQDPXxb4LkFa=Lm>?lDC#3Pzm$gErIKkp+p$L)2cQl3fz7NOD{)vg^ zv!7`4&NhXfk?V7xYr_`5SDHbR(~X6?qr$oVUazG$6iJDnu$zT{pfCw0&|>X2svvIe<7 zW%H6GrBjzLD)~l5^0>q~`v-O5|Bmitdrvw6tz<{pE<3>+fDWMRLCKABVSwy&BMwp?Y4i&&N7Oz7Y@4CU*h#** zLH^`5+W=Y6Tve2|(Wo&<$M$%L%SjTKk(iyPvv(A2^|fT$wdQgx@`b)5*ib(scw=}J zvuneXr6M~{3&+D^ksW85%hUYV09*Zhe6?y>WJcqtvztcxM5+@{W$jg6TdgdT z8l$HV%GQl3=-`A~^LnyK=9HyRp3X=TufG?umu0EC*5pyGcNj}%3gI;wKCx%bdv40$Qob}@|l*|hdiWf}Yu`9F7AF%JN>om+d| zs_N7#OLs({?Nm}eY1`Vl%N4D)-zrPjp>jFK+T>jB5iFzbq6f$+S`H|v)P~a>ezbmo zT(asIJy|+~k?#GpHHM2^Z zl_jOP8ab;Qx5?FSfFy^zkh|?X_~hvue^T({>CvV-RT4MGY|p2tb1&1nVn4pCk>e#v z-4<6T z-2lm4?oo44{zy+ zMs9hzg0;p+foE{#qvM2A)^fy~vkC)b$8o&7^$#bIjsdr~$I$*fYBm*O(&z zaAO`4lyL16Zq%X^vU!0Wi?!4oYBRP*(N_hA6D)zGcV{c4J{Bp=^>H)GW zS;n%kaymEWBn5Z41_84au$Z8k<~nBM$_oO`(0bZC_i-6j=rW=VIhjDSV0}2fwd{d|yvVUVk>P zhu?i@wW7$EcDRRn+dKoZA-3-W6#N?p$ob%6g?n>mfb2NS-1#K-vMe3EPTuP_L(Oe4 z3flG?s_am!gT#LRNCV`8)hJm%U&aBFPN@C0%3CNUTme-JRvprML0T_*ZZ+2tX9WEc zPo2vowex@i>VUch546#+ga@FZOF$eQoOMT9?+bo4Z6;8{k=A>kR;wo-xLcrhAzMi6 z$D5=z%Y_7J@xaGH()w{&ssqLW^+5fCRU_H@I7%J(sH=}zT7#6<)r$wNWu;4P-tpm6 zt-OFIse;$CjWz>0osad%jv`55OnLH~>qv~sdT^g+fb2M~rH`|;P?CMb?Dl6`vx7(y zn6ny^tdH6{*T+?!at@HCA2Z6vGD4RVw>&YRa0ApVpj?^aYSorLRW4Z#f+nD85%r9& z+sdQLB{VVm`YmbAatVz<6VSATwi*)GT?u}uOGimlB9tl z4@!Ad%oALlM>5U6+#OC_MM##Yjg!NnilaWpwnLl=5^avja8Pm;>RJcplqIvw&O3stzA1QTqSL9 z=1wG(j?s>Kll#wq0E3;p<4?CgR2hF`wzj2kEY^BsP?V3KhoE^r0`dF51O4>pH~&90qWu9DU;$+_BMXg`s; z?JcyJi=-tHt#RnnQp~+*?xeV$wdrZPV!6 zo$wIfBJYEyV~t5S7q7Xj-%5?zInG} z1|>WI4GUqLRW6~H(MwxiSGkle15^XmOX$R^0cw^|2h;+! zOQ;9xfVw3#0QEo@ODi^XogVC0l1t9gf6(zXoLM zX@M^JH6)-8=#pRK2-E{z@@rH;1JI>b5GQ_(K$lv9XLPTB7$upfj=6%;?1<-4Qdr05 zib;|*9UXn7FSljOKQD=*kUKp<{v}mjCn(PTRlRZcN!EnsHMWw3PEcuBBp1+81GPZy z5(=wFSUoOU4L~(ey@Xnz0ccpt7Seh`0!~W_Pzop#a9T=$GC<+i(~@7u1QcmKEv0pY z)ibP~7p;1ruzFs!8U_78wmug9>VQU|aS3hC@;lRBi>R{93+ULOk;P6cSO4YO45CVE z0i&hkgJX<&*lkmt7=Avut`{xII7=AA8DqxlLnR^_|p}2Zp4)Hz{IBX&DR)oa+HL3x!<3?~A zoPL>am1uAVoas1m^w%5WeUo_UGDWb-!d$zAQ>E-Ml=GpZZu6=f_{|leNZC0^F;AFM z0(KYgecz?j_+W0^pvi%EC%L#go{RZ($mhv4y*=T&={ZI-#OJ&KD<89N;AZ7M@19nUE zT(FaWNA8u16sPbmu@>&^2FT8NVlTIq0dgL#6YN#YW%;*8MP~%t>j83Zt+lhbzO?mL zm8B1&ua@PDsY}^^t!2dXj_W{?FYOd&`#7h$D#!By+@WxEt-vQK4z4XZ_-h;>7f4R! z`MqsmRypmis7sY@Bi{BBIY}~^=lyi-wcG{Jd~-3T-A8*OqsqD{3%Z7a>f|V;}nk;>GQhgJdvbhtA(pO;klq{?M&nEs=XUmhD;Z8=4s848RdPSYO|6|k%Ym0s>|G{jBuo&Bh=P?1V^97@6CdaJOvzUOe=UbU)n33Y;U0*YKhTipnkueKW6Ix+kysYIzBb0Apt{F!;Q%=ZmaE5gobY%T#=6gu*v=fLuZY zPzTg4p(D@;G%leEDAKxFOsf-gI}VTws6)2eV-Hm>S@l2-P_u*@pm5f9$yp8_*m|IT z355r?aMpIwSt!`H&a83?wSw;9f&HQfT3-@hpQ@on(uk{*Gluum`2$6Yo3Wi_t8zl- z4uamN(L(ZmNR<<^RaUNnIrszQM_7nm3CZJqkv-Ygbvf9#E7es_$hsy9ZHAeH<4VXZ z7fANfW^<6(H-~BL_2^>cvZr7R(k_sa_hS_D7$;=ybs`^!%)#fakmv2xW}9aj@zUiX zIm)stBmP{FgRdv?{Ph%iGDwcDpA!4^(=OPQ*uSFK%yNNj-AUZ4!2TpTEzNe`v68Ft z{ryqNDrt)8U6L71>-JXT`P!z{Rn9k8THP#x5nSvxLp-PH1!udJwF2kBxz65%^Wc1EPyBpSh*wQE zvD!H(_(dUJ)t{G={C*jUH_NVk#okwt^2qYbUjkt-Ozh>6Kjbxu0~f)?`S}+k ze&M=bgsL_nzlK3ynE+5G63f!vWTmpymtwNU%_RIA=GDniyn^vAK zP17NYyd-IZ%xb)Q!!?j34KjI3nO94t5eJ&0kAiwVKrWz>+tYo0<@!K!2@Qh&+5xgI zIZUqZbQn77z)wuC)T?Qq-l}~0bK>9dJ@6cu+MVDkXB^*{U#abiD`Sx~4RoX2w&hm~ zbKoaCv&gzWPIB8eu)Y0h#pxqimd83+XLu#3IfeU=osb{91(FL=y@fNUkz|S6jIQ>u zR`Inz3T<=xDoYPrW(wsn5r4bNf$<<*EF&Aec;B`yxZSI_757U;&aDN`$iZ*fH6&|B za{77mt!=GPXuC&OWe>I%qYvwL;6NoF3Wh0L@jqfhS2m4ajsay2`hpIjeF_G#qIoW49~h9ruxzR$$30j~Vx zw;gw^vaBlWW^lc8%fqyPhp*e2CA{I^o*dvr2GnC*KO!mNHd> zD>F)&hf$ybXF4tj*VZOgcIk`4wGXvbc3c97e4LOEo)5t5WjF4(bKnEqqvLq~99FCLXjGLi7>mKzK*vF+eDRGcK)#hPQQlhNH^PH=4O!;HXKKjX65w)E-{&9$W&K<{XbyIdCiS?zXD4 zXTf=Je$H`69NzXTvA18rUh5XD+Pm}C8v3K|tdxzgHeNx0Z(1YvJ$0JI3JGO@f zs-zOBpWMs#9v{-qf#-JTpvoD?PeHE2`}LFUk)+*9o806~HBbD)@5hQuj%4Y!yr(iA zQ=*du?X&DE9^0+C<^)Dsq)eXk;|ClZ{m_p^|{q-s`WjLmmIz z28G|l0J&tX*>K==xKfmE{tu1ieZE*4M=Z)WTYipT~CGqsp>t zWi-7Iq(^NmJwiANdQ}4?P5u9%mKXf%@wRVY@v1CVLl~W^qoE19ebZj+pW{a@ttv}x ziFqYzaI9A~^mpG*!dID5pVxR+&h@IcfTZB|%1V_JvV9e9-NM>#*KL|qW$7^kUDmFn ziO)-r!04@AkgbwYt8hrVWy0u;9_`=*cV0g?--=j!ricU)gdiU-gLTi=qyKmu%!N^?9xMsUyrta$o}2A4!E@yp*r@{ZZ2qoB=mKYC6hzl>?Dq z-{sd&;zfRapWg}R!Od?Wj}i62!!N!o-zXiW4k)+KmmA6>z%OC$moZH2!7rir3(6zF zFQKpZ@LzymLVpVx^(w$Gq4P_C{{q@a=-Nk^*h75@U422nt0US+?Ak|!y%qeo694h- zVpOXQvb#Zc>t$*^sV*BNZ_|BLv~b@R$=i-b&K60&(9QMH z$vJhYgU_!|_902yZnIH7d`i4-HS!9#$T`9|zd`abLWe)uigIQ9R zgQBN=8pw^4LLETYg8^U!SeDi{S#Xk-&~8vxS-D9HJ#(>LS*?=3kf+{;VdTg%@nX7ern-hA6GCz8U=U2?o> zkFVt=dO#96&)f5!qbB*nAFb(0TTk!!Hg13Axn7aS=XfFO-b3IrA=$+%_CizxWM@wcceMi~wNd)iA8|UJ z%jA9SxRH|VVvgV(I9INi371DmPT}?A$T5?o#!S~X@rVe|tKIc`&819ofo!?)ulsh$ zzsi^Hb%18l-w(%@$RbO1Pzk@K&1-Fc|Kz!Pk!6n|2HFs+C-3-2L)W=$ZBGVP`69oa z)8k$o*%yB;H~uQU<@W$?H?AW3Os`Z}Z$L%Z$WCEtl@wLMw;< zv`-OL0&E}P+nDjHUX8N@WN$GtXf<0T*;~{N+^Y7Rb(OtE&!E1sMUuV6kwJIlT$Ai9 zstFhY#y)DtAir)@Wf!$)hi`Xn3Rq=tv4Q%g+i8L-sl&?^L2H?0wSFfDU)#CODi=rv z(t&gfWXq5K@$Ktzl|9Kdh=M&xzhHA@^7J7#Pag|5ThCI01IVyI)}H0n?D#85l`rLo z>T{Qe2en%THgCuF-M31zsp$9TYY~;F>l%5zMz%>ZH?m-zGyEv1vu(BeYVOB2o2l}} zOy6t_`~%RhcJrniS7f(~GJ~_gu&nR&HE*|{So@1&KcQxz9-ie>v$+{{#dkP%N!_3RWwWUD` zM0rehp*SB79LmFV<>BFH13#N?cW#{I7k)OKpW9rA0xkSE*@fbKpgb)6XXm?kF2R2m z{XM`W*7%Cc6scmpo;KJ&*KFcA?r#%%R*o>a*{5krm3r!_U6+ zvzo+<^!aXEL806N(igg2PzC=5_%E1w`{lvZ?rr3FNz%+9_ZeoC!ywiR`K|`Y1(Nf! zcPDOhBo|1o4fn^LZIUF?(Gfa}-`Hplrzc!n#V%4ZRF97ETWzoI_1GT&t@4GgAI7$w z!ry0J-R9gjYpX0{pPL+`Xw}AD-aeTlStd@#>hh~X^_F6r5asaery3y1q_gzAcKbro z?%_~qJAPATZ&Nw8`DsQ*N%kl|KA*$wgZ)FYP8xmS(cJKO>OQY+AH7O)kdO7Vdn4IN z-`5-Gh$O9Fd{A=DmHT4%dus3AS6ODZd2XKY(vW8IJ#yiYTp*j{{pQ-v zJ3}1#{W%J6-?6JCG?ptEbr+^BKy%=C``}dNf=RBIe)H2_u&iGz9yp;S7f>Up zYbPG7TtKZ4>!5-8bzEc9muMotN>LbhLqHqt@T;lW3Jx>?Y7Y z1{ZtV@uT2&QKrg{4DD0q8AHwThgBecQgNa$>XYf$?K&IXJ$G%`7Km z?Ga4WLbjunRsOYEuKbNmZW8C68E7d6&BBL&PVn<$<9mssB#GoAVWv;(wu@%dVuV0N?qFU&kj`C+jJvH4{{-$ zifi5GdgQ#5?ENVs(ZSd#dpHGSX=W`OfoHQ0M z{X8Z1pQr2EW|VT@{H`VT-*xO-96!aIgC8jO$F$tJCL{09yTtyyyPlNN+>VM#lD+4p z#clP4V<$OJCw*68(%ghJw}o_SA^n=mo#ccJidKaD%f}S_t_R3z4_1EzO8U>Oze?=> z+VxAhZA#wLn)?uv^PW}~WI1?s<#t4JA)T!q>G#ln-muEDv~_uP$zw-s8vq;qy^k&L zMbf4a{b)y&Lj>G^>fbxpr6Os7*WMDzo^N^A58IxhtP*5>#VPVd>>TXcwWlg4HtAi0 zyku^t@N@9)Ddg@MrZ(FKX#g2P#szX>Gl5JC zNWDOMkOrh#APqwXfq_l8obqqyK1o)Nws54E?{23C%(646WeP&RhGV@bQbOB#aj)g z&~`4e$_{LwD@&*XsOBKw1}>o*pzgq_MFX&m%43jUK1UhhT#@Y6vrUZI$zR$A- z2X*?s(=RVI9&VsXD%nnXG!#U@ptCXx~1n z?2?EqUyzkcpA|%WK?*MmDV!KYp^Sz#$9BsJBuVjVQ|_X)&Q^-=V|QaOlBAkA zBe@g1-Pi0!l4ZBa(bi@Ar8Wm}8wSW8FMO@X%7HMT*Eh}#$%2NE?Bqx5+2{A;52GXr^R@3Zi+xFe7f*x+Xk$%<2-Zsi^O{` z^T%X^qDQ9rr+g)a4=WZQ^>^xLPSilRlE_DWFyJrB+xt}nZwn}n Date: Thu, 18 Jan 2024 14:07:10 +0100 Subject: [PATCH 13/24] fix bug in tests --- q2_moshpit/eggnog/tests/test_dbs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/q2_moshpit/eggnog/tests/test_dbs.py b/q2_moshpit/eggnog/tests/test_dbs.py index 66e657cd..c73d41f8 100644 --- a/q2_moshpit/eggnog/tests/test_dbs.py +++ b/q2_moshpit/eggnog/tests/test_dbs.py @@ -152,9 +152,9 @@ def test_fetch_eggnog_fasta(self, subp_run): # Check that commands are ran as expected subp_run.assert_has_calls([first_call, second_call], any_order=False) - @patch("q2_moshpit.eggnog._dbs._make_version_df") + @patch("q2_moshpit.eggnog._dbs._write_version_tsv") @patch("subprocess.run") - def test_fetch_ncbi_taxonomy(self, subp_run, mk_v_df): + def test_fetch_ncbi_taxonomy(self, subp_run, w_v_tsv): # Call function. Patching will make sure nothing is actually ran ncbi_data = fetch_ncbi_taxonomy() zip_path = os.path.join(str(ncbi_data), "taxdmp.zip") @@ -196,7 +196,7 @@ def test_fetch_ncbi_taxonomy(self, subp_run, mk_v_df): [first_call, second_call, third_call, forth_call], any_order=False ) - mk_v_df.assert_called_once_with( + w_v_tsv.assert_called_once_with( nodes_path, names_path, proteins_path, From 8233505d345ee20d3936abb6b2025e1fb29374db Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Thu, 18 Jan 2024 14:12:48 +0100 Subject: [PATCH 14/24] Add ellipsis to green prompts --- q2_moshpit/eggnog/_dbs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py index 4fbe663b..1264eb19 100644 --- a/q2_moshpit/eggnog/_dbs.py +++ b/q2_moshpit/eggnog/_dbs.py @@ -252,7 +252,7 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt: version_path = os.path.join(str(ncbi_data), "version.tsv") # Download zip file - print(colorify("Downloading *.dmp files")) + print(colorify("Downloading *.dmp files...")) run_command( cmd=[ "wget", "-O", zip_path, @@ -272,7 +272,7 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt: run_command(cmd=["rm", zip_path]) # Download proteins - print(colorify("Downloading proteins file (~15 GB)")) + print(colorify("Downloading proteins file (~15 GB)...")) run_command( cmd=[ "wget", "-O", proteins_path, @@ -282,12 +282,12 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt: ) # Constructing version file - print(colorify("Constructing version file")) + print(colorify("Constructing version file...")) _write_version_tsv(nodes_path, names_path, proteins_path, version_path) # Return object print(colorify( - "Done! Moving data from temporary directory to final location." + "Done! Moving data from temporary directory to final location..." )) return ncbi_data From 650160d8746ab4f0a3aa1df305a3f47dbecd2e74 Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Fri, 19 Jan 2024 12:42:36 +0100 Subject: [PATCH 15/24] remove duplicate action --- q2_moshpit/plugin_setup.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index e00a0feb..e708300e 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -591,23 +591,6 @@ "storage space is required to run this action. " ) -plugin.methods.register_function( - function=q2_moshpit.eggnog.fetch_eggnog_fasta, - inputs={}, - parameters={}, - outputs=[("eggnog_fasta", ReferenceDB[EggnogSequenceTaxa])], - output_descriptions={ - "eggnog_proteins": "eggNOG database of protein sequences and " - "their corresponding taxonomy information." - }, - name="Fetch the databases necessary to run to run the " - "build-eggnog-diamond-db action.", - description="Downloads eggnog proteome database " - "This script downloads 2 files " - "(e5.proteomes.faa and e5.taxid_info.tsv) " - "and creates and artifact with them. At least 18 GB of " - "storage space is required to run this action. " -) plugin.methods.register_function( function=q2_moshpit.eggnog.fetch_ncbi_taxonomy, From 5cfb55b9d1361d6b453b9e3d2edf665f54a80218 Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau <54123712+Sann5@users.noreply.github.com> Date: Fri, 19 Jan 2024 14:42:35 +0100 Subject: [PATCH 16/24] Update q2_moshpit/plugin_setup.py Co-authored-by: Michal Ziemski --- q2_moshpit/plugin_setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index e708300e..46660aeb 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -601,9 +601,9 @@ "taxonomy": "NCBI reference taxonomy." }, name="Fetch NCBI reference taxonomy", - description="Downloads NCBI reference taxonomy for the NCBI ftp server. " - "The resulting artifact is required in the " - "build-custom-diamond-db action if one whished to " + description="Downloads NCBI reference taxonomy from the NCBI FTP server. " + "The resulting artifact is required by the " + "build-custom-diamond-db action if one wished to " "create a Diamond data base with taxonomy features. " "At least 30 GB of " "storage space is required to run this action.", From 1633654f7cca5aa7d8febdc111c586d7d3e55a71 Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Mon, 22 Jan 2024 07:37:02 +0100 Subject: [PATCH 17/24] correct indentation --- q2_moshpit/eggnog/_dbs.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py index 2c751a21..5281c7d3 100644 --- a/q2_moshpit/eggnog/_dbs.py +++ b/q2_moshpit/eggnog/_dbs.py @@ -231,11 +231,11 @@ def _validate_taxon_id(eggnog_proteins, taxon): # Check for overlap with provided taxon id if not str(taxon) in tax_ids: - raise ValueError( - f"'{taxon}' is not valid taxon ID. " - "To view all valid taxon IDs inspect e5.taxid_info.tsv " - "file in the eggnog_proteins input." - ) + raise ValueError( + f"'{taxon}' is not valid taxon ID. " + "To view all valid taxon IDs inspect e5.taxid_info.tsv " + "file in the eggnog_proteins input." + ) def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt: From 1ef872d7ee1e2d7a50ea994d02e9a0ac73167cef Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Tue, 23 Jan 2024 10:21:13 +0100 Subject: [PATCH 18/24] Remove version file + adjust tests --- q2_moshpit/eggnog/_dbs.py | 33 ------------------ q2_moshpit/eggnog/tests/data/ncbi/names.dmp | 11 ------ q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp | 11 ------ .../tests/data/ncbi/prot.accession2taxid.gz | Bin 10956 -> 0 bytes q2_moshpit/eggnog/tests/test_dbs.py | 28 ++------------- 5 files changed, 3 insertions(+), 80 deletions(-) delete mode 100644 q2_moshpit/eggnog/tests/data/ncbi/names.dmp delete mode 100644 q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp delete mode 100644 q2_moshpit/eggnog/tests/data/ncbi/prot.accession2taxid.gz diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py index 5281c7d3..07a2fb09 100644 --- a/q2_moshpit/eggnog/_dbs.py +++ b/q2_moshpit/eggnog/_dbs.py @@ -6,7 +6,6 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import os -import datetime import pandas as pd from q2_types.feature_data import ProteinSequencesDirectoryFormat import shutil @@ -246,10 +245,7 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt: # Initialize output object and paths ncbi_data = NCBITaxonomyDirFmt() zip_path = os.path.join(str(ncbi_data), "taxdmp.zip") - nodes_path = os.path.join(str(ncbi_data), "nodes.dmp") - names_path = os.path.join(str(ncbi_data), "names.dmp") proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz") - version_path = os.path.join(str(ncbi_data), "version.tsv") # Download zip file print(colorify("Downloading *.dmp files...")) @@ -281,37 +277,8 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt: ] ) - # Constructing version file - print(colorify("Constructing version file...")) - _write_version_tsv(nodes_path, names_path, proteins_path, version_path) - # Return object print(colorify( "Done! Moving data from temporary directory to final location..." )) return ncbi_data - - -def _write_version_tsv(nodes, names, proteins, version): - names_time = datetime.date.fromtimestamp(os.path.getmtime(nodes)) - nodes_time = datetime.date.fromtimestamp(os.path.getmtime(names)) - proteins_time = datetime.date.fromtimestamp(os.path.getmtime(proteins)) - - # Create a DataFrame with file names and last modification times - data = {'file_name': [ - 'names.dmp', - 'nodes.dmp', - 'prot.accession2taxid.gz' - ], - 'date': [ - names_time.strftime('%d/%m/%Y'), - nodes_time.strftime('%d/%m/%Y'), - proteins_time.strftime('%d/%m/%Y') - ], - 'time': [ - names_time.strftime('%H:%M:%S'), - nodes_time.strftime('%H:%M:%S'), - proteins_time.strftime('%H:%M:%S') - ] - } - pd.DataFrame(data).to_csv(version, sep='\t', index=False) diff --git a/q2_moshpit/eggnog/tests/data/ncbi/names.dmp b/q2_moshpit/eggnog/tests/data/ncbi/names.dmp deleted file mode 100644 index b89e8a2b..00000000 --- a/q2_moshpit/eggnog/tests/data/ncbi/names.dmp +++ /dev/null @@ -1,11 +0,0 @@ -1 | all | | synonym | -1 | root | | scientific name | -2 | Bacteria | Bacteria | scientific name | -2 | bacteria | | blast name | -2 | eubacteria | | genbank common name | -2 | Monera | Monera | in-part | -2 | Procaryotae | Procaryotae | in-part | -2 | Prokaryotae | Prokaryotae | in-part | -2 | Prokaryota | Prokaryota | in-part | -2 | prokaryote | prokaryote | in-part | -2 | prokaryotes | prokaryotes | in-part | diff --git a/q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp b/q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp deleted file mode 100644 index 61a662a0..00000000 --- a/q2_moshpit/eggnog/tests/data/ncbi/nodes.dmp +++ /dev/null @@ -1,11 +0,0 @@ -1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | -2 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | | -6 | 335928 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | -7 | 6 | species | AC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | -9 | 32199 | species | BA | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | -10 | 1706371 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | -11 | 1707 | species | CG | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | effective current name; | -13 | 203488 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | -14 | 13 | species | DT | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | -16 | 32011 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | -17 | 16 | species | MM | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | diff --git a/q2_moshpit/eggnog/tests/data/ncbi/prot.accession2taxid.gz b/q2_moshpit/eggnog/tests/data/ncbi/prot.accession2taxid.gz deleted file mode 100644 index 8b78ac0fc0030af4e68ead0ebb547dfb55ae34c2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10956 zcmV;-Dl^p|iwFod4|`<*18{P0bS`0IV`X!5X>V>abYXaDWB_fQ%Wea)a&5<*k1?8W zawu63OR@}EvSmxNgCq#fBmt5jxSxMnB&&A2*%M>KTqu&oH=CsX@vr~-@Bi~Z|NO82 z9R9)T<^TTuzsrCA_aFc7pZ_-e$3OpNHLk`n9o~N80QsT@Wz^`E8B{bbX{~)Is8XM& zmRn6#b9S6uIbYyh@DhbGP2~~_<+&Dl^QLAh)Dmh8(%LP5B8NUlBB+2jJeJw<#MK?R>E!k06FJW6mF*m$d2QBb_iSH zvdo-~CP~Hh?8L**hBHEv-0ii_#hR5&Iso?^T~pg}m<({a$zJ^a>y% z-_Ik@$+U3N-h#B%CeOAstB9dlPXVJV5_9?(nB1rQwkL>-J(Z1VK;8 z`N>Re)>KYF!bnFwCnFxgE(oBbjI0 z=Lbc8Ml#Q?%g+dBo{`G4U&s$~env9S;SX5~D4<=OQ9$Q7BPliaga9@PbNlv3-3+8ZSuWWubAQntj6JXK&G<6@DU( zpM4sQ)Skzyw3#gXRw%iC$dZ6$U&CP$UGrn!H4`|o7yvHR$BD%Ruy3wW{P@_4d$R1? zi~$IX+l3AQY23l$u5-8v9w0KaTWAVMqfAao_Fw|&04|I(aL*e`=B?|bOoBypq>p(= zN^Qmhi~tvg52aikD4BV;M@<2h z0^Q!D@Aj+_i_aucXW7*rw8Cdf02eB*IQ+SS#Z_m~qa%H;U~$!342XIytuV{ZqMv{f zU_Z`8fUmIl+PCYF00w}4Q<#A0Ql9fJg_=SD=m0J>1u4R(tUF2eAS}wdlVtC3lr_4P z=e$dyimL&7fD0X|0tkoCr-jt_psdkJJ?EViMUBj0{`j2FA1O1aW;+z&ZefUnA2HE< z&L^5Qvxj?jNaJoHjmXR%Wxel*50A|3;qd)JyN1KjKR)OEBUzk8jWq82G-4+8d_(Pd zTj-k66fkvt&Zn*f!aeA5p9?*%L8ZWq^f{lAQpH6P9+1X^n&;YK{`ka<^f{lAl0~HP zfT#}(MTkm)iRQDPXxb4LkFa=Lm>?lDC#3Pzm$gErIKkp+p$L)2cQl3fz7NOD{)vg^ zv!7`4&NhXfk?V7xYr_`5SDHbR(~X6?qr$oVUazG$6iJDnu$zT{pfCw0&|>X2svvIe<7 zW%H6GrBjzLD)~l5^0>q~`v-O5|Bmitdrvw6tz<{pE<3>+fDWMRLCKABVSwy&BMwp?Y4i&&N7Oz7Y@4CU*h#** zLH^`5+W=Y6Tve2|(Wo&<$M$%L%SjTKk(iyPvv(A2^|fT$wdQgx@`b)5*ib(scw=}J zvuneXr6M~{3&+D^ksW85%hUYV09*Zhe6?y>WJcqtvztcxM5+@{W$jg6TdgdT z8l$HV%GQl3=-`A~^LnyK=9HyRp3X=TufG?umu0EC*5pyGcNj}%3gI;wKCx%bdv40$Qob}@|l*|hdiWf}Yu`9F7AF%JN>om+d| zs_N7#OLs({?Nm}eY1`Vl%N4D)-zrPjp>jFK+T>jB5iFzbq6f$+S`H|v)P~a>ezbmo zT(asIJy|+~k?#GpHHM2^Z zl_jOP8ab;Qx5?FSfFy^zkh|?X_~hvue^T({>CvV-RT4MGY|p2tb1&1nVn4pCk>e#v z-4<6T z-2lm4?oo44{zy+ zMs9hzg0;p+foE{#qvM2A)^fy~vkC)b$8o&7^$#bIjsdr~$I$*fYBm*O(&z zaAO`4lyL16Zq%X^vU!0Wi?!4oYBRP*(N_hA6D)zGcV{c4J{Bp=^>H)GW zS;n%kaymEWBn5Z41_84au$Z8k<~nBM$_oO`(0bZC_i-6j=rW=VIhjDSV0}2fwd{d|yvVUVk>P zhu?i@wW7$EcDRRn+dKoZA-3-W6#N?p$ob%6g?n>mfb2NS-1#K-vMe3EPTuP_L(Oe4 z3flG?s_am!gT#LRNCV`8)hJm%U&aBFPN@C0%3CNUTme-JRvprML0T_*ZZ+2tX9WEc zPo2vowex@i>VUch546#+ga@FZOF$eQoOMT9?+bo4Z6;8{k=A>kR;wo-xLcrhAzMi6 z$D5=z%Y_7J@xaGH()w{&ssqLW^+5fCRU_H@I7%J(sH=}zT7#6<)r$wNWu;4P-tpm6 zt-OFIse;$CjWz>0osad%jv`55OnLH~>qv~sdT^g+fb2M~rH`|;P?CMb?Dl6`vx7(y zn6ny^tdH6{*T+?!at@HCA2Z6vGD4RVw>&YRa0ApVpj?^aYSorLRW4Z#f+nD85%r9& z+sdQLB{VVm`YmbAatVz<6VSATwi*)GT?u}uOGimlB9tl z4@!Ad%oALlM>5U6+#OC_MM##Yjg!NnilaWpwnLl=5^avja8Pm;>RJcplqIvw&O3stzA1QTqSL9 z=1wG(j?s>Kll#wq0E3;p<4?CgR2hF`wzj2kEY^BsP?V3KhoE^r0`dF51O4>pH~&90qWu9DU;$+_BMXg`s; z?JcyJi=-tHt#RnnQp~+*?xeV$wdrZPV!6 zo$wIfBJYEyV~t5S7q7Xj-%5?zInG} z1|>WI4GUqLRW6~H(MwxiSGkle15^XmOX$R^0cw^|2h;+! zOQ;9xfVw3#0QEo@ODi^XogVC0l1t9gf6(zXoLM zX@M^JH6)-8=#pRK2-E{z@@rH;1JI>b5GQ_(K$lv9XLPTB7$upfj=6%;?1<-4Qdr05 zib;|*9UXn7FSljOKQD=*kUKp<{v}mjCn(PTRlRZcN!EnsHMWw3PEcuBBp1+81GPZy z5(=wFSUoOU4L~(ey@Xnz0ccpt7Seh`0!~W_Pzop#a9T=$GC<+i(~@7u1QcmKEv0pY z)ibP~7p;1ruzFs!8U_78wmug9>VQU|aS3hC@;lRBi>R{93+ULOk;P6cSO4YO45CVE z0i&hkgJX<&*lkmt7=Avut`{xII7=AA8DqxlLnR^_|p}2Zp4)Hz{IBX&DR)oa+HL3x!<3?~A zoPL>am1uAVoas1m^w%5WeUo_UGDWb-!d$zAQ>E-Ml=GpZZu6=f_{|leNZC0^F;AFM z0(KYgecz?j_+W0^pvi%EC%L#go{RZ($mhv4y*=T&={ZI-#OJ&KD<89N;AZ7M@19nUE zT(FaWNA8u16sPbmu@>&^2FT8NVlTIq0dgL#6YN#YW%;*8MP~%t>j83Zt+lhbzO?mL zm8B1&ua@PDsY}^^t!2dXj_W{?FYOd&`#7h$D#!By+@WxEt-vQK4z4XZ_-h;>7f4R! z`MqsmRypmis7sY@Bi{BBIY}~^=lyi-wcG{Jd~-3T-A8*OqsqD{3%Z7a>f|V;}nk;>GQhgJdvbhtA(pO;klq{?M&nEs=XUmhD;Z8=4s848RdPSYO|6|k%Ym0s>|G{jBuo&Bh=P?1V^97@6CdaJOvzUOe=UbU)n33Y;U0*YKhTipnkueKW6Ix+kysYIzBb0Apt{F!;Q%=ZmaE5gobY%T#=6gu*v=fLuZY zPzTg4p(D@;G%leEDAKxFOsf-gI}VTws6)2eV-Hm>S@l2-P_u*@pm5f9$yp8_*m|IT z355r?aMpIwSt!`H&a83?wSw;9f&HQfT3-@hpQ@on(uk{*Gluum`2$6Yo3Wi_t8zl- z4uamN(L(ZmNR<<^RaUNnIrszQM_7nm3CZJqkv-Ygbvf9#E7es_$hsy9ZHAeH<4VXZ z7fANfW^<6(H-~BL_2^>cvZr7R(k_sa_hS_D7$;=ybs`^!%)#fakmv2xW}9aj@zUiX zIm)stBmP{FgRdv?{Ph%iGDwcDpA!4^(=OPQ*uSFK%yNNj-AUZ4!2TpTEzNe`v68Ft z{ryqNDrt)8U6L71>-JXT`P!z{Rn9k8THP#x5nSvxLp-PH1!udJwF2kBxz65%^Wc1EPyBpSh*wQE zvD!H(_(dUJ)t{G={C*jUH_NVk#okwt^2qYbUjkt-Ozh>6Kjbxu0~f)?`S}+k ze&M=bgsL_nzlK3ynE+5G63f!vWTmpymtwNU%_RIA=GDniyn^vAK zP17NYyd-IZ%xb)Q!!?j34KjI3nO94t5eJ&0kAiwVKrWz>+tYo0<@!K!2@Qh&+5xgI zIZUqZbQn77z)wuC)T?Qq-l}~0bK>9dJ@6cu+MVDkXB^*{U#abiD`Sx~4RoX2w&hm~ zbKoaCv&gzWPIB8eu)Y0h#pxqimd83+XLu#3IfeU=osb{91(FL=y@fNUkz|S6jIQ>u zR`Inz3T<=xDoYPrW(wsn5r4bNf$<<*EF&Aec;B`yxZSI_757U;&aDN`$iZ*fH6&|B za{77mt!=GPXuC&OWe>I%qYvwL;6NoF3Wh0L@jqfhS2m4ajsay2`hpIjeF_G#qIoW49~h9ruxzR$$30j~Vx zw;gw^vaBlWW^lc8%fqyPhp*e2CA{I^o*dvr2GnC*KO!mNHd> zD>F)&hf$ybXF4tj*VZOgcIk`4wGXvbc3c97e4LOEo)5t5WjF4(bKnEqqvLq~99FCLXjGLi7>mKzK*vF+eDRGcK)#hPQQlhNH^PH=4O!;HXKKjX65w)E-{&9$W&K<{XbyIdCiS?zXD4 zXTf=Je$H`69NzXTvA18rUh5XD+Pm}C8v3K|tdxzgHeNx0Z(1YvJ$0JI3JGO@f zs-zOBpWMs#9v{-qf#-JTpvoD?PeHE2`}LFUk)+*9o806~HBbD)@5hQuj%4Y!yr(iA zQ=*du?X&DE9^0+C<^)Dsq)eXk;|ClZ{m_p^|{q-s`WjLmmIz z28G|l0J&tX*>K==xKfmE{tu1ieZE*4M=Z)WTYipT~CGqsp>t zWi-7Iq(^NmJwiANdQ}4?P5u9%mKXf%@wRVY@v1CVLl~W^qoE19ebZj+pW{a@ttv}x ziFqYzaI9A~^mpG*!dID5pVxR+&h@IcfTZB|%1V_JvV9e9-NM>#*KL|qW$7^kUDmFn ziO)-r!04@AkgbwYt8hrVWy0u;9_`=*cV0g?--=j!ricU)gdiU-gLTi=qyKmu%!N^?9xMsUyrta$o}2A4!E@yp*r@{ZZ2qoB=mKYC6hzl>?Dq z-{sd&;zfRapWg}R!Od?Wj}i62!!N!o-zXiW4k)+KmmA6>z%OC$moZH2!7rir3(6zF zFQKpZ@LzymLVpVx^(w$Gq4P_C{{q@a=-Nk^*h75@U422nt0US+?Ak|!y%qeo694h- zVpOXQvb#Zc>t$*^sV*BNZ_|BLv~b@R$=i-b&K60&(9QMH z$vJhYgU_!|_902yZnIH7d`i4-HS!9#$T`9|zd`abLWe)uigIQ9R zgQBN=8pw^4LLETYg8^U!SeDi{S#Xk-&~8vxS-D9HJ#(>LS*?=3kf+{;VdTg%@nX7ern-hA6GCz8U=U2?o> zkFVt=dO#96&)f5!qbB*nAFb(0TTk!!Hg13Axn7aS=XfFO-b3IrA=$+%_CizxWM@wcceMi~wNd)iA8|UJ z%jA9SxRH|VVvgV(I9INi371DmPT}?A$T5?o#!S~X@rVe|tKIc`&819ofo!?)ulsh$ zzsi^Hb%18l-w(%@$RbO1Pzk@K&1-Fc|Kz!Pk!6n|2HFs+C-3-2L)W=$ZBGVP`69oa z)8k$o*%yB;H~uQU<@W$?H?AW3Os`Z}Z$L%Z$WCEtl@wLMw;< zv`-OL0&E}P+nDjHUX8N@WN$GtXf<0T*;~{N+^Y7Rb(OtE&!E1sMUuV6kwJIlT$Ai9 zstFhY#y)DtAir)@Wf!$)hi`Xn3Rq=tv4Q%g+i8L-sl&?^L2H?0wSFfDU)#CODi=rv z(t&gfWXq5K@$Ktzl|9Kdh=M&xzhHA@^7J7#Pag|5ThCI01IVyI)}H0n?D#85l`rLo z>T{Qe2en%THgCuF-M31zsp$9TYY~;F>l%5zMz%>ZH?m-zGyEv1vu(BeYVOB2o2l}} zOy6t_`~%RhcJrniS7f(~GJ~_gu&nR&HE*|{So@1&KcQxz9-ie>v$+{{#dkP%N!_3RWwWUD` zM0rehp*SB79LmFV<>BFH13#N?cW#{I7k)OKpW9rA0xkSE*@fbKpgb)6XXm?kF2R2m z{XM`W*7%Cc6scmpo;KJ&*KFcA?r#%%R*o>a*{5krm3r!_U6+ zvzo+<^!aXEL806N(igg2PzC=5_%E1w`{lvZ?rr3FNz%+9_ZeoC!ywiR`K|`Y1(Nf! zcPDOhBo|1o4fn^LZIUF?(Gfa}-`Hplrzc!n#V%4ZRF97ETWzoI_1GT&t@4GgAI7$w z!ry0J-R9gjYpX0{pPL+`Xw}AD-aeTlStd@#>hh~X^_F6r5asaery3y1q_gzAcKbro z?%_~qJAPATZ&Nw8`DsQ*N%kl|KA*$wgZ)FYP8xmS(cJKO>OQY+AH7O)kdO7Vdn4IN z-`5-Gh$O9Fd{A=DmHT4%dus3AS6ODZd2XKY(vW8IJ#yiYTp*j{{pQ-v zJ3}1#{W%J6-?6JCG?ptEbr+^BKy%=C``}dNf=RBIe)H2_u&iGz9yp;S7f>Up zYbPG7TtKZ4>!5-8bzEc9muMotN>LbhLqHqt@T;lW3Jx>?Y7Y z1{ZtV@uT2&QKrg{4DD0q8AHwThgBecQgNa$>XYf$?K&IXJ$G%`7Km z?Ga4WLbjunRsOYEuKbNmZW8C68E7d6&BBL&PVn<$<9mssB#GoAVWv;(wu@%dVuV0N?qFU&kj`C+jJvH4{{-$ zifi5GdgQ#5?ENVs(ZSd#dpHGSX=W`OfoHQ0M z{X8Z1pQr2EW|VT@{H`VT-*xO-96!aIgC8jO$F$tJCL{09yTtyyyPlNN+>VM#lD+4p z#clP4V<$OJCw*68(%ghJw}o_SA^n=mo#ccJidKaD%f}S_t_R3z4_1EzO8U>Oze?=> z+VxAhZA#wLn)?uv^PW}~WI1?s<#t4JA)T!q>G#ln-muEDv~_uP$zw-s8vq;qy^k&L zMbf4a{b)y&Lj>G^>fbxpr6Os7*WMDzo^N^A58IxhtP*5>#VPVd>>TXcwWlg4HtAi0 zyku^t@N@9)Ddg@MrZ(FKX#g2P#szX>Gl5JC zNWDOMkOrh#APqwXfq_l8obqqyK1o)Nws54E?{23C%(646WeP&RhGV@bQbOB#aj)g z&~`4e$_{LwD@&*XsOBKw1}>o*pzgq_MFX&m%43jUK1UhhT#@Y6vrUZI$zR$A- z2X*?s(=RVI9&VsXD%nnXG!#U@ptCXx~1n z?2?EqUyzkcpA|%WK?*MmDV!KYp^Sz#$9BsJBuVjVQ|_X)&Q^-=V|QaOlBAkA zBe@g1-Pi0!l4ZBa(bi@Ar8Wm}8wSW8FMO@X%7HMT*Eh}#$%2NE?Bqx5+2{A;52GXr^R@3Zi+xFe7f*x+Xk$%<2-Zsi^O{` z^T%X^qDQ9rr+g)a4=WZQ^>^xLPSilRlE_DWFyJrB+xt}nZwn}n Date: Tue, 23 Jan 2024 10:31:06 +0100 Subject: [PATCH 19/24] Adjust file size in prompt --- q2_moshpit/eggnog/_dbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py index 07a2fb09..a9f8a344 100644 --- a/q2_moshpit/eggnog/_dbs.py +++ b/q2_moshpit/eggnog/_dbs.py @@ -268,7 +268,7 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt: run_command(cmd=["rm", zip_path]) # Download proteins - print(colorify("Downloading proteins file (~15 GB)...")) + print(colorify("Downloading proteins file (~8 GB)...")) run_command( cmd=[ "wget", "-O", proteins_path, From f6805677de9be7215feadecb7fa6c388b12192ba Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Tue, 23 Jan 2024 14:24:14 +0100 Subject: [PATCH 20/24] Reorganize fetch_ncbi_taxonomy --- q2_moshpit/eggnog/_dbs.py | 56 ++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py index a9f8a344..4de3ac2a 100644 --- a/q2_moshpit/eggnog/_dbs.py +++ b/q2_moshpit/eggnog/_dbs.py @@ -13,7 +13,9 @@ EggnogRefDirFmt, DiamondDatabaseDirFmt, NCBITaxonomyDirFmt, EggnogProteinSequencesDirFmt ) -from .._utils import run_command, _process_common_input_params, colorify +from .._utils import ( + run_command, _process_common_input_params, colorify, compare_md5_hashes +) from ._utils import _parse_build_diamond_db_params @@ -247,14 +249,19 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt: zip_path = os.path.join(str(ncbi_data), "taxdmp.zip") proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz") - # Download zip file + # Download zip file + MD5 file print(colorify("Downloading *.dmp files...")) - run_command( - cmd=[ - "wget", "-O", zip_path, - "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip" - ] - ) + for ext in ["", ".md5"]: + # Download MD5 + run_command( + cmd=[ + "wget", "-O", f"{zip_path}{ext}", + f"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip{ext}" + ] + ) + + # Collect and compare md5 hashes + _collect_and_compare_md5(f"{zip_path}.md5", zip_path) # Unzip run_command( @@ -267,18 +274,35 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt: # Remove zip file run_command(cmd=["rm", zip_path]) - # Download proteins + # Download proteins + MD5 file print(colorify("Downloading proteins file (~8 GB)...")) - run_command( - cmd=[ - "wget", "-O", proteins_path, - "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/" - "prot.accession2taxid.gz" - ] - ) + for ext in ["", ".md5"]: + run_command( + cmd=[ + "wget", "-O", f"{proteins_path}{ext}", + "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/" + f"prot.accession2taxid.gz{ext}" + ] + ) + + # Collect and compare md5 hashes + _collect_and_compare_md5(f"{proteins_path}.md5", proteins_path) # Return object print(colorify( "Done! Moving data from temporary directory to final location..." )) return ncbi_data + + +def _collect_and_compare_md5(path_to_md5: str, path_to_file: str): + with open(path_to_md5, 'r') as f: + # Read the first line + first_line = f.readline().strip() + # Split the line into hash and file name + md5_hash, _ = first_line.split(' ', 1) + # Compare + compare_md5_hashes(md5_hash, path_to_file) + + # If no exception is raised, remove md5 file + run_command(cmd=["rm", path_to_md5]) From af0f652b780c1a859054cc32102c6342daac9eb1 Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Tue, 23 Jan 2024 14:37:36 +0100 Subject: [PATCH 21/24] Add tests --- q2_moshpit/_utils.py | 24 +++++++ q2_moshpit/eggnog/tests/data/md5/a.txt | 1 + q2_moshpit/eggnog/tests/data/md5/a.txt.md5 | 1 + q2_moshpit/eggnog/tests/data/md5/b.txt | 1 + q2_moshpit/eggnog/tests/test_dbs.py | 81 ++++++++++++++++------ q2_moshpit/tests/data/md5/a.txt | 1 + q2_moshpit/tests/data/md5/b.txt | 1 + q2_moshpit/tests/test_utils.py | 33 +++++++-- 8 files changed, 116 insertions(+), 27 deletions(-) create mode 100644 q2_moshpit/eggnog/tests/data/md5/a.txt create mode 100644 q2_moshpit/eggnog/tests/data/md5/a.txt.md5 create mode 100644 q2_moshpit/eggnog/tests/data/md5/b.txt create mode 100644 q2_moshpit/tests/data/md5/a.txt create mode 100644 q2_moshpit/tests/data/md5/b.txt diff --git a/q2_moshpit/_utils.py b/q2_moshpit/_utils.py index 331f592d..0bce8a0a 100644 --- a/q2_moshpit/_utils.py +++ b/q2_moshpit/_utils.py @@ -5,7 +5,9 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +from qiime2.core.exceptions import ValidationError import subprocess +import hashlib from typing import List @@ -74,3 +76,25 @@ def _process_common_input_params(processing_func, params: dict) -> List[str]: def colorify(string): return "%s%s%s" % ('\033[1;32m', string, "\033[0m") + + +def compare_md5_hashes(expected_hash: str, path_to_file: str): + observed_hash = calculate_md5_from_file(path_to_file) + if observed_hash != expected_hash: + raise ValidationError( + "Download error. Data possibly corrupted.\n" + f"{path_to_file} has an unexpected MD5 hash.\n\n" + "Expected hash:\n" + f"{expected_hash}\n\n" + "Observed hash:\n" + f"{observed_hash}" + ) + + +def calculate_md5_from_file(file_path): + md5_hash = hashlib.md5() + with open(file_path, 'rb') as f: + # Read the file in chunks to handle large files + for chunk in iter(lambda: f.read(4096), b""): + md5_hash.update(chunk) + return md5_hash.hexdigest() diff --git a/q2_moshpit/eggnog/tests/data/md5/a.txt b/q2_moshpit/eggnog/tests/data/md5/a.txt new file mode 100644 index 00000000..348608f1 --- /dev/null +++ b/q2_moshpit/eggnog/tests/data/md5/a.txt @@ -0,0 +1 @@ +I am a text file. Calculate an MD% hash from me. \ No newline at end of file diff --git a/q2_moshpit/eggnog/tests/data/md5/a.txt.md5 b/q2_moshpit/eggnog/tests/data/md5/a.txt.md5 new file mode 100644 index 00000000..f9f80c4b --- /dev/null +++ b/q2_moshpit/eggnog/tests/data/md5/a.txt.md5 @@ -0,0 +1 @@ +a583054a9831a6e7cc56ea5cd9cac40a a.txt \ No newline at end of file diff --git a/q2_moshpit/eggnog/tests/data/md5/b.txt b/q2_moshpit/eggnog/tests/data/md5/b.txt new file mode 100644 index 00000000..6af1c12b --- /dev/null +++ b/q2_moshpit/eggnog/tests/data/md5/b.txt @@ -0,0 +1 @@ +I am a another text file. \ No newline at end of file diff --git a/q2_moshpit/eggnog/tests/test_dbs.py b/q2_moshpit/eggnog/tests/test_dbs.py index 1b3923d4..a32fb702 100644 --- a/q2_moshpit/eggnog/tests/test_dbs.py +++ b/q2_moshpit/eggnog/tests/test_dbs.py @@ -8,10 +8,11 @@ import os from unittest.mock import patch, call from qiime2.plugin.testing import TestPluginBase +from qiime2.core.exceptions import ValidationError from .._dbs import ( fetch_eggnog_db, build_custom_diamond_db, fetch_eggnog_proteins, fetch_diamond_db, build_eggnog_diamond_db, fetch_ncbi_taxonomy, - _validate_taxon_id + _validate_taxon_id, _collect_and_compare_md5 ) from q2_types.feature_data import ProteinSequencesDirectoryFormat from q2_types_genomics.reference_db import ( @@ -151,46 +152,80 @@ def test_fetch_eggnog_fasta(self, subp_run): # Check that commands are ran as expected subp_run.assert_has_calls([first_call, second_call], any_order=False) + @patch("q2_moshpit.eggnog._dbs._collect_and_compare_md5") @patch("subprocess.run") - def test_fetch_ncbi_taxonomy(self, subp_run): + def test_fetch_ncbi_taxonomy(self, subp_run, cc_md5): # Call function. Patching will make sure nothing is actually ran ncbi_data = fetch_ncbi_taxonomy() zip_path = os.path.join(str(ncbi_data), "taxdmp.zip") proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz") # Check that command was called in the expected way - first_call = call( - [ - "wget", "-O", zip_path, - "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip" - ], - check=True - ) - second_call = call( + I_call, II_call = [ + call( + [ + "wget", "-O", f"{zip_path}{ext}", + f"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip{ext}" + ], + check=True + ) + for ext in ["", ".md5"] + ] + III_call = call(f"{zip_path}.md5", zip_path) + IV_call = call( [ "unzip", "-j", zip_path, "names.dmp", "nodes.dmp", "-d", str(ncbi_data) ], check=True, ) - third_call = call( - ["rm", zip_path], - check=True, - ) - forth_call = call( - [ - "wget", "-O", proteins_path, - "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/" - "prot.accession2taxid.gz" - ], - check=True, - ) + V_call = call(["rm", zip_path], check=True) + VI_call, VII_call = [ + call( + [ + "wget", "-O", f"{proteins_path}{ext}", + "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/" + f"prot.accession2taxid.gz{ext}" + ], + check=True + ) + for ext in ["", ".md5"] + ] + VIII_call = call(f"{proteins_path}.md5", proteins_path) # Check that commands are ran as expected subp_run.assert_has_calls( - [first_call, second_call, third_call, forth_call], + [I_call, II_call, IV_call, V_call, VI_call, VII_call], any_order=False ) + cc_md5.assert_has_calls([III_call, VIII_call], any_order=False) + + @patch("subprocess.run") + def test_collect_and_compare_md5_valid(self, subp_run): + path_to_file = self.get_data_path("md5/a.txt") + + # Should raise no errors + _collect_and_compare_md5(f"{path_to_file}.md5", path_to_file) + + # Check rm is called as expected + subp_run.assert_called_once_with( + ["rm", f"{path_to_file}.md5"], check=True + ) + + @patch("subprocess.run") + def test_collect_and_compare_md5_invalid(self, subp_run): + path_to_file = self.get_data_path("md5/b.txt") + path_to_wrong_md5 = self.get_data_path("md5/a.txt.md5") + + # Check that expected exception is raised + with self.assertRaisesRegex( + ValidationError, + "has an unexpected MD5 hash" + ): + _collect_and_compare_md5(path_to_wrong_md5, path_to_file) + + # check that rm is not called + subp_run.assert_not_called() @patch("q2_moshpit.eggnog._dbs._validate_taxon_id") @patch("subprocess.run") diff --git a/q2_moshpit/tests/data/md5/a.txt b/q2_moshpit/tests/data/md5/a.txt new file mode 100644 index 00000000..348608f1 --- /dev/null +++ b/q2_moshpit/tests/data/md5/a.txt @@ -0,0 +1 @@ +I am a text file. Calculate an MD% hash from me. \ No newline at end of file diff --git a/q2_moshpit/tests/data/md5/b.txt b/q2_moshpit/tests/data/md5/b.txt new file mode 100644 index 00000000..6af1c12b --- /dev/null +++ b/q2_moshpit/tests/data/md5/b.txt @@ -0,0 +1 @@ +I am a another text file. \ No newline at end of file diff --git a/q2_moshpit/tests/test_utils.py b/q2_moshpit/tests/test_utils.py index 77f9f37c..98a6858d 100644 --- a/q2_moshpit/tests/test_utils.py +++ b/q2_moshpit/tests/test_utils.py @@ -5,12 +5,13 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- - import unittest - from qiime2.plugin.testing import TestPluginBase - -from .._utils import _construct_param, _process_common_input_params +from qiime2.core.exceptions import ValidationError +from .._utils import ( + _construct_param, _process_common_input_params, compare_md5_hashes, + calculate_md5_from_file +) def fake_processing_func(key, val): @@ -113,6 +114,30 @@ def test_process_common_inputs_mix_with_falsy_values(self): ] self.assertSetEqual(set(observed), set(expected)) + def test_compare_md5_hashes_pass(self): + path_to_file = self.get_data_path("md5/a.txt") + compare_md5_hashes("a583054a9831a6e7cc56ea5cd9cac40a", path_to_file) + + def test_compare_md5_hashes_fail(self): + path_to_file = self.get_data_path("md5/b.txt") + with self.assertRaisesRegex( + ValidationError, + "has an unexpected MD5 hash" + ): + compare_md5_hashes( + "a583054a9831a6e7cc56ea5cd9cac40a", path_to_file + ) + + def test_calculate_md5_from_pass(self): + path_to_file = self.get_data_path("md5/a.txt") + observed_hash = calculate_md5_from_file(path_to_file) + self.assertEqual(observed_hash, "a583054a9831a6e7cc56ea5cd9cac40a") + + def test_calculate_md5_from_fail(self): + path_to_file = self.get_data_path("md5/b.txt") + observed_hash = calculate_md5_from_file(path_to_file) + self.assertNotEqual(observed_hash, "a583054a9831a6e7cc56ea5cd9cac40a") + if __name__ == '__main__': unittest.main() From c4e28f0d1ba578d6cb8a72e9af95ab70b1abf85c Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau <54123712+Sann5@users.noreply.github.com> Date: Wed, 24 Jan 2024 16:29:05 +0100 Subject: [PATCH 22/24] Update q2_moshpit/eggnog/_dbs.py Co-authored-by: Michal Ziemski --- q2_moshpit/eggnog/_dbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py index 4de3ac2a..dec0e953 100644 --- a/q2_moshpit/eggnog/_dbs.py +++ b/q2_moshpit/eggnog/_dbs.py @@ -241,7 +241,7 @@ def _validate_taxon_id(eggnog_proteins, taxon): def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt: """ - Script fetches 3 files from the internet and puts them into the folder of + Script fetches 3 files from the NCBI server and puts them into the folder of a NCBITaxonomyDirFmt object. """ # Initialize output object and paths From 301ae006bf84a733e0943ce4704ebe7b6fada916 Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Wed, 24 Jan 2024 17:08:59 +0100 Subject: [PATCH 23/24] Eliminate duplicated action --- q2_moshpit/plugin_setup.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index c1833089..885cd394 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -610,34 +610,6 @@ citations=[citations["NCBI"]] ) -plugin.methods.register_function( - function=q2_moshpit.eggnog.build_eggnog_diamond_db, - inputs={ - 'eggnog_proteins': ReferenceDB[EggnogProteinSequences], - }, - input_descriptions={ - 'eggnog_proteins': "eggNOG database of protein sequences and " - "their corresponding taxonomy information " - "(generated through the fetch-eggnog-proteins " - "action)." - }, - parameters={ - 'taxon': Int % Range(2, 1579337) - }, - parameter_descriptions={ - 'taxon': "Taxon ID number." - }, - outputs=[("diamond_db", ReferenceDB[Diamond])], - output_descriptions={ - "diamond_db": "Complete Diamond reference database for the" - "specified taxon." - }, - name="Create a DIAMOND formatted reference database for the" - "specified taxon.", - description="Creates an DIAMOND database which contains the protein " - "sequences that belong to the specified taxon.", -) - plugin.methods.register_function( function=q2_moshpit.eggnog.build_eggnog_diamond_db, inputs={ From 9a2de5791b3d366a15ee3d08023870bd2e1fc87d Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Wed, 24 Jan 2024 17:09:52 +0100 Subject: [PATCH 24/24] Reveiw comments Michal --- q2_moshpit/_utils.py | 18 +----- q2_moshpit/eggnog/_dbs.py | 85 +++++++++++++++++------------ q2_moshpit/eggnog/tests/test_dbs.py | 78 +++++++++++++++----------- q2_moshpit/tests/test_utils.py | 23 ++------ setup.py | 2 + 5 files changed, 104 insertions(+), 102 deletions(-) diff --git a/q2_moshpit/_utils.py b/q2_moshpit/_utils.py index 0bce8a0a..f93efe17 100644 --- a/q2_moshpit/_utils.py +++ b/q2_moshpit/_utils.py @@ -5,7 +5,6 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- -from qiime2.core.exceptions import ValidationError import subprocess import hashlib from typing import List @@ -74,24 +73,11 @@ def _process_common_input_params(processing_func, params: dict) -> List[str]: return processed_args -def colorify(string): +def colorify(string: str): return "%s%s%s" % ('\033[1;32m', string, "\033[0m") -def compare_md5_hashes(expected_hash: str, path_to_file: str): - observed_hash = calculate_md5_from_file(path_to_file) - if observed_hash != expected_hash: - raise ValidationError( - "Download error. Data possibly corrupted.\n" - f"{path_to_file} has an unexpected MD5 hash.\n\n" - "Expected hash:\n" - f"{expected_hash}\n\n" - "Observed hash:\n" - f"{observed_hash}" - ) - - -def calculate_md5_from_file(file_path): +def _calculate_md5_from_file(file_path: str) -> str: md5_hash = hashlib.md5() with open(file_path, 'rb') as f: # Read the file in chunks to handle large files diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py index dec0e953..65e244f7 100644 --- a/q2_moshpit/eggnog/_dbs.py +++ b/q2_moshpit/eggnog/_dbs.py @@ -6,15 +6,17 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import os +import shutil import pandas as pd +from qiime2.core.exceptions import ValidationError from q2_types.feature_data import ProteinSequencesDirectoryFormat -import shutil from q2_types_genomics.reference_db import ( EggnogRefDirFmt, DiamondDatabaseDirFmt, NCBITaxonomyDirFmt, EggnogProteinSequencesDirFmt ) from .._utils import ( - run_command, _process_common_input_params, colorify, compare_md5_hashes + run_command, _process_common_input_params, colorify, + _calculate_md5_from_file ) from ._utils import _parse_build_diamond_db_params @@ -241,29 +243,30 @@ def _validate_taxon_id(eggnog_proteins, taxon): def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt: """ - Script fetches 3 files from the NCBI server and puts them into the folder of - a NCBITaxonomyDirFmt object. + Script fetches 3 files from the NCBI server and puts them into the folder + of a NCBITaxonomyDirFmt object. """ - # Initialize output object and paths ncbi_data = NCBITaxonomyDirFmt() zip_path = os.path.join(str(ncbi_data), "taxdmp.zip") proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz") - # Download zip file + MD5 file + # Download dump zip file + MD5 file print(colorify("Downloading *.dmp files...")) - for ext in ["", ".md5"]: - # Download MD5 - run_command( - cmd=[ - "wget", "-O", f"{zip_path}{ext}", - f"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip{ext}" - ] - ) + run_command( + cmd=[ + "wget", "-O", f"{zip_path}", + "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip" + ] + ) + run_command( + cmd=[ + "wget", "-O", f"{zip_path}.md5", + "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip.md5" + ] + ) - # Collect and compare md5 hashes _collect_and_compare_md5(f"{zip_path}.md5", zip_path) - # Unzip run_command( cmd=[ "unzip", "-j", zip_path, "names.dmp", "nodes.dmp", @@ -271,24 +274,27 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt: ] ) - # Remove zip file - run_command(cmd=["rm", zip_path]) + os.remove(zip_path) # Download proteins + MD5 file print(colorify("Downloading proteins file (~8 GB)...")) - for ext in ["", ".md5"]: - run_command( - cmd=[ - "wget", "-O", f"{proteins_path}{ext}", - "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/" - f"prot.accession2taxid.gz{ext}" - ] - ) + run_command( + cmd=[ + "wget", "-O", f"{proteins_path}", + "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/" + "prot.accession2taxid.gz" + ] + ) + run_command( + cmd=[ + "wget", "-O", f"{proteins_path}.md5", + "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/" + "prot.accession2taxid.gz.md5" + ] + ) - # Collect and compare md5 hashes _collect_and_compare_md5(f"{proteins_path}.md5", proteins_path) - # Return object print(colorify( "Done! Moving data from temporary directory to final location..." )) @@ -296,13 +302,22 @@ def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt: def _collect_and_compare_md5(path_to_md5: str, path_to_file: str): + # Read in hash from md5 file with open(path_to_md5, 'r') as f: - # Read the first line - first_line = f.readline().strip() - # Split the line into hash and file name - md5_hash, _ = first_line.split(' ', 1) - # Compare - compare_md5_hashes(md5_hash, path_to_file) + expected_hash = f.readline().strip().split(maxsplit=1)[0] + + # Calculate hash from file + observed_hash = _calculate_md5_from_file(path_to_file) + + if observed_hash != expected_hash: + raise ValidationError( + "Download error. Data possibly corrupted.\n" + f"{path_to_file} has an unexpected MD5 hash.\n\n" + "Expected hash:\n" + f"{expected_hash}\n\n" + "Observed hash:\n" + f"{observed_hash}" + ) # If no exception is raised, remove md5 file - run_command(cmd=["rm", path_to_md5]) + os.remove(path_to_md5) diff --git a/q2_moshpit/eggnog/tests/test_dbs.py b/q2_moshpit/eggnog/tests/test_dbs.py index a32fb702..6529d675 100644 --- a/q2_moshpit/eggnog/tests/test_dbs.py +++ b/q2_moshpit/eggnog/tests/test_dbs.py @@ -154,66 +154,80 @@ def test_fetch_eggnog_fasta(self, subp_run): @patch("q2_moshpit.eggnog._dbs._collect_and_compare_md5") @patch("subprocess.run") - def test_fetch_ncbi_taxonomy(self, subp_run, cc_md5): + @patch("os.remove") + def test_fetch_ncbi_taxonomy(self, mock_os_rm, mock_run, mock_md5): # Call function. Patching will make sure nothing is actually ran ncbi_data = fetch_ncbi_taxonomy() zip_path = os.path.join(str(ncbi_data), "taxdmp.zip") proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz") # Check that command was called in the expected way - I_call, II_call = [ + expected_calls = [ call( [ - "wget", "-O", f"{zip_path}{ext}", - f"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip{ext}" + "wget", "-O", f"{zip_path}", + "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip" ], check=True - ) - for ext in ["", ".md5"] - ] - III_call = call(f"{zip_path}.md5", zip_path) - IV_call = call( - [ - "unzip", "-j", zip_path, "names.dmp", "nodes.dmp", - "-d", str(ncbi_data) - ], - check=True, - ) - V_call = call(["rm", zip_path], check=True) - VI_call, VII_call = [ + ), + call( + [ + "wget", "-O", f"{zip_path}.md5", + "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip.md5" + ], + check=True + ), + call( + [ + "unzip", "-j", zip_path, "names.dmp", "nodes.dmp", + "-d", str(ncbi_data) + ], + check=True, + ), call( [ - "wget", "-O", f"{proteins_path}{ext}", + "wget", "-O", f"{proteins_path}", "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/" - f"prot.accession2taxid.gz{ext}" + "prot.accession2taxid.gz" + ], + check=True + ), + call( + [ + "wget", "-O", f"{proteins_path}.md5", + "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/" + "prot.accession2taxid.gz.md5" ], check=True ) - for ext in ["", ".md5"] ] - VIII_call = call(f"{proteins_path}.md5", proteins_path) # Check that commands are ran as expected - subp_run.assert_has_calls( - [I_call, II_call, IV_call, V_call, VI_call, VII_call], + mock_os_rm.assert_called_once_with(zip_path) + mock_run.assert_has_calls( + expected_calls, + any_order=False + ) + mock_md5.assert_has_calls( + [ + call(f"{zip_path}.md5", zip_path), + call(f"{proteins_path}.md5", proteins_path), + ], any_order=False ) - cc_md5.assert_has_calls([III_call, VIII_call], any_order=False) - @patch("subprocess.run") - def test_collect_and_compare_md5_valid(self, subp_run): + @patch("os.remove") + def test_collect_and_compare_md5_valid(self, mock_os_rm): path_to_file = self.get_data_path("md5/a.txt") # Should raise no errors _collect_and_compare_md5(f"{path_to_file}.md5", path_to_file) # Check rm is called as expected - subp_run.assert_called_once_with( - ["rm", f"{path_to_file}.md5"], check=True - ) + mock_os_rm.assert_called_once_with(f"{path_to_file}.md5") - @patch("subprocess.run") - def test_collect_and_compare_md5_invalid(self, subp_run): + @patch("os.remove") + def test_collect_and_compare_md5_invalid(self, mock_os_rm): path_to_file = self.get_data_path("md5/b.txt") path_to_wrong_md5 = self.get_data_path("md5/a.txt.md5") @@ -225,7 +239,7 @@ def test_collect_and_compare_md5_invalid(self, subp_run): _collect_and_compare_md5(path_to_wrong_md5, path_to_file) # check that rm is not called - subp_run.assert_not_called() + mock_os_rm.assert_not_called() @patch("q2_moshpit.eggnog._dbs._validate_taxon_id") @patch("subprocess.run") diff --git a/q2_moshpit/tests/test_utils.py b/q2_moshpit/tests/test_utils.py index 98a6858d..780a10c7 100644 --- a/q2_moshpit/tests/test_utils.py +++ b/q2_moshpit/tests/test_utils.py @@ -7,10 +7,9 @@ # ---------------------------------------------------------------------------- import unittest from qiime2.plugin.testing import TestPluginBase -from qiime2.core.exceptions import ValidationError from .._utils import ( - _construct_param, _process_common_input_params, compare_md5_hashes, - calculate_md5_from_file + _construct_param, _process_common_input_params, + _calculate_md5_from_file ) @@ -114,28 +113,14 @@ def test_process_common_inputs_mix_with_falsy_values(self): ] self.assertSetEqual(set(observed), set(expected)) - def test_compare_md5_hashes_pass(self): - path_to_file = self.get_data_path("md5/a.txt") - compare_md5_hashes("a583054a9831a6e7cc56ea5cd9cac40a", path_to_file) - - def test_compare_md5_hashes_fail(self): - path_to_file = self.get_data_path("md5/b.txt") - with self.assertRaisesRegex( - ValidationError, - "has an unexpected MD5 hash" - ): - compare_md5_hashes( - "a583054a9831a6e7cc56ea5cd9cac40a", path_to_file - ) - def test_calculate_md5_from_pass(self): path_to_file = self.get_data_path("md5/a.txt") - observed_hash = calculate_md5_from_file(path_to_file) + observed_hash = _calculate_md5_from_file(path_to_file) self.assertEqual(observed_hash, "a583054a9831a6e7cc56ea5cd9cac40a") def test_calculate_md5_from_fail(self): path_to_file = self.get_data_path("md5/b.txt") - observed_hash = calculate_md5_from_file(path_to_file) + observed_hash = _calculate_md5_from_file(path_to_file) self.assertNotEqual(observed_hash, "a583054a9831a6e7cc56ea5cd9cac40a") diff --git a/setup.py b/setup.py index b7cf194e..1f048bff 100644 --- a/setup.py +++ b/setup.py @@ -28,6 +28,7 @@ 'q2_moshpit': [ 'citations.bib', 'tests/data/*', + 'tests/data/md5/*', "assets/busco/*", "assets/busco/js/*", "assets/busco/css/*", @@ -47,6 +48,7 @@ ], 'q2_moshpit.eggnog': [ 'tests/data/*', + 'tests/data/md5/*', 'tests/data/build_eggnog_diamond_db/*', 'tests/data/contig-sequences-1/*', 'tests/data/mag-sequences/*',