From f4b9644b3235e2c84894913378bd31fd670ef742 Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau <54123712+Sann5@users.noreply.github.com> Date: Fri, 19 Jan 2024 17:15:12 +0100 Subject: [PATCH] ENH: add a `build-eggnog-diamond-db` action (#116) * Ignore runinfo * Add colorify to utils * Register action * colorify only with green * Update q2_moshpit/plugin_setup.py Co-authored-by: Michal Ziemski * Updated the output name and description * Implement build_eggnog_diamond_db action * Add test * Further refactor EggnogSequenceTaxa to EggnogProteinSequences * Add validation for taxon IDs and corresponding test * Eliminate duplicated method * Apply suggestions from code review Co-authored-by: Michal Ziemski * unit tests for _validate_taxon_id function --------- Co-authored-by: Michal Ziemski --- .gitignore | 2 +- q2_moshpit/eggnog/__init__.py | 5 +- q2_moshpit/eggnog/_dbs.py | 61 +++++++++++ .../build_eggnog_diamond_db/e5.taxid_info.tsv | 100 ++++++++++++++++++ q2_moshpit/eggnog/tests/test_dbs.py | 51 ++++++++- q2_moshpit/plugin_setup.py | 28 +++++ setup.py | 1 + 7 files changed, 243 insertions(+), 5 deletions(-) create mode 100644 q2_moshpit/eggnog/tests/data/build_eggnog_diamond_db/e5.taxid_info.tsv diff --git a/.gitignore b/.gitignore index cf09a48a..93cc4a13 100644 --- a/.gitignore +++ b/.gitignore @@ -139,5 +139,5 @@ dmypy.json # Ignore notebooks **/*.ipynb -# Ignore parsl stuff +# Ignore parsl dir runinfo \ No newline at end of file diff --git a/q2_moshpit/eggnog/__init__.py b/q2_moshpit/eggnog/__init__.py index c6e7f8cc..9a176a43 100644 --- a/q2_moshpit/eggnog/__init__.py +++ b/q2_moshpit/eggnog/__init__.py @@ -8,11 +8,12 @@ from ._method import eggnog_diamond_search, eggnog_annotate from ._dbs import ( fetch_eggnog_db, fetch_diamond_db, build_custom_diamond_db, - fetch_eggnog_proteins + fetch_eggnog_proteins, build_eggnog_diamond_db ) __all__ = [ 'eggnog_diamond_search', 'eggnog_annotate', 'fetch_eggnog_db', - 'fetch_diamond_db', 'build_custom_diamond_db', 'fetch_eggnog_proteins' + 'fetch_diamond_db', 'build_custom_diamond_db', 'fetch_eggnog_proteins', + 'build_eggnog_diamond_db', ] diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py index ea7382fe..845b2b6d 100644 --- a/q2_moshpit/eggnog/_dbs.py +++ b/q2_moshpit/eggnog/_dbs.py @@ -6,7 +6,9 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import os +import pandas as pd from q2_types.feature_data import ProteinSequencesDirectoryFormat +import shutil from q2_types_genomics.reference_db import ( EggnogRefDirFmt, DiamondDatabaseDirFmt, NCBITaxonomyDirFmt, EggnogProteinSequencesDirFmt @@ -174,3 +176,62 @@ def fetch_eggnog_proteins() -> EggnogProteinSequencesDirFmt: )) return eggnog_fa + + +def build_eggnog_diamond_db( + eggnog_proteins: EggnogProteinSequencesDirFmt, + taxon: int +) -> DiamondDatabaseDirFmt: + """ + Creates a DIAMOND database which contains the protein + sequences that belong to the specified taxon. + """ + # Validate taxon ID + _validate_taxon_id(eggnog_proteins, taxon) + + # Initialize output objects + diamond_db = DiamondDatabaseDirFmt() + + # Define command. + cmd = [ + "create_dbs.py", + "--data_dir", str(eggnog_proteins), + "--taxids", str(taxon), + "--dbname", "ref_db" + ] + run_command(cmd) + + # The script will create the diamond DB in side the directory of + # eggnog_proteins object, so we need to move it to diamond_db + source_path = os.path.join(str(eggnog_proteins), "ref_db.dmnd") + destination_path = os.path.join(str(diamond_db), "ref_db.dmnd") + shutil.move(source_path, destination_path) + + # Return objects + return diamond_db + + +def _validate_taxon_id(eggnog_proteins, taxon): + # Validate taxon id number + # Read in valid taxon ids + taxid_info = pd.read_csv( + os.path.join(str(eggnog_proteins), "e5.taxid_info.tsv"), + sep="\t" + ) + + # Convert them into a set + tax_ids = set() + for lineage in taxid_info["Taxid Lineage"]: + tax_ids.update( + set( + lineage.strip().split(",") + ) + ) + + # Check for overlap with provided taxon id + if not str(taxon) in tax_ids: + raise ValueError( + f"'{taxon}' is not valid taxon ID. " + "To view all valid taxon IDs inspect e5.taxid_info.tsv " + "file in the eggnog_proteins input." + ) diff --git a/q2_moshpit/eggnog/tests/data/build_eggnog_diamond_db/e5.taxid_info.tsv b/q2_moshpit/eggnog/tests/data/build_eggnog_diamond_db/e5.taxid_info.tsv new file mode 100644 index 00000000..e3e30994 --- /dev/null +++ b/q2_moshpit/eggnog/tests/data/build_eggnog_diamond_db/e5.taxid_info.tsv @@ -0,0 +1,100 @@ +# Taxid Sci.Name Rank Named Lineage Taxid Lineage +679937 Bacteroides coprosuis DSM 18011 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides coprosuis,Bacteroides coprosuis DSM 18011 1,131567,2,68336,976,200643,171549,815,816,151276,679937 +1146883 Blastococcus saxobsidens DD2 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Frankineae,Geodermatophilaceae,Blastococcus,Blastococcus saxobsidens,Blastococcus saxobsidens DD2 1,131567,2,201174,1760,85003,2037,85013,85030,38501,138336,1146883 +1497679 Listeriaceae bacterium FSL A5-0209 species root,cellular organisms,Bacteria,Firmicutes,Bacilli,Bacillales,Listeriaceae,unclassified Listeriaceae,Listeriaceae bacterium FSL A5-0209 1,131567,2,1239,91061,1385,186820,1081735,1497679 +69014 Thermococcus kodakarensis KOD1 no rank root,cellular organisms,Archaea,Euryarchaeota,Thermococci,Thermococcales,Thermococcaceae,Thermococcus,Thermococcus kodakarensis,Thermococcus kodakarensis KOD1 1,131567,2157,28890,183968,2258,2259,2263,311400,69014 +888833 Streptococcus australis ATCC 700641 no rank root,cellular organisms,Bacteria,Firmicutes,Bacilli,Lactobacillales,Streptococcaceae,Streptococcus,Streptococcus australis,Streptococcus australis ATCC 700641 1,131567,2,1239,91061,186826,1300,1301,113107,888833 +1089544 Amycolatopsis benzoatilytica AK 16/65 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Pseudonocardineae,Pseudonocardiaceae,Amycolatopsis,Amycolatopsis benzoatilytica,Amycolatopsis benzoatilytica AK 16/65 1,131567,2,201174,1760,85003,2037,85010,2070,1813,346045,1089544 +1089545 Amycolatopsis balhimycina FH 1894 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Pseudonocardineae,Pseudonocardiaceae,Amycolatopsis,Amycolatopsis balhimycina,Amycolatopsis balhimycina FH 1894 1,131567,2,201174,1760,85003,2037,85010,2070,1813,208443,1089545 +1089546 Actinopolyspora halophila DSM 43834 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinopolysporineae,Actinopolysporaceae,Actinopolyspora,Actinopolyspora halophila,Actinopolyspora halophila DSM 43834 1,131567,2,201174,1760,85003,2037,622450,622451,1849,1850,1089546 +521393 Actinomyces timonensis DSM 23838 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinomycineae,Actinomycetaceae,Actinomyces,Actinomyces timonensis,Actinomyces timonensis DSM 23838 1,131567,2,201174,1760,85003,2037,85005,2049,1654,1288391,521393 +1089548 Thermicanus aegyptius DSM 12793 no rank root,cellular organisms,Bacteria,Firmicutes,Bacilli,Bacillales,Bacillales incertae sedis,Bacillales Family X. Incertae Sedis,Thermicanus,Thermicanus aegyptius,Thermicanus aegyptius DSM 12793 1,131567,2,1239,91061,1385,539002,539003,94008,94009,1089548 +172045 Elizabethkingia miricola species root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Flavobacteriia,Flavobacteriales,Flavobacteriaceae,Elizabethkingia,Elizabethkingia miricola 1,131567,2,68336,976,117743,200644,49546,308865,172045 +1089550 Salisaeta longa DSM 21114 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Bacteroidetes Order II. Incertae sedis,Rhodothermaceae,Salisaeta,Salisaeta longa,Salisaeta longa DSM 21114 1,131567,2,68336,976,1100069,563843,689697,503170,1089550 +1089551 Geminicoccus roseus DSM 18922 no rank root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,unclassified Alphaproteobacteria,Geminicoccus,Geminicoccus roseus,Geminicoccus roseus DSM 18922 1,131567,2,1224,28211,82117,489140,404900,1089551 +1089552 Rhodovibrio salinarum DSM 9154 no rank root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodospirillales,Rhodospirillaceae,Rhodovibrio,Rhodovibrio salinarum,Rhodovibrio salinarum DSM 9154 1,131567,2,1224,28211,204441,41295,85274,1087,1089552 +1089553 Thermacetogenium phaeum DSM 12270 no rank root,cellular organisms,Bacteria,Firmicutes,Clostridia,Thermoanaerobacterales,Thermoanaerobacteraceae,Thermacetogenium,Thermacetogenium phaeum,Thermacetogenium phaeum DSM 12270 1,131567,2,1239,186801,68295,186814,140458,85874,1089553 +196627 Corynebacterium glutamicum ATCC 13032 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Corynebacterineae,Corynebacteriaceae,Corynebacterium,Corynebacterium glutamicum,Corynebacterium glutamicum ATCC 13032 1,131567,2,201174,1760,85003,2037,85007,1653,1716,1718,196627 +1161902 Eubacterium nodatum ATCC 33099 no rank root,cellular organisms,Bacteria,Firmicutes,Clostridia,Clostridiales,Clostridiales incertae sedis,Clostridiales Family XIII. Incertae Sedis,[Eubacterium] nodatum,Eubacterium nodatum ATCC 33099 1,131567,2,1239,186801,186802,538999,543314,35518,1161902 +446468 Nocardiopsis dassonvillei subsp. dassonvillei DSM 43111 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Streptosporangineae,Nocardiopsaceae,Nocardiopsis,Nocardiopsis dassonvillei,Nocardiopsis dassonvillei subsp. dassonvillei,Nocardiopsis dassonvillei subsp. dassonvillei DSM 43111 1,131567,2,201174,1760,85003,2037,85012,83676,2013,2014,568208,446468 +1286170 Raoultella ornithinolytica B6 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacteriales,Enterobacteriaceae,Raoultella,Raoultella ornithinolytica,Raoultella ornithinolytica B6 1,131567,2,1224,1236,91347,543,160674,54291,1286170 +1286171 Eubacterium acidaminophilum DSM 3953 no rank root,cellular organisms,Bacteria,Firmicutes,Clostridia,Clostridiales,Eubacteriaceae,Eubacterium,Eubacterium acidaminophilum,Eubacterium acidaminophilum DSM 3953 1,131567,2,1239,186801,186802,186806,1730,1731,1286171 +446469 Sanguibacter keddieii DSM 10542 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Micrococcineae,Sanguibacteraceae,Sanguibacter,Sanguibacter keddieii,Sanguibacter keddieii DSM 10542 1,131567,2,201174,1760,85003,2037,85006,145360,60919,60920,446469 +1384484 Adlercreutzia equolifaciens DSM 19450 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Coriobacteridae,Coriobacteriales,Coriobacterineae,Coriobacteriaceae,Adlercreutzia,Adlercreutzia equolifaciens,Adlercreutzia equolifaciens DSM 19450 1,131567,2,201174,1760,84998,84999,255727,84107,447020,446660,1384484 +446470 Stackebrandtia nassauensis DSM 44728 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Glycomycineae,Glycomycetaceae,Stackebrandtia,Stackebrandtia nassauensis,Stackebrandtia nassauensis DSM 44728 1,131567,2,201174,1760,85003,2037,85014,85034,283810,283811,446470 +270374 Marinobacter sp. ELB17 species root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Marinobacter,Marinobacter sp. ELB17 1,131567,2,1224,1236,135622,72275,2742,270374 +237609 Pseudomonas alkylphenolia species root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,Pseudomonas alkylphenolia 1,131567,2,1224,1236,72274,135621,286,237609 +622637 Methylocystis sp. ATCC 49242 species root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Methylocystaceae,Methylocystis,Methylocystis sp. ATCC 49242 1,131567,2,1224,28211,356,31993,133,622637 +536019 Mesorhizobium opportunistum WSM2075 no rank root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Phyllobacteriaceae,Mesorhizobium,Mesorhizobium opportunistum,Mesorhizobium opportunistum WSM2075 1,131567,2,1224,28211,356,69277,68287,593909,536019 +46429 Sphingobium chlorophenolicum species root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Sphingomonadaceae,Sphingobium,Sphingobium chlorophenolicum 1,131567,2,1224,28211,204457,41297,165695,46429 +1056816 Nocardia sp. BMG51109 species root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Corynebacterineae,Nocardiaceae,Nocardia,Nocardia sp. BMG51109 1,131567,2,201174,1760,85003,2037,85007,85025,1817,1056816 +589873 Alteromonas australica species root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas australica 1,131567,2,1224,1236,135622,72275,226,589873 +1120947 Actinomyces vaccimaxillae DSM 15804 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Actinomycineae,Actinomycetaceae,Actinomyces,Actinomyces vaccimaxillae,Actinomyces vaccimaxillae DSM 15804 1,131567,2,201174,1760,85003,2037,85005,2049,1654,183916,1120947 +1056820 Teredinibacter turnerae T7902 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadales genera incertae sedis,Teredinibacter,Teredinibacter turnerae,Teredinibacter turnerae T7902 1,131567,2,1224,1236,135622,256005,2425,2426,1056820 +1269813 Thioalkalivibrio sp. ALR17-21 species root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Chromatiales,Ectothiorhodospiraceae,Thioalkalivibrio,Thioalkalivibrio sp. ALR17-21 1,131567,2,1224,1236,135613,72276,106633,1269813 +639030 Acidobacteria bacterium KBS 146 species root,cellular organisms,Bacteria,Fibrobacteres/Acidobacteria group,Acidobacteria,Acidobacteriia,Acidobacteriales,Acidobacteriaceae,unclassified Acidobacteriaceae,Acidobacteria bacterium KBS 146 1,131567,2,131550,57723,204432,204433,204434,112074,639030 +172088 Bradyrhizobium sp. th.b2 species root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Bradyrhizobiaceae,Bradyrhizobium,Bradyrhizobium sp. th.b2 1,131567,2,1224,28211,356,41294,374,172088 +180281 Cyanobium sp. PCC 7001 species root,cellular organisms,Bacteria,Cyanobacteria,Oscillatoriophycideae,Chroococcales,Cyanobium,Cyanobium sp. PCC 7001 1,131567,2,1117,1301283,1118,167375,180281 +663610 Methylocapsa aurea species root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Beijerinckiaceae,Methylocapsa,Methylocapsa aurea 1,131567,2,1224,28211,356,45404,184923,663610 +1045855 Pseudoxanthomonas spadix BD-a59 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,Pseudoxanthomonas,Pseudoxanthomonas spadix,Pseudoxanthomonas spadix BD-a59 1,131567,2,1224,1236,135614,32033,83618,415229,1045855 +1120949 Actinoplanes globisporus DSM 43857 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Micromonosporineae,Micromonosporaceae,Actinoplanes,Actinoplanes globisporus,Actinoplanes globisporus DSM 43857 1,131567,2,201174,1760,85003,2037,85008,28056,1865,113565,1120949 +81985 Capsella rubella species root,cellular organisms,Eukaryota,Viridiplantae,Streptophyta,Streptophytina,Embryophyta,Tracheophyta,Euphyllophyta,Spermatophyta,Magnoliophyta,Mesangiospermae,eudicotyledons,Gunneridae,Pentapetalae,rosids,malvids,Brassicales,Brassicaceae,Camelineae,Capsella,Capsella rubella 1,131567,2759,33090,35493,131221,3193,58023,78536,58024,3398,1437183,71240,91827,1437201,71275,91836,3699,3700,980083,3718,81985 +393283 Pestalotiopsis fici species root,cellular organisms,Eukaryota,Opisthokonta,Fungi,Dikarya,Ascomycota,saccharomyceta,Pezizomycotina,leotiomyceta,sordariomyceta,Sordariomycetes,Xylariomycetidae,Xylariales,Amphisphaeriaceae,Pestalotiopsis,Pestalotiopsis fici 1,131567,2759,33154,4751,451864,4890,716545,147538,716546,715989,147550,222545,37989,54958,37840,393283 +163908 Anabaena sp. PCC 7108 species root,cellular organisms,Bacteria,Cyanobacteria,Nostocales,Nostocaceae,Anabaena,Anabaena sp. PCC 7108 1,131567,2,1117,1161,1162,1163,163908 +1120950 Actinopolymorpha alba DSM 45243 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Propionibacterineae,Nocardioidaceae,Actinopolymorpha,Actinopolymorpha alba,Actinopolymorpha alba DSM 45243 1,131567,2,201174,1760,85003,2037,85009,85015,117156,533267,1120950 +1144325 Pseudomonas sp. GM21 species root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,Pseudomonas sp. GM21 1,131567,2,1224,1236,72274,135621,286,1144325 +1045858 Brachyspira intermedia PWS/A no rank root,cellular organisms,Bacteria,Spirochaetes,Spirochaetia,Spirochaetales,Brachyspiraceae,Brachyspira,Brachyspira intermedia,Brachyspira intermedia PWS/A 1,131567,2,203691,203692,136,143786,29521,84377,1045858 +925775 Xanthomonas vesicatoria ATCC 35937 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,Xanthomonas,Xanthomonas vesicatoria,Xanthomonas vesicatoria ATCC 35937 1,131567,2,1224,1236,135614,32033,338,56460,925775 +1417296 Defluviimonas sp. 20V17 species root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodobacterales,Rhodobacteraceae,Defluviimonas,Defluviimonas sp. 20V17 1,131567,2,1224,28211,204455,31989,1097466,1417296 +1417230 Borrelia persica No12 no rank root,cellular organisms,Bacteria,Spirochaetes,Spirochaetia,Spirochaetales,Spirochaetaceae,Borrelia,Borrelia persica,Borrelia persica No12 1,131567,2,203691,203692,136,137,138,44448,1417230 +106582 Maylandia zebra species root,cellular organisms,Eukaryota,Opisthokonta,Metazoa,Eumetazoa,Bilateria,Deuterostomia,Chordata,Craniata,Vertebrata,Gnathostomata,Teleostomi,Euteleostomi,Actinopterygii,Actinopteri,Neopterygii,Teleostei,Osteoglossocephalai,Clupeocephala,Euteleosteomorpha,Neoteleostei,Eurypterygia,Ctenosquamata,Acanthomorphata,Euacanthomorphacea,Percomorphaceae,Ovalentaria,Cichlomorphae,Cichliformes,Cichlidae,African cichlids,Pseudocrenilabrinae,Haplochromini,Maylandia,Maylandia zebra complex,Maylandia zebra 1,131567,2759,33154,33208,6072,33213,33511,7711,89593,7742,7776,117570,117571,7898,186623,41665,32443,1489341,186625,1489388,123365,123366,123367,123368,123369,1489872,1489908,1489910,1489911,8113,319095,318546,319058,143623,57445,106582 +1120953 Aestuariibacter salexigens DSM 15300 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Aestuariibacter,Aestuariibacter salexigens,Aestuariibacter salexigens DSM 15300 1,131567,2,1224,1236,135622,72275,249523,226010,1120953 +393305 Yersinia enterocolitica subsp. enterocolitica 8081 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacteriales,Enterobacteriaceae,Yersinia,Yersinia enterocolitica,Yersinia enterocolitica subsp. enterocolitica,Yersinia enterocolitica subsp. enterocolitica 8081 1,131567,2,1224,1236,91347,543,629,630,150052,393305 +1280706 Selenomonas ruminantium subsp. ruminantium ATCC 12561 no rank root,cellular organisms,Bacteria,Firmicutes,Negativicutes,Selenomonadales,Veillonellaceae,Selenomonas,Selenomonas ruminantium,Selenomonas ruminantium subsp. ruminantium,Selenomonas ruminantium subsp. ruminantium ATCC 12561 1,131567,2,1239,909932,909929,31977,970,971,114196,1280706 +1515613 Porphyromonas sp. COT-239_OH1446 species root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Bacteroidia,Bacteroidales,Porphyromonadaceae,Porphyromonas,Porphyromonas sp. COT-239_OH1446 1,131567,2,68336,976,200643,171549,171551,836,1515613 +1123257 Solimonas flava DSM 18980 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Sinobacteraceae,Solimonas,Solimonas flava,Solimonas flava DSM 18980 1,131567,2,1224,1236,135614,568386,413435,415849,1123257 +1515615 Porphyromonas sp. COT-290_OH860 species root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Bacteroidia,Bacteroidales,Porphyromonadaceae,Porphyromonas,Porphyromonas sp. COT-290_OH860 1,131567,2,68336,976,200643,171549,171551,836,1515615 +715451 Alteromonas sp. SN2 species root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas sp. SN2 1,131567,2,1224,1236,135622,72275,226,715451 +589924 Ferroglobus placidus DSM 10642 no rank root,cellular organisms,Archaea,Euryarchaeota,Archaeoglobi,Archaeoglobales,Archaeoglobaceae,Ferroglobus,Ferroglobus placidus,Ferroglobus placidus DSM 10642 1,131567,2157,28890,183980,2231,2232,54260,54261,589924 +221288 Mastigocladopsis repens PCC 10914 no rank root,cellular organisms,Bacteria,Cyanobacteria,Stigonematales,Mastigocladopsis,Mastigocladopsis repens,Mastigocladopsis repens PCC 10914 1,131567,2,1117,1189,221282,221287,221288 +862908 Bacteriovorax marinus SJ no rank root,cellular organisms,Bacteria,Proteobacteria,delta/epsilon subdivisions,Deltaproteobacteria,Bdellovibrionales,Bacteriovoracaceae,Bacteriovorax,Bacteriovorax marinus,Bacteriovorax marinus SJ 1,131567,2,1224,68525,28221,213481,263369,146784,97084,862908 +311402 Agrobacterium vitis S4 no rank root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Rhizobiaceae,Rhizobium/Agrobacterium group,Agrobacterium,Agrobacterium vitis,Agrobacterium vitis S4 1,131567,2,1224,28211,356,82115,227290,357,373,311402 +311403 Agrobacterium radiobacter K84 no rank root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Rhizobiaceae,Rhizobium/Agrobacterium group,Agrobacterium,Agrobacterium tumefaciens complex,Agrobacterium tumefaciens,Agrobacterium radiobacter K84 1,131567,2,1224,28211,356,82115,227290,357,1183400,358,311403 +180332 Robinsoniella peoriensis species root,cellular organisms,Bacteria,Firmicutes,Clostridia,Clostridiales,Lachnospiraceae,Robinsoniella,Robinsoniella peoriensis 1,131567,2,1239,186801,186802,186803,588605,180332 +1227453 Haloarcula japonica DSM 6131 no rank root,cellular organisms,Archaea,Euryarchaeota,Halobacteria,Halobacteriales,Halobacteriaceae,Haloarcula,Haloarcula japonica,Haloarcula japonica DSM 6131 1,131567,2157,28890,183963,2235,2236,2237,29282,1227453 +1150600 Arcticibacter svalbardensis MN12-7 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Sphingobacteriia,Sphingobacteriales,Sphingobacteriaceae,Arcticibacter,Arcticibacter svalbardensis,Arcticibacter svalbardensis MN12-7 1,131567,2,68336,976,117747,200666,84566,1288026,1288027,1150600 +1406840 Flavobacterium beibuense F44-8 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Flavobacteriia,Flavobacteriales,Flavobacteriaceae,Flavobacterium,Flavobacterium beibuense,Flavobacterium beibuense F44-8 1,131567,2,68336,976,117743,200644,49546,237,657326,1406840 +688245 Comamonas testosteroni CNB-2 no rank root,cellular organisms,Bacteria,Proteobacteria,Betaproteobacteria,Burkholderiales,Comamonadaceae,Comamonas,Comamonas testosteroni,Comamonas testosteroni CNB-1,Comamonas testosteroni CNB-2 1,131567,2,1224,28216,80840,80864,283,285,543891,688245 +401526 Thermosinus carboxydivorans Nor1 no rank root,cellular organisms,Bacteria,Firmicutes,Negativicutes,Selenomonadales,Veillonellaceae,Thermosinus,Thermosinus carboxydivorans,Thermosinus carboxydivorans Nor1 1,131567,2,1239,909932,909929,31977,261684,261685,401526 +335992 Candidatus Pelagibacter ubique HTCC1062 no rank root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,unclassified Alphaproteobacteria,SAR11 cluster,Candidatus Pelagibacter,Candidatus Pelagibacter ubique,Candidatus Pelagibacter ubique HTCC1062 1,131567,2,1224,28211,82117,54526,198251,198252,335992 +1163385 Peanut witches'-broom phytoplasma NTU2011 no rank root,cellular organisms,Bacteria,Tenericutes,Mollicutes,Acholeplasmatales,Acholeplasmataceae,Candidatus Phytoplasma,16SrII (Peanut WB group),Peanut witches'-broom phytoplasma,Peanut witches'-broom phytoplasma NTU2011 1,131567,2,544448,31969,186329,2146,33926,85621,35772,1163385 +999547 Leisingera daeponensis DSM 23529 no rank root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodobacterales,Rhodobacteraceae,Leisingera,Leisingera daeponensis,Leisingera daeponensis DSM 23529 1,131567,2,1224,28211,204455,31989,191028,405746,999547 +1288083 Streptomyces sp. TAA040 species root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Streptomycineae,Streptomycetaceae,Streptomyces,Streptomyces sp. TAA040 1,131567,2,201174,1760,85003,2037,85011,2062,1883,1288083 +999549 Leisingera caerulea DSM 24564 no rank root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodobacterales,Rhodobacteraceae,Leisingera,Leisingera caerulea,Leisingera caerulea DSM 24564 1,131567,2,1224,28211,204455,31989,191028,506591,999549 +999550 Pseudophaeobacter arcticus DSM 23566 no rank root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodobacterales,Rhodobacteraceae,Pseudophaeobacter,Pseudophaeobacter arcticus,Pseudophaeobacter arcticus DSM 23566 1,131567,2,1224,28211,204455,31989,1541822,385492,999550 +1216362 Fusobacterium nucleatum ChDC F128 no rank root,cellular organisms,Bacteria,Fusobacteria,Fusobacteriia,Fusobacteriales,Fusobacteriaceae,Fusobacterium,Fusobacterium nucleatum,unclassified Fusobacterium nucleatum,Fusobacterium nucleatum ChDC F128 1,131567,2,32066,203490,203491,203492,848,851,189727,1216362 +311424 Dehalococcoides mccartyi VS no rank root,cellular organisms,Bacteria,Chloroflexi,Dehalococcoidia,Dehalococcoidales,Dehalococcoidaceae,Dehalococcoides,Dehalococcoides mccartyi,Dehalococcoides mccartyi VS 1,131567,2,200795,301297,1202465,1202464,61434,61435,311424 +573569 Francisella sp. TX077308 species root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Thiotrichales,Francisellaceae,Francisella,Francisella sp. TX077308 1,131567,2,1224,1236,72273,34064,262,573569 +426114 Thiomonas arsenitoxydans species root,cellular organisms,Bacteria,Proteobacteria,Betaproteobacteria,Burkholderiales,unclassified Burkholderiales,Burkholderiales Genera incertae sedis,Thiomonas,Thiomonas arsenitoxydans 1,131567,2,1224,28216,80840,119065,224471,32012,426114 +1212548 Pseudomonas stutzeri NF13 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Pseudomonadaceae,Pseudomonas,Pseudomonas stutzeri group,Pseudomonas stutzeri subgroup,Pseudomonas stutzeri,Pseudomonas stutzeri NF13 1,131567,2,1224,1236,72274,135621,286,136846,578833,316,1212548 +426117 Methylobacterium sp. 4-46 species root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhizobiales,Methylobacteriaceae,Methylobacterium,Methylobacterium sp. 4-46 1,131567,2,1224,28211,356,119045,407,426117 +1138822 Lactobacillus curieae species root,cellular organisms,Bacteria,Firmicutes,Bacilli,Lactobacillales,Lactobacillaceae,Lactobacillus,Lactobacillus curieae 1,131567,2,1239,91061,186826,33958,1578,1138822 +98439 Fischerella thermalis PCC 7521 no rank root,cellular organisms,Bacteria,Cyanobacteria,Stigonematales,Fischerella,Fischerella thermalis,Fischerella thermalis PCC 7521 1,131567,2,1117,1189,1190,372787,98439 +65672 Piriformospora indica species root,cellular organisms,Eukaryota,Opisthokonta,Fungi,Dikarya,Basidiomycota,Agaricomycotina,Agaricomycetes,Agaricomycetes incertae sedis,Sebacinales,Sebacinales group B,Piriformospora,Piriformospora indica 1,131567,2759,33154,4751,451864,5204,5302,155619,355688,297313,1506295,65702,65672 +1441930 Serratia fonticola RB-25 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacteriales,Enterobacteriaceae,Serratia,Serratia fonticola,Serratia fonticola RB-25 1,131567,2,1224,1236,91347,543,613,47917,1441930 +688269 Thermotoga thermarum DSM 5069 no rank root,cellular organisms,Bacteria,Thermotogae,Thermotogae,Thermotogales,Thermotogaceae,Thermotoga,Thermotoga thermarum,Thermotoga thermarum DSM 5069 1,131567,2,200918,188708,2419,188709,2335,119394,688269 +688270 Cellulophaga algicola DSM 14237 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Flavobacteriia,Flavobacteriales,Flavobacteriaceae,Cellulophaga,Cellulophaga algicola,Cellulophaga algicola DSM 14237 1,131567,2,68336,976,117743,200644,49546,104264,59600,688270 +1163407 Rhodanobacter spathiphylli B39 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,Rhodanobacter,Rhodanobacter spathiphylli,Rhodanobacter spathiphylli B39 1,131567,2,1224,1236,135614,32033,75309,347483,1163407 +1163408 Rhodanobacter fulvus Jip2 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Xanthomonadales,Xanthomonadaceae,Rhodanobacter,Rhodanobacter fulvus,Rhodanobacter fulvus Jip2 1,131567,2,1224,1236,135614,32033,75309,219571,1163408 +1069080 Succinispira mobilis DSM 6222 no rank root,cellular organisms,Bacteria,Firmicutes,Negativicutes,Selenomonadales,Acidaminococcaceae,Succinispira,Succinispira mobilis,Succinispira mobilis DSM 6222 1,131567,2,1239,909932,909929,909930,78119,78120,1069080 +1120963 Algicola sagamiensis DSM 14643 no rank root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Pseudoalteromonadaceae,Algicola,Algicola sagamiensis,Algicola sagamiensis DSM 14643 1,131567,2,1224,1236,135622,267888,296014,163869,1120963 +561177 Anaerococcus hydrogenalis DSM 7454 no rank root,cellular organisms,Bacteria,Firmicutes,Clostridia,Clostridiales,Peptoniphilaceae,Anaerococcus,Anaerococcus hydrogenalis,Anaerococcus hydrogenalis DSM 7454 1,131567,2,1239,186801,186802,1570339,165779,33029,561177 +106648 Acinetobacter bereziniae species root,cellular organisms,Bacteria,Proteobacteria,Gammaproteobacteria,Pseudomonadales,Moraxellaceae,Acinetobacter,Acinetobacter bereziniae 1,131567,2,1224,1236,72274,468,469,106648 +1107311 Flavobacterium enshiense DK69 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Flavobacteriia,Flavobacteriales,Flavobacteriaceae,Flavobacterium,Flavobacterium enshiense,Flavobacterium enshiense DK69 1,131567,2,68336,976,117743,200644,49546,237,1341165,1107311 +1136417 Salinispora pacifica CNT003 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Actinomycetales,Micromonosporineae,Micromonosporaceae,Salinispora,Salinispora pacifica,Salinispora pacifica CNT003 1,131567,2,201174,1760,85003,2037,85008,28056,168694,351187,1136417 +237727 Erythrobacter sp. NAP1 species root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Sphingomonadales,Erythrobacteraceae,Erythrobacter,Erythrobacter sp. NAP1 1,131567,2,1224,28211,204457,335929,1041,237727 +1506583 Flavobacterium sp. Fl species root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Flavobacteriia,Flavobacteriales,Flavobacteriaceae,Flavobacterium,Flavobacterium sp. Fl 1,131567,2,68336,976,117743,200644,49546,237,1506583 +1540257 Clostridium sp. KNHs214 species root,cellular organisms,Bacteria,Firmicutes,Clostridia,Clostridiales,Clostridiaceae,Clostridium,Clostridium sp. KNHs214 1,131567,2,1239,186801,186802,31979,1485,1540257 +1120966 Algoriphagus marincola DSM 16067 no rank root,cellular organisms,Bacteria,Bacteroidetes/Chlorobi group,Bacteroidetes,Cytophagia,Cytophagales,Cyclobacteriaceae,Algoriphagus,Algoriphagus marincola,Algoriphagus marincola DSM 16067 1,131567,2,68336,976,768503,768507,563798,246875,264027,1120966 +1437610 Bifidobacterium reuteri DSM 23975 no rank root,cellular organisms,Bacteria,Actinobacteria,Actinobacteria,Actinobacteridae,Bifidobacteriales,Bifidobacteriaceae,Bifidobacterium,Bifidobacterium reuteri,Bifidobacterium reuteri DSM 23975 1,131567,2,201174,1760,85003,85004,31953,1678,983706,1437610 +1380380 Ahrensia sp. 13_GOM-1096m species root,cellular organisms,Bacteria,Proteobacteria,Alphaproteobacteria,Rhodobacterales,Rhodobacteraceae,Ahrensia,Ahrensia sp. 13_GOM-1096m 1,131567,2,1224,28211,204455,31989,152180,1380380 diff --git a/q2_moshpit/eggnog/tests/test_dbs.py b/q2_moshpit/eggnog/tests/test_dbs.py index 33d4304e..0a1c12db 100644 --- a/q2_moshpit/eggnog/tests/test_dbs.py +++ b/q2_moshpit/eggnog/tests/test_dbs.py @@ -10,10 +10,12 @@ from qiime2.plugin.testing import TestPluginBase from .._dbs import ( fetch_eggnog_db, build_custom_diamond_db, fetch_eggnog_proteins, - fetch_diamond_db + fetch_diamond_db, build_eggnog_diamond_db, _validate_taxon_id ) from q2_types.feature_data import ProteinSequencesDirectoryFormat -from q2_types_genomics.reference_db import NCBITaxonomyDirFmt +from q2_types_genomics.reference_db import ( + NCBITaxonomyDirFmt, EggnogProteinSequencesDirFmt +) class TestFetchDB(TestPluginBase): @@ -147,3 +149,48 @@ def test_fetch_eggnog_fasta(self, subp_run): # Check that commands are ran as expected subp_run.assert_has_calls([first_call, second_call], any_order=False) + + @patch("q2_moshpit.eggnog._dbs._validate_taxon_id") + @patch("subprocess.run") + @patch("shutil.move") + def test_build_eggnog_diamond_db(self, shut_mv, subp_run, _val): + # Instantiate input + proteins_and_taxa = EggnogProteinSequencesDirFmt() + + # Call function. Patching will make sure nothing is + # actually ran + diamond_db = build_eggnog_diamond_db(proteins_and_taxa, taxon=2) + + # Check that command was called in the expected way + exp_cmd = [ + "create_dbs.py", + "--data_dir", str(proteins_and_taxa), + "--taxids", "2", + "--dbname", "ref_db" + ] + + # Check that subprocess.run is run as expected + subp_run.assert_called_once_with(exp_cmd, check=True) + + # Check that shutil.move is run as expected + source_path = os.path.join(str(proteins_and_taxa), "ref_db.dmnd") + destination_path = os.path.join(str(diamond_db), "ref_db.dmnd") + shut_mv.assert_called_once_with(source_path, destination_path) + + def test_validate_taxon_id_invalid(self): + # Init input data + path_to_data = self.get_data_path('build_eggnog_diamond_db/') + eggnog_proteins = EggnogProteinSequencesDirFmt(path_to_data, 'r') + + # Call function exception error since taxon 0 is invalid + with self.assertRaisesRegex( + ValueError, + "'0' is not valid taxon ID. " + ): + _validate_taxon_id(eggnog_proteins, 0) + + def test_validate_taxon_id_valid(self): + # Init input data + path_to_data = self.get_data_path('build_eggnog_diamond_db/') + eggnog_proteins = EggnogProteinSequencesDirFmt(path_to_data, 'r') + _validate_taxon_id(eggnog_proteins, 2) diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index 81eeaf2d..134b2c41 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -591,6 +591,34 @@ "storage space is required to run this action. " ) +plugin.methods.register_function( + function=q2_moshpit.eggnog.build_eggnog_diamond_db, + inputs={ + 'eggnog_proteins': ReferenceDB[EggnogProteinSequences], + }, + input_descriptions={ + 'eggnog_proteins': "eggNOG database of protein sequences and " + "their corresponding taxonomy information " + "(generated through the `fetch-eggnog-proteins` " + "action)." + }, + parameters={ + 'taxon': Int % Range(2, 1579337) + }, + parameter_descriptions={ + 'taxon': "Taxon ID number." + }, + outputs=[("diamond_db", ReferenceDB[Diamond])], + output_descriptions={ + "diamond_db": "Complete Diamond reference database for the" + "specified taxon." + }, + name="Create a DIAMOND formatted reference database for the" + "specified taxon.", + description="Creates a DIAMOND database which contains the protein " + "sequences that belong to the specified taxon.", +) + plugin.methods.register_function( function=q2_moshpit.eggnog.eggnog_diamond_search, inputs={ diff --git a/setup.py b/setup.py index 2a286e79..b7cf194e 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ ], 'q2_moshpit.eggnog': [ 'tests/data/*', + 'tests/data/build_eggnog_diamond_db/*', 'tests/data/contig-sequences-1/*', 'tests/data/mag-sequences/*', 'tests/data/random-db-1/*',