diff --git a/ci/recipe/meta.yaml b/ci/recipe/meta.yaml index 0b6e2ec3..0a3d61a4 100644 --- a/ci/recipe/meta.yaml +++ b/ci/recipe/meta.yaml @@ -17,19 +17,20 @@ requirements: - setuptools run: + - altair + - beautifulsoup4 - bracken + - busco >=5.0.0 + - diamond + - eggnog-mapper >=2.1.10 - kraken2 - metabat2 - - samtools - qiime2 {{ qiime2_epoch }}.* - q2-types-genomics {{ qiime2_epoch }}.* - q2templates {{ qiime2_epoch }}.* - - eggnog-mapper >=2.1.10 - - diamond + - samtools - tqdm - xmltodict - - altair - - busco >=5.0.0 test: requires: diff --git a/q2_moshpit/__init__.py b/q2_moshpit/__init__.py index ee9d3e0d..cf919382 100644 --- a/q2_moshpit/__init__.py +++ b/q2_moshpit/__init__.py @@ -7,7 +7,10 @@ # ---------------------------------------------------------------------------- from .dereplication import dereplicate_mags -from .kraken2 import bracken, classification, database +from .kaiju import classification as kaiju_class, database as kaiju_db +from .kraken2 import ( + classification as kraken_class, database as kraken_db, bracken +) from .metabat2 import metabat2 from . import eggnog from . import busco @@ -18,6 +21,6 @@ del get_versions __all__ = [ - 'metabat2', 'bracken', 'classification', 'database', - 'dereplicate_mags', 'eggnog', 'busco', + 'metabat2', 'bracken', 'kraken_class', 'kraken_db', + 'kaiju_class', 'kaiju_db', 'dereplicate_mags', 'eggnog', 'busco' ] diff --git a/q2_moshpit/citations.bib b/q2_moshpit/citations.bib index 56d4110e..15a5f9b3 100644 --- a/q2_moshpit/citations.bib +++ b/q2_moshpit/citations.bib @@ -51,6 +51,21 @@ @article{kang2019 keywords = {Clustering,Metagenome binning,Metagenomics} } +@article{menzel2016, + title = {Fast and Sensitive Taxonomic Classification for Metagenomics with {{Kaiju}}}, + author = {Menzel, Peter and Ng, Kim Lee and Krogh, Anders}, + year = {2016}, + month = apr, + journal = {Nature Communications}, + volume = {7}, + number = {1}, + pages = {11257}, + publisher = {{Nature Publishing Group}}, + issn = {2041-1723}, + doi = {10.1038/ncomms11257}, + keywords = {Classification and taxonomy,Metagenomics} +} + @article{manni_busco_2021, title = {{BUSCO} {Update}: {Novel} and {Streamlined} {Workflows} along with {Broader} and {Deeper} {Phylogenetic} {Coverage} for {Scoring} of {Eukaryotic}, {Prokaryotic}, and {Viral} {Genomes}}, volume = {38}, @@ -68,5 +83,4 @@ @article{manni_busco_2021 month = sep, year = {2021}, pages = {4647--4654}, - file = {Manni et al. - 2021 - BUSCO Update Novel and Streamlined Workflows alon.pdf:/Users/santiago/Zotero/storage/SQ2VFGPF/Manni et al. - 2021 - BUSCO Update Novel and Streamlined Workflows alon.pdf:application/pdf}, } diff --git a/q2_moshpit/kaiju/__init__.py b/q2_moshpit/kaiju/__init__.py new file mode 100644 index 00000000..61212bf6 --- /dev/null +++ b/q2_moshpit/kaiju/__init__.py @@ -0,0 +1,12 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2022-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +from .database import fetch_kaiju_db +# from .classification import classify_kaiju + +__all__ = ["fetch_kaiju_db", ] diff --git a/q2_moshpit/kaiju/classification.py b/q2_moshpit/kaiju/classification.py new file mode 100644 index 00000000..16cef8fc --- /dev/null +++ b/q2_moshpit/kaiju/classification.py @@ -0,0 +1,7 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2022-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- diff --git a/q2_moshpit/kaiju/database.py b/q2_moshpit/kaiju/database.py new file mode 100644 index 00000000..6b2bcaa2 --- /dev/null +++ b/q2_moshpit/kaiju/database.py @@ -0,0 +1,110 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2022-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +import os +import tarfile +from urllib.parse import urljoin + +from tqdm import tqdm + +from q2_types_genomics.kaiju import KaijuDBDirectoryFormat + + +from bs4 import BeautifulSoup +import requests +import pandas as pd + +CHUNK_SIZE = 8192 +KAIJU_SERVER_URL = "https://kaiju.binf.ku.dk/server" +ERR_MSG = ( + "Unable to connect to the Kaiju server. Please try again later. " + "The error was: {}" +) + + +def _fetch_and_extract_db(db_uri: str, db_dir: str): + latest_db = os.path.basename(db_uri) + db_path = os.path.join(db_dir, latest_db) + try: + response = requests.get(db_uri, stream=True) + response.raise_for_status() + total_size = int(response.headers.get("content-length", 0)) + if total_size > 0: + progress_bar = tqdm( + desc=f'Downloading the "{latest_db}" database', + total=total_size, + unit="B", + unit_scale=True, + unit_divisor=1024, + ) + + with open(db_path, "wb") as file: + for chunk in response.iter_content(chunk_size=CHUNK_SIZE): + file.write(chunk) if chunk else False + if total_size > 0: + progress_bar.update(len(chunk)) + progress_bar.close() if total_size > 0 else False + except requests.exceptions.ConnectionError as e: + raise Exception(ERR_MSG.format(e)) + + msg = "Download finished. Extracting database files..." + print(f"{msg}", end="", flush=True) + with tarfile.open(db_path, "r:gz") as tar: + tar.extractall(path=db_dir) + print(f"\r{msg} Done.", flush=True) + + os.remove(db_path) + + +def _find_latest_db_url(database_type, sidebox_element, url): + # Extract the databases and dates + df = _find_all_dbs(sidebox_element) + + # Filter databases based on target_database type + filtered_df = df[df.index.str.contains(database_type)] + + # Find the latest database + latest_database = filtered_df["Date"].idxmax() + # latest_database = filtered_df.loc[latest_index, "Database"] + download_link = sidebox_element.find("a", string=latest_database)["href"] + download_link = urljoin(url, download_link) + + return download_link + + +def _find_all_dbs(sidebox_element): + databases, dates = [], [] + for link in sidebox_element.find_all("a"): + database = link.get_text() + date = database.split()[-2] # Last element is the date + databases.append(database) + dates.append(date) + df = pd.DataFrame({"Database": databases, "Date": dates}) + df.set_index("Database", inplace=True) + df.loc[:, "Date"] = pd.to_datetime(df.loc[:, "Date"]) + return df + + +def fetch_kaiju_db( + database_type: str, +) -> KaijuDBDirectoryFormat: + + try: + response = requests.get(KAIJU_SERVER_URL) + except requests.exceptions.RequestException as e: + raise Exception(ERR_MSG.format(e)) + soup = BeautifulSoup(response.content, "html.parser") + sidebox_db = soup.find("div", id="sidebox_db") + + download_link = _find_latest_db_url( + database_type, sidebox_db, KAIJU_SERVER_URL + ) + + db = KaijuDBDirectoryFormat() + _fetch_and_extract_db(download_link, str(db.path)) + + return db diff --git a/q2_moshpit/kaiju/tests/__init__.py b/q2_moshpit/kaiju/tests/__init__.py new file mode 100644 index 00000000..16cef8fc --- /dev/null +++ b/q2_moshpit/kaiju/tests/__init__.py @@ -0,0 +1,7 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2022-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- diff --git a/q2_moshpit/kaiju/tests/test_database.py b/q2_moshpit/kaiju/tests/test_database.py new file mode 100644 index 00000000..7de7d125 --- /dev/null +++ b/q2_moshpit/kaiju/tests/test_database.py @@ -0,0 +1,151 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2022-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +import os +import tempfile +import unittest +from unittest.mock import patch, Mock + +import pandas as pd +from bs4 import BeautifulSoup +from qiime2.plugin.testing import TestPluginBase + +from q2_moshpit.kaiju.database import ( + _fetch_and_extract_db, _find_latest_db_url, _find_all_dbs, + fetch_kaiju_db, CHUNK_SIZE, ERR_MSG, KAIJU_SERVER_URL +) +from requests.exceptions import ConnectionError, RequestException + +from q2_types_genomics.kaiju import KaijuDBDirectoryFormat + + +class TestDatabaseFunctions(TestPluginBase): + package = 'q2_moshpit.kaiju.tests' + + @patch("requests.get") + @patch("q2_moshpit.kaiju.database.tqdm") + @patch("tarfile.open") + @patch("os.remove") + def test_fetch_and_extract_db( + self, mock_remove, mock_tarfile_open, + mock_progress, mock_requests + ): + response = mock_requests.return_value + response.headers = {"content-length": 1024} + response.iter_content.return_value = [b"test"] * 1024 + mock_tar = Mock() + mock_tarfile_open.return_value.__enter__.return_value = mock_tar + + with tempfile.TemporaryDirectory() as tmpdir: + _fetch_and_extract_db("http://a/b/db.tar.gz", tmpdir) + db_path = os.path.join(tmpdir, "db.tar.gz") + + mock_progress.assert_called_with( + desc='Downloading the "db.tar.gz" database', + total=1024, + unit="B", + unit_scale=True, + unit_divisor=1024 + ) + response.iter_content.assert_called_with(chunk_size=CHUNK_SIZE) + mock_tarfile_open.assert_called_with(db_path, "r:gz") + mock_tar.extractall.assert_called_with(path=tmpdir) + mock_remove.assert_called_with(db_path) + mock_requests.assert_called_with( + "http://a/b/db.tar.gz", stream=True + ) + + @patch("requests.get", side_effect=ConnectionError("some error")) + def test_fetch_and_extract_db_exception( + self, mock_requests + ): + exp_error = ERR_MSG.format("some error") + with self.assertRaisesRegex(Exception, exp_error): + with tempfile.TemporaryDirectory() as tmpdir: + _fetch_and_extract_db("http://a/b/db.tar.gz", tmpdir) + + mock_requests.assert_called_with( + "http://a/b/db.tar.gz", stream=True + ) + + def test_find_latest_db_url(self): + databases = [ + ('nr_euk 2021-02-24 (61GB)', + 'https://hello.com/nr_euk_2021-02-24.tar.gz'), + ('nr 2021-02-26 (52GB)', + 'https://hello.com/nr_2021-02-26.tar.gz'), + ('nr_euk 2022-01-11 (60GB)', + 'https://hello.com/nr_euk_2022-01-11.tar.gz') + ] + sidebox_element = BeautifulSoup( + '{}'.format( + ''.join('{}'.format(d[1], d[0]) + for d in databases) + ), 'html.parser') + url = _find_latest_db_url( + database_type='nr_euk', + sidebox_element=sidebox_element, + url='https://test.com' + ) + self.assertEqual(url, 'https://hello.com/nr_euk_2022-01-11.tar.gz') + + def test_find_all_dbs(self): + databases = ['nr_euk 2021-02-24 (61GB)', 'nr 2021-02-26 (52GB)'] + sidebox_element = BeautifulSoup( + '{}'.format( + ''.join('{}'.format(d) for d in databases) + ), 'html.parser') + df = _find_all_dbs(sidebox_element) + self.assertIsInstance(df, pd.DataFrame) + self.assertListEqual( + df.index.tolist(), + ['nr_euk 2021-02-24 (61GB)', 'nr 2021-02-26 (52GB)'] + ) + self.assertListEqual( + df['Date'].tolist(), + [pd.to_datetime('2021-02-24'), pd.to_datetime('2021-02-26')] + ) + + @patch("requests.get") + @patch("q2_moshpit.kaiju.database._fetch_and_extract_db") + def test_fetch_kaiju_db(self, mock_fetch, mock_requests): + databases = [ + ('nr_euk 2021-02-24 (61GB)', + 'https://hello.com/nr_euk_2021-02-24.tar.gz'), + ('nr 2021-02-26 (52GB)', + 'https://hello.com/nr_2021-02-26.tar.gz'), + ('nr_euk 2022-01-11 (60GB)', + 'https://hello.com/nr_euk_2022-01-11.tar.gz') + ] + mock_requests.return_value = Mock( + content='' + .format( + ''.join('{}'.format(d[1], d[0]) + for d in databases) + ) + ) + + obs_db = fetch_kaiju_db('nr_euk') + self.assertIsInstance(obs_db, KaijuDBDirectoryFormat) + mock_requests.assert_called_with(KAIJU_SERVER_URL) + mock_fetch.assert_called_with( + 'https://hello.com/nr_euk_2022-01-11.tar.gz', + str(obs_db.path) + ) + + @patch("requests.get", side_effect=RequestException("some error")) + def test_fetch_kaiju_db_exception(self, mock_requests): + with self.assertRaisesRegex( + Exception, ERR_MSG.format("some error") + ): + fetch_kaiju_db('nr_euk') + + mock_requests.assert_called_with(KAIJU_SERVER_URL) + + +if __name__ == "__main__": + unittest.main() diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index 37335e67..d5b88d79 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -11,8 +11,7 @@ from q2_types.feature_data import FeatureData, Sequence, Taxonomy from q2_types.feature_table import FeatureTable, Frequency, PresenceAbsence from q2_types.per_sample_sequences import ( - SequencesWithQuality, - PairedEndSequencesWithQuality, + SequencesWithQuality, PairedEndSequencesWithQuality ) from q2_types.sample_data import SampleData from q2_types.feature_map import FeatureMap, MAGtoContigs @@ -23,6 +22,7 @@ import q2_moshpit from q2_types_genomics.feature_data import NOG, MAG from q2_types_genomics.genome_data import BLAST6 +from q2_types_genomics.kaiju import KaijuDB from q2_types_genomics.kraken2 import ( Kraken2Reports, Kraken2Outputs, Kraken2DB ) @@ -31,107 +31,107 @@ from q2_types_genomics.per_sample_data._type import AlignmentMap from q2_types_genomics.reference_db import ReferenceDB, Diamond, Eggnog -citations = Citations.load("citations.bib", package="q2_moshpit") +citations = Citations.load('citations.bib', package='q2_moshpit') kraken2_params = { - "threads": Int % Range(1, None), - "confidence": Float % Range(0, 1, inclusive_end=True), - "minimum_base_quality": Int % Range(0, None), - "memory_mapping": Bool, - "minimum_hit_groups": Int % Range(1, None), - "quick": Bool, - "report_minimizer_data": Bool, + 'threads': Int % Range(1, None), + 'confidence': Float % Range(0, 1, inclusive_end=True), + 'minimum_base_quality': Int % Range(0, None), + 'memory_mapping': Bool, + 'minimum_hit_groups': Int % Range(1, None), + 'quick': Bool, + 'report_minimizer_data': Bool } kraken2_param_descriptions = { - "threads": "Number of threads.", - "confidence": "Confidence score threshold.", - "minimum_base_quality": "Minimum base quality used in classification. " - "Only applies when reads are used as input.", - "memory_mapping": "Avoids loading the database into RAM.", - "minimum_hit_groups": "Minimum number of hit groups (overlapping " - "k-mers sharing the same minimizer).", - "quick": "Quick operation (use first hit or hits).", - "report_minimizer_data": "Include number of read-minimizers per-taxon and " - "unique read-minimizers per-taxon in the repot.", + 'threads': 'Number of threads.', + 'confidence': 'Confidence score threshold.', + 'minimum_base_quality': 'Minimum base quality used in classification.' + ' Only applies when reads are used as input.', + 'memory_mapping': 'Avoids loading the database into RAM.', + 'minimum_hit_groups': 'Minimum number of hit groups (overlapping ' + 'k-mers sharing the same minimizer).', + 'quick': 'Quick operation (use first hit or hits).', + 'report_minimizer_data': 'Include number of read-minimizers per-taxon and' + ' unique read-minimizers per-taxon in the repot.' } plugin = Plugin( - name="moshpit", + name='moshpit', version=q2_moshpit.__version__, website="https://github.com/bokulich-lab/q2-moshpit", - package="q2_moshpit", + package='q2_moshpit', description=( - "MOdular SHotgun metagenome Pipelines with Integrated " - "provenance Tracking: QIIME 2 plugin gor metagenome analysis with" - "tools for genome binning and functional annotation." - ), - short_description="QIIME 2 plugin for metagenome analysis.", + 'MOdular SHotgun metagenome Pipelines with Integrated ' + 'provenance Tracking: QIIME 2 plugin gor metagenome analysis with' + 'tools for genome binning and functional annotation.'), + short_description='QIIME 2 plugin for metagenome analysis.', ) -importlib.import_module("q2_moshpit.eggnog") -importlib.import_module("q2_moshpit.metabat2") +importlib.import_module('q2_moshpit.eggnog') +importlib.import_module('q2_moshpit.metabat2') plugin.methods.register_function( function=q2_moshpit.metabat2.bin_contigs_metabat, inputs={ - "contigs": SampleData[Contigs], - "alignment_maps": SampleData[AlignmentMap] + 'contigs': SampleData[Contigs], + 'alignment_maps': SampleData[AlignmentMap] }, parameters={ - "min_contig": Int % Range(1500, None), - "max_p": Int % Range(1, 100), - "min_s": Int % Range(1, 100), - "max_edges": Int % Range(1, None), - "p_tnf": Int % Range(0, 100), - "no_add": Bool, - "min_cv": Int % Range(1, None), - "min_cv_sum": Int % Range(1, None), - "min_cls_size": Int % Range(1, None), - "num_threads": Int % Range(0, None), - "seed": Int % Range(0, None), - "debug": Bool, - "verbose": Bool, + 'min_contig': Int % Range(1500, None), + 'max_p': Int % Range(1, 100), + 'min_s': Int % Range(1, 100), + 'max_edges': Int % Range(1, None), + 'p_tnf': Int % Range(0, 100), + 'no_add': Bool, + 'min_cv': Int % Range(1, None), + 'min_cv_sum': Int % Range(1, None), + 'min_cls_size': Int % Range(1, None), + 'num_threads': Int % Range(0, None), + 'seed': Int % Range(0, None), + 'debug': Bool, + 'verbose': Bool }, outputs=[ - ("mags", SampleData[MAGs]), - ("contig_map", FeatureMap[MAGtoContigs]), - ("unbinned_contigs", SampleData[Contigs % Properties("unbinned")]), + ('mags', SampleData[MAGs]), + ('contig_map', FeatureMap[MAGtoContigs]), + ('unbinned_contigs', SampleData[Contigs % Properties('unbinned')]) ], input_descriptions={ - "contigs": "Placeholder.", "alignment_maps": "Placeholder." + 'contigs': 'Placeholder.', + 'alignment_maps': 'Placeholder.' }, parameter_descriptions={ - "min_contig": "Minimum size of a contig for binning.", - "max_p": 'Percentage of "good" contigs considered for binning ' - "decided by connection among contigs. The greater, the " - "more sensitive.", - "min_s": "Minimum score of a edge for binning. The greater, the " - "more specific.", - "max_edges": "Maximum number of edges per node. The greater, the " - "more sensitive.", - "p_tnf": "TNF probability cutoff for building TNF graph. Use it to " - "skip the preparation step. (0: auto)", - "no_add": "Turning off additional binning for lost or small contigs.", - "min_cv": "Minimum mean coverage of a contig in each library for " - "binning.", - "min_cv_sum": "Minimum total effective mean coverage of a contig " - "(sum of depth over minCV) for binning.", - "min_cls_size": "Minimum size of a bin as the output.", - "num_threads": "Number of threads to use (0: use all cores).", - "seed": "For exact reproducibility. (0: use random seed)", - "debug": "Debug output.", - "verbose": "Verbose output.", + 'min_contig': 'Minimum size of a contig for binning.', + 'max_p': 'Percentage of "good" contigs considered for binning ' + 'decided by connection among contigs. The greater, the ' + 'more sensitive.', + 'min_s': 'Minimum score of a edge for binning. The greater, the ' + 'more specific.', + 'max_edges': 'Maximum number of edges per node. The greater, the ' + 'more sensitive.', + 'p_tnf': 'TNF probability cutoff for building TNF graph. Use it to ' + 'skip the preparation step. (0: auto)', + 'no_add': 'Turning off additional binning for lost or small contigs.', + 'min_cv': 'Minimum mean coverage of a contig in each library ' + 'for binning.', + 'min_cv_sum': 'Minimum total effective mean coverage of a contig ' + '(sum of depth over minCV) for binning.', + 'min_cls_size': 'Minimum size of a bin as the output.', + 'num_threads': 'Number of threads to use (0: use all cores).', + 'seed': 'For exact reproducibility. (0: use random seed)', + 'debug': 'Debug output.', + 'verbose': 'Verbose output.' }, output_descriptions={ - "mags": "The resulting MAGs.", - "contig_map": "Mapping of MAG identifiers to the contig identifiers " - "contained in each MAG.", - "unbinned_contigs": "Contigs that were not binned into any MAG.", - }, - name="Bin contigs into MAGs using MetaBAT 2.", - description="This method uses MetaBAT 2 to bin provided contigs " - "into MAGs.", - citations=[citations["kang2019"]], + 'mags': 'The resulting MAGs.', + 'contig_map': 'Mapping of MAG identifiers to the contig identifiers ' + 'contained in each MAG.', + 'unbinned_contigs': 'Contigs that were not binned into any MAG.' + }, + name='Bin contigs into MAGs using MetaBAT 2.', + description='This method uses MetaBAT 2 to bin provided contigs ' + 'into MAGs.', + citations=[citations["kang2019"]] ) T_kraken_in, T_kraken_out_rep, T_kraken_out_hits = TypeMap({ @@ -158,8 +158,8 @@ }, parameters=kraken2_params, outputs=[ - ("reports", T_kraken_out_rep), - ("hits", T_kraken_out_hits), + ('reports', T_kraken_out_rep), + ('hits', T_kraken_out_hits), ], input_descriptions={ "seqs": "The sequences to be classified. Single-end or paired-end " @@ -168,123 +168,113 @@ }, parameter_descriptions=kraken2_param_descriptions, output_descriptions={ - "reports": "Reports produced by Kraken2.", - "hits": "Output files produced by Kraken2.", + 'reports': 'Reports produced by Kraken2.', + 'hits': 'Output files produced by Kraken2.', }, - name="Perform taxonomic classification of reads or MAGs using Kraken 2.", - description="This method uses Kraken 2 to classify provided NGS reads " - "or MAGs into taxonomic groups.", - citations=[citations["wood2019"]], + name='Perform taxonomic classification of reads or MAGs using Kraken 2.', + description='This method uses Kraken 2 to classify provided NGS reads ' + 'or MAGs into taxonomic groups.', + citations=[citations["wood2019"]] ) plugin.methods.register_function( function=q2_moshpit.kraken2.bracken.estimate_bracken, inputs={ - "kraken_reports": SampleData[Kraken2Reports % Properties("reads")], - "bracken_db": BrackenDB, + "kraken_reports": SampleData[Kraken2Reports % Properties('reads')], + "bracken_db": BrackenDB }, parameters={ - "threshold": Int % Range(0, None), - "read_len": Int % Range(0, None), - "level": Str % Choices(["D", "P", "C", "O", "F", "G", "S"]), + 'threshold': Int % Range(0, None), + 'read_len': Int % Range(0, None), + 'level': Str % Choices(['D', 'P', 'C', 'O', 'F', 'G', 'S']) }, outputs=[ - ("reports", SampleData[Kraken2Reports % Properties("bracken")]), - ("taxonomy", FeatureData[Taxonomy]), - ("table", FeatureTable[Frequency]), + ('reports', SampleData[Kraken2Reports % Properties('bracken')]), + ('taxonomy', FeatureData[Taxonomy]), + ('table', FeatureTable[Frequency]) ], input_descriptions={ "kraken_reports": "Reports produced by Kraken2.", - "bracken_db": "Bracken database.", + "bracken_db": "Bracken database." }, parameter_descriptions={ - "threshold": "Bracken: number of reads required PRIOR to abundance " - "estimation to perform re-estimation.", - "read_len": "Bracken: read length to get all classifications for.", - "level": "Bracken: taxonomic level to estimate abundance at.", + 'threshold': 'Bracken: number of reads required PRIOR to abundance ' + 'estimation to perform re-estimation.', + 'read_len': 'Bracken: read length to get all classifications for.', + 'level': 'Bracken: taxonomic level to estimate abundance at.' }, output_descriptions={ - "reports": "Reports modified by Bracken.", + 'reports': 'Reports modified by Bracken.', }, - name="Perform read abundance re-estimation using Bracken.", - description="This method uses Bracken to re-estimate read abundances.", - citations=[citations["wood2019"]], + name='Perform read abundance re-estimation using Bracken.', + description='This method uses Bracken to re-estimate read abundances.', + citations=[citations["wood2019"]] ) plugin.methods.register_function( function=q2_moshpit.kraken2.build_kraken_db, - inputs={"seqs": List[FeatureData[Sequence]]}, + inputs={ + "seqs": List[FeatureData[Sequence]] + }, parameters={ - "collection": Str - % Choices( - [ - "viral", - "minusb", - "standard", - "standard8", - "standard16", - "pluspf", - "pluspf8", - "pluspf16", - "pluspfp", - "pluspfp8", - "pluspfp16", - "eupathdb", - ], + 'collection': Str % Choices( + ['viral', 'minusb', 'standard', 'standard8', + 'standard16', 'pluspf', 'pluspf8', 'pluspf16', + 'pluspfp', 'pluspfp8', 'pluspfp16', 'eupathdb'], ), - "threads": Int % Range(1, None), - "kmer_len": Int % Range(1, None), - "minimizer_len": Int % Range(1, None), - "minimizer_spaces": Int % Range(1, None), - "no_masking": Bool, - "max_db_size": Int % Range(0, None), - "use_ftp": Bool, - "load_factor": Float % Range(0, 1), - "fast_build": Bool, - "read_len": List[Int % Range(1, None)], + 'threads': Int % Range(1, None), + 'kmer_len': Int % Range(1, None), + 'minimizer_len': Int % Range(1, None), + 'minimizer_spaces': Int % Range(1, None), + 'no_masking': Bool, + 'max_db_size': Int % Range(0, None), + 'use_ftp': Bool, + 'load_factor': Float % Range(0, 1), + 'fast_build': Bool, + 'read_len': List[Int % Range(1, None)], }, outputs=[ - ("kraken2_database", Kraken2DB), - ("bracken_database", BrackenDB), + ('kraken2_database', Kraken2DB), + ('bracken_database', BrackenDB), ], input_descriptions={ "seqs": "Sequences to be added to the Kraken 2 database." }, parameter_descriptions={ - "collection": "Name of the database collection to be fetched. " - "Please check https://benlangmead.github.io/aws-" - "indexes/k2 for the description of the available " - "options.", - "threads": "Number of threads. Only applicable when building a " - "custom database.", - "kmer_len": "K-mer length in bp/aa.", - "minimizer_len": "Minimizer length in bp/aa.", - "minimizer_spaces": "Number of characters in minimizer that are " - "ignored in comparisons.", - "no_masking": "Avoid masking low-complexity sequences prior to " - "building; masking requires dustmasker or segmasker " - "to be installed in PATH", - "max_db_size": "Maximum number of bytes for Kraken 2 hash table; " - "if the estimator determines more would normally be " - "needed, the reference library will be downsampled " - "to fit.", - "use_ftp": "Use FTP for downloading instead of RSYNC.", - "load_factor": "Proportion of the hash table to be populated.", - "fast_build": "Do not require database to be deterministically " - "built when using multiple threads. This is faster, " - "but does introduce variability in minimizer/LCA pairs.", - "read_len": "Ideal read lengths to be used while building the Bracken " - "database.", + 'collection': 'Name of the database collection to be fetched. ' + 'Please check https://benlangmead.github.io/aws-' + 'indexes/k2 for the description of the available ' + 'options.', + 'threads': 'Number of threads. Only applicable when building a ' + 'custom database.', + 'kmer_len': 'K-mer length in bp/aa.', + 'minimizer_len': 'Minimizer length in bp/aa.', + 'minimizer_spaces': 'Number of characters in minimizer that are ' + 'ignored in comparisons.', + 'no_masking': 'Avoid masking low-complexity sequences prior to ' + 'building; masking requires dustmasker or segmasker ' + 'to be installed in PATH', + 'max_db_size': 'Maximum number of bytes for Kraken 2 hash table; ' + 'if the estimator determines more would normally be ' + 'needed, the reference library will be downsampled ' + 'to fit.', + 'use_ftp': 'Use FTP for downloading instead of RSYNC.', + 'load_factor': 'Proportion of the hash table to be populated.', + 'fast_build': 'Do not require database to be deterministically ' + 'built when using multiple threads. This is faster, ' + 'but does introduce variability in minimizer/LCA pairs.', + 'read_len': 'Ideal read lengths to be used while building the Bracken ' + 'database.' }, output_descriptions={ - "kraken2_database": "Kraken2 database.", - "bracken_database": "Bracken database.", - }, - name="Build Kraken 2 database.", - description="This method builds a Kraken 2/Bracken databases from " - "provided DNA sequences or simply fetches pre-built " - "versions from an online resource.", - citations=[citations["wood2019"], citations["lu2017"]], + 'kraken2_database': 'Kraken2 database.', + 'bracken_database': 'Bracken database.' + }, + name='Build Kraken 2 database.', + description='This method builds a Kraken 2/Bracken databases from ' + 'provided DNA sequences or simply fetches pre-built ' + 'versions from an online resource.', + citations=[citations["wood2019"], citations["lu2017"]] ) plugin.methods.register_function( @@ -322,33 +312,37 @@ plugin.methods.register_function( function=q2_moshpit.kraken2.kraken2_to_features, - inputs={"reports": SampleData[Kraken2Reports]}, + inputs={ + 'reports': SampleData[Kraken2Reports] + }, parameters={ - "coverage_threshold": Float % Range(0, 100, inclusive_end=True) + 'coverage_threshold': Float % Range(0, 100, inclusive_end=True) }, outputs=[ - ("table", FeatureTable[PresenceAbsence]), - ("taxonomy", FeatureData[Taxonomy]), + ('table', FeatureTable[PresenceAbsence]), + ('taxonomy', FeatureData[Taxonomy]) ], - input_descriptions={"reports": "Per-sample Kraken 2 reports."}, + input_descriptions={ + 'reports': 'Per-sample Kraken 2 reports.' + }, parameter_descriptions={ - "coverage_threshold": "The minimum percent coverage required to " - "produce a feature." + 'coverage_threshold': 'The minimum percent coverage required to' + ' produce a feature.' }, output_descriptions={ - "table": "A presence/absence table of selected features. The features " - "are not of even ranks, but will be the most specific rank " - "available.", - "taxonomy": "Infra-clade ranks are ignored " - "unless they are strain-level. Missing internal ranks " - "are annotated by their next most specific rank, " - "with the exception of k__Bacteria and k__Archaea which " - "match their domain's name.", - }, - name="Select downstream features from Kraken 2", - description="Convert a Kraken 2 report, which is an annotated NCBI " - "taxonomy tree into generic artifacts for downstream " - "analyses.", + 'table': 'A presence/absence table of selected features. The features' + ' are not of even ranks, but will be the most specific rank' + ' available.', + 'taxonomy': 'Infra-clade ranks are ignored ' + 'unless they are strain-level. Missing internal ranks ' + 'are annotated by their next most specific rank, ' + 'with the exception of k__Bacteria and k__Archaea which ' + 'match their domain\'s name.', + }, + name='Select downstream features from Kraken 2', + description='Convert a Kraken 2 report, which is an annotated NCBI ' + 'taxonomy tree into generic artifacts for downstream ' + 'analyses.' ) plugin.methods.register_function( @@ -392,8 +386,8 @@ 'diamond_db': ReferenceDB[Diamond], }, parameters={ - "num_cpus": Int, - "db_in_memory": Bool, + 'num_cpus': Int, + 'db_in_memory': Bool, }, input_descriptions={ 'sequences': 'Sequence data of the contigs we want to ' @@ -415,27 +409,27 @@ ], name='Run eggNOG search using diamond aligner', description="This method performs the steps by which we find our " - "possible target sequences to annotate using the diamond " - "search functionality from the eggnog `emapper.py` script", + "possible target sequences to annotate using the diamond " + "search functionality from the eggnog `emapper.py` script", ) plugin.methods.register_function( function=q2_moshpit.eggnog.eggnog_annotate, inputs={ - "eggnog_hits": SampleData[BLAST6], - "eggnog_db": ReferenceDB[Eggnog], + 'eggnog_hits': SampleData[BLAST6], + 'eggnog_db': ReferenceDB[Eggnog], }, parameters={ - "db_in_memory": Bool, + 'db_in_memory': Bool, }, parameter_descriptions={ - "db_in_memory": "Read eggnog database into memory. The " - "eggnog database is very large(>44GB), so this " - "option should only be used on clusters or other " - "machines with enough memory.", + 'db_in_memory': 'Read eggnog database into memory. The ' + 'eggnog database is very large(>44GB), so this ' + 'option should only be used on clusters or other ' + 'machines with enough memory.', }, - outputs=[("ortholog_annotations", FeatureData[NOG])], - name="Annotate orthologs against eggNOG database", + outputs=[('ortholog_annotations', FeatureData[NOG])], + name='Annotate orthologs against eggNOG database', description="Apply eggnog mapper to annotate seed orthologs.", ) @@ -560,3 +554,37 @@ "visualizations summarizing the results.", citations=[citations["manni_busco_2021"]], ) + +plugin.methods.register_function( + function=q2_moshpit.kaiju.fetch_kaiju_db, + inputs={}, + parameters={ + "database_type": Str + % Choices( + [ + "nr", + "nr_euk", + "refseq", + "fungi", + "viruses", + "plasmids", + "progenomes", + "rvdb", + ] + ), + }, + outputs=[ + ("database", KaijuDB), + ], + input_descriptions={}, + parameter_descriptions={ + "database_type": "Type of database to be downloaded. For more " + "information on available types please see the list on " + "Kaiju's web server: https://kaiju.binf.ku.dk/server", + }, + output_descriptions={"database": "Kaiju database."}, + name="Fetch Kaiju database.", + description="This method fetches the latest Kaiju database from " + "https://kaiju.binf.ku.dk/server.", + citations=[citations["menzel2016"]], +)