diff --git a/ci/recipe/meta.yaml b/ci/recipe/meta.yaml
index 0b6e2ec3..0a3d61a4 100644
--- a/ci/recipe/meta.yaml
+++ b/ci/recipe/meta.yaml
@@ -17,19 +17,20 @@ requirements:
- setuptools
run:
+ - altair
+ - beautifulsoup4
- bracken
+ - busco >=5.0.0
+ - diamond
+ - eggnog-mapper >=2.1.10
- kraken2
- metabat2
- - samtools
- qiime2 {{ qiime2_epoch }}.*
- q2-types-genomics {{ qiime2_epoch }}.*
- q2templates {{ qiime2_epoch }}.*
- - eggnog-mapper >=2.1.10
- - diamond
+ - samtools
- tqdm
- xmltodict
- - altair
- - busco >=5.0.0
test:
requires:
diff --git a/q2_moshpit/__init__.py b/q2_moshpit/__init__.py
index ee9d3e0d..cf919382 100644
--- a/q2_moshpit/__init__.py
+++ b/q2_moshpit/__init__.py
@@ -7,7 +7,10 @@
# ----------------------------------------------------------------------------
from .dereplication import dereplicate_mags
-from .kraken2 import bracken, classification, database
+from .kaiju import classification as kaiju_class, database as kaiju_db
+from .kraken2 import (
+ classification as kraken_class, database as kraken_db, bracken
+)
from .metabat2 import metabat2
from . import eggnog
from . import busco
@@ -18,6 +21,6 @@
del get_versions
__all__ = [
- 'metabat2', 'bracken', 'classification', 'database',
- 'dereplicate_mags', 'eggnog', 'busco',
+ 'metabat2', 'bracken', 'kraken_class', 'kraken_db',
+ 'kaiju_class', 'kaiju_db', 'dereplicate_mags', 'eggnog', 'busco'
]
diff --git a/q2_moshpit/citations.bib b/q2_moshpit/citations.bib
index 56d4110e..15a5f9b3 100644
--- a/q2_moshpit/citations.bib
+++ b/q2_moshpit/citations.bib
@@ -51,6 +51,21 @@ @article{kang2019
keywords = {Clustering,Metagenome binning,Metagenomics}
}
+@article{menzel2016,
+ title = {Fast and Sensitive Taxonomic Classification for Metagenomics with {{Kaiju}}},
+ author = {Menzel, Peter and Ng, Kim Lee and Krogh, Anders},
+ year = {2016},
+ month = apr,
+ journal = {Nature Communications},
+ volume = {7},
+ number = {1},
+ pages = {11257},
+ publisher = {{Nature Publishing Group}},
+ issn = {2041-1723},
+ doi = {10.1038/ncomms11257},
+ keywords = {Classification and taxonomy,Metagenomics}
+}
+
@article{manni_busco_2021,
title = {{BUSCO} {Update}: {Novel} and {Streamlined} {Workflows} along with {Broader} and {Deeper} {Phylogenetic} {Coverage} for {Scoring} of {Eukaryotic}, {Prokaryotic}, and {Viral} {Genomes}},
volume = {38},
@@ -68,5 +83,4 @@ @article{manni_busco_2021
month = sep,
year = {2021},
pages = {4647--4654},
- file = {Manni et al. - 2021 - BUSCO Update Novel and Streamlined Workflows alon.pdf:/Users/santiago/Zotero/storage/SQ2VFGPF/Manni et al. - 2021 - BUSCO Update Novel and Streamlined Workflows alon.pdf:application/pdf},
}
diff --git a/q2_moshpit/kaiju/__init__.py b/q2_moshpit/kaiju/__init__.py
new file mode 100644
index 00000000..61212bf6
--- /dev/null
+++ b/q2_moshpit/kaiju/__init__.py
@@ -0,0 +1,12 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2022-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+from .database import fetch_kaiju_db
+# from .classification import classify_kaiju
+
+__all__ = ["fetch_kaiju_db", ]
diff --git a/q2_moshpit/kaiju/classification.py b/q2_moshpit/kaiju/classification.py
new file mode 100644
index 00000000..16cef8fc
--- /dev/null
+++ b/q2_moshpit/kaiju/classification.py
@@ -0,0 +1,7 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2022-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
diff --git a/q2_moshpit/kaiju/database.py b/q2_moshpit/kaiju/database.py
new file mode 100644
index 00000000..6b2bcaa2
--- /dev/null
+++ b/q2_moshpit/kaiju/database.py
@@ -0,0 +1,110 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2022-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+import os
+import tarfile
+from urllib.parse import urljoin
+
+from tqdm import tqdm
+
+from q2_types_genomics.kaiju import KaijuDBDirectoryFormat
+
+
+from bs4 import BeautifulSoup
+import requests
+import pandas as pd
+
+CHUNK_SIZE = 8192
+KAIJU_SERVER_URL = "https://kaiju.binf.ku.dk/server"
+ERR_MSG = (
+ "Unable to connect to the Kaiju server. Please try again later. "
+ "The error was: {}"
+)
+
+
+def _fetch_and_extract_db(db_uri: str, db_dir: str):
+ latest_db = os.path.basename(db_uri)
+ db_path = os.path.join(db_dir, latest_db)
+ try:
+ response = requests.get(db_uri, stream=True)
+ response.raise_for_status()
+ total_size = int(response.headers.get("content-length", 0))
+ if total_size > 0:
+ progress_bar = tqdm(
+ desc=f'Downloading the "{latest_db}" database',
+ total=total_size,
+ unit="B",
+ unit_scale=True,
+ unit_divisor=1024,
+ )
+
+ with open(db_path, "wb") as file:
+ for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
+ file.write(chunk) if chunk else False
+ if total_size > 0:
+ progress_bar.update(len(chunk))
+ progress_bar.close() if total_size > 0 else False
+ except requests.exceptions.ConnectionError as e:
+ raise Exception(ERR_MSG.format(e))
+
+ msg = "Download finished. Extracting database files..."
+ print(f"{msg}", end="", flush=True)
+ with tarfile.open(db_path, "r:gz") as tar:
+ tar.extractall(path=db_dir)
+ print(f"\r{msg} Done.", flush=True)
+
+ os.remove(db_path)
+
+
+def _find_latest_db_url(database_type, sidebox_element, url):
+ # Extract the databases and dates
+ df = _find_all_dbs(sidebox_element)
+
+ # Filter databases based on target_database type
+ filtered_df = df[df.index.str.contains(database_type)]
+
+ # Find the latest database
+ latest_database = filtered_df["Date"].idxmax()
+ # latest_database = filtered_df.loc[latest_index, "Database"]
+ download_link = sidebox_element.find("a", string=latest_database)["href"]
+ download_link = urljoin(url, download_link)
+
+ return download_link
+
+
+def _find_all_dbs(sidebox_element):
+ databases, dates = [], []
+ for link in sidebox_element.find_all("a"):
+ database = link.get_text()
+ date = database.split()[-2] # Last element is the date
+ databases.append(database)
+ dates.append(date)
+ df = pd.DataFrame({"Database": databases, "Date": dates})
+ df.set_index("Database", inplace=True)
+ df.loc[:, "Date"] = pd.to_datetime(df.loc[:, "Date"])
+ return df
+
+
+def fetch_kaiju_db(
+ database_type: str,
+) -> KaijuDBDirectoryFormat:
+
+ try:
+ response = requests.get(KAIJU_SERVER_URL)
+ except requests.exceptions.RequestException as e:
+ raise Exception(ERR_MSG.format(e))
+ soup = BeautifulSoup(response.content, "html.parser")
+ sidebox_db = soup.find("div", id="sidebox_db")
+
+ download_link = _find_latest_db_url(
+ database_type, sidebox_db, KAIJU_SERVER_URL
+ )
+
+ db = KaijuDBDirectoryFormat()
+ _fetch_and_extract_db(download_link, str(db.path))
+
+ return db
diff --git a/q2_moshpit/kaiju/tests/__init__.py b/q2_moshpit/kaiju/tests/__init__.py
new file mode 100644
index 00000000..16cef8fc
--- /dev/null
+++ b/q2_moshpit/kaiju/tests/__init__.py
@@ -0,0 +1,7 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2022-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
diff --git a/q2_moshpit/kaiju/tests/test_database.py b/q2_moshpit/kaiju/tests/test_database.py
new file mode 100644
index 00000000..7de7d125
--- /dev/null
+++ b/q2_moshpit/kaiju/tests/test_database.py
@@ -0,0 +1,151 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2022-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+import os
+import tempfile
+import unittest
+from unittest.mock import patch, Mock
+
+import pandas as pd
+from bs4 import BeautifulSoup
+from qiime2.plugin.testing import TestPluginBase
+
+from q2_moshpit.kaiju.database import (
+ _fetch_and_extract_db, _find_latest_db_url, _find_all_dbs,
+ fetch_kaiju_db, CHUNK_SIZE, ERR_MSG, KAIJU_SERVER_URL
+)
+from requests.exceptions import ConnectionError, RequestException
+
+from q2_types_genomics.kaiju import KaijuDBDirectoryFormat
+
+
+class TestDatabaseFunctions(TestPluginBase):
+ package = 'q2_moshpit.kaiju.tests'
+
+ @patch("requests.get")
+ @patch("q2_moshpit.kaiju.database.tqdm")
+ @patch("tarfile.open")
+ @patch("os.remove")
+ def test_fetch_and_extract_db(
+ self, mock_remove, mock_tarfile_open,
+ mock_progress, mock_requests
+ ):
+ response = mock_requests.return_value
+ response.headers = {"content-length": 1024}
+ response.iter_content.return_value = [b"test"] * 1024
+ mock_tar = Mock()
+ mock_tarfile_open.return_value.__enter__.return_value = mock_tar
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ _fetch_and_extract_db("http://a/b/db.tar.gz", tmpdir)
+ db_path = os.path.join(tmpdir, "db.tar.gz")
+
+ mock_progress.assert_called_with(
+ desc='Downloading the "db.tar.gz" database',
+ total=1024,
+ unit="B",
+ unit_scale=True,
+ unit_divisor=1024
+ )
+ response.iter_content.assert_called_with(chunk_size=CHUNK_SIZE)
+ mock_tarfile_open.assert_called_with(db_path, "r:gz")
+ mock_tar.extractall.assert_called_with(path=tmpdir)
+ mock_remove.assert_called_with(db_path)
+ mock_requests.assert_called_with(
+ "http://a/b/db.tar.gz", stream=True
+ )
+
+ @patch("requests.get", side_effect=ConnectionError("some error"))
+ def test_fetch_and_extract_db_exception(
+ self, mock_requests
+ ):
+ exp_error = ERR_MSG.format("some error")
+ with self.assertRaisesRegex(Exception, exp_error):
+ with tempfile.TemporaryDirectory() as tmpdir:
+ _fetch_and_extract_db("http://a/b/db.tar.gz", tmpdir)
+
+ mock_requests.assert_called_with(
+ "http://a/b/db.tar.gz", stream=True
+ )
+
+ def test_find_latest_db_url(self):
+ databases = [
+ ('nr_euk 2021-02-24 (61GB)',
+ 'https://hello.com/nr_euk_2021-02-24.tar.gz'),
+ ('nr 2021-02-26 (52GB)',
+ 'https://hello.com/nr_2021-02-26.tar.gz'),
+ ('nr_euk 2022-01-11 (60GB)',
+ 'https://hello.com/nr_euk_2022-01-11.tar.gz')
+ ]
+ sidebox_element = BeautifulSoup(
+ '
{}'.format(
+ ''.join('{}'.format(d[1], d[0])
+ for d in databases)
+ ), 'html.parser')
+ url = _find_latest_db_url(
+ database_type='nr_euk',
+ sidebox_element=sidebox_element,
+ url='https://test.com'
+ )
+ self.assertEqual(url, 'https://hello.com/nr_euk_2022-01-11.tar.gz')
+
+ def test_find_all_dbs(self):
+ databases = ['nr_euk 2021-02-24 (61GB)', 'nr 2021-02-26 (52GB)']
+ sidebox_element = BeautifulSoup(
+ '{}'.format(
+ ''.join('{}'.format(d) for d in databases)
+ ), 'html.parser')
+ df = _find_all_dbs(sidebox_element)
+ self.assertIsInstance(df, pd.DataFrame)
+ self.assertListEqual(
+ df.index.tolist(),
+ ['nr_euk 2021-02-24 (61GB)', 'nr 2021-02-26 (52GB)']
+ )
+ self.assertListEqual(
+ df['Date'].tolist(),
+ [pd.to_datetime('2021-02-24'), pd.to_datetime('2021-02-26')]
+ )
+
+ @patch("requests.get")
+ @patch("q2_moshpit.kaiju.database._fetch_and_extract_db")
+ def test_fetch_kaiju_db(self, mock_fetch, mock_requests):
+ databases = [
+ ('nr_euk 2021-02-24 (61GB)',
+ 'https://hello.com/nr_euk_2021-02-24.tar.gz'),
+ ('nr 2021-02-26 (52GB)',
+ 'https://hello.com/nr_2021-02-26.tar.gz'),
+ ('nr_euk 2022-01-11 (60GB)',
+ 'https://hello.com/nr_euk_2022-01-11.tar.gz')
+ ]
+ mock_requests.return_value = Mock(
+ content='{}
'
+ .format(
+ ''.join('{}'.format(d[1], d[0])
+ for d in databases)
+ )
+ )
+
+ obs_db = fetch_kaiju_db('nr_euk')
+ self.assertIsInstance(obs_db, KaijuDBDirectoryFormat)
+ mock_requests.assert_called_with(KAIJU_SERVER_URL)
+ mock_fetch.assert_called_with(
+ 'https://hello.com/nr_euk_2022-01-11.tar.gz',
+ str(obs_db.path)
+ )
+
+ @patch("requests.get", side_effect=RequestException("some error"))
+ def test_fetch_kaiju_db_exception(self, mock_requests):
+ with self.assertRaisesRegex(
+ Exception, ERR_MSG.format("some error")
+ ):
+ fetch_kaiju_db('nr_euk')
+
+ mock_requests.assert_called_with(KAIJU_SERVER_URL)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py
index 37335e67..d5b88d79 100644
--- a/q2_moshpit/plugin_setup.py
+++ b/q2_moshpit/plugin_setup.py
@@ -11,8 +11,7 @@
from q2_types.feature_data import FeatureData, Sequence, Taxonomy
from q2_types.feature_table import FeatureTable, Frequency, PresenceAbsence
from q2_types.per_sample_sequences import (
- SequencesWithQuality,
- PairedEndSequencesWithQuality,
+ SequencesWithQuality, PairedEndSequencesWithQuality
)
from q2_types.sample_data import SampleData
from q2_types.feature_map import FeatureMap, MAGtoContigs
@@ -23,6 +22,7 @@
import q2_moshpit
from q2_types_genomics.feature_data import NOG, MAG
from q2_types_genomics.genome_data import BLAST6
+from q2_types_genomics.kaiju import KaijuDB
from q2_types_genomics.kraken2 import (
Kraken2Reports, Kraken2Outputs, Kraken2DB
)
@@ -31,107 +31,107 @@
from q2_types_genomics.per_sample_data._type import AlignmentMap
from q2_types_genomics.reference_db import ReferenceDB, Diamond, Eggnog
-citations = Citations.load("citations.bib", package="q2_moshpit")
+citations = Citations.load('citations.bib', package='q2_moshpit')
kraken2_params = {
- "threads": Int % Range(1, None),
- "confidence": Float % Range(0, 1, inclusive_end=True),
- "minimum_base_quality": Int % Range(0, None),
- "memory_mapping": Bool,
- "minimum_hit_groups": Int % Range(1, None),
- "quick": Bool,
- "report_minimizer_data": Bool,
+ 'threads': Int % Range(1, None),
+ 'confidence': Float % Range(0, 1, inclusive_end=True),
+ 'minimum_base_quality': Int % Range(0, None),
+ 'memory_mapping': Bool,
+ 'minimum_hit_groups': Int % Range(1, None),
+ 'quick': Bool,
+ 'report_minimizer_data': Bool
}
kraken2_param_descriptions = {
- "threads": "Number of threads.",
- "confidence": "Confidence score threshold.",
- "minimum_base_quality": "Minimum base quality used in classification. "
- "Only applies when reads are used as input.",
- "memory_mapping": "Avoids loading the database into RAM.",
- "minimum_hit_groups": "Minimum number of hit groups (overlapping "
- "k-mers sharing the same minimizer).",
- "quick": "Quick operation (use first hit or hits).",
- "report_minimizer_data": "Include number of read-minimizers per-taxon and "
- "unique read-minimizers per-taxon in the repot.",
+ 'threads': 'Number of threads.',
+ 'confidence': 'Confidence score threshold.',
+ 'minimum_base_quality': 'Minimum base quality used in classification.'
+ ' Only applies when reads are used as input.',
+ 'memory_mapping': 'Avoids loading the database into RAM.',
+ 'minimum_hit_groups': 'Minimum number of hit groups (overlapping '
+ 'k-mers sharing the same minimizer).',
+ 'quick': 'Quick operation (use first hit or hits).',
+ 'report_minimizer_data': 'Include number of read-minimizers per-taxon and'
+ ' unique read-minimizers per-taxon in the repot.'
}
plugin = Plugin(
- name="moshpit",
+ name='moshpit',
version=q2_moshpit.__version__,
website="https://github.com/bokulich-lab/q2-moshpit",
- package="q2_moshpit",
+ package='q2_moshpit',
description=(
- "MOdular SHotgun metagenome Pipelines with Integrated "
- "provenance Tracking: QIIME 2 plugin gor metagenome analysis with"
- "tools for genome binning and functional annotation."
- ),
- short_description="QIIME 2 plugin for metagenome analysis.",
+ 'MOdular SHotgun metagenome Pipelines with Integrated '
+ 'provenance Tracking: QIIME 2 plugin gor metagenome analysis with'
+ 'tools for genome binning and functional annotation.'),
+ short_description='QIIME 2 plugin for metagenome analysis.',
)
-importlib.import_module("q2_moshpit.eggnog")
-importlib.import_module("q2_moshpit.metabat2")
+importlib.import_module('q2_moshpit.eggnog')
+importlib.import_module('q2_moshpit.metabat2')
plugin.methods.register_function(
function=q2_moshpit.metabat2.bin_contigs_metabat,
inputs={
- "contigs": SampleData[Contigs],
- "alignment_maps": SampleData[AlignmentMap]
+ 'contigs': SampleData[Contigs],
+ 'alignment_maps': SampleData[AlignmentMap]
},
parameters={
- "min_contig": Int % Range(1500, None),
- "max_p": Int % Range(1, 100),
- "min_s": Int % Range(1, 100),
- "max_edges": Int % Range(1, None),
- "p_tnf": Int % Range(0, 100),
- "no_add": Bool,
- "min_cv": Int % Range(1, None),
- "min_cv_sum": Int % Range(1, None),
- "min_cls_size": Int % Range(1, None),
- "num_threads": Int % Range(0, None),
- "seed": Int % Range(0, None),
- "debug": Bool,
- "verbose": Bool,
+ 'min_contig': Int % Range(1500, None),
+ 'max_p': Int % Range(1, 100),
+ 'min_s': Int % Range(1, 100),
+ 'max_edges': Int % Range(1, None),
+ 'p_tnf': Int % Range(0, 100),
+ 'no_add': Bool,
+ 'min_cv': Int % Range(1, None),
+ 'min_cv_sum': Int % Range(1, None),
+ 'min_cls_size': Int % Range(1, None),
+ 'num_threads': Int % Range(0, None),
+ 'seed': Int % Range(0, None),
+ 'debug': Bool,
+ 'verbose': Bool
},
outputs=[
- ("mags", SampleData[MAGs]),
- ("contig_map", FeatureMap[MAGtoContigs]),
- ("unbinned_contigs", SampleData[Contigs % Properties("unbinned")]),
+ ('mags', SampleData[MAGs]),
+ ('contig_map', FeatureMap[MAGtoContigs]),
+ ('unbinned_contigs', SampleData[Contigs % Properties('unbinned')])
],
input_descriptions={
- "contigs": "Placeholder.", "alignment_maps": "Placeholder."
+ 'contigs': 'Placeholder.',
+ 'alignment_maps': 'Placeholder.'
},
parameter_descriptions={
- "min_contig": "Minimum size of a contig for binning.",
- "max_p": 'Percentage of "good" contigs considered for binning '
- "decided by connection among contigs. The greater, the "
- "more sensitive.",
- "min_s": "Minimum score of a edge for binning. The greater, the "
- "more specific.",
- "max_edges": "Maximum number of edges per node. The greater, the "
- "more sensitive.",
- "p_tnf": "TNF probability cutoff for building TNF graph. Use it to "
- "skip the preparation step. (0: auto)",
- "no_add": "Turning off additional binning for lost or small contigs.",
- "min_cv": "Minimum mean coverage of a contig in each library for "
- "binning.",
- "min_cv_sum": "Minimum total effective mean coverage of a contig "
- "(sum of depth over minCV) for binning.",
- "min_cls_size": "Minimum size of a bin as the output.",
- "num_threads": "Number of threads to use (0: use all cores).",
- "seed": "For exact reproducibility. (0: use random seed)",
- "debug": "Debug output.",
- "verbose": "Verbose output.",
+ 'min_contig': 'Minimum size of a contig for binning.',
+ 'max_p': 'Percentage of "good" contigs considered for binning '
+ 'decided by connection among contigs. The greater, the '
+ 'more sensitive.',
+ 'min_s': 'Minimum score of a edge for binning. The greater, the '
+ 'more specific.',
+ 'max_edges': 'Maximum number of edges per node. The greater, the '
+ 'more sensitive.',
+ 'p_tnf': 'TNF probability cutoff for building TNF graph. Use it to '
+ 'skip the preparation step. (0: auto)',
+ 'no_add': 'Turning off additional binning for lost or small contigs.',
+ 'min_cv': 'Minimum mean coverage of a contig in each library '
+ 'for binning.',
+ 'min_cv_sum': 'Minimum total effective mean coverage of a contig '
+ '(sum of depth over minCV) for binning.',
+ 'min_cls_size': 'Minimum size of a bin as the output.',
+ 'num_threads': 'Number of threads to use (0: use all cores).',
+ 'seed': 'For exact reproducibility. (0: use random seed)',
+ 'debug': 'Debug output.',
+ 'verbose': 'Verbose output.'
},
output_descriptions={
- "mags": "The resulting MAGs.",
- "contig_map": "Mapping of MAG identifiers to the contig identifiers "
- "contained in each MAG.",
- "unbinned_contigs": "Contigs that were not binned into any MAG.",
- },
- name="Bin contigs into MAGs using MetaBAT 2.",
- description="This method uses MetaBAT 2 to bin provided contigs "
- "into MAGs.",
- citations=[citations["kang2019"]],
+ 'mags': 'The resulting MAGs.',
+ 'contig_map': 'Mapping of MAG identifiers to the contig identifiers '
+ 'contained in each MAG.',
+ 'unbinned_contigs': 'Contigs that were not binned into any MAG.'
+ },
+ name='Bin contigs into MAGs using MetaBAT 2.',
+ description='This method uses MetaBAT 2 to bin provided contigs '
+ 'into MAGs.',
+ citations=[citations["kang2019"]]
)
T_kraken_in, T_kraken_out_rep, T_kraken_out_hits = TypeMap({
@@ -158,8 +158,8 @@
},
parameters=kraken2_params,
outputs=[
- ("reports", T_kraken_out_rep),
- ("hits", T_kraken_out_hits),
+ ('reports', T_kraken_out_rep),
+ ('hits', T_kraken_out_hits),
],
input_descriptions={
"seqs": "The sequences to be classified. Single-end or paired-end "
@@ -168,123 +168,113 @@
},
parameter_descriptions=kraken2_param_descriptions,
output_descriptions={
- "reports": "Reports produced by Kraken2.",
- "hits": "Output files produced by Kraken2.",
+ 'reports': 'Reports produced by Kraken2.',
+ 'hits': 'Output files produced by Kraken2.',
},
- name="Perform taxonomic classification of reads or MAGs using Kraken 2.",
- description="This method uses Kraken 2 to classify provided NGS reads "
- "or MAGs into taxonomic groups.",
- citations=[citations["wood2019"]],
+ name='Perform taxonomic classification of reads or MAGs using Kraken 2.',
+ description='This method uses Kraken 2 to classify provided NGS reads '
+ 'or MAGs into taxonomic groups.',
+ citations=[citations["wood2019"]]
)
plugin.methods.register_function(
function=q2_moshpit.kraken2.bracken.estimate_bracken,
inputs={
- "kraken_reports": SampleData[Kraken2Reports % Properties("reads")],
- "bracken_db": BrackenDB,
+ "kraken_reports": SampleData[Kraken2Reports % Properties('reads')],
+ "bracken_db": BrackenDB
},
parameters={
- "threshold": Int % Range(0, None),
- "read_len": Int % Range(0, None),
- "level": Str % Choices(["D", "P", "C", "O", "F", "G", "S"]),
+ 'threshold': Int % Range(0, None),
+ 'read_len': Int % Range(0, None),
+ 'level': Str % Choices(['D', 'P', 'C', 'O', 'F', 'G', 'S'])
},
outputs=[
- ("reports", SampleData[Kraken2Reports % Properties("bracken")]),
- ("taxonomy", FeatureData[Taxonomy]),
- ("table", FeatureTable[Frequency]),
+ ('reports', SampleData[Kraken2Reports % Properties('bracken')]),
+ ('taxonomy', FeatureData[Taxonomy]),
+ ('table', FeatureTable[Frequency])
],
input_descriptions={
"kraken_reports": "Reports produced by Kraken2.",
- "bracken_db": "Bracken database.",
+ "bracken_db": "Bracken database."
},
parameter_descriptions={
- "threshold": "Bracken: number of reads required PRIOR to abundance "
- "estimation to perform re-estimation.",
- "read_len": "Bracken: read length to get all classifications for.",
- "level": "Bracken: taxonomic level to estimate abundance at.",
+ 'threshold': 'Bracken: number of reads required PRIOR to abundance '
+ 'estimation to perform re-estimation.',
+ 'read_len': 'Bracken: read length to get all classifications for.',
+ 'level': 'Bracken: taxonomic level to estimate abundance at.'
},
output_descriptions={
- "reports": "Reports modified by Bracken.",
+ 'reports': 'Reports modified by Bracken.',
},
- name="Perform read abundance re-estimation using Bracken.",
- description="This method uses Bracken to re-estimate read abundances.",
- citations=[citations["wood2019"]],
+ name='Perform read abundance re-estimation using Bracken.',
+ description='This method uses Bracken to re-estimate read abundances.',
+ citations=[citations["wood2019"]]
)
plugin.methods.register_function(
function=q2_moshpit.kraken2.build_kraken_db,
- inputs={"seqs": List[FeatureData[Sequence]]},
+ inputs={
+ "seqs": List[FeatureData[Sequence]]
+ },
parameters={
- "collection": Str
- % Choices(
- [
- "viral",
- "minusb",
- "standard",
- "standard8",
- "standard16",
- "pluspf",
- "pluspf8",
- "pluspf16",
- "pluspfp",
- "pluspfp8",
- "pluspfp16",
- "eupathdb",
- ],
+ 'collection': Str % Choices(
+ ['viral', 'minusb', 'standard', 'standard8',
+ 'standard16', 'pluspf', 'pluspf8', 'pluspf16',
+ 'pluspfp', 'pluspfp8', 'pluspfp16', 'eupathdb'],
),
- "threads": Int % Range(1, None),
- "kmer_len": Int % Range(1, None),
- "minimizer_len": Int % Range(1, None),
- "minimizer_spaces": Int % Range(1, None),
- "no_masking": Bool,
- "max_db_size": Int % Range(0, None),
- "use_ftp": Bool,
- "load_factor": Float % Range(0, 1),
- "fast_build": Bool,
- "read_len": List[Int % Range(1, None)],
+ 'threads': Int % Range(1, None),
+ 'kmer_len': Int % Range(1, None),
+ 'minimizer_len': Int % Range(1, None),
+ 'minimizer_spaces': Int % Range(1, None),
+ 'no_masking': Bool,
+ 'max_db_size': Int % Range(0, None),
+ 'use_ftp': Bool,
+ 'load_factor': Float % Range(0, 1),
+ 'fast_build': Bool,
+ 'read_len': List[Int % Range(1, None)],
},
outputs=[
- ("kraken2_database", Kraken2DB),
- ("bracken_database", BrackenDB),
+ ('kraken2_database', Kraken2DB),
+ ('bracken_database', BrackenDB),
],
input_descriptions={
"seqs": "Sequences to be added to the Kraken 2 database."
},
parameter_descriptions={
- "collection": "Name of the database collection to be fetched. "
- "Please check https://benlangmead.github.io/aws-"
- "indexes/k2 for the description of the available "
- "options.",
- "threads": "Number of threads. Only applicable when building a "
- "custom database.",
- "kmer_len": "K-mer length in bp/aa.",
- "minimizer_len": "Minimizer length in bp/aa.",
- "minimizer_spaces": "Number of characters in minimizer that are "
- "ignored in comparisons.",
- "no_masking": "Avoid masking low-complexity sequences prior to "
- "building; masking requires dustmasker or segmasker "
- "to be installed in PATH",
- "max_db_size": "Maximum number of bytes for Kraken 2 hash table; "
- "if the estimator determines more would normally be "
- "needed, the reference library will be downsampled "
- "to fit.",
- "use_ftp": "Use FTP for downloading instead of RSYNC.",
- "load_factor": "Proportion of the hash table to be populated.",
- "fast_build": "Do not require database to be deterministically "
- "built when using multiple threads. This is faster, "
- "but does introduce variability in minimizer/LCA pairs.",
- "read_len": "Ideal read lengths to be used while building the Bracken "
- "database.",
+ 'collection': 'Name of the database collection to be fetched. '
+ 'Please check https://benlangmead.github.io/aws-'
+ 'indexes/k2 for the description of the available '
+ 'options.',
+ 'threads': 'Number of threads. Only applicable when building a '
+ 'custom database.',
+ 'kmer_len': 'K-mer length in bp/aa.',
+ 'minimizer_len': 'Minimizer length in bp/aa.',
+ 'minimizer_spaces': 'Number of characters in minimizer that are '
+ 'ignored in comparisons.',
+ 'no_masking': 'Avoid masking low-complexity sequences prior to '
+ 'building; masking requires dustmasker or segmasker '
+ 'to be installed in PATH',
+ 'max_db_size': 'Maximum number of bytes for Kraken 2 hash table; '
+ 'if the estimator determines more would normally be '
+ 'needed, the reference library will be downsampled '
+ 'to fit.',
+ 'use_ftp': 'Use FTP for downloading instead of RSYNC.',
+ 'load_factor': 'Proportion of the hash table to be populated.',
+ 'fast_build': 'Do not require database to be deterministically '
+ 'built when using multiple threads. This is faster, '
+ 'but does introduce variability in minimizer/LCA pairs.',
+ 'read_len': 'Ideal read lengths to be used while building the Bracken '
+ 'database.'
},
output_descriptions={
- "kraken2_database": "Kraken2 database.",
- "bracken_database": "Bracken database.",
- },
- name="Build Kraken 2 database.",
- description="This method builds a Kraken 2/Bracken databases from "
- "provided DNA sequences or simply fetches pre-built "
- "versions from an online resource.",
- citations=[citations["wood2019"], citations["lu2017"]],
+ 'kraken2_database': 'Kraken2 database.',
+ 'bracken_database': 'Bracken database.'
+ },
+ name='Build Kraken 2 database.',
+ description='This method builds a Kraken 2/Bracken databases from '
+ 'provided DNA sequences or simply fetches pre-built '
+ 'versions from an online resource.',
+ citations=[citations["wood2019"], citations["lu2017"]]
)
plugin.methods.register_function(
@@ -322,33 +312,37 @@
plugin.methods.register_function(
function=q2_moshpit.kraken2.kraken2_to_features,
- inputs={"reports": SampleData[Kraken2Reports]},
+ inputs={
+ 'reports': SampleData[Kraken2Reports]
+ },
parameters={
- "coverage_threshold": Float % Range(0, 100, inclusive_end=True)
+ 'coverage_threshold': Float % Range(0, 100, inclusive_end=True)
},
outputs=[
- ("table", FeatureTable[PresenceAbsence]),
- ("taxonomy", FeatureData[Taxonomy]),
+ ('table', FeatureTable[PresenceAbsence]),
+ ('taxonomy', FeatureData[Taxonomy])
],
- input_descriptions={"reports": "Per-sample Kraken 2 reports."},
+ input_descriptions={
+ 'reports': 'Per-sample Kraken 2 reports.'
+ },
parameter_descriptions={
- "coverage_threshold": "The minimum percent coverage required to "
- "produce a feature."
+ 'coverage_threshold': 'The minimum percent coverage required to'
+ ' produce a feature.'
},
output_descriptions={
- "table": "A presence/absence table of selected features. The features "
- "are not of even ranks, but will be the most specific rank "
- "available.",
- "taxonomy": "Infra-clade ranks are ignored "
- "unless they are strain-level. Missing internal ranks "
- "are annotated by their next most specific rank, "
- "with the exception of k__Bacteria and k__Archaea which "
- "match their domain's name.",
- },
- name="Select downstream features from Kraken 2",
- description="Convert a Kraken 2 report, which is an annotated NCBI "
- "taxonomy tree into generic artifacts for downstream "
- "analyses.",
+ 'table': 'A presence/absence table of selected features. The features'
+ ' are not of even ranks, but will be the most specific rank'
+ ' available.',
+ 'taxonomy': 'Infra-clade ranks are ignored '
+ 'unless they are strain-level. Missing internal ranks '
+ 'are annotated by their next most specific rank, '
+ 'with the exception of k__Bacteria and k__Archaea which '
+ 'match their domain\'s name.',
+ },
+ name='Select downstream features from Kraken 2',
+ description='Convert a Kraken 2 report, which is an annotated NCBI '
+ 'taxonomy tree into generic artifacts for downstream '
+ 'analyses.'
)
plugin.methods.register_function(
@@ -392,8 +386,8 @@
'diamond_db': ReferenceDB[Diamond],
},
parameters={
- "num_cpus": Int,
- "db_in_memory": Bool,
+ 'num_cpus': Int,
+ 'db_in_memory': Bool,
},
input_descriptions={
'sequences': 'Sequence data of the contigs we want to '
@@ -415,27 +409,27 @@
],
name='Run eggNOG search using diamond aligner',
description="This method performs the steps by which we find our "
- "possible target sequences to annotate using the diamond "
- "search functionality from the eggnog `emapper.py` script",
+ "possible target sequences to annotate using the diamond "
+ "search functionality from the eggnog `emapper.py` script",
)
plugin.methods.register_function(
function=q2_moshpit.eggnog.eggnog_annotate,
inputs={
- "eggnog_hits": SampleData[BLAST6],
- "eggnog_db": ReferenceDB[Eggnog],
+ 'eggnog_hits': SampleData[BLAST6],
+ 'eggnog_db': ReferenceDB[Eggnog],
},
parameters={
- "db_in_memory": Bool,
+ 'db_in_memory': Bool,
},
parameter_descriptions={
- "db_in_memory": "Read eggnog database into memory. The "
- "eggnog database is very large(>44GB), so this "
- "option should only be used on clusters or other "
- "machines with enough memory.",
+ 'db_in_memory': 'Read eggnog database into memory. The '
+ 'eggnog database is very large(>44GB), so this '
+ 'option should only be used on clusters or other '
+ 'machines with enough memory.',
},
- outputs=[("ortholog_annotations", FeatureData[NOG])],
- name="Annotate orthologs against eggNOG database",
+ outputs=[('ortholog_annotations', FeatureData[NOG])],
+ name='Annotate orthologs against eggNOG database',
description="Apply eggnog mapper to annotate seed orthologs.",
)
@@ -560,3 +554,37 @@
"visualizations summarizing the results.",
citations=[citations["manni_busco_2021"]],
)
+
+plugin.methods.register_function(
+ function=q2_moshpit.kaiju.fetch_kaiju_db,
+ inputs={},
+ parameters={
+ "database_type": Str
+ % Choices(
+ [
+ "nr",
+ "nr_euk",
+ "refseq",
+ "fungi",
+ "viruses",
+ "plasmids",
+ "progenomes",
+ "rvdb",
+ ]
+ ),
+ },
+ outputs=[
+ ("database", KaijuDB),
+ ],
+ input_descriptions={},
+ parameter_descriptions={
+ "database_type": "Type of database to be downloaded. For more "
+ "information on available types please see the list on "
+ "Kaiju's web server: https://kaiju.binf.ku.dk/server",
+ },
+ output_descriptions={"database": "Kaiju database."},
+ name="Fetch Kaiju database.",
+ description="This method fetches the latest Kaiju database from "
+ "https://kaiju.binf.ku.dk/server.",
+ citations=[citations["menzel2016"]],
+)