From 8b4d715d6c166c4ba9e3a456b6ad26153c8cd545 Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Thu, 15 Jun 2023 21:18:16 +0200 Subject: [PATCH] ENH: replace building standard DBs by fetching the pre-built ones (#34) * ENH: replace building standard DBs by fetching the pre-built ones * use response streaming * ENH: build Bracken database for custom Kraken 2 DBs * small refactor * add bracken conda pkg * add tests * add xmltodict to conda meta * add missing test files * add Bracken citation * add download progress bar * Lint * Add tqdm to the list of dependencies --- ci/recipe/meta.yaml | 3 + q2_moshpit/citations.bib | 14 + q2_moshpit/kraken2/database.py | 239 +++++++--- .../bracken-db/database100mers.kmer_distrib | 0 .../bracken-db/database150mers.kmer_distrib | 0 q2_moshpit/kraken2/tests/test_database.py | 442 +++++++++++------- q2_moshpit/plugin_setup.py | 50 +- setup.py | 3 +- 8 files changed, 487 insertions(+), 264 deletions(-) create mode 100644 q2_moshpit/kraken2/tests/data/bracken-db/database100mers.kmer_distrib create mode 100644 q2_moshpit/kraken2/tests/data/bracken-db/database150mers.kmer_distrib diff --git a/ci/recipe/meta.yaml b/ci/recipe/meta.yaml index b44fb91b..a89a715d 100644 --- a/ci/recipe/meta.yaml +++ b/ci/recipe/meta.yaml @@ -17,11 +17,14 @@ requirements: - setuptools run: + - bracken - kraken2 - metabat2 - samtools - qiime2 {{ qiime2_epoch }}.* - q2-types-genomics {{ qiime2_epoch }}.* + - tqdm + - xmltodict test: requires: diff --git a/q2_moshpit/citations.bib b/q2_moshpit/citations.bib index 4a5bfa0d..37009f8b 100644 --- a/q2_moshpit/citations.bib +++ b/q2_moshpit/citations.bib @@ -24,6 +24,20 @@ @article{wood2019 pages = {257}, } +@article{lu2017, + title = {Bracken: Estimating Species Abundance in Metagenomics Data}, + shorttitle = {Bracken}, + author = {Lu, Jennifer and Breitwieser, Florian P. and Thielen, Peter and Salzberg, Steven L.}, + year = {2017}, + month = jan, + journal = {PeerJ Computer Science}, + volume = {3}, + pages = {e104}, + publisher = {{PeerJ Inc.}}, + issn = {2376-5992}, + doi = {10.7717/peerj-cs.104}, +} + @article{kang2019, title = {{{MetaBAT}} 2: {{An}} Adaptive Binning Algorithm for Robust and Efficient Genome Reconstruction from Metagenome Assemblies}, author = {Kang, Dongwan D. and Li, Feng and Kirton, Edward and Thomas, Ashleigh and Egan, Rob and An, Hong and Wang, Zhong}, diff --git a/q2_moshpit/kraken2/database.py b/q2_moshpit/kraken2/database.py index e1d565e7..551f420f 100644 --- a/q2_moshpit/kraken2/database.py +++ b/q2_moshpit/kraken2/database.py @@ -7,46 +7,43 @@ # ---------------------------------------------------------------------------- import glob import os +import re import shutil import subprocess +import tarfile import tempfile -from contextlib import ExitStack from copy import deepcopy from typing import List +import requests +import xmltodict from q2_types.feature_data import DNAFASTAFormat +from tqdm import tqdm from q2_moshpit._utils import _process_common_input_params, run_command from q2_types_genomics.kraken2 import ( - Kraken2DBDirectoryFormat, + Kraken2DBDirectoryFormat, BrackenDBDirectoryFormat, ) from q2_moshpit.kraken2.utils import _process_kraken2_arg -def _build_standard_db(db_dir: str, all_kwargs: dict): - kwargs = { - k: v for k, v in all_kwargs.items() - if k in ["threads", "no_masking", "use_ftp", "fast_build"] - } - common_args = _process_common_input_params( - processing_func=_process_kraken2_arg, params=kwargs - ) - if all_kwargs["max_db_size"] > 0: - common_args.extend( - ["--max-db-size", str(all_kwargs["max_db_size"])] - ) - cmd = [ - "kraken2-build", "--standard", "--db", db_dir, *common_args - ] - try: - run_command(cmd=cmd, verbose=True) - except subprocess.CalledProcessError as e: - raise Exception( - "An error was encountered while building the standard " - f"library, (return code {e.returncode}), please inspect " - "stdout and stderr to learn more." - ) +COLLECTIONS = { + "standard": "standard", + "viral": "viral", + "minusb": "minusb", + "standard8": "standard_08gb", + "standard16": "standard_16gb", + "pluspf": "pluspf", + "pluspf8": "pluspf_08gb", + "pluspf16": "pluspf_16gb", + "pluspfp": "pluspfp", + "pluspfp8": "pluspfp_08gb", + "pluspfp16": "pluspfp_16gb", + "eupathdb": "eupathdb48", +} +S3_COLLECTIONS_URL = 'https://genome-idx.s3.amazonaws.com' +CHUNK_SIZE = 8192 def _fetch_taxonomy(db_dir: str, threads: int, use_ftp: bool): @@ -115,7 +112,7 @@ def _add_seqs_to_library(db_dir: str, seqs: DNAFASTAFormat, no_masking: bool): ) -def _build_database(db_dir: str, all_kwargs: dict): +def _build_kraken2_database(db_dir: str, all_kwargs: dict): kwargs = { k: v for k, v in all_kwargs.items() if k in ["threads", "minimizer_len", "minimizer_spaces", @@ -141,19 +138,129 @@ def _build_database(db_dir: str, all_kwargs: dict): ) -def _move_db_files(source: str, destination: str): - files = glob.glob(f"{source}/*.k2d") +def _build_bracken_database( + kraken2_db_dir: str, threads: int, kmer_len: int, read_len: int +): + cmd = [ + "bracken-build", "-d", kraken2_db_dir, "-t", str(threads), + "-k", str(kmer_len), "-l", str(read_len) + ] + try: + run_command(cmd=cmd, verbose=True) + except subprocess.CalledProcessError as e: + raise Exception( + "An error was encountered while building the Bracken " + f"database, (return code {e.returncode}), please inspect " + "stdout and stderr to learn more." + ) + + +def _find_latest_db(collection: str, response: requests.Response) -> str: + collection_id = COLLECTIONS[collection] + pattern = fr'kraken\/k2_{collection_id}_\d{{8}}.tar.gz' + + s3_objects = xmltodict.parse(response.content) + s3_objects = s3_objects.get('ListBucketResult') + if not s3_objects: + raise ValueError( + 'No databases were found in the response returned by S3. ' + 'Please try again.' + ) + s3_objects = [obj for obj in s3_objects['Contents'] + if re.match(pattern, obj['Key'])] + s3_objects = sorted( + s3_objects, key=lambda x: x['LastModified'], reverse=True + ) + latest_db = s3_objects[0]['Key'] + return latest_db + + +def _fetch_db_collection(collection: str, tmp_dir: str): + err_msg = 'Could not connect to the server. Please check your internet ' \ + 'connection and try again. The error was: {}.' + try: + response = requests.get(S3_COLLECTIONS_URL) + except requests.exceptions.ConnectionError as e: + raise ValueError(err_msg.format(e)) + + if response.status_code == 200: + latest_db = _find_latest_db(collection, response) + print(f'Found the latest "{collection}" database: {latest_db}.') + else: + raise ValueError( + 'Could not fetch the list of available databases. ' + f'Status code was: {response.status_code}. ' + 'Please try again later.' + ) + + db_uri = f'{S3_COLLECTIONS_URL}/{latest_db}' + try: + response = requests.get(db_uri, stream=True) + total_size = int(response.headers.get('content-length', 0)) + if total_size > 0: + progress_bar = tqdm( + desc=f'Downloading the "{latest_db}" database', + total=total_size, unit='B', + unit_scale=True, unit_divisor=1024, + ) + db_path = os.path.join(tmp_dir, os.path.split(db_uri)[-1]) + with open(db_path, "wb") as f: + for chunk in response.iter_content(chunk_size=CHUNK_SIZE): + f.write(chunk) if chunk else False + if total_size > 0: + progress_bar.update(len(chunk)) + progress_bar.close() if total_size > 0 else False + except requests.exceptions.ConnectionError as e: + raise ValueError(err_msg.format(e)) + + msg = "Download finished. Extracting database files..." + print(f"{msg}", end="", flush=True) + with tarfile.open(db_path, "r:gz") as tar: + tar.extractall(path=tmp_dir) + print(f"\r{msg} Done.", flush=True) + + +def _move_db_files(source: str, destination: str, extension: str = "k2d"): + files = glob.glob(f"{source}/*.{extension}") for file in files: new_file = os.path.join(destination, os.path.split(file)[-1]) shutil.move(file, new_file) +def _fetch_prebuilt_dbs(bracken_db, kraken2_db, collection, tmp): + # Find files with the latest version + _fetch_db_collection(collection=collection, tmp_dir=tmp) + # Move the Kraken2/Bracken database files to the final location + _move_db_files(tmp, str(kraken2_db.path), extension="k2d") + _move_db_files(tmp, str(bracken_db.path), extension="kmer_distrib") + + +def _build_dbs_from_seqs(bracken_db, kraken2_db, seqs, tmp_dir, common_args): + # Fetch taxonomy (also needed for custom databases) + _fetch_taxonomy( + db_dir=tmp_dir, threads=common_args["threads"], + use_ftp=common_args["use_ftp"] + ) + for seq in seqs: + _add_seqs_to_library( + db_dir=tmp_dir, seqs=seq, no_masking=common_args["no_masking"] + ) + # Build the Kraken2 database + _build_kraken2_database(db_dir=tmp_dir, all_kwargs=common_args) + # Build the Bracken database + for rl in common_args["read_len"]: + _build_bracken_database( + kraken2_db_dir=tmp_dir, threads=common_args["threads"], + kmer_len=common_args["kmer_len"], read_len=rl + ) + # Move the Kraken2/Bracken database files to the final location + _move_db_files(tmp_dir, str(kraken2_db.path), extension="k2d") + _move_db_files(tmp_dir, str(bracken_db.path), extension="kmer_distrib") + + def build_kraken_db( seqs: DNAFASTAFormat = None, - standard: bool = False, - library_path: str = None, - libraries: List[str] = None, - library_exists: str = 'skip', + collection: str = None, threads: int = 1, kmer_len: int = 35, minimizer_len: int = 31, @@ -163,52 +270,32 @@ def build_kraken_db( use_ftp: bool = False, load_factor: float = 0.7, fast_build: bool = False, -) -> Kraken2DBDirectoryFormat: - db = Kraken2DBDirectoryFormat() - db_dir = str(db.path) + read_len: int = None, +) -> (Kraken2DBDirectoryFormat, BrackenDBDirectoryFormat): + kraken2_db = Kraken2DBDirectoryFormat() + bracken_db = BrackenDBDirectoryFormat() - if standard and libraries: - raise ValueError( - 'Standard Kraken2 database was requested but some libraries ' - 'were also provided. Please provide either only the "standard" ' - 'option or a list of "libraries" to be fetched for the database.' - ) + if not read_len: + # use the same values as in the pre-built databases + read_len = [50, 75, 100, 150, 200, 250, 300] - with ExitStack() as stack: - if not library_path: - temp = tempfile.TemporaryDirectory() - temp_dir = temp.name - stack.enter_context(temp) - else: - os.makedirs(library_path, exist_ok=True) - temp_dir = library_path - - # If requested, build the standard Kraken2 database - if standard: - _build_standard_db(db_dir=temp_dir, all_kwargs=locals()) - _move_db_files(source=temp_dir, destination=db_dir) - return db - - # Fetch taxonomy (required regardless of the build source) - _fetch_taxonomy(db_dir=temp_dir, threads=threads, use_ftp=use_ftp) - - # If requested, download all the libraries - if libraries: - _fetch_libraries( - db_dir=temp_dir, libraries=libraries, all_kwargs=locals() - ) - - # If provided, add the additional sequences to the database + with tempfile.TemporaryDirectory() as tmp: if seqs: - for seq in seqs: - _add_seqs_to_library( - db_dir=temp_dir, seqs=seq, no_masking=no_masking - ) - - # Finally, build the actual database - _build_database(db_dir=temp_dir, all_kwargs=locals()) + # Construct the custom-made database + common_args = {k: v for k, v in locals().items() + if k not in ["seqs", "collection"]} - # Move the database files (*.k2d) to the final location - _move_db_files(source=temp_dir, destination=db_dir) + # Fetch taxonomy (also needed for custom databases) + _build_dbs_from_seqs( + bracken_db, kraken2_db, seqs, tmp, common_args + ) + elif collection: + _fetch_prebuilt_dbs(bracken_db, kraken2_db, collection, tmp) + else: + raise ValueError( + 'You need to either provide a list of sequences to build the ' + 'database from or a valid collection name to be fetched from ' + '"Kraken 2/Bracken Refseq indexes" resource.' + ) - return db + return kraken2_db, bracken_db diff --git a/q2_moshpit/kraken2/tests/data/bracken-db/database100mers.kmer_distrib b/q2_moshpit/kraken2/tests/data/bracken-db/database100mers.kmer_distrib new file mode 100644 index 00000000..e69de29b diff --git a/q2_moshpit/kraken2/tests/data/bracken-db/database150mers.kmer_distrib b/q2_moshpit/kraken2/tests/data/bracken-db/database150mers.kmer_distrib new file mode 100644 index 00000000..e69de29b diff --git a/q2_moshpit/kraken2/tests/test_database.py b/q2_moshpit/kraken2/tests/test_database.py index 10d72c71..b9a47fbf 100644 --- a/q2_moshpit/kraken2/tests/test_database.py +++ b/q2_moshpit/kraken2/tests/test_database.py @@ -5,13 +5,17 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +import io import os +import shutil +import tarfile import tempfile import unittest from copy import deepcopy +from requests.exceptions import ConnectionError from subprocess import CalledProcessError from tempfile import TemporaryDirectory -from unittest.mock import patch, ANY, call +from unittest.mock import patch, ANY, call, Mock, MagicMock from q2_types.feature_data import DNAFASTAFormat from qiime2 import Artifact @@ -19,10 +23,14 @@ from qiime2.plugins import moshpit from q2_moshpit.kraken2.database import ( - _build_standard_db, _fetch_taxonomy, _fetch_libraries, - _add_seqs_to_library, _build_database, _move_db_files + _fetch_taxonomy, _fetch_libraries, _add_seqs_to_library, + _build_kraken2_database, _move_db_files, _build_bracken_database, + _find_latest_db, _fetch_db_collection, S3_COLLECTIONS_URL, + _build_dbs_from_seqs, _fetch_prebuilt_dbs +) +from q2_types_genomics.kraken2 import ( + Kraken2DBDirectoryFormat, BrackenDBDirectoryFormat ) -from q2_types_genomics.kraken2 import Kraken2DBDirectoryFormat class MockTempDir(tempfile.TemporaryDirectory): @@ -34,42 +42,49 @@ class TestKraken2Database(TestPluginBase): def setUp(self): super().setUp() - self.db_dir = 'fake/db/dir' + self.kraken2_db_dir = 'fake/db/dir' self.kwargs = { 'threads': 2, 'fast_build': True, 'kmer_len': 31, 'use_ftp': False, 'max_db_size': 1000, 'load_factor': 0.5 } - - @patch('q2_moshpit.kraken2.database.run_command') - def test_build_standard_db(self, p1): - _build_standard_db(self.db_dir, self.kwargs) - - exp_cmd = [ - "kraken2-build", "--standard", "--db", self.db_dir, - "--threads", "2", "--fast-build", "--max-db-size", "1000" - ] - p1.assert_called_once_with(cmd=exp_cmd, verbose=True) - - @patch( - 'q2_moshpit.kraken2.database.run_command', - side_effect=CalledProcessError(123, 'cmd') - ) - def test_build_standard_db_exception(self, p1): - with self.assertRaisesRegex( - Exception, - "An error was encountered .* standard library, " - r"\(return code 123\), please inspect .*" - ): - _build_standard_db(self.db_dir, self.kwargs) + self.s3_response = b''' + + + kraken/k2_viral_20201202.tar.gz + 2020-12-09T01:38:22.000Z + + + kraken/k2_viral_20230314.tar.gz + 2023-03-22T01:29:11.000Z + + + ''' + + self.temp_dir = tempfile.mkdtemp() + self.temp_tar = os.path.join(self.temp_dir, 'temp.tar.gz') + + with tarfile.open(self.temp_tar, "w:gz") as tar: + data = io.BytesIO(b"sample data") + tarinfo = tarfile.TarInfo(name="sample.txt") + tarinfo.size = len(data.getbuffer()) + tar.addfile(tarinfo, data) + + with open(self.temp_tar, "rb") as f: + self.tar_chunks = [ + chunk for chunk in iter(lambda: f.read(8192), b"") + ] + + def tearDown(self): + shutil.rmtree(self.temp_dir) @patch('q2_moshpit.kraken2.database.run_command') def test_fetch_taxonomy(self, p1): - _fetch_taxonomy(self.db_dir, threads=3, use_ftp=True) + _fetch_taxonomy(self.kraken2_db_dir, threads=3, use_ftp=True) exp_cmd = [ "kraken2-build", "--download-taxonomy", - "--threads", "3", "--db", self.db_dir, "--use-ftp" + "--threads", "3", "--db", self.kraken2_db_dir, "--use-ftp" ] p1.assert_called_once_with(cmd=exp_cmd, verbose=True) @@ -83,7 +98,7 @@ def test_fetch_taxonomy_exception(self, p1): "An error was encountered .* downloading taxonomy, " r"\(return code 123\), please inspect .*" ): - _fetch_taxonomy(self.db_dir, threads=3, use_ftp=True) + _fetch_taxonomy(self.kraken2_db_dir, threads=3, use_ftp=True) @patch('q2_moshpit.kraken2.database.run_command') def test_fetch_libraries_skip(self, p1): @@ -117,11 +132,11 @@ def test_fetch_libraries_refetch(self, p1): libraries = ['plasmid', 'human'] _fetch_libraries( - self.db_dir, libraries=libraries, all_kwargs=all_kwargs + self.kraken2_db_dir, libraries=libraries, all_kwargs=all_kwargs ) base_cmd = ["kraken2-build", "--download-library"] - exp_common_args = ["--threads", "2", "--db", self.db_dir] + exp_common_args = ["--threads", "2", "--db", self.kraken2_db_dir] exp_cmds = [ [*base_cmd, libraries[0], *exp_common_args], [*base_cmd, libraries[1], *exp_common_args] @@ -143,18 +158,19 @@ def test_fetch_libraries_exception(self, p1): ): _fetch_libraries( - self.db_dir, libraries=['human'], all_kwargs=self.kwargs + self.kraken2_db_dir, libraries=['human'], + all_kwargs=self.kwargs ) @patch('q2_moshpit.kraken2.database.run_command') def test_add_seqs_to_library(self, p1): seqs = DNAFASTAFormat(self.get_data_path('mags/samp1/bin1.fa'), 'r') - _add_seqs_to_library(self.db_dir, seqs=seqs, no_masking=True) + _add_seqs_to_library(self.kraken2_db_dir, seqs=seqs, no_masking=True) exp_cmd = [ "kraken2-build", "--add-to-library", str(seqs.path), - "--db", self.db_dir, "--no-mask" + "--db", self.kraken2_db_dir, "--no-mask" ] p1.assert_called_once_with(cmd=exp_cmd, verbose=True) @@ -170,28 +186,30 @@ def test_add_seqs_to_library_exception(self, p1): "An error was encountered .* adding sequences to the " r"library, \(return code 123\), please inspect .*" ): - _add_seqs_to_library(self.db_dir, seqs=seqs, no_masking=True) + _add_seqs_to_library( + self.kraken2_db_dir, seqs=seqs, no_masking=True + ) @patch('q2_moshpit.kraken2.database.run_command') - def test_build_database(self, p1): - _build_database(self.db_dir, all_kwargs=self.kwargs) + def test_build_kraken2_database(self, p1): + _build_kraken2_database(self.kraken2_db_dir, all_kwargs=self.kwargs) exp_cmd = [ - "kraken2-build", "--build", "--db", self.db_dir, + "kraken2-build", "--build", "--db", self.kraken2_db_dir, "--threads", "2", "--fast-build", "--kmer-len", "31", "--load-factor", "0.5", "--max-db-size", "1000" ] p1.assert_called_once_with(cmd=exp_cmd, verbose=True) @patch('q2_moshpit.kraken2.database.run_command') - def test_build_database_no_max_db(self, p1): + def test_build_kraken2_database_no_max_db(self, p1): all_kwargs = deepcopy(self.kwargs) all_kwargs['max_db_size'] = 0 - _build_database(self.db_dir, all_kwargs=all_kwargs) + _build_kraken2_database(self.kraken2_db_dir, all_kwargs=all_kwargs) exp_cmd = [ - "kraken2-build", "--build", "--db", self.db_dir, + "kraken2-build", "--build", "--db", self.kraken2_db_dir, "--threads", "2", "--fast-build", "--kmer-len", "31", "--load-factor", "0.5" ] @@ -201,13 +219,142 @@ def test_build_database_no_max_db(self, p1): 'q2_moshpit.kraken2.database.run_command', side_effect=CalledProcessError(123, 'cmd') ) - def test_build_database_exception(self, p1): + def test_build_kraken2_database_exception(self, p1): with self.assertRaisesRegex( Exception, "An error was encountered .* building the database, " r"\(return code 123\), please inspect .*" ): - _build_database(self.db_dir, all_kwargs=self.kwargs) + _build_kraken2_database( + self.kraken2_db_dir, all_kwargs=self.kwargs + ) + + @patch('q2_moshpit.kraken2.database.run_command') + def test_build_bracken_database(self, p1): + _build_bracken_database( + kraken2_db_dir=self.kraken2_db_dir, threads=2, + kmer_len=31, read_len=150 + ) + + exp_cmd = [ + "bracken-build", "-d", self.kraken2_db_dir, + "-t", "2", "-k", "31", "-l", "150" + ] + p1.assert_called_once_with(cmd=exp_cmd, verbose=True) + + @patch( + 'q2_moshpit.kraken2.database.run_command', + side_effect=CalledProcessError(123, 'cmd') + ) + def test_build_bracken_database_exception(self, p1): + with self.assertRaisesRegex( + Exception, + "An error was encountered while building the Bracken " + r"database, \(return code 123\), please inspect .+" + ): + _build_bracken_database( + kraken2_db_dir=self.kraken2_db_dir, threads=2, + kmer_len=31, read_len=150 + ) + + def test_find_latest_db(self): + response = Mock(content=self.s3_response) + + obs_db = _find_latest_db('viral', response) + exp_db = 'kraken/k2_viral_20230314.tar.gz' + self.assertEqual(obs_db, exp_db) + + def test_find_latest_db_empty(self): + response = Mock(content=b'''''') + + with self.assertRaisesRegex( + ValueError, r'No databases were found.+' + ): + _find_latest_db('viral', response) + + @patch("requests.get") + @patch("tarfile.open") + @patch( + "q2_moshpit.kraken2.database._find_latest_db", + return_value="kraken/k2_viral.tar.gz" + ) + @patch("q2_moshpit.kraken2.database.tqdm") + def test_fetch_db_collection_success( + self, mock_tqdm, mock_find, mock_tarfile_open, mock_requests_get + ): + mock_requests_get.side_effect = [ + MagicMock(status_code=200), + MagicMock( + status_code=200, + iter_content=lambda chunk_size: self.tar_chunks, + headers={} + ) + ] + mock_tarfile_open.return_value.__enter__.return_value = MagicMock() + + _fetch_db_collection("viral", "/tmp") + + mock_requests_get.has_calls([ + call(S3_COLLECTIONS_URL), + call(f"{S3_COLLECTIONS_URL}/kraken/k2_viral.tar.gz", stream=True) + ]) + mock_tarfile_open.assert_called_once_with( + "/tmp/k2_viral.tar.gz", "r:gz" + ) + mock_find.assert_called_once_with("viral", ANY) + mock_tqdm.assert_not_called() + + @patch("requests.get") + @patch("tarfile.open") + @patch( + "q2_moshpit.kraken2.database._find_latest_db", + return_value="kraken/k2_viral.tar.gz" + ) + @patch("q2_moshpit.kraken2.database.tqdm") + def test_fetch_db_collection_success_with_tqdm( + self, mock_tqdm, mock_find, mock_tarfile_open, mock_requests_get + ): + mock_requests_get.side_effect = [ + MagicMock(status_code=200), + MagicMock( + status_code=200, + iter_content=lambda chunk_size: self.tar_chunks, + headers={"content-length": '1000'} + ) + ] + mock_tarfile_open.return_value.__enter__.return_value = MagicMock() + + _fetch_db_collection("viral", "/tmp") + + mock_requests_get.has_calls([ + call(S3_COLLECTIONS_URL), + call(f"{S3_COLLECTIONS_URL}/kraken/k2_viral.tar.gz", stream=True) + ]) + mock_tarfile_open.assert_called_once_with( + "/tmp/k2_viral.tar.gz", "r:gz" + ) + mock_find.assert_called_once_with("viral", ANY) + mock_tqdm.assert_called_once_with( + desc='Downloading the "kraken/k2_viral.tar.gz" database', + total=1000, unit='B', + unit_scale=True, unit_divisor=1024, + ) + + @patch('requests.get') + def test_fetch_db_collection_connection_error(self, mock_get): + mock_get.side_effect = ConnectionError("Some error.") + with self.assertRaisesRegex( + ValueError, r".+The error was\: Some error\." + ): + _fetch_db_collection('my_collection', '/tmp') + + @patch('requests.get') + def test_fetch_db_collection_status_non200(self, mock_get): + mock_get.return_value = Mock(status_code=404) + with self.assertRaisesRegex( + ValueError, r".+Status code was\: 404" + ): + _fetch_db_collection('my_collection', '/tmp') def test_move_db_files(self): with TemporaryDirectory() as tmp_dir: @@ -226,158 +373,131 @@ def test_move_db_files(self): for f in fake_files[:2]: self.assertTrue(os.path.exists(os.path.join(fake_dest, f))) - @patch("q2_moshpit.kraken2.database.run_command") - def test_build_kraken_db_action_standard_plus_library(self, p1): - with self.assertRaisesRegex( - ValueError, - "Standard Kraken2 database was requested but some libraries" - ): - moshpit.actions.build_kraken_db( - standard=True, libraries=['human'], threads=2 - ) - - @patch("q2_moshpit.kraken2.database.Kraken2DBDirectoryFormat") - @patch("q2_moshpit.kraken2.database._build_database") - @patch("q2_moshpit.kraken2.database._add_seqs_to_library") - @patch("q2_moshpit.kraken2.database._fetch_libraries") @patch("q2_moshpit.kraken2.database._fetch_taxonomy") + @patch("q2_moshpit.kraken2.database._add_seqs_to_library") + @patch("q2_moshpit.kraken2.database._build_kraken2_database") + @patch("q2_moshpit.kraken2.database._build_bracken_database") @patch("q2_moshpit.kraken2.database._move_db_files") - @patch("q2_moshpit.kraken2.database._build_standard_db") - def test_build_kraken_db_action_with_existing_library( - self, p1, p2, p3, p4, p5, p6, p7 + def test_build_dbs_from_seqs( + self, mock_move, mock_bracken, mock_kraken, + mock_add_seqs, mock_fetch_tax ): - lib_path = 'path/to/lib' - fake_dir_fmt = Kraken2DBDirectoryFormat( - self.get_data_path('db'), 'r' - ) - p7.return_value = fake_dir_fmt + bracken_db, kraken2_db = MagicMock(), MagicMock() + seqs, tmp_dir = ["seq1", "seq2"], "/tmp" + common_args = { + "threads": 1, "use_ftp": False, "no_masking": False, + "read_len": [100, 150], "kmer_len": 35 + } - moshpit.actions.build_kraken_db( - libraries=['human'], library_path=lib_path, - threads=2, fast_build=True + _build_dbs_from_seqs( + bracken_db, kraken2_db, seqs, tmp_dir, common_args ) - p1.assert_not_called() - p3.assert_called_once_with( - db_dir=lib_path, threads=2, use_ftp=False - ) - p4.assert_called_once_with( - db_dir=lib_path, libraries=['human'], all_kwargs=ANY + mock_fetch_tax.assert_called_once_with( + db_dir=tmp_dir, threads=1, use_ftp=False ) - p5.assert_not_called() - p6.assert_called_once_with( - db_dir=lib_path, all_kwargs=ANY - ) - p2.assert_called_once_with( - source=lib_path, destination=str(fake_dir_fmt.path) + mock_add_seqs.assert_has_calls([ + call(db_dir=tmp_dir, seqs="seq1", no_masking=False), + call(db_dir=tmp_dir, seqs="seq2", no_masking=False) + ]) + mock_kraken.assert_called_once_with( + db_dir=tmp_dir, all_kwargs=common_args ) + mock_bracken.assert_has_calls([ + call(kraken2_db_dir=tmp_dir, threads=1, + kmer_len=35, read_len=100), + call(kraken2_db_dir=tmp_dir, threads=1, + kmer_len=35, read_len=150) + ]) + mock_move.has_calls([ + call(tmp_dir, str(kraken2_db.path), extension="k2d"), + call(tmp_dir, str(bracken_db.path), extension="kmer_distrib") + ]) - @patch("q2_moshpit.kraken2.database.Kraken2DBDirectoryFormat") - @patch("q2_moshpit.kraken2.database._build_database") - @patch("q2_moshpit.kraken2.database._add_seqs_to_library") - @patch("q2_moshpit.kraken2.database._fetch_libraries") - @patch("q2_moshpit.kraken2.database._fetch_taxonomy") + @patch("q2_moshpit.kraken2.database._fetch_db_collection") @patch("q2_moshpit.kraken2.database._move_db_files") - @patch("q2_moshpit.kraken2.database._build_standard_db") - def test_build_kraken_db_action_with_standard_library( - self, p1, p2, p3, p4, p5, p6, p7 - ): - lib_path = 'path/to/lib' - fake_dir_fmt = Kraken2DBDirectoryFormat( - self.get_data_path('db'), 'r' - ) - p7.return_value = fake_dir_fmt + def test_fetch_prebuilt_dbs(self, mock_move, mock_fetch): + bracken_db = MagicMock(path="/path/to/bracken_db") + kraken2_db = MagicMock(path="/path/to/kraken2_db") - moshpit.actions.build_kraken_db( - standard=True, library_path=lib_path, - threads=2, fast_build=True - ) + _fetch_prebuilt_dbs(bracken_db, kraken2_db, "some_collection", "/tmp") - p1.assert_called_once_with( - db_dir=lib_path, all_kwargs=ANY - ) - p2.assert_called_once_with( - source=lib_path, destination=str(fake_dir_fmt.path) + mock_fetch.assert_called_once_with( + collection="some_collection", tmp_dir="/tmp" ) - p3.assert_not_called() - p4.assert_not_called() - p5.assert_not_called() - p6.assert_not_called() + mock_move.assert_has_calls([ + call("/tmp", str(kraken2_db.path), extension="k2d"), + call("/tmp", str(bracken_db.path), extension="kmer_distrib") + ]) - @patch("q2_moshpit.kraken2.database.tempfile.TemporaryDirectory") + @patch("tempfile.TemporaryDirectory", return_value=MockTempDir()) @patch("q2_moshpit.kraken2.database.Kraken2DBDirectoryFormat") - @patch("q2_moshpit.kraken2.database._build_database") - @patch("q2_moshpit.kraken2.database._add_seqs_to_library") - @patch("q2_moshpit.kraken2.database._fetch_libraries") - @patch("q2_moshpit.kraken2.database._fetch_taxonomy") - @patch("q2_moshpit.kraken2.database._move_db_files") - @patch("q2_moshpit.kraken2.database._build_standard_db") - def test_build_kraken_db_action_with_standard_library_temp( - self, p1, p2, p3, p4, p5, p6, p7, p8 + @patch("q2_moshpit.kraken2.database.BrackenDBDirectoryFormat") + @patch("q2_moshpit.kraken2.database._fetch_prebuilt_dbs") + def test_build_kraken_db_action_with_prebuilt( + self, mock_fetch, mock_bracken, mock_kraken, mock_tmp ): - fake_dir_fmt = Kraken2DBDirectoryFormat( + fake_kraken_dir_fmt = Kraken2DBDirectoryFormat( self.get_data_path('db'), 'r' ) - p7.return_value = fake_dir_fmt - fake_tmp_dir = MockTempDir() - p8.return_value = fake_tmp_dir - - moshpit.actions.build_kraken_db( - standard=True, threads=2, fast_build=True + mock_kraken.return_value = fake_kraken_dir_fmt + fake_bracken_dir_fmt = BrackenDBDirectoryFormat( + self.get_data_path('bracken-db'), 'r' ) + mock_bracken.return_value = fake_bracken_dir_fmt - p1.assert_called_once_with( - db_dir=str(fake_tmp_dir.name), all_kwargs=ANY - ) - p2.assert_called_once_with( - source=str(fake_tmp_dir.name), destination=str(fake_dir_fmt.path) + moshpit.actions.build_kraken_db(collection="viral") + + mock_fetch.assert_called_once_with( + fake_bracken_dir_fmt, fake_kraken_dir_fmt, + "viral", str(mock_tmp.return_value.name) ) - p3.assert_not_called() - p4.assert_not_called() - p5.assert_not_called() - p6.assert_not_called() + @patch("tempfile.TemporaryDirectory", return_value=MockTempDir()) @patch("q2_moshpit.kraken2.database.Kraken2DBDirectoryFormat") - @patch("q2_moshpit.kraken2.database._build_database") - @patch("q2_moshpit.kraken2.database._add_seqs_to_library") - @patch("q2_moshpit.kraken2.database._fetch_libraries") - @patch("q2_moshpit.kraken2.database._fetch_taxonomy") - @patch("q2_moshpit.kraken2.database._move_db_files") - @patch("q2_moshpit.kraken2.database._build_standard_db") - def test_build_kraken_db_action_with_more_seqs( - self, p1, p2, p3, p4, p5, p6, p7 + @patch("q2_moshpit.kraken2.database.BrackenDBDirectoryFormat") + @patch("q2_moshpit.kraken2.database._build_dbs_from_seqs") + def test_build_kraken_db_action_with_seqs( + self, mock_build, mock_bracken, mock_kraken, mock_tmp ): - lib_path = 'path/to/lib' seqs = Artifact.import_data( 'FeatureData[Sequence]', self.get_data_path("seqs") ) - fake_dir_fmt = Kraken2DBDirectoryFormat( + fake_kraken_dir_fmt = Kraken2DBDirectoryFormat( self.get_data_path('db'), 'r' ) - p7.return_value = fake_dir_fmt + mock_kraken.return_value = fake_kraken_dir_fmt + fake_bracken_dir_fmt = BrackenDBDirectoryFormat( + self.get_data_path('bracken-db'), 'r' + ) + mock_bracken.return_value = fake_bracken_dir_fmt moshpit.actions.build_kraken_db( - seqs=[seqs], libraries=['human'], library_path=lib_path, - threads=2, fast_build=True + seqs=[seqs], threads=2, fast_build=True ) - p1.assert_not_called() - p3.assert_called_once_with( - db_dir=lib_path, threads=2, use_ftp=False - ) - p4.assert_called_once_with( - db_dir=lib_path, libraries=['human'], all_kwargs=ANY - ) - p5.assert_called_once_with( - db_dir=lib_path, seqs=ANY, no_masking=False - ) - p6.assert_called_once_with( - db_dir=lib_path, all_kwargs=ANY - ) - p2.assert_called_once_with( - source=lib_path, destination=str(fake_dir_fmt.path) + exp_common_args = { + 'threads': 2, 'kmer_len': 35, 'minimizer_len': 31, + 'minimizer_spaces': 7, 'no_masking': False, 'max_db_size': 0, + 'use_ftp': False, 'load_factor': 0.7, 'fast_build': True, + 'read_len': [50, 75, 100, 150, 200, 250, 300], + 'kraken2_db': fake_kraken_dir_fmt, + 'bracken_db': fake_bracken_dir_fmt, + 'tmp': str(mock_tmp.return_value.name) + } + mock_build.assert_called_once_with( + fake_bracken_dir_fmt, fake_kraken_dir_fmt, + [ANY], str(mock_tmp.return_value.name), + exp_common_args ) + @patch("tempfile.TemporaryDirectory", return_value=MockTempDir()) + def test_build_kraken_db_action_with_error(self, mock_tmp): + with self.assertRaisesRegex( + ValueError, r"You need to either provide a list .+" + ): + moshpit.actions.build_kraken_db() + if __name__ == "__main__": unittest.main() diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index 1c9e7030..38ae402f 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -15,6 +15,7 @@ from q2_types_genomics.kraken2 import ( Kraken2Reports, Kraken2Outputs, Kraken2DB ) +from q2_types_genomics.kraken2._type import BrackenDB from q2_types_genomics.per_sample_data import MAGs, Contigs from q2_types_genomics.per_sample_data._type import AlignmentMap from qiime2.core.type import Bool, Range, Int, Str, Float, List, Choices @@ -143,13 +144,11 @@ "seqs": List[FeatureData[Sequence]] }, parameters={ - 'standard': Bool, - 'library_path': Str, - 'libraries': List[Str % Choices( - ['archaea', 'bacteria', 'plasmid', 'viral', 'human', 'fungi', - 'plant', 'protozoa', 'nr', 'nt', 'UniVec', 'UniVec_Core', ], - )], - 'library_exists': Str % Choices(['skip', 'refetch']), + 'collection': Str % Choices( + ['viral', 'minusb', 'standard', 'standard8', + 'standard16', 'pluspf', 'pluspf8', 'pluspf16', + 'pluspfp', 'pluspfp8', 'pluspfp16', 'eupathdb'], + ), 'threads': Int % Range(1, None), 'kmer_len': Int % Range(1, None), 'minimizer_len': Int % Range(1, None), @@ -159,26 +158,22 @@ 'use_ftp': Bool, 'load_factor': Float % Range(0, 1), 'fast_build': Bool, + 'read_len': List[Int % Range(1, None)], }, outputs=[ - ('database', Kraken2DB), + ('kraken2_database', Kraken2DB), + ('bracken_database', BrackenDB), ], input_descriptions={ "seqs": "Sequences to be added to the Kraken 2 database." }, parameter_descriptions={ - 'standard': 'Use standard Kraken 2 database. Incompatible with the ' - '"libraries" parameter.', - 'library_path': 'Path to the directory containing the library files. ' - 'This is where all the required files will be ' - 'downloaded - if not provided, a temporary directory ' - 'will be created.', - 'libraries': 'List of Kraken 2 reference libraries to be ' - 'included in the database. Incompatible with ' - 'the "standard" parameter.', - 'library_exists': 'Desired behaviour to follow when the library ' - 'already exists in the "library_path" directory.', - 'threads': 'Number of threads.', + 'collection': 'Name of the database collection to be fetched. ' + 'Please check https://benlangmead.github.io/aws-' + 'indexes/k2 for the description of the available ' + 'options.', + 'threads': 'Number of threads. Only applicable when building a ' + 'custom database.', 'kmer_len': 'K-mer length in bp/aa.', 'minimizer_len': 'Minimizer length in bp/aa.', 'minimizer_spaces': 'Number of characters in minimizer that are ' @@ -194,14 +189,17 @@ 'load_factor': 'Proportion of the hash table to be populated.', 'fast_build': 'Do not require database to be deterministically ' 'built when using multiple threads. This is faster, ' - 'but does introduce variability in minimizer/LCA pairs.' + 'but does introduce variability in minimizer/LCA pairs.', + 'read_len': 'Ideal read lengths to be used while building the Bracken ' + 'database.' }, output_descriptions={ - 'database': 'Kraken2 database.' + 'kraken2_database': 'Kraken2 database.', + 'bracken_database': 'Bracken database.' }, name='Build Kraken 2 database.', - description='This method builds a Kraken 2 database from provided ' - 'DNA sequences or simply fetches the sequences based on ' - 'user inputs and uses those to construct a database.', - citations=[citations["wood2019"]] + description='This method builds a Kraken 2/Bracken databases from ' + 'provided DNA sequences or simply fetches pre-built ' + 'versions from an online resource.', + citations=[citations["wood2019"], citations["lu2017"]] ) diff --git a/setup.py b/setup.py index 36262a75..1c07d974 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,8 @@ 'data/single-end/*', 'data/paired-end/*', 'data/db/*', 'data/reports-mags/*', 'data/reports-mags/*/*', 'data/outputs-mags/*', - 'data/outputs-mags/*/*', 'data/seqs/*' + 'data/outputs-mags/*/*', 'data/seqs/*', + 'data/bracken-db/*' ] }, zip_safe=False,