ENH: add an action to build a Kraken 2 database (#24)

* ENH: add an action to build a Kraken 2 database * warn when skipping a library * max_db_size update * refactor into smaller submodules; update copyright year * rebase fixups * test fixups * add tests * add citations * fixup! add citations
bokulich-lab · Jan 17, 2023 · 3502d02 · 3502d02
1 parent 231845b
commit 3502d02
Show file tree

Hide file tree

Showing 10 changed files with 1,399 additions and 87 deletions.
diff --git a/q2_moshpit/__init__.py b/q2_moshpit/__init__.py
@@ -6,11 +6,11 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
-from .kraken2 import kraken2
+from .kraken2 import classification, database
 from .metabat2 import metabat2
 
 from ._version import get_versions
 __version__ = get_versions()['version']
 del get_versions
 
-__all__ = ['kraken2', 'metabat2']
+__all__ = ['metabat2', 'classification', 'database']
diff --git a/q2_moshpit/citations.bib b/q2_moshpit/citations.bib
@@ -7,60 +7,32 @@ @InProceedings{ mckinney-proc-scipy-2010
   editor    = { St{\'e}fan van der Walt and Jarrod Millman }
 }
 
-@article{matsen2010,
-	title = {pplacer: linear time maximum-likelihood and {Bayesian} phylogenetic placement of sequences onto a fixed reference tree},
-	volume = {11},
-	issn = {1471-2105},
-	shorttitle = {pplacer},
-	url = {https://doi.org/10.1186/1471-2105-11-538},
-	doi = {10.1186/1471-2105-11-538},
-	abstract = {Likelihood-based phylogenetic inference is generally considered to be the most reliable classification method for unknown sequences. However, traditional likelihood-based phylogenetic methods cannot be applied to large volumes of short reads from next-generation sequencing due to computational complexity issues and lack of phylogenetic signal. "Phylogenetic placement," where a reference tree is fixed and the unknown query sequences are placed onto the tree via a reference alignment, is a way to bring the inferential power offered by likelihood-based approaches to large data sets.},
+@article{wood2019,
+	title = {Improved metagenomic analysis with {Kraken} 2},
+	volume = {20},
+	issn = {1474-760X},
+	url = {https://doi.org/10.1186/s13059-019-1891-0},
+	doi = {10.1186/s13059-019-1891-0},
+	abstract = {Although Kraken’s k-mer-based approach provides a fast taxonomic classification of metagenomic sequence data, its large memory requirements can be limiting for some applications. Kraken 2 improves upon Kraken 1 by reducing memory usage by 85\%, allowing greater amounts of reference genomic data to be used, while maintaining high accuracy and increasing speed fivefold. Kraken 2 also introduces a translated search mode, providing increased sensitivity in viral metagenomics analysis.},
 	number = {1},
-	urldate = {2022-04-27},
-	journal = {BMC Bioinformatics},
-	author = {Matsen, Frederick A. and Kodner, Robin B. and Armbrust, E Virginia},
-	month = {oct},
-	year = {2010},
-	keywords = {Branch Length, Confidence Score, Posterior Probability, Query Sequence, Reference Tree},
-	pages = {538},
+	urldate = {2023-01-17},
+	journal = {Genome Biology},
+	author = {Wood, Derrick E. and Lu, Jennifer and Langmead, Ben},
+	month = {nov},
+	year = {2019},
+	keywords = {Alignment-free methods, Metagenomics, Metagenomics classification, Microbiome, Minimizers, Probabilistic data structures},
+	pages = {257},
 }
 
-@article{hyatt2012,
-	title = {Gene and translation initiation site prediction in metagenomic sequences},
-	volume = {28},
-	issn = {1367-4803},
-	url = {https://doi.org/10.1093/bioinformatics/bts429},
-	doi = {10.1093/bioinformatics/bts429},
-	abstract = {Motivation: Gene prediction in metagenomic sequences remains a difficult problem. Current sequencing technologies do not achieve sufficient coverage to assemble the individual genomes in a typical sample; consequently, sequencing runs produce a large number of short sequences whose exact origin is unknown. Since these sequences are usually smaller than the average length of a gene, algorithms must make predictions based on very little data.Results: We present MetaProdigal, a metagenomic version of the gene prediction program Prodigal, that can identify genes in short, anonymous coding sequences with a high degree of accuracy. The novel value of the method consists of enhanced translation initiation site identification, ability to identify sequences that use alternate genetic codes and confidence values for each gene call. We compare the results of MetaProdigal with other methods and conclude with a discussion of future improvements.Availability: The Prodigal software is freely available under the General Public License from http://code.google.com/p/prodigal/.Contact:[email protected] Information: Supplementary data are available at Bioinformatics online.},
-	number = {17},
-	journal = {Bioinformatics},
-	author = {Hyatt, Doug and LoCascio, Philip F. and Hauser, Loren J. and Uberbacher, Edward C.},
-	month = {sep},
-	year = {2012},
-	pages = {2223--2230},
-}
-
-@article{parks2015b,
-	title = {{CheckM}: assessing the quality of microbial genomes recovered from isolates, single cells, and metagenomes},
-	volume = {25},
-	issn = {1088-9051, 1549-5469},
-	shorttitle = {{CheckM}},
-	url = {https://genome.cshlp.org/content/25/7/1043},
-	doi = {10.1101/gr.186072.114},
-	abstract = {Large-scale recovery of genomes from isolates, single cells, and metagenomic data has been made possible by advances in computational methods and substantial reductions in sequencing costs. Although this increasing breadth of draft genomes is providing key information regarding the evolutionary and functional diversity of microbial life, it has become impractical to finish all available reference genomes. Making robust biological inferences from draft genomes requires accurate estimates of their completeness and contamination. Current methods for assessing genome quality are ad hoc and generally make use of a limited number of “marker” genes conserved across all bacterial or archaeal genomes. Here we introduce CheckM, an automated method for assessing the quality of a genome using a broader set of marker genes specific to the position of a genome within a reference genome tree and information about the collocation of these genes. We demonstrate the effectiveness of CheckM using synthetic data and a wide range of isolate-, single-cell-, and metagenome-derived genomes. CheckM is shown to provide accurate estimates of genome completeness and contamination and to outperform existing approaches. Using CheckM, we identify a diverse range of errors currently impacting publicly available isolate genomes and demonstrate that genomes obtained from single cells and metagenomic data vary substantially in quality. In order to facilitate the use of draft genomes, we propose an objective measure of genome quality that can be used to select genomes suitable for specific gene- and genome-centric analyses of microbial communities.},
-	language = {en},
-	number = {7},
-	urldate = {2022-04-27},
-	journal = {Genome Research},
-	author = {Parks, Donovan H. and Imelfort, Michael and Skennerton, Connor T. and Hugenholtz, Philip and Tyson, Gene W.},
-	month = {jul},
-	year = {2015},
-	pmid = {25977477},
-	pages = {1043--1055},
-}
-
-@misc{hmmer2022,
-	title = {{HMMER}},
-	url = {http://hmmer.org/},
-	urldate = {2022-04-27},
+@article{kang2019,
+  title = {{{MetaBAT}} 2: {{An}} Adaptive Binning Algorithm for Robust and Efficient Genome Reconstruction from Metagenome Assemblies},
+  author = {Kang, Dongwan D. and Li, Feng and Kirton, Edward and Thomas, Ashleigh and Egan, Rob and An, Hong and Wang, Zhong},
+  year = {2019},
+  journal = {PeerJ},
+  volume = {2019},
+  number = {7},
+  publisher = {{PeerJ Inc.}},
+  issn = {21678359},
+  doi = {10.7717/peerj.7359},
+  keywords = {Clustering,Metagenome binning,Metagenomics}
 }
diff --git a/q2_moshpit/kraken2/__init__.py b/q2_moshpit/kraken2/__init__.py
@@ -6,6 +6,7 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
-from .kraken2 import classify_kraken
+from .database import build_kraken_db
+from .classification import classify_kraken
 
-__all__ = ['classify_kraken']
+__all__ = ['build_kraken_db', 'classify_kraken']
diff --git a/q2_moshpit/kraken2/kraken2.py → q2_moshpit/kraken2/classification.py b/q2_moshpit/kraken2/kraken2.py → q2_moshpit/kraken2/classification.py
@@ -12,19 +12,19 @@
 
 import pandas as pd
 from q2_types.per_sample_sequences import (
-    SingleLanePerSampleSingleEndFastqDirFmt,
     SingleLanePerSamplePairedEndFastqDirFmt,
+    SingleLanePerSampleSingleEndFastqDirFmt
 )
 
-from q2_moshpit._utils import _process_common_input_params, run_command
+from q2_moshpit._utils import run_command, _process_common_input_params
+from q2_moshpit.kraken2.utils import _process_kraken2_arg
 from q2_types_genomics.kraken2 import (
     Kraken2ReportDirectoryFormat,
-    Kraken2OutputDirectoryFormat, Kraken2DBDirectoryFormat,
+    Kraken2OutputDirectoryFormat,
+    Kraken2DBDirectoryFormat
 )
 from q2_types_genomics.per_sample_data import MultiMAGSequencesDirFmt
 
-from q2_moshpit.kraken2.utils import _process_kraken2_arg
-
 
 def _get_seq_paths(df_index, df_row, df_columns):
     if "filename" in df_columns:
@@ -37,7 +37,7 @@ def _get_seq_paths(df_index, df_row, df_columns):
 
 
 def _construct_output_paths(
-        _sample, _bin, kraken2_outputs_dir, kraken2_reports_dir
+    _sample, _bin, kraken2_outputs_dir, kraken2_reports_dir
 ):
     sample_dir_report = os.path.join(kraken2_reports_dir.path, _sample)
     sample_dir_output = os.path.join(kraken2_outputs_dir.path, _sample)

diff --git a/q2_moshpit/kraken2/database.py b/q2_moshpit/kraken2/database.py
@@ -0,0 +1,214 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2022-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+import glob
+import os
+import shutil
+import subprocess
+import tempfile
+from contextlib import ExitStack
+from copy import deepcopy
+from typing import List
+
+from q2_types.feature_data import DNAFASTAFormat
+
+from q2_moshpit._utils import _process_common_input_params, run_command
+from q2_types_genomics.kraken2 import (
+    Kraken2DBDirectoryFormat,
+)
+
+from q2_moshpit.kraken2.utils import _process_kraken2_arg
+
+
+def _build_standard_db(db_dir: str, all_kwargs: dict):
+    kwargs = {
+        k: v for k, v in all_kwargs.items()
+        if k in ["threads", "no_masking", "use_ftp", "fast_build"]
+    }
+    common_args = _process_common_input_params(
+        processing_func=_process_kraken2_arg, params=kwargs
+    )
+    if all_kwargs["max_db_size"] > 0:
+        common_args.extend(
+            ["--max-db-size", str(all_kwargs["max_db_size"])]
+        )
+    cmd = [
+        "kraken2-build", "--standard", "--db", db_dir, *common_args
+    ]
+    try:
+        run_command(cmd=cmd, verbose=True)
+    except subprocess.CalledProcessError as e:
+        raise Exception(
+            "An error was encountered while building the standard "
+            f"library, (return code {e.returncode}), please inspect "
+            "stdout and stderr to learn more."
+        )
+
+
+def _fetch_taxonomy(db_dir: str, threads: int, use_ftp: bool):
+    cmd = [
+        "kraken2-build", "--download-taxonomy",
+        "--threads", str(threads), "--db", str(db_dir),
+    ]
+    cmd.append("--use-ftp") if use_ftp else False
+    try:
+        run_command(cmd=cmd, verbose=True)
+    except subprocess.CalledProcessError as e:
+        raise Exception(
+            "An error was encountered while downloading taxonomy, "
+            f"(return code {e.returncode}), please inspect "
+            "stdout and stderr to learn more."
+        )
+
+
+def _fetch_libraries(db_dir: str, libraries: List[str], all_kwargs: dict):
+    kwargs = {
+        k: v for k, v in all_kwargs.items()
+        if k in ["threads", "no_masking", "use_ftp"]
+    }
+    common_args = _process_common_input_params(
+        processing_func=_process_kraken2_arg, params=kwargs
+    )
+    common_args.extend(["--db", db_dir])
+    base_cmd = [
+        "kraken2-build", "--download-library",
+    ]
+    fetch_behaviour = all_kwargs.get("library_exists", "refetch")
+    for library in libraries:
+        if fetch_behaviour == "skip":
+            lib_path = os.path.join(db_dir, "library", library)
+            if os.path.exists(lib_path) and len(os.listdir(lib_path)) > 0:
+                print(
+                    f"Skipping download of the '{library}' library, "
+                    f"already exists."
+                )
+                continue
+        try:
+            cmd = deepcopy(base_cmd)
+            cmd.extend([library, *common_args])
+            run_command(cmd=cmd, verbose=True)
+        except subprocess.CalledProcessError as e:
+            raise Exception(
+                f"An error was encountered while downloading the "
+                f"'{library}' library, (return code {e.returncode}), "
+                "please inspect stdout and stderr to learn more."
+            )
+
+
+def _add_seqs_to_library(db_dir: str, seqs: DNAFASTAFormat, no_masking: bool):
+    cmd = [
+        "kraken2-build", "--add-to-library",
+        str(seqs.path), "--db", db_dir
+    ]
+    cmd.append("--no-mask") if no_masking else False
+    try:
+        run_command(cmd=cmd, verbose=True)
+    except subprocess.CalledProcessError as e:
+        raise Exception(
+            "An error was encountered while adding sequences to the "
+            f"library, (return code {e.returncode}), please inspect "
+            "stdout and stderr to learn more."
+        )
+
+
+def _build_database(db_dir: str, all_kwargs: dict):
+    kwargs = {
+        k: v for k, v in all_kwargs.items()
+        if k in ["threads", "minimizer_len", "minimizer_spaces",
+                 "load_factor", "fast_build", "kmer_len"]
+    }
+    common_args = _process_common_input_params(
+        processing_func=_process_kraken2_arg, params=kwargs
+    )
+    if all_kwargs["max_db_size"] > 0:
+        common_args.extend(
+            ["--max-db-size", str(all_kwargs["max_db_size"])]
+        )
+    cmd = [
+        "kraken2-build", "--build", "--db", db_dir, *common_args
+    ]
+    try:
+        run_command(cmd=cmd, verbose=True)
+    except subprocess.CalledProcessError as e:
+        raise Exception(
+            "An error was encountered while building the database, "
+            f"(return code {e.returncode}), please inspect "
+            "stdout and stderr to learn more."
+        )
+
+
+def _move_db_files(source: str, destination: str):
+    files = glob.glob(f"{source}/*.k2d")
+    for file in files:
+        new_file = os.path.join(destination, os.path.split(file)[-1])
+        shutil.move(file, new_file)
+
+
+def build_kraken_db(
+    seqs: DNAFASTAFormat = None,
+    standard: bool = False,
+    library_path: str = None,
+    libraries: List[str] = None,
+    library_exists: str = 'skip',
+    threads: int = 1,
+    kmer_len: int = 35,
+    minimizer_len: int = 31,
+    minimizer_spaces: int = 7,
+    no_masking: bool = False,
+    max_db_size: int = 0,
+    use_ftp: bool = False,
+    load_factor: float = 0.7,
+    fast_build: bool = False,
+) -> Kraken2DBDirectoryFormat:
+    db = Kraken2DBDirectoryFormat()
+    db_dir = str(db.path)
+
+    if standard and libraries:
+        raise ValueError(
+            'Standard Kraken2 database was requested but some libraries '
+            'were also provided. Please provide either only the "standard" '
+            'option or a list of "libraries" to be fetched for the database.'
+        )
+
+    with ExitStack() as stack:
+        if not library_path:
+            temp = tempfile.TemporaryDirectory()
+            temp_dir = temp.name
+            stack.enter_context(temp)
+        else:
+            os.makedirs(library_path, exist_ok=True)
+            temp_dir = library_path
+
+        # If requested, build the standard Kraken2 database
+        if standard:
+            _build_standard_db(db_dir=temp_dir, all_kwargs=locals())
+            _move_db_files(source=temp_dir, destination=db_dir)
+            return db
+
+        # Fetch taxonomy (required regardless of the build source)
+        _fetch_taxonomy(db_dir=temp_dir, threads=threads, use_ftp=use_ftp)
+
+        # If requested, download all the libraries
+        if libraries:
+            _fetch_libraries(
+                db_dir=temp_dir, libraries=libraries, all_kwargs=locals()
+            )
+
+        # If provided, add the additional sequences to the database
+        if seqs:
+            for seq in seqs:
+                _add_seqs_to_library(
+                    db_dir=temp_dir, seqs=seq, no_masking=no_masking
+                )
+
+        # Finally, build the actual database
+        _build_database(db_dir=temp_dir, all_kwargs=locals())
+
+        # Move the database files (*.k2d) to the final location
+        _move_db_files(source=temp_dir, destination=db_dir)
+
+    return db