generated from bokulich-lab/q2-plugin-template
-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ENH: add an action to build a Kraken 2 database (#24)
* ENH: add an action to build a Kraken 2 database * warn when skipping a library * max_db_size update * refactor into smaller submodules; update copyright year * rebase fixups * test fixups * add tests * add citations * fixup! add citations
- Loading branch information
Showing
10 changed files
with
1,399 additions
and
87 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,60 +7,32 @@ @InProceedings{ mckinney-proc-scipy-2010 | |
editor = { St{\'e}fan van der Walt and Jarrod Millman } | ||
} | ||
|
||
@article{matsen2010, | ||
title = {pplacer: linear time maximum-likelihood and {Bayesian} phylogenetic placement of sequences onto a fixed reference tree}, | ||
volume = {11}, | ||
issn = {1471-2105}, | ||
shorttitle = {pplacer}, | ||
url = {https://doi.org/10.1186/1471-2105-11-538}, | ||
doi = {10.1186/1471-2105-11-538}, | ||
abstract = {Likelihood-based phylogenetic inference is generally considered to be the most reliable classification method for unknown sequences. However, traditional likelihood-based phylogenetic methods cannot be applied to large volumes of short reads from next-generation sequencing due to computational complexity issues and lack of phylogenetic signal. "Phylogenetic placement," where a reference tree is fixed and the unknown query sequences are placed onto the tree via a reference alignment, is a way to bring the inferential power offered by likelihood-based approaches to large data sets.}, | ||
@article{wood2019, | ||
title = {Improved metagenomic analysis with {Kraken} 2}, | ||
volume = {20}, | ||
issn = {1474-760X}, | ||
url = {https://doi.org/10.1186/s13059-019-1891-0}, | ||
doi = {10.1186/s13059-019-1891-0}, | ||
abstract = {Although Kraken’s k-mer-based approach provides a fast taxonomic classification of metagenomic sequence data, its large memory requirements can be limiting for some applications. Kraken 2 improves upon Kraken 1 by reducing memory usage by 85\%, allowing greater amounts of reference genomic data to be used, while maintaining high accuracy and increasing speed fivefold. Kraken 2 also introduces a translated search mode, providing increased sensitivity in viral metagenomics analysis.}, | ||
number = {1}, | ||
urldate = {2022-04-27}, | ||
journal = {BMC Bioinformatics}, | ||
author = {Matsen, Frederick A. and Kodner, Robin B. and Armbrust, E Virginia}, | ||
month = {oct}, | ||
year = {2010}, | ||
keywords = {Branch Length, Confidence Score, Posterior Probability, Query Sequence, Reference Tree}, | ||
pages = {538}, | ||
urldate = {2023-01-17}, | ||
journal = {Genome Biology}, | ||
author = {Wood, Derrick E. and Lu, Jennifer and Langmead, Ben}, | ||
month = {nov}, | ||
year = {2019}, | ||
keywords = {Alignment-free methods, Metagenomics, Metagenomics classification, Microbiome, Minimizers, Probabilistic data structures}, | ||
pages = {257}, | ||
} | ||
|
||
@article{hyatt2012, | ||
title = {Gene and translation initiation site prediction in metagenomic sequences}, | ||
volume = {28}, | ||
issn = {1367-4803}, | ||
url = {https://doi.org/10.1093/bioinformatics/bts429}, | ||
doi = {10.1093/bioinformatics/bts429}, | ||
abstract = {Motivation: Gene prediction in metagenomic sequences remains a difficult problem. Current sequencing technologies do not achieve sufficient coverage to assemble the individual genomes in a typical sample; consequently, sequencing runs produce a large number of short sequences whose exact origin is unknown. Since these sequences are usually smaller than the average length of a gene, algorithms must make predictions based on very little data.Results: We present MetaProdigal, a metagenomic version of the gene prediction program Prodigal, that can identify genes in short, anonymous coding sequences with a high degree of accuracy. The novel value of the method consists of enhanced translation initiation site identification, ability to identify sequences that use alternate genetic codes and confidence values for each gene call. We compare the results of MetaProdigal with other methods and conclude with a discussion of future improvements.Availability: The Prodigal software is freely available under the General Public License from http://code.google.com/p/prodigal/.Contact:[email protected] Information: Supplementary data are available at Bioinformatics online.}, | ||
number = {17}, | ||
journal = {Bioinformatics}, | ||
author = {Hyatt, Doug and LoCascio, Philip F. and Hauser, Loren J. and Uberbacher, Edward C.}, | ||
month = {sep}, | ||
year = {2012}, | ||
pages = {2223--2230}, | ||
} | ||
|
||
@article{parks2015b, | ||
title = {{CheckM}: assessing the quality of microbial genomes recovered from isolates, single cells, and metagenomes}, | ||
volume = {25}, | ||
issn = {1088-9051, 1549-5469}, | ||
shorttitle = {{CheckM}}, | ||
url = {https://genome.cshlp.org/content/25/7/1043}, | ||
doi = {10.1101/gr.186072.114}, | ||
abstract = {Large-scale recovery of genomes from isolates, single cells, and metagenomic data has been made possible by advances in computational methods and substantial reductions in sequencing costs. Although this increasing breadth of draft genomes is providing key information regarding the evolutionary and functional diversity of microbial life, it has become impractical to finish all available reference genomes. Making robust biological inferences from draft genomes requires accurate estimates of their completeness and contamination. Current methods for assessing genome quality are ad hoc and generally make use of a limited number of “marker” genes conserved across all bacterial or archaeal genomes. Here we introduce CheckM, an automated method for assessing the quality of a genome using a broader set of marker genes specific to the position of a genome within a reference genome tree and information about the collocation of these genes. We demonstrate the effectiveness of CheckM using synthetic data and a wide range of isolate-, single-cell-, and metagenome-derived genomes. CheckM is shown to provide accurate estimates of genome completeness and contamination and to outperform existing approaches. Using CheckM, we identify a diverse range of errors currently impacting publicly available isolate genomes and demonstrate that genomes obtained from single cells and metagenomic data vary substantially in quality. In order to facilitate the use of draft genomes, we propose an objective measure of genome quality that can be used to select genomes suitable for specific gene- and genome-centric analyses of microbial communities.}, | ||
language = {en}, | ||
number = {7}, | ||
urldate = {2022-04-27}, | ||
journal = {Genome Research}, | ||
author = {Parks, Donovan H. and Imelfort, Michael and Skennerton, Connor T. and Hugenholtz, Philip and Tyson, Gene W.}, | ||
month = {jul}, | ||
year = {2015}, | ||
pmid = {25977477}, | ||
pages = {1043--1055}, | ||
} | ||
|
||
@misc{hmmer2022, | ||
title = {{HMMER}}, | ||
url = {http://hmmer.org/}, | ||
urldate = {2022-04-27}, | ||
@article{kang2019, | ||
title = {{{MetaBAT}} 2: {{An}} Adaptive Binning Algorithm for Robust and Efficient Genome Reconstruction from Metagenome Assemblies}, | ||
author = {Kang, Dongwan D. and Li, Feng and Kirton, Edward and Thomas, Ashleigh and Egan, Rob and An, Hong and Wang, Zhong}, | ||
year = {2019}, | ||
journal = {PeerJ}, | ||
volume = {2019}, | ||
number = {7}, | ||
publisher = {{PeerJ Inc.}}, | ||
issn = {21678359}, | ||
doi = {10.7717/peerj.7359}, | ||
keywords = {Clustering,Metagenome binning,Metagenomics} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,214 @@ | ||
# ---------------------------------------------------------------------------- | ||
# Copyright (c) 2022-2023, QIIME 2 development team. | ||
# | ||
# Distributed under the terms of the Modified BSD License. | ||
# | ||
# The full license is in the file LICENSE, distributed with this software. | ||
# ---------------------------------------------------------------------------- | ||
import glob | ||
import os | ||
import shutil | ||
import subprocess | ||
import tempfile | ||
from contextlib import ExitStack | ||
from copy import deepcopy | ||
from typing import List | ||
|
||
from q2_types.feature_data import DNAFASTAFormat | ||
|
||
from q2_moshpit._utils import _process_common_input_params, run_command | ||
from q2_types_genomics.kraken2 import ( | ||
Kraken2DBDirectoryFormat, | ||
) | ||
|
||
from q2_moshpit.kraken2.utils import _process_kraken2_arg | ||
|
||
|
||
def _build_standard_db(db_dir: str, all_kwargs: dict): | ||
kwargs = { | ||
k: v for k, v in all_kwargs.items() | ||
if k in ["threads", "no_masking", "use_ftp", "fast_build"] | ||
} | ||
common_args = _process_common_input_params( | ||
processing_func=_process_kraken2_arg, params=kwargs | ||
) | ||
if all_kwargs["max_db_size"] > 0: | ||
common_args.extend( | ||
["--max-db-size", str(all_kwargs["max_db_size"])] | ||
) | ||
cmd = [ | ||
"kraken2-build", "--standard", "--db", db_dir, *common_args | ||
] | ||
try: | ||
run_command(cmd=cmd, verbose=True) | ||
except subprocess.CalledProcessError as e: | ||
raise Exception( | ||
"An error was encountered while building the standard " | ||
f"library, (return code {e.returncode}), please inspect " | ||
"stdout and stderr to learn more." | ||
) | ||
|
||
|
||
def _fetch_taxonomy(db_dir: str, threads: int, use_ftp: bool): | ||
cmd = [ | ||
"kraken2-build", "--download-taxonomy", | ||
"--threads", str(threads), "--db", str(db_dir), | ||
] | ||
cmd.append("--use-ftp") if use_ftp else False | ||
try: | ||
run_command(cmd=cmd, verbose=True) | ||
except subprocess.CalledProcessError as e: | ||
raise Exception( | ||
"An error was encountered while downloading taxonomy, " | ||
f"(return code {e.returncode}), please inspect " | ||
"stdout and stderr to learn more." | ||
) | ||
|
||
|
||
def _fetch_libraries(db_dir: str, libraries: List[str], all_kwargs: dict): | ||
kwargs = { | ||
k: v for k, v in all_kwargs.items() | ||
if k in ["threads", "no_masking", "use_ftp"] | ||
} | ||
common_args = _process_common_input_params( | ||
processing_func=_process_kraken2_arg, params=kwargs | ||
) | ||
common_args.extend(["--db", db_dir]) | ||
base_cmd = [ | ||
"kraken2-build", "--download-library", | ||
] | ||
fetch_behaviour = all_kwargs.get("library_exists", "refetch") | ||
for library in libraries: | ||
if fetch_behaviour == "skip": | ||
lib_path = os.path.join(db_dir, "library", library) | ||
if os.path.exists(lib_path) and len(os.listdir(lib_path)) > 0: | ||
print( | ||
f"Skipping download of the '{library}' library, " | ||
f"already exists." | ||
) | ||
continue | ||
try: | ||
cmd = deepcopy(base_cmd) | ||
cmd.extend([library, *common_args]) | ||
run_command(cmd=cmd, verbose=True) | ||
except subprocess.CalledProcessError as e: | ||
raise Exception( | ||
f"An error was encountered while downloading the " | ||
f"'{library}' library, (return code {e.returncode}), " | ||
"please inspect stdout and stderr to learn more." | ||
) | ||
|
||
|
||
def _add_seqs_to_library(db_dir: str, seqs: DNAFASTAFormat, no_masking: bool): | ||
cmd = [ | ||
"kraken2-build", "--add-to-library", | ||
str(seqs.path), "--db", db_dir | ||
] | ||
cmd.append("--no-mask") if no_masking else False | ||
try: | ||
run_command(cmd=cmd, verbose=True) | ||
except subprocess.CalledProcessError as e: | ||
raise Exception( | ||
"An error was encountered while adding sequences to the " | ||
f"library, (return code {e.returncode}), please inspect " | ||
"stdout and stderr to learn more." | ||
) | ||
|
||
|
||
def _build_database(db_dir: str, all_kwargs: dict): | ||
kwargs = { | ||
k: v for k, v in all_kwargs.items() | ||
if k in ["threads", "minimizer_len", "minimizer_spaces", | ||
"load_factor", "fast_build", "kmer_len"] | ||
} | ||
common_args = _process_common_input_params( | ||
processing_func=_process_kraken2_arg, params=kwargs | ||
) | ||
if all_kwargs["max_db_size"] > 0: | ||
common_args.extend( | ||
["--max-db-size", str(all_kwargs["max_db_size"])] | ||
) | ||
cmd = [ | ||
"kraken2-build", "--build", "--db", db_dir, *common_args | ||
] | ||
try: | ||
run_command(cmd=cmd, verbose=True) | ||
except subprocess.CalledProcessError as e: | ||
raise Exception( | ||
"An error was encountered while building the database, " | ||
f"(return code {e.returncode}), please inspect " | ||
"stdout and stderr to learn more." | ||
) | ||
|
||
|
||
def _move_db_files(source: str, destination: str): | ||
files = glob.glob(f"{source}/*.k2d") | ||
for file in files: | ||
new_file = os.path.join(destination, os.path.split(file)[-1]) | ||
shutil.move(file, new_file) | ||
|
||
|
||
def build_kraken_db( | ||
seqs: DNAFASTAFormat = None, | ||
standard: bool = False, | ||
library_path: str = None, | ||
libraries: List[str] = None, | ||
library_exists: str = 'skip', | ||
threads: int = 1, | ||
kmer_len: int = 35, | ||
minimizer_len: int = 31, | ||
minimizer_spaces: int = 7, | ||
no_masking: bool = False, | ||
max_db_size: int = 0, | ||
use_ftp: bool = False, | ||
load_factor: float = 0.7, | ||
fast_build: bool = False, | ||
) -> Kraken2DBDirectoryFormat: | ||
db = Kraken2DBDirectoryFormat() | ||
db_dir = str(db.path) | ||
|
||
if standard and libraries: | ||
raise ValueError( | ||
'Standard Kraken2 database was requested but some libraries ' | ||
'were also provided. Please provide either only the "standard" ' | ||
'option or a list of "libraries" to be fetched for the database.' | ||
) | ||
|
||
with ExitStack() as stack: | ||
if not library_path: | ||
temp = tempfile.TemporaryDirectory() | ||
temp_dir = temp.name | ||
stack.enter_context(temp) | ||
else: | ||
os.makedirs(library_path, exist_ok=True) | ||
temp_dir = library_path | ||
|
||
# If requested, build the standard Kraken2 database | ||
if standard: | ||
_build_standard_db(db_dir=temp_dir, all_kwargs=locals()) | ||
_move_db_files(source=temp_dir, destination=db_dir) | ||
return db | ||
|
||
# Fetch taxonomy (required regardless of the build source) | ||
_fetch_taxonomy(db_dir=temp_dir, threads=threads, use_ftp=use_ftp) | ||
|
||
# If requested, download all the libraries | ||
if libraries: | ||
_fetch_libraries( | ||
db_dir=temp_dir, libraries=libraries, all_kwargs=locals() | ||
) | ||
|
||
# If provided, add the additional sequences to the database | ||
if seqs: | ||
for seq in seqs: | ||
_add_seqs_to_library( | ||
db_dir=temp_dir, seqs=seq, no_masking=no_masking | ||
) | ||
|
||
# Finally, build the actual database | ||
_build_database(db_dir=temp_dir, all_kwargs=locals()) | ||
|
||
# Move the database files (*.k2d) to the final location | ||
_move_db_files(source=temp_dir, destination=db_dir) | ||
|
||
return db |
Oops, something went wrong.