Skip to content

Commit

Permalink
ENH: add an action to build a Kraken 2 database (#24)
Browse files Browse the repository at this point in the history
* ENH: add an action to build a Kraken 2 database
* warn when skipping a library
* max_db_size update
* refactor into smaller submodules; update copyright year
* rebase fixups
* test fixups
* add tests
* add citations
* fixup! add citations
  • Loading branch information
misialq authored Jan 17, 2023
1 parent 231845b commit 3502d02
Show file tree
Hide file tree
Showing 10 changed files with 1,399 additions and 87 deletions.
4 changes: 2 additions & 2 deletions q2_moshpit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from .kraken2 import kraken2
from .kraken2 import classification, database
from .metabat2 import metabat2

from ._version import get_versions
__version__ = get_versions()['version']
del get_versions

__all__ = ['kraken2', 'metabat2']
__all__ = ['metabat2', 'classification', 'database']
78 changes: 25 additions & 53 deletions q2_moshpit/citations.bib
Original file line number Diff line number Diff line change
Expand Up @@ -7,60 +7,32 @@ @InProceedings{ mckinney-proc-scipy-2010
editor = { St{\'e}fan van der Walt and Jarrod Millman }
}

@article{matsen2010,
title = {pplacer: linear time maximum-likelihood and {Bayesian} phylogenetic placement of sequences onto a fixed reference tree},
volume = {11},
issn = {1471-2105},
shorttitle = {pplacer},
url = {https://doi.org/10.1186/1471-2105-11-538},
doi = {10.1186/1471-2105-11-538},
abstract = {Likelihood-based phylogenetic inference is generally considered to be the most reliable classification method for unknown sequences. However, traditional likelihood-based phylogenetic methods cannot be applied to large volumes of short reads from next-generation sequencing due to computational complexity issues and lack of phylogenetic signal. "Phylogenetic placement," where a reference tree is fixed and the unknown query sequences are placed onto the tree via a reference alignment, is a way to bring the inferential power offered by likelihood-based approaches to large data sets.},
@article{wood2019,
title = {Improved metagenomic analysis with {Kraken} 2},
volume = {20},
issn = {1474-760X},
url = {https://doi.org/10.1186/s13059-019-1891-0},
doi = {10.1186/s13059-019-1891-0},
abstract = {Although Kraken’s k-mer-based approach provides a fast taxonomic classification of metagenomic sequence data, its large memory requirements can be limiting for some applications. Kraken 2 improves upon Kraken 1 by reducing memory usage by 85\%, allowing greater amounts of reference genomic data to be used, while maintaining high accuracy and increasing speed fivefold. Kraken 2 also introduces a translated search mode, providing increased sensitivity in viral metagenomics analysis.},
number = {1},
urldate = {2022-04-27},
journal = {BMC Bioinformatics},
author = {Matsen, Frederick A. and Kodner, Robin B. and Armbrust, E Virginia},
month = {oct},
year = {2010},
keywords = {Branch Length, Confidence Score, Posterior Probability, Query Sequence, Reference Tree},
pages = {538},
urldate = {2023-01-17},
journal = {Genome Biology},
author = {Wood, Derrick E. and Lu, Jennifer and Langmead, Ben},
month = {nov},
year = {2019},
keywords = {Alignment-free methods, Metagenomics, Metagenomics classification, Microbiome, Minimizers, Probabilistic data structures},
pages = {257},
}

@article{hyatt2012,
title = {Gene and translation initiation site prediction in metagenomic sequences},
volume = {28},
issn = {1367-4803},
url = {https://doi.org/10.1093/bioinformatics/bts429},
doi = {10.1093/bioinformatics/bts429},
abstract = {Motivation: Gene prediction in metagenomic sequences remains a difficult problem. Current sequencing technologies do not achieve sufficient coverage to assemble the individual genomes in a typical sample; consequently, sequencing runs produce a large number of short sequences whose exact origin is unknown. Since these sequences are usually smaller than the average length of a gene, algorithms must make predictions based on very little data.Results: We present MetaProdigal, a metagenomic version of the gene prediction program Prodigal, that can identify genes in short, anonymous coding sequences with a high degree of accuracy. The novel value of the method consists of enhanced translation initiation site identification, ability to identify sequences that use alternate genetic codes and confidence values for each gene call. We compare the results of MetaProdigal with other methods and conclude with a discussion of future improvements.Availability: The Prodigal software is freely available under the General Public License from http://code.google.com/p/prodigal/.Contact:[email protected] Information: Supplementary data are available at Bioinformatics online.},
number = {17},
journal = {Bioinformatics},
author = {Hyatt, Doug and LoCascio, Philip F. and Hauser, Loren J. and Uberbacher, Edward C.},
month = {sep},
year = {2012},
pages = {2223--2230},
}

@article{parks2015b,
title = {{CheckM}: assessing the quality of microbial genomes recovered from isolates, single cells, and metagenomes},
volume = {25},
issn = {1088-9051, 1549-5469},
shorttitle = {{CheckM}},
url = {https://genome.cshlp.org/content/25/7/1043},
doi = {10.1101/gr.186072.114},
abstract = {Large-scale recovery of genomes from isolates, single cells, and metagenomic data has been made possible by advances in computational methods and substantial reductions in sequencing costs. Although this increasing breadth of draft genomes is providing key information regarding the evolutionary and functional diversity of microbial life, it has become impractical to finish all available reference genomes. Making robust biological inferences from draft genomes requires accurate estimates of their completeness and contamination. Current methods for assessing genome quality are ad hoc and generally make use of a limited number of “marker” genes conserved across all bacterial or archaeal genomes. Here we introduce CheckM, an automated method for assessing the quality of a genome using a broader set of marker genes specific to the position of a genome within a reference genome tree and information about the collocation of these genes. We demonstrate the effectiveness of CheckM using synthetic data and a wide range of isolate-, single-cell-, and metagenome-derived genomes. CheckM is shown to provide accurate estimates of genome completeness and contamination and to outperform existing approaches. Using CheckM, we identify a diverse range of errors currently impacting publicly available isolate genomes and demonstrate that genomes obtained from single cells and metagenomic data vary substantially in quality. In order to facilitate the use of draft genomes, we propose an objective measure of genome quality that can be used to select genomes suitable for specific gene- and genome-centric analyses of microbial communities.},
language = {en},
number = {7},
urldate = {2022-04-27},
journal = {Genome Research},
author = {Parks, Donovan H. and Imelfort, Michael and Skennerton, Connor T. and Hugenholtz, Philip and Tyson, Gene W.},
month = {jul},
year = {2015},
pmid = {25977477},
pages = {1043--1055},
}

@misc{hmmer2022,
title = {{HMMER}},
url = {http://hmmer.org/},
urldate = {2022-04-27},
@article{kang2019,
title = {{{MetaBAT}} 2: {{An}} Adaptive Binning Algorithm for Robust and Efficient Genome Reconstruction from Metagenome Assemblies},
author = {Kang, Dongwan D. and Li, Feng and Kirton, Edward and Thomas, Ashleigh and Egan, Rob and An, Hong and Wang, Zhong},
year = {2019},
journal = {PeerJ},
volume = {2019},
number = {7},
publisher = {{PeerJ Inc.}},
issn = {21678359},
doi = {10.7717/peerj.7359},
keywords = {Clustering,Metagenome binning,Metagenomics}
}
5 changes: 3 additions & 2 deletions q2_moshpit/kraken2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from .kraken2 import classify_kraken
from .database import build_kraken_db
from .classification import classify_kraken

__all__ = ['classify_kraken']
__all__ = ['build_kraken_db', 'classify_kraken']
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,19 @@

import pandas as pd
from q2_types.per_sample_sequences import (
SingleLanePerSampleSingleEndFastqDirFmt,
SingleLanePerSamplePairedEndFastqDirFmt,
SingleLanePerSampleSingleEndFastqDirFmt
)

from q2_moshpit._utils import _process_common_input_params, run_command
from q2_moshpit._utils import run_command, _process_common_input_params
from q2_moshpit.kraken2.utils import _process_kraken2_arg
from q2_types_genomics.kraken2 import (
Kraken2ReportDirectoryFormat,
Kraken2OutputDirectoryFormat, Kraken2DBDirectoryFormat,
Kraken2OutputDirectoryFormat,
Kraken2DBDirectoryFormat
)
from q2_types_genomics.per_sample_data import MultiMAGSequencesDirFmt

from q2_moshpit.kraken2.utils import _process_kraken2_arg


def _get_seq_paths(df_index, df_row, df_columns):
if "filename" in df_columns:
Expand All @@ -37,7 +37,7 @@ def _get_seq_paths(df_index, df_row, df_columns):


def _construct_output_paths(
_sample, _bin, kraken2_outputs_dir, kraken2_reports_dir
_sample, _bin, kraken2_outputs_dir, kraken2_reports_dir
):
sample_dir_report = os.path.join(kraken2_reports_dir.path, _sample)
sample_dir_output = os.path.join(kraken2_outputs_dir.path, _sample)
Expand Down
214 changes: 214 additions & 0 deletions q2_moshpit/kraken2/database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2022-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import glob
import os
import shutil
import subprocess
import tempfile
from contextlib import ExitStack
from copy import deepcopy
from typing import List

from q2_types.feature_data import DNAFASTAFormat

from q2_moshpit._utils import _process_common_input_params, run_command
from q2_types_genomics.kraken2 import (
Kraken2DBDirectoryFormat,
)

from q2_moshpit.kraken2.utils import _process_kraken2_arg


def _build_standard_db(db_dir: str, all_kwargs: dict):
kwargs = {
k: v for k, v in all_kwargs.items()
if k in ["threads", "no_masking", "use_ftp", "fast_build"]
}
common_args = _process_common_input_params(
processing_func=_process_kraken2_arg, params=kwargs
)
if all_kwargs["max_db_size"] > 0:
common_args.extend(
["--max-db-size", str(all_kwargs["max_db_size"])]
)
cmd = [
"kraken2-build", "--standard", "--db", db_dir, *common_args
]
try:
run_command(cmd=cmd, verbose=True)
except subprocess.CalledProcessError as e:
raise Exception(
"An error was encountered while building the standard "
f"library, (return code {e.returncode}), please inspect "
"stdout and stderr to learn more."
)


def _fetch_taxonomy(db_dir: str, threads: int, use_ftp: bool):
cmd = [
"kraken2-build", "--download-taxonomy",
"--threads", str(threads), "--db", str(db_dir),
]
cmd.append("--use-ftp") if use_ftp else False
try:
run_command(cmd=cmd, verbose=True)
except subprocess.CalledProcessError as e:
raise Exception(
"An error was encountered while downloading taxonomy, "
f"(return code {e.returncode}), please inspect "
"stdout and stderr to learn more."
)


def _fetch_libraries(db_dir: str, libraries: List[str], all_kwargs: dict):
kwargs = {
k: v for k, v in all_kwargs.items()
if k in ["threads", "no_masking", "use_ftp"]
}
common_args = _process_common_input_params(
processing_func=_process_kraken2_arg, params=kwargs
)
common_args.extend(["--db", db_dir])
base_cmd = [
"kraken2-build", "--download-library",
]
fetch_behaviour = all_kwargs.get("library_exists", "refetch")
for library in libraries:
if fetch_behaviour == "skip":
lib_path = os.path.join(db_dir, "library", library)
if os.path.exists(lib_path) and len(os.listdir(lib_path)) > 0:
print(
f"Skipping download of the '{library}' library, "
f"already exists."
)
continue
try:
cmd = deepcopy(base_cmd)
cmd.extend([library, *common_args])
run_command(cmd=cmd, verbose=True)
except subprocess.CalledProcessError as e:
raise Exception(
f"An error was encountered while downloading the "
f"'{library}' library, (return code {e.returncode}), "
"please inspect stdout and stderr to learn more."
)


def _add_seqs_to_library(db_dir: str, seqs: DNAFASTAFormat, no_masking: bool):
cmd = [
"kraken2-build", "--add-to-library",
str(seqs.path), "--db", db_dir
]
cmd.append("--no-mask") if no_masking else False
try:
run_command(cmd=cmd, verbose=True)
except subprocess.CalledProcessError as e:
raise Exception(
"An error was encountered while adding sequences to the "
f"library, (return code {e.returncode}), please inspect "
"stdout and stderr to learn more."
)


def _build_database(db_dir: str, all_kwargs: dict):
kwargs = {
k: v for k, v in all_kwargs.items()
if k in ["threads", "minimizer_len", "minimizer_spaces",
"load_factor", "fast_build", "kmer_len"]
}
common_args = _process_common_input_params(
processing_func=_process_kraken2_arg, params=kwargs
)
if all_kwargs["max_db_size"] > 0:
common_args.extend(
["--max-db-size", str(all_kwargs["max_db_size"])]
)
cmd = [
"kraken2-build", "--build", "--db", db_dir, *common_args
]
try:
run_command(cmd=cmd, verbose=True)
except subprocess.CalledProcessError as e:
raise Exception(
"An error was encountered while building the database, "
f"(return code {e.returncode}), please inspect "
"stdout and stderr to learn more."
)


def _move_db_files(source: str, destination: str):
files = glob.glob(f"{source}/*.k2d")
for file in files:
new_file = os.path.join(destination, os.path.split(file)[-1])
shutil.move(file, new_file)


def build_kraken_db(
seqs: DNAFASTAFormat = None,
standard: bool = False,
library_path: str = None,
libraries: List[str] = None,
library_exists: str = 'skip',
threads: int = 1,
kmer_len: int = 35,
minimizer_len: int = 31,
minimizer_spaces: int = 7,
no_masking: bool = False,
max_db_size: int = 0,
use_ftp: bool = False,
load_factor: float = 0.7,
fast_build: bool = False,
) -> Kraken2DBDirectoryFormat:
db = Kraken2DBDirectoryFormat()
db_dir = str(db.path)

if standard and libraries:
raise ValueError(
'Standard Kraken2 database was requested but some libraries '
'were also provided. Please provide either only the "standard" '
'option or a list of "libraries" to be fetched for the database.'
)

with ExitStack() as stack:
if not library_path:
temp = tempfile.TemporaryDirectory()
temp_dir = temp.name
stack.enter_context(temp)
else:
os.makedirs(library_path, exist_ok=True)
temp_dir = library_path

# If requested, build the standard Kraken2 database
if standard:
_build_standard_db(db_dir=temp_dir, all_kwargs=locals())
_move_db_files(source=temp_dir, destination=db_dir)
return db

# Fetch taxonomy (required regardless of the build source)
_fetch_taxonomy(db_dir=temp_dir, threads=threads, use_ftp=use_ftp)

# If requested, download all the libraries
if libraries:
_fetch_libraries(
db_dir=temp_dir, libraries=libraries, all_kwargs=locals()
)

# If provided, add the additional sequences to the database
if seqs:
for seq in seqs:
_add_seqs_to_library(
db_dir=temp_dir, seqs=seq, no_masking=no_masking
)

# Finally, build the actual database
_build_database(db_dir=temp_dir, all_kwargs=locals())

# Move the database files (*.k2d) to the final location
_move_db_files(source=temp_dir, destination=db_dir)

return db
Loading

0 comments on commit 3502d02

Please sign in to comment.