Skip to content

Commit

Permalink
ENH: Added option to compare reads to WildCARD database (#11)
Browse files Browse the repository at this point in the history
-fetch-card-db fetches CARD and WildCARD and produces two artifacts, one for CARD and WildCARD data and one for the Kmer database
-added option to compare reads to WildCARD database with annotate-reads-card
-added option to compare reads to three additional detection models with annotate-reads-card
-added new type and formats for Kmer database
  • Loading branch information
VinzentRisch authored Jan 3, 2024
1 parent 414355d commit 4e6f0e3
Show file tree
Hide file tree
Showing 29 changed files with 830 additions and 270 deletions.
1 change: 1 addition & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
[run]
source=q2_amr
branch = True
omit =
*/tests*
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ htmlcov/
.tox/
.nox/
.coverage
.coveragerc
.coverage.*
.cache
nosetests.xml
Expand Down
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ test: all
py.test

test-cov: all
py.test --cov=q2_amr
coverage run -m pytest
coverage xml

install: all
$(PYTHON) setup.py install
Expand Down
8 changes: 7 additions & 1 deletion ci/recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ requirements:
run:
- python {{ python }}
- qiime2 {{ qiime2_epoch }}.*
- q2-types-genomics {{ qiime2_epoch }}.*
- q2-types {{ qiime2_epoch }}.*
- q2templates {{ qiime2_epoch }}.*
- q2cli {{ qiime2_epoch }}.*
- rgi
- altair

test:
requires:
Expand All @@ -28,7 +34,7 @@ test:
- q2_amr
- qiime2.plugins.amr
commands:
- pytest --cov q2_amr --pyargs q2_amr
- pytest --cov q2_amr --cov-report xml:coverage.xml --pyargs q2_amr

about:
home: https://github.com/bokulich-lab/q2-amr
Expand Down
134 changes: 124 additions & 10 deletions q2_amr/card/database.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,142 @@
import glob
import gzip
import os
import shutil
import subprocess
import tarfile
import tempfile

import requests

from q2_amr.types import CARDDatabaseDirectoryFormat
from q2_amr.card.utils import run_command
from q2_amr.types._format import (
CARDDatabaseDirectoryFormat,
CARDKmerDatabaseDirectoryFormat,
)

CARD_URL = "https://card.mcmaster.ca/latest/data"


def fetch_card_db() -> CARDDatabaseDirectoryFormat:
def fetch_card_db() -> (CARDDatabaseDirectoryFormat, CARDKmerDatabaseDirectoryFormat):
# Fetch CARD and WildCARD data from CARD website
try:
response = requests.get(CARD_URL, stream=True)
response_card = requests.get(
"https://card.mcmaster.ca/latest/data", stream=True
)
response_wildcard = requests.get(
"https://card.mcmaster.ca/latest/variants", stream=True
)
except requests.ConnectionError as e:
raise requests.ConnectionError("Network connectivity problems.") from e

# Create temporary directory for WildCARD data
with tempfile.TemporaryDirectory() as tmp_dir:
os.mkdir(os.path.join(tmp_dir, "wildcard"))

# Extract tar.bz2 archives and store files in dirs "card" and "wildcard_zip"
try:
with tarfile.open(fileobj=response.raw, mode="r|bz2") as tar:
tar.extractall(path=tmp_dir)
with tarfile.open(
fileobj=response_card.raw, mode="r|bz2"
) as c_tar, tarfile.open(
fileobj=response_wildcard.raw, mode="r|bz2"
) as wc_tar:
c_tar.extractall(path=os.path.join(tmp_dir, "card"))
wc_tar.extractall(path=os.path.join(tmp_dir, "wildcard_zip"))
except tarfile.ReadError as a:
raise tarfile.ReadError("Tarfile is invalid.") from a

# List of files to be unzipped
files = (
"index-for-model-sequences.txt.gz",
"nucleotide_fasta_protein_homolog_model_variants.fasta.gz",
"nucleotide_fasta_protein_overexpression_model_variants.fasta.gz",
"nucleotide_fasta_protein_variant_model_variants.fasta.gz",
"nucleotide_fasta_rRNA_gene_variant_model_variants.fasta.gz",
"61_kmer_db.json.gz",
"all_amr_61mers.txt.gz",
)

# Unzip gzip files and save them in "wildcard" dir
for file in files:
with gzip.open(
os.path.join(tmp_dir, "wildcard_zip", file), "rb"
) as f_in, open(
os.path.join(tmp_dir, "wildcard", file[:-3]), "wb"
) as f_out:
f_out.write(f_in.read())

# Preprocess data for CARD and WildCARD
# This creates additional fasta files in the temp directory
preprocess(dir=tmp_dir, operation="card")
preprocess(dir=tmp_dir, operation="wildcard")

# Create CARD and Kmer database objects
card_db = CARDDatabaseDirectoryFormat()
shutil.move(
os.path.join(tmp_dir, "card.json"), os.path.join(str(card_db), "card.json")
kmer_db = CARDKmerDatabaseDirectoryFormat()

# Find names of CARD database files created by preprocess function
card_db_files = [
os.path.basename(file)
for file in glob.glob(os.path.join(tmp_dir, "card_database_v*.fasta"))
]

# Lists of filenames to be moved to CARD and Kmer database objects
wildcard_to_card_db = [
"index-for-model-sequences.txt",
"nucleotide_fasta_protein_homolog_model_variants.fasta",
"nucleotide_fasta_protein_overexpression_model_variants.fasta",
"nucleotide_fasta_protein_variant_model_variants.fasta",
"nucleotide_fasta_rRNA_gene_variant_model_variants.fasta",
]
tmp_to_card_db = [
"wildcard_database_v0.fasta",
"wildcard_database_v0_all.fasta",
card_db_files[0],
card_db_files[1],
]
wildcard_to_kmer_db = ["all_amr_61mers.txt", "61_kmer_db.json"]

# List of source and destination paths for files
src_des_list = [
(os.path.join(tmp_dir, "card"), str(card_db)),
(os.path.join(tmp_dir, "wildcard"), str(card_db)),
(tmp_dir, str(card_db)),
(os.path.join(tmp_dir, "wildcard"), str(kmer_db)),
]

# Move all files from source path to destination path
for file_list, src_des in zip(
[["card.json"], wildcard_to_card_db, tmp_to_card_db, wildcard_to_kmer_db],
src_des_list,
):
for file in file_list:
shutil.move(
os.path.join(src_des[0], file), os.path.join(src_des[1], file)
)

return card_db, kmer_db


def preprocess(dir, operation):
if operation == "card":
# Run RGI command for CARD data
cmd = ["rgi", "card_annotation", "-i", "card/card.json"]
elif operation == "wildcard":
# Run RGI command for WildCARD data
cmd = [
"rgi",
"wildcard_annotation",
"-i",
"wildcard",
"--card_json",
"card/card.json",
"-v",
"0",
]

try:
run_command(cmd, dir, verbose=True)
except subprocess.CalledProcessError as e:
raise Exception(
f"An error was encountered while running rgi, "
f"(return code {e.returncode}), please inspect "
"stdout and stderr to learn more."
)
return card_db
13 changes: 4 additions & 9 deletions q2_amr/card/mags.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,13 @@
import pandas as pd
from q2_types_genomics.per_sample_data import MultiMAGSequencesDirFmt

from q2_amr.card.utils import (
create_count_table,
load_preprocess_card_db,
read_in_txt,
run_command,
)
from q2_amr.types import CARDAnnotationDirectoryFormat, CARDDatabaseFormat
from q2_amr.card.utils import create_count_table, load_card_db, read_in_txt, run_command
from q2_amr.types import CARDAnnotationDirectoryFormat, CARDDatabaseDirectoryFormat


def annotate_mags_card(
mag: MultiMAGSequencesDirFmt,
card_db: CARDDatabaseFormat,
card_db: CARDDatabaseDirectoryFormat,
alignment_tool: str = "BLAST",
split_prodigal_jobs: bool = False,
include_loose: bool = False,
Expand All @@ -29,7 +24,7 @@ def annotate_mags_card(
amr_annotations = CARDAnnotationDirectoryFormat()
frequency_list = []
with tempfile.TemporaryDirectory() as tmp:
load_preprocess_card_db(tmp, card_db, "load")
load_card_db(tmp=tmp, card_db=card_db)
for samp_bin in list(manifest.index):
bin_dir = os.path.join(str(amr_annotations), samp_bin[0], samp_bin[1])
os.makedirs(bin_dir, exist_ok=True)
Expand Down
31 changes: 20 additions & 11 deletions q2_amr/card/reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,10 @@
SingleLanePerSampleSingleEndFastqDirFmt,
)

from q2_amr.card.utils import (
create_count_table,
load_preprocess_card_db,
read_in_txt,
run_command,
)
from q2_amr.card.utils import create_count_table, load_card_db, read_in_txt, run_command
from q2_amr.types import (
CARDAlleleAnnotationDirectoryFormat,
CARDDatabaseFormat,
CARDDatabaseDirectoryFormat,
CARDGeneAnnotationDirectoryFormat,
)

Expand All @@ -31,9 +26,11 @@ def annotate_reads_card(
reads: Union[
SingleLanePerSamplePairedEndFastqDirFmt, SingleLanePerSampleSingleEndFastqDirFmt
],
card_db: CARDDatabaseFormat,
card_db: CARDDatabaseDirectoryFormat,
aligner: str = "kma",
threads: int = 1,
include_wildcard: bool = False,
include_other_models: bool = False,
) -> (
CARDAlleleAnnotationDirectoryFormat,
CARDGeneAnnotationDirectoryFormat,
Expand All @@ -46,9 +43,13 @@ def annotate_reads_card(
amr_allele_annotation = CARDAlleleAnnotationDirectoryFormat()
amr_gene_annotation = CARDGeneAnnotationDirectoryFormat()
with tempfile.TemporaryDirectory() as tmp:
load_preprocess_card_db(tmp, card_db, "load")
load_preprocess_card_db(tmp, card_db, "preprocess")
load_preprocess_card_db(tmp, card_db, "load_fasta")
load_card_db(
tmp=tmp,
card_db=card_db,
fasta=True,
include_other_models=include_other_models,
include_wildcard=include_wildcard,
)
for samp in list(manifest.index):
fwd = manifest.loc[samp, "forward"]
rev = manifest.loc[samp, "reverse"] if paired else None
Expand All @@ -65,6 +66,8 @@ def annotate_reads_card(
rev=rev,
aligner=aligner,
threads=threads,
include_wildcard=include_wildcard,
include_other_models=include_other_models,
)
path_allele = os.path.join(samp_input_dir, "output.allele_mapping_data.txt")
allele_frequency = read_in_txt(
Expand Down Expand Up @@ -109,6 +112,8 @@ def run_rgi_bwt(
rev: str,
aligner: str,
threads: int,
include_wildcard: bool,
include_other_models: bool,
):
cmd = [
"rgi",
Expand All @@ -126,6 +131,10 @@ def run_rgi_bwt(
]
if rev:
cmd.extend(["--read_two", rev])
if include_wildcard:
cmd.append("--include_wildcard")
if include_other_models:
cmd.append("--include_other_models")
try:
run_command(cmd, cwd, verbose=True)
except subprocess.CalledProcessError as e:
Expand Down
74 changes: 58 additions & 16 deletions q2_amr/card/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import glob
import json
import os
import subprocess
from functools import reduce

Expand All @@ -21,24 +23,64 @@ def run_command(cmd, cwd, verbose=True):
subprocess.run(cmd, check=True, cwd=cwd)


def load_preprocess_card_db(tmp, card_db, operation):
if operation == "load":
cmd = ["rgi", "load", "--card_json", str(card_db), "--local"]
elif operation == "preprocess":
cmd = ["rgi", "card_annotation", "-i", str(card_db)]
elif operation == "load_fasta":
with open(str(card_db)) as f:
def load_card_db(
tmp,
card_db,
kmer_db=None,
kmer: bool = False,
fasta: bool = False,
include_other_models: bool = False,
include_wildcard: bool = False,
):
# Get path to card.json
path_card_json = os.path.join(str(card_db), "card.json")

# Base command that only loads card.json into the local database
cmd = ["rgi", "load", "--card_json", path_card_json, "--local"]

# Define suffixes for card fasta file
models = ("_all", "_all_models") if include_other_models is True else ("", "")

# Extend base command with flag to load card fasta file
if fasta:
# Retrieve the database version number from card.jason file
with open(path_card_json) as f:
card_data = json.load(f)
version = card_data["_version"]
cmd = [
"rgi",
"load",
"-i",
str(card_db),
"--card_annotation",
f"card_database_v{version}.fasta",
"--local",
]

# Define path to card fasta file
path_card_fasta = os.path.join(
str(card_db), f"card_database_v{version}{models[0]}.fasta"
)

# Extend base command
cmd.extend([f"--card_annotation{models[1]}", path_card_fasta])

# Extend base command with flag to load wildcard fasta file and index
if include_wildcard:
cmd.extend(
[
f"--wildcard_annotation{models[1]}",
os.path.join(str(card_db), f"wildcard_database_v0{models[0]}.fasta"),
"--wildcard_index",
os.path.join(str(card_db), "index-for-model-sequences.txt"),
]
)
# Extend base command with flag to load kmer json and txt database files
if kmer:
path_kmer_json = glob.glob(os.path.join(str(kmer_db), "*_kmer_db.json"))[0]
cmd.extend(
[
"--kmer_database",
path_kmer_json,
"--amr_kmers",
glob.glob(os.path.join(str(kmer_db), "all_amr_*mers.txt"))[0],
"--kmer_size",
os.path.basename(path_kmer_json).split("_")[0],
]
)

# Run command
try:
run_command(cmd, tmp, verbose=True)
except subprocess.CalledProcessError as e:
Expand Down
Loading

0 comments on commit 4e6f0e3

Please sign in to comment.