ENH: Added option to compare reads to WildCARD database (#11)

-fetch-card-db fetches CARD and WildCARD and produces two artifacts, one for CARD and WildCARD data and one for the Kmer database -added option to compare reads to WildCARD database with annotate-reads-card -added option to compare reads to three additional detection models with annotate-reads-card -added new type and formats for Kmer database
bokulich-lab · Jan 3, 2024 · 4e6f0e3 · 4e6f0e3
1 parent 414355d
commit 4e6f0e3
Show file tree

Hide file tree

Showing 29 changed files with 830 additions and 270 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -1,4 +1,5 @@
 [run]
+source=q2_amr
 branch = True
 omit =
     */tests*

diff --git a/.gitignore b/.gitignore
@@ -42,6 +42,7 @@ htmlcov/
 .tox/
 .nox/
 .coverage
+.coveragerc
 .coverage.*
 .cache
 nosetests.xml

diff --git a/Makefile b/Makefile
@@ -12,7 +12,8 @@ test: all
 	py.test
 
 test-cov: all
-	py.test --cov=q2_amr
+	coverage run -m pytest
+	coverage xml
 
 install: all
 	$(PYTHON) setup.py install

diff --git a/ci/recipe/meta.yaml b/ci/recipe/meta.yaml
@@ -19,6 +19,12 @@ requirements:
   run:
     - python {{ python }}
     - qiime2 {{ qiime2_epoch }}.*
+    - q2-types-genomics {{ qiime2_epoch }}.*
+    - q2-types {{ qiime2_epoch }}.*
+    - q2templates {{ qiime2_epoch }}.*
+    - q2cli {{ qiime2_epoch }}.*
+    - rgi
+    - altair
 
 test:
   requires:
@@ -28,7 +34,7 @@ test:
     - q2_amr
     - qiime2.plugins.amr
   commands:
-    - pytest --cov q2_amr --pyargs q2_amr
+    - pytest --cov q2_amr --cov-report xml:coverage.xml --pyargs q2_amr
 
 about:
   home: https://github.com/bokulich-lab/q2-amr

diff --git a/q2_amr/card/database.py b/q2_amr/card/database.py
@@ -1,28 +1,142 @@
+import glob
+import gzip
 import os
 import shutil
+import subprocess
 import tarfile
 import tempfile
 
 import requests
 
-from q2_amr.types import CARDDatabaseDirectoryFormat
+from q2_amr.card.utils import run_command
+from q2_amr.types._format import (
+    CARDDatabaseDirectoryFormat,
+    CARDKmerDatabaseDirectoryFormat,
+)
 
-CARD_URL = "https://card.mcmaster.ca/latest/data"
 
-
-def fetch_card_db() -> CARDDatabaseDirectoryFormat:
+def fetch_card_db() -> (CARDDatabaseDirectoryFormat, CARDKmerDatabaseDirectoryFormat):
+    # Fetch CARD and WildCARD data from CARD website
     try:
-        response = requests.get(CARD_URL, stream=True)
+        response_card = requests.get(
+            "https://card.mcmaster.ca/latest/data", stream=True
+        )
+        response_wildcard = requests.get(
+            "https://card.mcmaster.ca/latest/variants", stream=True
+        )
     except requests.ConnectionError as e:
         raise requests.ConnectionError("Network connectivity problems.") from e
+
+    # Create temporary directory for WildCARD data
     with tempfile.TemporaryDirectory() as tmp_dir:
+        os.mkdir(os.path.join(tmp_dir, "wildcard"))
+
+        # Extract tar.bz2 archives and store files in dirs "card" and "wildcard_zip"
         try:
-            with tarfile.open(fileobj=response.raw, mode="r|bz2") as tar:
-                tar.extractall(path=tmp_dir)
+            with tarfile.open(
+                fileobj=response_card.raw, mode="r|bz2"
+            ) as c_tar, tarfile.open(
+                fileobj=response_wildcard.raw, mode="r|bz2"
+            ) as wc_tar:
+                c_tar.extractall(path=os.path.join(tmp_dir, "card"))
+                wc_tar.extractall(path=os.path.join(tmp_dir, "wildcard_zip"))
         except tarfile.ReadError as a:
             raise tarfile.ReadError("Tarfile is invalid.") from a
+
+        # List of files to be unzipped
+        files = (
+            "index-for-model-sequences.txt.gz",
+            "nucleotide_fasta_protein_homolog_model_variants.fasta.gz",
+            "nucleotide_fasta_protein_overexpression_model_variants.fasta.gz",
+            "nucleotide_fasta_protein_variant_model_variants.fasta.gz",
+            "nucleotide_fasta_rRNA_gene_variant_model_variants.fasta.gz",
+            "61_kmer_db.json.gz",
+            "all_amr_61mers.txt.gz",
+        )
+
+        # Unzip gzip files and save them in "wildcard" dir
+        for file in files:
+            with gzip.open(
+                os.path.join(tmp_dir, "wildcard_zip", file), "rb"
+            ) as f_in, open(
+                os.path.join(tmp_dir, "wildcard", file[:-3]), "wb"
+            ) as f_out:
+                f_out.write(f_in.read())
+
+        # Preprocess data for CARD and WildCARD
+        # This creates additional fasta files in the temp directory
+        preprocess(dir=tmp_dir, operation="card")
+        preprocess(dir=tmp_dir, operation="wildcard")
+
+        # Create CARD and Kmer database objects
         card_db = CARDDatabaseDirectoryFormat()
-        shutil.move(
-            os.path.join(tmp_dir, "card.json"), os.path.join(str(card_db), "card.json")
+        kmer_db = CARDKmerDatabaseDirectoryFormat()
+
+        # Find names of CARD database files created by preprocess function
+        card_db_files = [
+            os.path.basename(file)
+            for file in glob.glob(os.path.join(tmp_dir, "card_database_v*.fasta"))
+        ]
+
+        # Lists of filenames to be moved to CARD and Kmer database objects
+        wildcard_to_card_db = [
+            "index-for-model-sequences.txt",
+            "nucleotide_fasta_protein_homolog_model_variants.fasta",
+            "nucleotide_fasta_protein_overexpression_model_variants.fasta",
+            "nucleotide_fasta_protein_variant_model_variants.fasta",
+            "nucleotide_fasta_rRNA_gene_variant_model_variants.fasta",
+        ]
+        tmp_to_card_db = [
+            "wildcard_database_v0.fasta",
+            "wildcard_database_v0_all.fasta",
+            card_db_files[0],
+            card_db_files[1],
+        ]
+        wildcard_to_kmer_db = ["all_amr_61mers.txt", "61_kmer_db.json"]
+
+        # List of source and destination paths for files
+        src_des_list = [
+            (os.path.join(tmp_dir, "card"), str(card_db)),
+            (os.path.join(tmp_dir, "wildcard"), str(card_db)),
+            (tmp_dir, str(card_db)),
+            (os.path.join(tmp_dir, "wildcard"), str(kmer_db)),
+        ]
+
+        # Move all files from source path to destination path
+        for file_list, src_des in zip(
+            [["card.json"], wildcard_to_card_db, tmp_to_card_db, wildcard_to_kmer_db],
+            src_des_list,
+        ):
+            for file in file_list:
+                shutil.move(
+                    os.path.join(src_des[0], file), os.path.join(src_des[1], file)
+                )
+
+        return card_db, kmer_db
+
+
+def preprocess(dir, operation):
+    if operation == "card":
+        # Run RGI command for CARD data
+        cmd = ["rgi", "card_annotation", "-i", "card/card.json"]
+    elif operation == "wildcard":
+        # Run RGI command for WildCARD data
+        cmd = [
+            "rgi",
+            "wildcard_annotation",
+            "-i",
+            "wildcard",
+            "--card_json",
+            "card/card.json",
+            "-v",
+            "0",
+        ]
+
+    try:
+        run_command(cmd, dir, verbose=True)
+    except subprocess.CalledProcessError as e:
+        raise Exception(
+            f"An error was encountered while running rgi, "
+            f"(return code {e.returncode}), please inspect "
+            "stdout and stderr to learn more."
         )
-        return card_db
diff --git a/q2_amr/card/mags.py b/q2_amr/card/mags.py
@@ -6,18 +6,13 @@
 import pandas as pd
 from q2_types_genomics.per_sample_data import MultiMAGSequencesDirFmt
 
-from q2_amr.card.utils import (
-    create_count_table,
-    load_preprocess_card_db,
-    read_in_txt,
-    run_command,
-)
-from q2_amr.types import CARDAnnotationDirectoryFormat, CARDDatabaseFormat
+from q2_amr.card.utils import create_count_table, load_card_db, read_in_txt, run_command
+from q2_amr.types import CARDAnnotationDirectoryFormat, CARDDatabaseDirectoryFormat
 
 
 def annotate_mags_card(
     mag: MultiMAGSequencesDirFmt,
-    card_db: CARDDatabaseFormat,
+    card_db: CARDDatabaseDirectoryFormat,
     alignment_tool: str = "BLAST",
     split_prodigal_jobs: bool = False,
     include_loose: bool = False,
@@ -29,7 +24,7 @@ def annotate_mags_card(
     amr_annotations = CARDAnnotationDirectoryFormat()
     frequency_list = []
     with tempfile.TemporaryDirectory() as tmp:
-        load_preprocess_card_db(tmp, card_db, "load")
+        load_card_db(tmp=tmp, card_db=card_db)
         for samp_bin in list(manifest.index):
             bin_dir = os.path.join(str(amr_annotations), samp_bin[0], samp_bin[1])
             os.makedirs(bin_dir, exist_ok=True)

diff --git a/q2_amr/card/reads.py b/q2_amr/card/reads.py
@@ -14,15 +14,10 @@
     SingleLanePerSampleSingleEndFastqDirFmt,
 )
 
-from q2_amr.card.utils import (
-    create_count_table,
-    load_preprocess_card_db,
-    read_in_txt,
-    run_command,
-)
+from q2_amr.card.utils import create_count_table, load_card_db, read_in_txt, run_command
 from q2_amr.types import (
     CARDAlleleAnnotationDirectoryFormat,
-    CARDDatabaseFormat,
+    CARDDatabaseDirectoryFormat,
     CARDGeneAnnotationDirectoryFormat,
 )
 
@@ -31,9 +26,11 @@ def annotate_reads_card(
     reads: Union[
         SingleLanePerSamplePairedEndFastqDirFmt, SingleLanePerSampleSingleEndFastqDirFmt
     ],
-    card_db: CARDDatabaseFormat,
+    card_db: CARDDatabaseDirectoryFormat,
     aligner: str = "kma",
     threads: int = 1,
+    include_wildcard: bool = False,
+    include_other_models: bool = False,
 ) -> (
     CARDAlleleAnnotationDirectoryFormat,
     CARDGeneAnnotationDirectoryFormat,
@@ -46,9 +43,13 @@ def annotate_reads_card(
     amr_allele_annotation = CARDAlleleAnnotationDirectoryFormat()
     amr_gene_annotation = CARDGeneAnnotationDirectoryFormat()
     with tempfile.TemporaryDirectory() as tmp:
-        load_preprocess_card_db(tmp, card_db, "load")
-        load_preprocess_card_db(tmp, card_db, "preprocess")
-        load_preprocess_card_db(tmp, card_db, "load_fasta")
+        load_card_db(
+            tmp=tmp,
+            card_db=card_db,
+            fasta=True,
+            include_other_models=include_other_models,
+            include_wildcard=include_wildcard,
+        )
         for samp in list(manifest.index):
             fwd = manifest.loc[samp, "forward"]
             rev = manifest.loc[samp, "reverse"] if paired else None
@@ -65,6 +66,8 @@ def annotate_reads_card(
                 rev=rev,
                 aligner=aligner,
                 threads=threads,
+                include_wildcard=include_wildcard,
+                include_other_models=include_other_models,
             )
             path_allele = os.path.join(samp_input_dir, "output.allele_mapping_data.txt")
             allele_frequency = read_in_txt(
@@ -109,6 +112,8 @@ def run_rgi_bwt(
     rev: str,
     aligner: str,
     threads: int,
+    include_wildcard: bool,
+    include_other_models: bool,
 ):
     cmd = [
         "rgi",
@@ -126,6 +131,10 @@ def run_rgi_bwt(
     ]
     if rev:
         cmd.extend(["--read_two", rev])
+    if include_wildcard:
+        cmd.append("--include_wildcard")
+    if include_other_models:
+        cmd.append("--include_other_models")
     try:
         run_command(cmd, cwd, verbose=True)
     except subprocess.CalledProcessError as e:

diff --git a/q2_amr/card/utils.py b/q2_amr/card/utils.py
@@ -1,4 +1,6 @@
+import glob
 import json
+import os
 import subprocess
 from functools import reduce
 
@@ -21,24 +23,64 @@ def run_command(cmd, cwd, verbose=True):
     subprocess.run(cmd, check=True, cwd=cwd)
 
 
-def load_preprocess_card_db(tmp, card_db, operation):
-    if operation == "load":
-        cmd = ["rgi", "load", "--card_json", str(card_db), "--local"]
-    elif operation == "preprocess":
-        cmd = ["rgi", "card_annotation", "-i", str(card_db)]
-    elif operation == "load_fasta":
-        with open(str(card_db)) as f:
+def load_card_db(
+    tmp,
+    card_db,
+    kmer_db=None,
+    kmer: bool = False,
+    fasta: bool = False,
+    include_other_models: bool = False,
+    include_wildcard: bool = False,
+):
+    # Get path to card.json
+    path_card_json = os.path.join(str(card_db), "card.json")
+
+    # Base command that only loads card.json into the local database
+    cmd = ["rgi", "load", "--card_json", path_card_json, "--local"]
+
+    # Define suffixes for card fasta file
+    models = ("_all", "_all_models") if include_other_models is True else ("", "")
+
+    # Extend base command with flag to load card fasta file
+    if fasta:
+        # Retrieve the database version number from card.jason file
+        with open(path_card_json) as f:
             card_data = json.load(f)
             version = card_data["_version"]
-        cmd = [
-            "rgi",
-            "load",
-            "-i",
-            str(card_db),
-            "--card_annotation",
-            f"card_database_v{version}.fasta",
-            "--local",
-        ]
+
+        # Define path to card fasta file
+        path_card_fasta = os.path.join(
+            str(card_db), f"card_database_v{version}{models[0]}.fasta"
+        )
+
+        # Extend base command
+        cmd.extend([f"--card_annotation{models[1]}", path_card_fasta])
+
+    # Extend base command with flag to load wildcard fasta file and index
+    if include_wildcard:
+        cmd.extend(
+            [
+                f"--wildcard_annotation{models[1]}",
+                os.path.join(str(card_db), f"wildcard_database_v0{models[0]}.fasta"),
+                "--wildcard_index",
+                os.path.join(str(card_db), "index-for-model-sequences.txt"),
+            ]
+        )
+    # Extend base command with flag to load kmer json and txt database files
+    if kmer:
+        path_kmer_json = glob.glob(os.path.join(str(kmer_db), "*_kmer_db.json"))[0]
+        cmd.extend(
+            [
+                "--kmer_database",
+                path_kmer_json,
+                "--amr_kmers",
+                glob.glob(os.path.join(str(kmer_db), "all_amr_*mers.txt"))[0],
+                "--kmer_size",
+                os.path.basename(path_kmer_json).split("_")[0],
+            ]
+        )
+
+    # Run command
     try:
         run_command(cmd, tmp, verbose=True)
     except subprocess.CalledProcessError as e: