Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Added FeatureTable[Frequency] output to annotate-reads-card function #12

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions q2_amr/card/mags.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,9 @@ def annotate_mags_card(
shutil.move(f"{tmp}/output.json", json_path)
samp_bin_name = os.path.join(samp_bin[0], samp_bin[1])
frequency_df = read_in_txt(
path=txt_path, col_name="ARO", samp_bin_name=samp_bin_name
path=txt_path, samp_bin_name=samp_bin_name, data_type="mags"
)
if frequency_df is not None:
frequency_list.append(frequency_df)
frequency_list.append(frequency_df)
feature_table = create_count_table(df_list=frequency_list)
return (
amr_annotations,
Expand Down
10 changes: 4 additions & 6 deletions q2_amr/card/reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,16 +71,14 @@ def annotate_reads_card(
)
path_allele = os.path.join(samp_input_dir, "output.allele_mapping_data.txt")
allele_frequency = read_in_txt(
path=path_allele, col_name="ARO Accession", samp_bin_name=samp
path=path_allele, samp_bin_name=samp, data_type="reads"
)
if allele_frequency is not None:
allele_frequency_list.append(allele_frequency)
allele_frequency_list.append(allele_frequency)
path_gene = os.path.join(samp_input_dir, "output.gene_mapping_data.txt")
gene_frequency = read_in_txt(
path=path_gene, col_name="ARO Accession", samp_bin_name=samp
path=path_gene, samp_bin_name=samp, data_type="reads"
)
if gene_frequency is not None:
gene_frequency_list.append(gene_frequency)
gene_frequency_list.append(gene_frequency)
move_files(samp_input_dir, samp_allele_dir, "allele")
move_files(samp_input_dir, samp_gene_dir, "gene")

Expand Down
25 changes: 19 additions & 6 deletions q2_amr/card/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,26 +91,39 @@ def load_card_db(
)


def read_in_txt(path: str, col_name: str, samp_bin_name: str):
def read_in_txt(path: str, samp_bin_name: str, data_type):
# Read in txt file to pd.Dataframe
df = pd.read_csv(path, sep="\t")
if df.empty:
return None
df = df[[col_name]]

# Process the df depending on the data type (from reads or mags)
if data_type == "reads":
df = df[["ARO Term", "All Mapped Reads"]]
df.rename(columns={"All Mapped Reads": samp_bin_name}, inplace=True)
else:
df = df[["Best_Hit_ARO"]]
df[samp_bin_name] = 1

df = df.astype(str)
df[samp_bin_name] = df.groupby(col_name)[col_name].transform("count")
df = df.drop_duplicates(subset=[col_name])
return df


def create_count_table(df_list: list) -> pd.DataFrame:
# Remove all empty lists from df_list
df_list = [df for df in df_list if not df.empty]

# Raise ValueError if df_list is empty. This happens when no ARGs were detected
if not df_list:
raise ValueError(
"RGI did not identify any AMR genes. No output can be created."
)

# Merge all dfs contained in df_list
df = reduce(
lambda left, right: pd.merge(left, right, on=left.columns[0], how="outer"),
df_list,
)

# Process the df to meet all requirements for a FeatureTable
df = df.transpose()
df = df.fillna(0)
df.columns = df.iloc[0]
Expand Down
14 changes: 7 additions & 7 deletions q2_amr/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# ----------------------------------------------------------------------------
import importlib

from q2_types.feature_table import FeatureTable, PresenceAbsence
from q2_types.feature_table import FeatureTable, Frequency, PresenceAbsence
from q2_types.per_sample_sequences import (
PairedEndSequencesWithQuality,
SequencesWithQuality,
Expand Down Expand Up @@ -133,8 +133,8 @@
outputs=[
("amr_allele_annotation", SampleData[CARDAlleleAnnotation]),
("amr_gene_annotation", SampleData[CARDGeneAnnotation]),
("allele_feature_table", FeatureTable[PresenceAbsence]),
("gene_feature_table", FeatureTable[PresenceAbsence]),
("allele_feature_table", FeatureTable[Frequency]),
("gene_feature_table", FeatureTable[Frequency]),
],
input_descriptions={
"reads": "Paired or single end reads.",
Expand Down Expand Up @@ -162,10 +162,10 @@
output_descriptions={
"amr_allele_annotation": "AMR annotation mapped on alleles.",
"amr_gene_annotation": "AMR annotation mapped on genes.",
"allele_feature_table": "Presence and absence table of ARGs in all samples for"
" allele mapping.",
"gene_feature_table": "Presence and absence table of ARGs in all samples for "
"gene mapping.",
"allele_feature_table": "Frequency table of ARGs in all samples for allele "
"mapping.",
"gene_feature_table": "Frequency table of ARGs in all samples for gene "
"mapping.",
},
name="Annotate reads with antimicrobial resistance genes from CARD.",
description="Annotate reads with antimicrobial resistance genes from CARD.",
Expand Down
28 changes: 3 additions & 25 deletions q2_amr/tests/card/test_mags.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,18 @@
import os
import shutil
import subprocess
from copy import deepcopy
from unittest.mock import MagicMock, patch

import pandas as pd
from q2_types_genomics.per_sample_data import MultiMAGSequencesDirFmt
from qiime2.plugin.testing import TestPluginBase

from q2_amr.card.mags import annotate_mags_card, run_rgi_main
from q2_amr.types import CARDAnnotationDirectoryFormat, CARDDatabaseFormat
from q2_amr.types import CARDAnnotationDirectoryFormat, CARDDatabaseDirectoryFormat


class TestAnnotateMagsCard(TestPluginBase):
package = "q2_amr.tests"

table = pd.DataFrame(
{
"sample_id": ["sample1", "sample2"],
3000796: [1, 0],
3000815: [1, 1],
3000805: [1, 1],
3000026: [1, 2],
3000797: [0, 1],
}
)

def mock_run_rgi_main(
self,
tmp,
Expand All @@ -42,21 +29,13 @@ def mock_run_rgi_main(
shutil.copy(output_txt, f"{tmp}/output.txt")
shutil.copy(output_json, f"{tmp}/output.json")

def return_count_table(self, df_list):
count_table = deepcopy(self.table)
count_table.set_index("sample_id", inplace=True)
count_table = count_table.astype(float)
count_table.columns = count_table.columns.astype(float)
return count_table

def test_annotate_mags_card(self):

manifest = self.get_data_path("MANIFEST_mags")
mag = MultiMAGSequencesDirFmt()
card_db = CARDDatabaseFormat()
card_db = CARDDatabaseDirectoryFormat()
shutil.copy(manifest, os.path.join(str(mag), "MANIFEST"))

mock_create_count_table = MagicMock(side_effect=self.return_count_table)
mock_create_count_table = MagicMock()
mock_read_in_txt = MagicMock()
with patch(
"q2_amr.card.mags.run_rgi_main", side_effect=self.mock_run_rgi_main
Expand All @@ -67,7 +46,6 @@ def test_annotate_mags_card(self):
):
result = annotate_mags_card(mag, card_db)
self.assertIsInstance(result[0], CARDAnnotationDirectoryFormat)
self.assertIsInstance(result[1], pd.DataFrame)
self.assertTrue(
os.path.exists(
os.path.join(
Expand Down
10 changes: 2 additions & 8 deletions q2_amr/tests/card/test_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,11 @@
import tempfile
from unittest.mock import ANY, MagicMock, call, patch

import pandas as pd
from q2_types.per_sample_sequences import (
SingleLanePerSamplePairedEndFastqDirFmt,
SingleLanePerSampleSingleEndFastqDirFmt,
)
from qiime2.plugin.testing import TestPluginBase
from test_mags import TestAnnotateMagsCard

from q2_amr.card.reads import (
annotate_reads_card,
Expand Down Expand Up @@ -77,9 +75,7 @@ def annotate_reads_card_test_body(self, read_type):
mock_run_rgi_bwt = MagicMock(side_effect=self.copy_needed_files)
mock_run_rgi_load = MagicMock()
mock_read_in_txt = MagicMock()
mock_create_count_table = MagicMock(
side_effect=TestAnnotateMagsCard().return_count_table
)
mock_create_count_table = MagicMock()

# Patch run_rgi_bwt, run_rgi_load, read_in_txt and create_count_table functions
# and assign MagicMock objects
Expand Down Expand Up @@ -128,8 +124,8 @@ def annotate_reads_card_test_body(self, read_type):
exp_calls_mock_read = [
call(
path=f"{tmp_dir}/{samp}/output.{model}_mapping_data.txt",
col_name="ARO Accession",
samp_bin_name=samp,
data_type="reads",
)
for samp in ["sample1", "sample2"]
for model in ["allele", "gene"]
Expand All @@ -147,8 +143,6 @@ def annotate_reads_card_test_body(self, read_type):
# Assert if all output files are the expected format
self.assertIsInstance(result[0], CARDAlleleAnnotationDirectoryFormat)
self.assertIsInstance(result[1], CARDGeneAnnotationDirectoryFormat)
self.assertIsInstance(result[2], pd.DataFrame)
self.assertIsInstance(result[3], pd.DataFrame)

# Assert if the expected files are in every sample directory and in both
# resulting CARD annotation objects
Expand Down
82 changes: 42 additions & 40 deletions q2_amr/tests/card/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import pandas as pd
from qiime2.plugin.testing import TestPluginBase
from test_mags import TestAnnotateMagsCard

from q2_amr.card.utils import create_count_table, load_card_db, read_in_txt
from q2_amr.types import CARDDatabaseDirectoryFormat, CARDKmerDatabaseDirectoryFormat
Expand All @@ -16,26 +15,31 @@ class TestAnnotateReadsCARD(TestPluginBase):

@classmethod
def setUpClass(cls):
cls.mapping_data_sample1 = pd.DataFrame(
cls.count_df_list = []
for colname, ARG, sample in zip(
["ARO Term", "ARO Term", "Best_Hit_ARO"],
["mdtF", "mdtE", "mdtF"],
["sample1", "sample2", "sample1"],
):
df = pd.DataFrame(
{
colname: [ARG, "mgrA", "OprN", "mepA"],
sample: ["1", "1", "1", "1"],
}
)
cls.count_df_list.append(df)

cls.frequency_table = pd.DataFrame(
{
"ARO Accession": [3000796, 3000815, 3000805, 3000026],
"sample1": [1, 1, 1, 1],
}
)

cls.mapping_data_sample2 = pd.DataFrame(
{
"ARO Accession": [3000797, 3000815, 3000805, 3000026],
"sample2": [1, 1, 1, 2],
}
)

cls.mags_mapping_data_sample1 = pd.DataFrame(
{
"ARO": [3000796, 3000815, 3000805, 3000026],
"sample1": [1, 1, 1, 1],
"sample_id": ["sample1", "sample2"],
"mdtF": ["1", "0"],
"mgrA": ["1", "1"],
"OprN": ["1", "1"],
"mepA": ["1", "1"],
"mdtE": ["0", "1"],
}
)
cls.frequency_table.set_index("sample_id", inplace=True)

def test_load_card_db_fasta(self):
# Create CARD and Kmer database objects
Expand Down Expand Up @@ -119,37 +123,35 @@ def test_exception_raised(self):
self.assertEqual(str(cm.exception), expected_message)

def test_read_in_txt_mags(self):
path = self.get_data_path("output.mags.txt")
self.read_in_txt_test_body(
path, "ARO", "sample1", self.mags_mapping_data_sample1
)

def test_read_in_txt_allele(self):
path = self.get_data_path("output.allele_mapping_data.txt")
# Test read_in_txt with output data from annotate_mags_card
self.read_in_txt_test_body(
path, "ARO Accession", "sample1", self.mapping_data_sample1
"output.mags.txt", "sample1", self.count_df_list[2], "mags"
)

def test_read_in_txt_gene(self):
path = self.get_data_path("output.gene_mapping_data.txt")
def test_read_in_txt_reads(self):
# Test read_in_txt with output data from annotate_reads_card
self.read_in_txt_test_body(
path, "ARO Accession", "sample1", self.mapping_data_sample1
"output.allele_mapping_data.txt", "sample1", self.count_df_list[0], "reads"
)

def read_in_txt_test_body(self, path, col_name, samp_bin_name, mapping_data):
def read_in_txt_test_body(self, txt_file, samp_bin_name, mapping_data, data_type):
# Create expected and observed count dataframes and compares them
exp = mapping_data
obs = read_in_txt(path, col_name, samp_bin_name)
obs[col_name] = obs[col_name].astype(int)
obs = read_in_txt(self.get_data_path(txt_file), samp_bin_name, data_type)
pd.testing.assert_frame_equal(exp, obs)

def test_create_count_table(self):
df_list = [self.mapping_data_sample1, self.mapping_data_sample2]
# Create observed count table with create_count_table function
df_list = [self.count_df_list[0], self.count_df_list[1]]
obs = create_count_table(df_list)
mag_test_class = TestAnnotateMagsCard()
exp = mag_test_class.table
exp.set_index("sample_id", inplace=True)
exp = exp.astype(float)
exp.columns = exp.columns.astype(float)
obs = obs.astype(str)

# Define expected count table
exp = self.frequency_table

# Compare expected and observed count table
pd.testing.assert_frame_equal(exp, obs)
df_list_empty = []
self.assertRaises(ValueError, create_count_table, df_list_empty)

def test_create_count_table_value_error(self):
# Assert if ValueError is called when empy list is passed
self.assertRaises(ValueError, create_count_table, [])
6 changes: 3 additions & 3 deletions q2_amr/tests/data/output.allele_mapping_data.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Reference Sequence ARO Term ARO Accession Reference Model Type Reference DB Reference Allele Source Resistomes & Variants: Observed in Genome(s) Resistomes & Variants: Observed in Plasmid(s) Resistomes & Variants: Observed Pathogen(s) Completely Mapped Reads Mapped Reads with Flanking Sequence All Mapped Reads Percent Coverage Length Coverage (bp) Average MAPQ (Completely Mapped Reads) Mate Pair Linkage Reference Length AMR Gene Family Drug Class Resistance Mechanism
ARO:3000796|ID:121|Name:mdtF|NCBI:U00096.1 mdtF 3000796 protein homolog model CARD CARD curation no data no data Escherichia coli 2 0 2 8.09 252 193.00 * 3114 resistance-nodulation-cell division (RND) antibiotic efflux pump macrolide antibiotic; fluoroquinolone antibiotic; penam antibiotic efflux
ARO:3000796|ID:121|Name:mdtF|NCBI:U00096.1 mdtF 3000796 protein homolog model CARD CARD curation no data no data Escherichia coli 1 0 1 8.09 252 193.00 * 3114 resistance-nodulation-cell division (RND) antibiotic efflux pump macrolide antibiotic; fluoroquinolone antibiotic; penam antibiotic efflux
ARO:3000815|ID:154|Name:mgrA|NCBI:BA000018.3 mgrA 3000815 protein homolog model CARD CARD curation no data no data Staphylococcus aureus 1 0 1 19.59 87 172.00 * 444 ATP-binding cassette (ABC) antibiotic efflux pump; major facilitator superfamily (MFS) antibiotic efflux pump fluoroquinolone antibiotic; cephalosporin; penam; tetracycline antibiotic; peptide antibiotic; disinfecting agents and antiseptics antibiotic efflux
ARO:3000805|ID:172|Name:OprN|NCBI:AE004091.2 OprN 3000805 protein homolog model CARD CARD curation no data no data Pseudomonas aeruginosa 2 0 2 17.76 252 193.00 * 1419 resistance-nodulation-cell division (RND) antibiotic efflux pump fluoroquinolone antibiotic; diaminopyrimidine antibiotic; phenicol antibiotic antibiotic efflux
ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1 mepA 3000026 protein homolog model CARD CARD curation no data no data Staphylococcus aureus 2 0 2 17.70 240 190.50 * 1356 multidrug and toxic compound extrusion (MATE) transporter glycylcycline; tetracycline antibiotic antibiotic efflux
ARO:3000805|ID:172|Name:OprN|NCBI:AE004091.2 OprN 3000805 protein homolog model CARD CARD curation no data no data Pseudomonas aeruginosa 1 0 1 17.76 252 193.00 * 1419 resistance-nodulation-cell division (RND) antibiotic efflux pump fluoroquinolone antibiotic; diaminopyrimidine antibiotic; phenicol antibiotic antibiotic efflux
ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1 mepA 3000026 protein homolog model CARD CARD curation no data no data Staphylococcus aureus 1 0 1 17.70 240 190.50 * 1356 multidrug and toxic compound extrusion (MATE) transporter glycylcycline; tetracycline antibiotic antibiotic efflux
Loading