diff --git a/q2_amr/amrfinderplus/sample_data.py b/q2_amr/amrfinderplus/sample_data.py new file mode 100644 index 0000000..6a5265e --- /dev/null +++ b/q2_amr/amrfinderplus/sample_data.py @@ -0,0 +1,123 @@ +import os +import shutil +import tempfile +from typing import Union + +import pandas as pd +from q2_types.genome_data import GenesDirectoryFormat +from q2_types.per_sample_sequences import ContigSequencesDirFmt, MultiMAGSequencesDirFmt + +from q2_amr.amrfinderplus.types import ( + AMRFinderPlusAnnotationsDirFmt, + AMRFinderPlusDatabaseDirFmt, +) +from q2_amr.amrfinderplus.utils import run_amrfinderplus_n +from q2_amr.card.utils import create_count_table, read_in_txt + + +def annotate_sample_data_amrfinderplus( + sequences: Union[MultiMAGSequencesDirFmt, ContigSequencesDirFmt], + amrfinderplus_db: AMRFinderPlusDatabaseDirFmt, + organism: str = None, + plus: bool = False, + report_all_equal: bool = False, + ident_min: float = None, + curated_ident: bool = False, + coverage_min: float = 0.5, + translation_table: str = "11", + threads: int = None, +) -> ( + AMRFinderPlusAnnotationsDirFmt, + AMRFinderPlusAnnotationsDirFmt, + GenesDirectoryFormat, + pd.DataFrame, +): + annotations = AMRFinderPlusAnnotationsDirFmt() + mutations = AMRFinderPlusAnnotationsDirFmt() + genes = GenesDirectoryFormat() + frequency_list = [] + + # Create list of paths to all mags or contigs + if isinstance(sequences, MultiMAGSequencesDirFmt): + manifest = sequences.manifest.view(pd.DataFrame) + files = manifest["filename"] + else: + files = [ + os.path.join(str(sequences), file) for file in os.listdir(str(sequences)) + ] + + with tempfile.TemporaryDirectory() as tmp: + # Iterate over paths of MAGs or contigs + for file in files: + # Set sample and MAG IDs + if isinstance(sequences, MultiMAGSequencesDirFmt): + index_value = manifest.query("filename == @file").index[0] + sample_id = index_value[0] + mag_id = index_value[1] + else: + sample_id = os.path.splitext(os.path.basename(file))[0][:-8] + mag_id = "" + + # Run amrfinderplus + run_amrfinderplus_n( + working_dir=tmp, + amrfinderplus_db=amrfinderplus_db, + dna_sequences=file, + protein_sequences=None, + gff=None, + organism=organism, + plus=plus, + report_all_equal=report_all_equal, + ident_min=ident_min, + curated_ident=curated_ident, + coverage_min=coverage_min, + translation_table=translation_table, + threads=threads, + ) + + # Create frequency dataframe and append it to list + frequency_df = read_in_txt( + path=os.path.join(tmp, "amr_annotations.tsv"), + samp_bin_name=str(os.path.join(sample_id, mag_id)), + data_type="mags", + colname="Gene symbol", + ) + frequency_list.append(frequency_df) + + # Move mutations file. If it is not created, create an empty mutations file + des_path_mutations = os.path.join( + str(mutations), + sample_id, + f"{mag_id + '_' if mag_id else ''}amr_mutations.tsv", + ) + os.makedirs(os.path.dirname(des_path_mutations), exist_ok=True) + if organism: + shutil.move(os.path.join(tmp, "amr_mutations.tsv"), des_path_mutations) + else: + with open(des_path_mutations, "w"): + pass + + # Move annotations file + des_path_annotations = os.path.join( + str(annotations), + sample_id, + f"{mag_id + '_' if mag_id else ''}amr_annotations.tsv", + ) + os.makedirs(os.path.dirname(des_path_annotations), exist_ok=True) + shutil.move(os.path.join(tmp, "amr_annotations.tsv"), des_path_annotations) + + # Move genes file + shutil.move( + os.path.join(tmp, "amr_genes.fasta"), + os.path.join( + str(genes), f"{mag_id if mag_id else sample_id}_amr_genes.fasta" + ), + ) + + feature_table = create_count_table(df_list=frequency_list) + return ( + annotations, + mutations, + genes, + feature_table, + ) diff --git a/q2_amr/amrfinderplus/tests/test_sample_data.py b/q2_amr/amrfinderplus/tests/test_sample_data.py new file mode 100644 index 0000000..9f705af --- /dev/null +++ b/q2_amr/amrfinderplus/tests/test_sample_data.py @@ -0,0 +1,101 @@ +import os +from unittest.mock import MagicMock, patch + +from q2_types.per_sample_sequences import ContigSequencesDirFmt, MultiMAGSequencesDirFmt +from qiime2.plugin.testing import TestPluginBase + +from q2_amr.amrfinderplus.sample_data import annotate_sample_data_amrfinderplus +from q2_amr.amrfinderplus.types import AMRFinderPlusDatabaseDirFmt + + +class TestAnnotateSampleDataAMRFinderPlus(TestPluginBase): + package = "q2_amr.amrfinderplus.tests" + + def mock_run_amrfinderplus_n( + self, + working_dir, + amrfinderplus_db, + dna_sequences, + protein_sequences, + gff, + organism, + plus, + report_all_equal, + ident_min, + curated_ident, + coverage_min, + translation_table, + threads, + ): + with open(os.path.join(working_dir, "amr_annotations.tsv"), "w"): + pass + if organism: + with open(os.path.join(working_dir, "amr_mutations.tsv"), "w"): + pass + if dna_sequences: + with open(os.path.join(working_dir, "amr_genes.fasta"), "w"): + pass + + files_contigs = [ + "amr_annotations.tsv", + "amr_mutations.tsv", + "sample1_amr_genes.fasta", + ] + + files_mags = [ + "mag1_amr_annotations.tsv", + "mag1_amr_mutations.tsv", + "mag1_amr_genes.fasta", + ] + + def test_annotate_sample_data_amrfinderplus_mags(self): + sequences = MultiMAGSequencesDirFmt() + with open(os.path.join(str(sequences), "MANIFEST"), "w") as file: + file.write("sample-id,mag-id,filename\nsample1,mag1,sample1/mag1.fasta\n") + self._helper(sequences=sequences, organism=None, files=self.files_mags) + + def test_annotate_sample_data_amrfinderplus_mags_organism(self): + sequences = MultiMAGSequencesDirFmt() + with open(os.path.join(str(sequences), "MANIFEST"), "w") as file: + file.write("sample-id,mag-id,filename\nsample1,mag1,sample1/mag1.fasta\n") + self._helper(sequences, "Escherichia", files=self.files_mags) + + def test_annotate_sample_data_amrfinderplus_contigs(self): + sequences = ContigSequencesDirFmt() + with open(os.path.join(str(sequences), "sample1_contigs.fasta"), "w"): + pass + self._helper(sequences=sequences, organism=None, files=self.files_contigs) + + def test_annotate_sample_data_amrfinderplus_contigs_organism(self): + sequences = ContigSequencesDirFmt() + with open(os.path.join(str(sequences), "sample1_contigs.fasta"), "w"): + pass + self._helper( + sequences=sequences, organism="Escherichia", files=self.files_contigs + ) + + def _helper(self, sequences, organism, files): + amrfinderplus_db = AMRFinderPlusDatabaseDirFmt() + mock_create_count_table = MagicMock() + mock_read_in_txt = MagicMock() + with patch( + "q2_amr.amrfinderplus.sample_data.run_amrfinderplus_n", + side_effect=self.mock_run_amrfinderplus_n, + ), patch( + "q2_amr.amrfinderplus.sample_data.read_in_txt", mock_read_in_txt + ), patch( + "q2_amr.amrfinderplus.sample_data.create_count_table", + mock_create_count_table, + ): + result = annotate_sample_data_amrfinderplus( + sequences=sequences, + amrfinderplus_db=amrfinderplus_db, + organism=organism, + ) + self.assertTrue( + os.path.exists(os.path.join(str(result[0]), "sample1", files[0])) + ) + self.assertTrue( + os.path.exists(os.path.join(str(result[1]), "sample1", files[1])) + ) + self.assertTrue(os.path.exists(os.path.join(str(result[2]), files[2]))) diff --git a/q2_amr/amrfinderplus/tests/test_utils.py b/q2_amr/amrfinderplus/tests/test_utils.py new file mode 100644 index 0000000..1e7ae59 --- /dev/null +++ b/q2_amr/amrfinderplus/tests/test_utils.py @@ -0,0 +1,95 @@ +from unittest.mock import patch + +from qiime2.plugin.testing import TestPluginBase + +from q2_amr.amrfinderplus.utils import run_amrfinderplus_n + + +class TestAnnotateMagsCard(TestPluginBase): + package = "q2_amr.amrfinderplus.tests" + + @patch("q2_amr.amrfinderplus.utils.run_command") + def test_run_amrfinderplus_n(self, mock_run_command): + run_amrfinderplus_n( + working_dir="path_dir", + amrfinderplus_db="amrfinderplus_db", + dna_sequences="dna_sequences", + protein_sequences="protein_sequences", + gff="gff", + organism="Escherichia", + plus=True, + report_all_equal=True, + ident_min=1, + curated_ident=False, + coverage_min=1, + translation_table="11", + threads=4, + ) + mock_run_command.assert_called_once_with( + [ + "amrfinder", + "--database", + "amrfinderplus_db", + "-o", + "path_dir/amr_annotations.tsv", + "--print_node", + "-n", + "dna_sequences", + "--nucleotide_output", + "path_dir/amr_genes.fasta", + "-p", + "protein_sequences", + "--protein_output", + "path_dir/amr_proteins.fasta", + "-g", + "gff", + "--threads", + "4", + "--organism", + "Escherichia", + "--mutation_all", + "path_dir/amr_mutations.tsv", + "--plus", + "--report_all_equal", + "--ident_min", + "1", + "--coverage_min", + "1", + "--translation_table", + "11", + ], + "path_dir", + verbose=True, + ) + + @patch("q2_amr.amrfinderplus.utils.run_command") + def test_run_amrfinderplus_n_minimal(self, mock_run_command): + run_amrfinderplus_n( + working_dir="path_dir", + amrfinderplus_db="amrfinderplus_db", + dna_sequences=None, + protein_sequences=None, + gff=None, + organism=None, + plus=False, + report_all_equal=False, + ident_min=None, + curated_ident=True, + coverage_min=None, + translation_table=None, + threads=None, + ) + mock_run_command.assert_called_once_with( + [ + "amrfinder", + "--database", + "amrfinderplus_db", + "-o", + "path_dir/amr_annotations.tsv", + "--print_node", + "--ident_min", + "-1", + ], + "path_dir", + verbose=True, + ) diff --git a/q2_amr/amrfinderplus/utils.py b/q2_amr/amrfinderplus/utils.py new file mode 100644 index 0000000..199957c --- /dev/null +++ b/q2_amr/amrfinderplus/utils.py @@ -0,0 +1,83 @@ +import subprocess + +from q2_amr.card.utils import run_command + + +def run_amrfinderplus_n( + working_dir, + amrfinderplus_db, + dna_sequences, + protein_sequences, + gff, + organism, + plus, + report_all_equal, + ident_min, + curated_ident, + coverage_min, + translation_table, + threads, +): + cmd = [ + "amrfinder", + "--database", + str(amrfinderplus_db), + "-o", + f"{working_dir}/amr_annotations.tsv", + "--print_node", + ] + # Creates nucleotide fasta output if DNA sequences are given as input + if dna_sequences: + cmd.extend( + [ + "-n", + dna_sequences, + "--nucleotide_output", + f"{working_dir}/amr_genes.fasta", + ] + ) + # Creates protein fasta output if protein sequences are given as input + if protein_sequences: + cmd.extend( + [ + "-p", + protein_sequences, + "--protein_output", + f"{working_dir}/amr_proteins.fasta", + ] + ) + if gff: + cmd.extend(["-g", gff]) + if threads: + cmd.extend(["--threads", str(threads)]) + # Creates all mutations output if an organism is specified + if organism: + cmd.extend( + [ + "--organism", + organism, + "--mutation_all", + f"{working_dir}/amr_mutations.tsv", + ] + ) + if plus: + cmd.append("--plus") + if report_all_equal: + cmd.append("--report_all_equal") + # If curated_ident is True, it will overwrite the value specified with ident_min + if ident_min and not curated_ident: + cmd.extend(["--ident_min", str(ident_min)]) + if curated_ident: + cmd.extend(["--ident_min", "-1"]) + if coverage_min: + cmd.extend(["--coverage_min", str(coverage_min)]) + if translation_table: + cmd.extend(["--translation_table", str(translation_table)]) + try: + run_command(cmd, working_dir, verbose=True) + except subprocess.CalledProcessError as e: + raise Exception( + "An error was encountered while running AMRFinderPlus, " + f"(return code {e.returncode}), please inspect " + "stdout and stderr to learn more." + ) diff --git a/q2_amr/card/mags.py b/q2_amr/card/mags.py index 4a5c72f..8ba020c 100644 --- a/q2_amr/card/mags.py +++ b/q2_amr/card/mags.py @@ -46,7 +46,10 @@ def annotate_mags_card( shutil.move(f"{tmp}/output.json", json_path) samp_bin_name = os.path.join(samp_bin[0], samp_bin[1]) frequency_df = read_in_txt( - path=txt_path, samp_bin_name=samp_bin_name, data_type="mags" + path=txt_path, + samp_bin_name=samp_bin_name, + data_type="mags", + colname="Best_Hit_ARO", ) frequency_list.append(frequency_df) feature_table = create_count_table(df_list=frequency_list) diff --git a/q2_amr/card/reads.py b/q2_amr/card/reads.py index 05debc1..0d85d69 100644 --- a/q2_amr/card/reads.py +++ b/q2_amr/card/reads.py @@ -144,11 +144,12 @@ def _annotate_reads_card( path_txt = os.path.join( samp_tmp_dir, f"output.{map_type}_mapping_data.txt" ) + colname = "Reference Sequence" if map_type == "allele" else "ARO Term" frequency_table = read_in_txt( path=path_txt, samp_bin_name=samp, data_type="reads", - map_type=map_type, + colname=colname, ) table_list.append(frequency_table) diff --git a/q2_amr/card/tests/test_reads.py b/q2_amr/card/tests/test_reads.py index c3abaef..88a5de2 100644 --- a/q2_amr/card/tests/test_reads.py +++ b/q2_amr/card/tests/test_reads.py @@ -119,10 +119,12 @@ def annotate_reads_card_test_body(self, read_type): path=f"{tmp_dir}/{samp}/output.{map_type}_mapping_data.txt", samp_bin_name=samp, data_type="reads", - map_type=map_type, + colname=colname, ) for samp in ["sample1", "sample2"] - for map_type in ["allele", "gene"] + for map_type, colname in zip( + ["allele", "gene"], ["Reference Sequence", "ARO Term"] + ) ] # Expected call objects for mock_create_count_table diff --git a/q2_amr/card/tests/test_utils.py b/q2_amr/card/tests/test_utils.py index a8eb7ca..dc64094 100644 --- a/q2_amr/card/tests/test_utils.py +++ b/q2_amr/card/tests/test_utils.py @@ -151,6 +151,7 @@ def test_read_in_txt_mags(self): samp_bin_name="sample1/bin1", exp=self.mag_count_df, data_type="mags", + colname="Best_Hit_ARO", ) def test_read_in_txt_reads_allele(self): @@ -160,7 +161,7 @@ def test_read_in_txt_reads_allele(self): samp_bin_name="sample1", exp=self.allele_count_df, data_type="reads", - map_type="allele", + colname="Reference Sequence", ) def test_read_in_txt_reads_gene(self): @@ -170,15 +171,13 @@ def test_read_in_txt_reads_gene(self): samp_bin_name="sample1", exp=self.gene_count_df, data_type="reads", - map_type="gene", + colname="ARO Term", ) - def read_in_txt_test_body( - self, filename, samp_bin_name, exp, data_type, map_type=None - ): + def read_in_txt_test_body(self, filename, samp_bin_name, exp, data_type, colname): # Create expected and observed count dataframes and compare them obs = read_in_txt( - self.get_data_path(filename), samp_bin_name, data_type, map_type + self.get_data_path(filename), samp_bin_name, data_type, colname ) pd.testing.assert_frame_equal(exp, obs) diff --git a/q2_amr/card/utils.py b/q2_amr/card/utils.py index 367a9ff..129bb7a 100644 --- a/q2_amr/card/utils.py +++ b/q2_amr/card/utils.py @@ -95,20 +95,17 @@ def load_card_db( return kmer_size -def read_in_txt(path: str, samp_bin_name: str, data_type: str, map_type=None): +def read_in_txt(path: str, samp_bin_name: str, data_type: str, colname: str): # Read in txt file to pd.Dataframe df = pd.read_csv(path, sep="\t") - # Process the df depending on the data type and mapping type + # Process the df depending on the data type if data_type == "reads": - colname = "Reference Sequence" if map_type == "allele" else "ARO Term" df = df[[colname, "All Mapped Reads"]] df.rename(columns={"All Mapped Reads": samp_bin_name}, inplace=True) - else: - df = df["Best_Hit_ARO"].value_counts().reset_index() - - # Rename the columns - df.columns = ["Best_Hit_ARO", samp_bin_name] + elif data_type == "mags": + df = df[colname].value_counts().reset_index() + df.columns = [colname, samp_bin_name] df = df.astype(str) return df diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py index 80c7f9a..ea20a55 100644 --- a/q2_amr/plugin_setup.py +++ b/q2_amr/plugin_setup.py @@ -9,7 +9,9 @@ from q2_types.feature_data import FeatureData from q2_types.feature_table import FeatureTable, Frequency +from q2_types.genome_data import Genes, GenomeData from q2_types.per_sample_sequences import ( + Contigs, MAGs, PairedEndSequencesWithQuality, SequencesWithQuality, @@ -19,6 +21,7 @@ Bool, Choices, Collection, + Float, Int, List, Properties, @@ -30,6 +33,7 @@ from q2_amr import __version__ from q2_amr.amrfinderplus.database import fetch_amrfinderplus_db +from q2_amr.amrfinderplus.sample_data import annotate_sample_data_amrfinderplus from q2_amr.amrfinderplus.types._format import ( AMRFinderPlusAnnotationDirFmt, AMRFinderPlusAnnotationFormat, @@ -1096,6 +1100,143 @@ citations=[citations["feldgarden2021amrfinderplus"]], ) +organisms = [ + "Acinetobacter_baumannii", + "Burkholderia_cepacia", + "Burkholderia_pseudomallei", + "Campylobacter", + "Citrobacter_freundii", + "Clostridioides_difficile", + "Enterobacter_asburiae", + "Enterobacter_cloacae", + "Enterococcus_faecalis", + "Enterococcus_faecium", + "Escherichia", + "Klebsiella_oxytoca", + "Klebsiella_pneumoniae", + "Neisseria_gonorrhoeae", + "Neisseria_meningitidis", + "Pseudomonas_aeruginosa", + "Salmonella", + "Serratia_marcescens", + "Staphylococcus_aureus", + "Staphylococcus_pseudintermedius", + "Streptococcus_agalactiae", + "Streptococcus_pneumoniae", + "Streptococcus_pyogenes", + "Vibrio_cholerae", + "Vibrio_parahaemolyticus", + "Vibrio_vulnificus", +] + +translation_tables = [ + "1", + "2", + "3", + "4", + "5", + "6", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "33", +] + +plugin.methods.register_function( + function=annotate_sample_data_amrfinderplus, + inputs={ + "sequences": SampleData[MAGs | Contigs], + "amrfinderplus_db": AMRFinderPlusDatabase, + }, + parameters={ + "organism": Str % Choices(organisms), + "plus": Bool, + "report_all_equal": Bool, + "ident_min": Float % Range(0, 1, inclusive_start=True, inclusive_end=True), + "curated_ident": Bool, + "coverage_min": Float % Range(0, 1, inclusive_start=True, inclusive_end=True), + "translation_table": Str % Choices(translation_tables), + "threads": Int % Range(0, None, inclusive_start=False), + }, + outputs=[ + ("annotations", SampleData[AMRFinderPlusAnnotations]), + ("mutations", SampleData[AMRFinderPlusAnnotations]), + ("genes", GenomeData[Genes]), + ("feature_table", FeatureTable[Frequency]), + ], + input_descriptions={ + "sequences": "MAGs or contigs to be annotated with AMRFinderPlus.", + "amrfinderplus_db": "AMRFinderPlus Database.", + }, + parameter_descriptions={ + "organism": "Taxon used for screening known resistance causing point mutations " + "and blacklisting of common, non-informative genes.", + "plus": "Provide results from 'Plus' genes such as virulence factors, " + "stress-response genes, etc.", + "report_all_equal": "Report all equally scoring BLAST and HMM matches. This " + "will report multiple lines for a single element if there " + "are multiple reference proteins that have the same score. " + "On those lines the fields Accession of closest sequence " + "and Name of closest sequence will be different showing " + "each of the database proteins that are equally close to " + "the query sequence.", + "ident_min": "Minimum identity for a blast-based hit (Methods BLAST or " + "PARTIAL). Setting this value to something other than -1 " + "will override curated similarity cutoffs. We only recommend " + "using this option if you have a specific reason.", + "curated_ident": "Use the curated threshold for a blast-based hit, if it " + "exists and 0.9 otherwise. This will overwrite the value specified with the " + "'ident_min' parameter", + "coverage_min": "Minimum proportion of reference gene covered for a " + "BLAST-based hit (Methods BLAST or PARTIAL).", + "translation_table": "Translation table used for BLASTX.", + "threads": "The number of threads to use for processing. AMRFinderPlus " + "defaults to 4 on hosts with >= 4 cores. Setting this number higher" + " than the number of cores on the running host may cause blastp to " + "fail. Using more than 4 threads may speed up searches.", + }, + output_descriptions={ + "annotations": "Annotated AMR genes and mutations.", + "mutations": "Report of genotypes at all locations screened for point " + "mutations. These files allow you to distinguish between called " + "point mutations that were the sensitive variant and the point " + "mutations that could not be called because the sequence was not " + "found. This file will contain all detected variants from the " + "reference sequence, so it could be used as an initial screen for " + "novel variants. Note 'Gene symbols' for mutations not in the " + "database (identifiable by [UNKNOWN] in the Sequence name field) " + "have offsets that are relative to the start of the sequence " + "indicated in the field 'Accession of closest sequence' while " + "'Gene symbols' from known point-mutation sites have gene symbols " + "that match the Pathogen Detection Reference Gene Catalog " + "standardized nomenclature for point mutations.", + "genes": "Sequences that were identified by AMRFinderPlus as AMR genes. This " + "will include the entire region that aligns to the references for " + "point mutations.", + "feature_table": "Presence/Absence table of ARGs in all samples.", + }, + name="Annotate MAGs or contigs with AMRFinderPlus.", + description="Annotate sample data MAGs or contigs with antimicrobial resistance " + "genes with AMRFinderPlus.", + citations=[], +) + # Registrations plugin.register_semantic_types( CARDDatabase, @@ -1143,7 +1284,6 @@ AMRFinderPlusDatabase, artifact_format=AMRFinderPlusDatabaseDirFmt, ) - plugin.register_semantic_type_to_format( SampleData[AMRFinderPlusAnnotations], artifact_format=AMRFinderPlusAnnotationsDirFmt,