merge 3

bokulich-lab · Sep 26, 2024 · 4f76e2e · 4f76e2e
2 parents 1026eec + ee66471
commit 4f76e2e
Show file tree

Hide file tree

Showing 7 changed files with 1,153 additions and 11 deletions.
diff --git a/q2_amrfinderplus/annotate.py b/q2_amrfinderplus/annotate.py
@@ -0,0 +1,129 @@
+from typing import Union
+
+from q2_types.feature_data_mag import MAGSequencesDirFmt
+from q2_types.genome_data import (
+    GenesDirectoryFormat,
+    LociDirectoryFormat,
+    ProteinsDirectoryFormat,
+)
+from q2_types.per_sample_sequences import ContigSequencesDirFmt, MultiMAGSequencesDirFmt
+
+from q2_amrfinderplus.types import (
+    AMRFinderPlusAnnotationsDirFmt,
+    AMRFinderPlusDatabaseDirFmt,
+)
+from q2_amrfinderplus.utils import (
+    _create_empty_files,
+    _create_sample_dict,
+    _create_sample_dirs,
+    _get_file_paths,
+    _run_amrfinderplus_analyse,
+    _validate_inputs,
+)
+
+
+def annotate(
+    amrfinderplus_db: AMRFinderPlusDatabaseDirFmt,
+    sequences: Union[
+        MultiMAGSequencesDirFmt, ContigSequencesDirFmt, MAGSequencesDirFmt
+    ] = None,
+    proteins: ProteinsDirectoryFormat = None,
+    loci: LociDirectoryFormat = None,
+    organism: str = None,
+    plus: bool = False,
+    report_all_equal: bool = False,
+    ident_min: float = None,
+    curated_ident: bool = False,
+    coverage_min: float = 0.5,
+    translation_table: str = "11",
+    annotation_format: str = "prodigal",
+    report_common: bool = False,
+    threads: int = None,
+) -> (
+    AMRFinderPlusAnnotationsDirFmt,
+    AMRFinderPlusAnnotationsDirFmt,
+    GenesDirectoryFormat,
+    ProteinsDirectoryFormat,
+):
+    # Validate input and parameter combinations
+    _validate_inputs(
+        sequences,
+        loci,
+        proteins,
+        ident_min,
+        curated_ident,
+        report_common,
+        plus,
+        organism,
+    )
+
+    # Set up common parameters for _run_amrfinderplus_analyse
+    common_params = locals().copy()
+    del common_params["sequences"]
+    del common_params["proteins"]
+    del common_params["loci"]
+
+    # Innit output formats
+    amr_annotations = AMRFinderPlusAnnotationsDirFmt()
+    amr_all_mutations = AMRFinderPlusAnnotationsDirFmt()
+    amr_genes = GenesDirectoryFormat()
+    amr_proteins = ProteinsDirectoryFormat()
+
+    # Create sample_dict to iterate over input files
+    sample_dict = _create_sample_dict(proteins, sequences)
+
+    # Iterate over sample_dict
+    for sample_id, files_dict in sample_dict.items():
+        # Create sample directories in output directories
+        _create_sample_dirs(
+            sequences,
+            proteins,
+            organism,
+            amr_annotations,
+            amr_genes,
+            amr_proteins,
+            amr_all_mutations,
+            sample_id,
+        )
+
+        for id, file_fp in files_dict.items():
+            # Construct and validate file input paths for amrfinderplus
+            dna_path, protein_path, gff_path = _get_file_paths(
+                sequences,
+                proteins,
+                loci,
+                id,
+                file_fp,
+                sample_id,
+            )
+
+            # Define paths for output files
+            amr_annotations_path = (
+                amr_annotations.path / sample_id / f"{id}_amr_annotations.tsv"
+            )
+            amr_genes_path = amr_genes.path / sample_id / f"{id}_amr_genes.fasta"
+            amr_proteins_path = (
+                amr_proteins.path / sample_id / f"{id}_amr_proteins.fasta"
+            )
+            amr_all_mutations_path = (
+                amr_all_mutations.path / sample_id / f"{id}_amr_all_mutations.tsv"
+            )
+
+            # Run amrfinderplus
+            _run_amrfinderplus_analyse(
+                dna_path=dna_path,
+                protein_path=protein_path,
+                gff_path=gff_path,
+                amr_annotations_path=amr_annotations_path,
+                amr_genes_path=amr_genes_path,
+                amr_proteins_path=amr_proteins_path,
+                amr_all_mutations_path=amr_all_mutations_path,
+                **common_params,
+            )
+
+    # Create empty files for empty output artifacts if needed
+    _create_empty_files(
+        sequences, proteins, organism, amr_genes, amr_proteins, amr_all_mutations
+    )
+
+    return amr_annotations, amr_all_mutations, amr_genes, amr_proteins
diff --git a/q2_amrfinderplus/plugin_setup.py b/q2_amrfinderplus/plugin_setup.py
@@ -7,10 +7,16 @@
 # ----------------------------------------------------------------------------
 import importlib
 
+from q2_types.feature_data import FeatureData
+from q2_types.feature_data_mag import MAG
+from q2_types.genome_data import Genes, GenomeData, Loci, Proteins
+from q2_types.per_sample_sequences import Contigs, MAGs
 from q2_types.sample_data import SampleData
+from qiime2.core.type import Bool, Choices, Float, Int, Range, Str
 from qiime2.plugin import Citations, Plugin
 
 from q2_amrfinderplus import __version__
+from q2_amrfinderplus.annotate import annotate
 from q2_amrfinderplus.database import fetch_amrfinderplus_db
 from q2_amrfinderplus.types._format import (
     AMRFinderPlusAnnotationFormat,
@@ -50,12 +56,191 @@
     citations=[citations["feldgarden2021amrfinderplus"]],
 )
 
+organisms = [
+    "Acinetobacter_baumannii",
+    "Burkholderia_cepacia",
+    "Burkholderia_pseudomallei",
+    "Campylobacter",
+    "Citrobacter_freundii",
+    "Clostridioides_difficile",
+    "Enterobacter_asburiae",
+    "Enterobacter_cloacae",
+    "Enterococcus_faecalis",
+    "Enterococcus_faecium",
+    "Escherichia",
+    "Klebsiella_oxytoca",
+    "Klebsiella_pneumoniae",
+    "Neisseria_gonorrhoeae",
+    "Neisseria_meningitidis",
+    "Pseudomonas_aeruginosa",
+    "Salmonella",
+    "Serratia_marcescens",
+    "Staphylococcus_aureus",
+    "Staphylococcus_pseudintermedius",
+    "Streptococcus_agalactiae",
+    "Streptococcus_pneumoniae",
+    "Streptococcus_pyogenes",
+    "Vibrio_cholerae",
+    "Vibrio_parahaemolyticus",
+    "Vibrio_vulnificus",
+    "Acinetobacter",
+    "Burkholderia_cepacia_complex",
+    "Escherichia_coli_Shigella",
+    "Klebsiella",
+    "Serratia",
+]
+
+
+translation_tables = [
+    "1",
+    "2",
+    "3",
+    "4",
+    "5",
+    "6",
+    "9",
+    "10",
+    "11",
+    "12",
+    "13",
+    "14",
+    "15",
+    "16",
+    "21",
+    "22",
+    "23",
+    "24",
+    "25",
+    "26",
+    "27",
+    "28",
+    "29",
+    "30",
+    "31",
+    "33",
+]
+
+
+amrfinderplus_parameters = {
+    "organism": Str % Choices(organisms),
+    "plus": Bool,
+    "report_all_equal": Bool,
+    "ident_min": Float % Range(0, 1, inclusive_start=True, inclusive_end=True),
+    "curated_ident": Bool,
+    "coverage_min": Float % Range(0, 1, inclusive_start=True, inclusive_end=True),
+    "translation_table": Str % Choices(translation_tables),
+    "annotation_format": Str
+    % Choices(
+        "bakta",
+        "genbank",
+        "microscope",
+        "patric",
+        "pgap",
+        "prodigal",
+        "prokka",
+        "pseudomonasdb",
+        "rast",
+        "standard",
+    ),
+    "report_common": Bool,
+    "threads": Int % Range(0, None, inclusive_start=False),
+}
+
+amrfinderplus_parameter_descriptions = {
+    "organism": "Taxon used for screening known resistance causing point mutations "
+    "and blacklisting of common, non-informative genes. Pathogen Detection "
+    "taxgroup names can also be used.",
+    "plus": "Provide results from 'Plus' genes such as virulence factors, "
+    "stress-response genes, etc.",
+    "report_all_equal": "Report all equally scoring BLAST and HMM matches. This "
+    "will report multiple lines for a single element if there "
+    "are multiple reference proteins that have the same score. "
+    "On those lines the fields Accession of closest sequence "
+    "and Name of closest sequence will be different showing "
+    "each of the database proteins that are equally close to "
+    "the query sequence.",
+    "ident_min": "Minimum identity for a blast-based hit (Methods BLAST or "
+    "PARTIAL). Setting this value to something other than -1 "
+    "will override curated similarity cutoffs. We only recommend "
+    "using this option if you have a specific reason.",
+    "curated_ident": "Use the curated threshold for a blast-based hit, if it "
+    "exists and 0.9 otherwise. This will overwrite the value specified with the "
+    "'ident_min' parameter.",
+    "coverage_min": "Minimum proportion of reference gene covered for a "
+    "BLAST-based hit (Methods BLAST or PARTIAL).",
+    "translation_table": "Translation table used for BLASTX.",
+    "report_common": "Report proteins common to a taxonomy group.",
+    "threads": "The number of threads to use for processing. AMRFinderPlus "
+    "defaults to 4 on hosts with >= 4 cores. Setting this number higher"
+    " than the number of cores on the running host may cause blastp to "
+    "fail. Using more than 4 threads may speed up searches.",
+}
+
+amrfinderplus_output_descriptions = {
+    "amr_annotations": "Annotated AMR genes and mutations.",
+    "amr_all_mutations": "Report of genotypes at all locations screened for point "
+    "mutations. These files allow you to distinguish between called "
+    "point mutations that were the sensitive variant and the point "
+    "mutations that could not be called because the sequence was not "
+    "found. This file will contain all detected variants from the "
+    "reference sequence, so it could be used as an initial screen for "
+    "novel variants. Note 'Gene symbols' for mutations not in the "
+    "database (identifiable by [UNKNOWN] in the Sequence name field) "
+    "have offsets that are relative to the start of the sequence "
+    "indicated in the field 'Accession of closest sequence' while "
+    "'Gene symbols' from known point-mutation sites have gene symbols "
+    "that match the Pathogen Detection Reference Gene Catalog "
+    "standardized nomenclature for point mutations.",
+    "amr_genes": "Sequences that were identified by AMRFinderPlus as AMR genes. "
+    "This will include the entire region that aligns to the references for "
+    "point mutations.",
+    "amr_proteins": "Protein Sequences that were identified by AMRFinderPlus as "
+    "AMR genes. This will include the entire region that aligns to the references "
+    "for point mutations.",
+}
+
+
+amrfinderplus_input_descriptions = {
+    "sequences": "MAGs or contigs to be annotated with AMRFinderPlus.",
+    "proteins": "Protein sequences to be annotated with AMRFinderPlus.",
+    "loci": "GFF files to give sequence coordinates for proteins input. Required "
+    "for combined searches of protein and DNA sequences.",
+    "amrfinderplus_db": "AMRFinderPlus Database.",
+}
+
+
+plugin.methods.register_function(
+    function=annotate,
+    inputs={
+        "sequences": SampleData[MAGs | Contigs] | FeatureData[MAG],
+        "proteins": GenomeData[Proteins],
+        "loci": GenomeData[Loci],
+        "amrfinderplus_db": AMRFinderPlusDatabase,
+    },
+    parameters=amrfinderplus_parameters,
+    outputs=[
+        ("amr_annotations", GenomeData[AMRFinderPlusAnnotations]),
+        ("amr_all_mutations", GenomeData[AMRFinderPlusAnnotations]),
+        ("amr_genes", GenomeData[Genes]),
+        ("amr_proteins", GenomeData[Proteins]),
+    ],
+    input_descriptions=amrfinderplus_input_descriptions,
+    parameter_descriptions=amrfinderplus_parameter_descriptions,
+    output_descriptions=amrfinderplus_output_descriptions,
+    name="Annotate MAGs or contigs with AMRFinderPlus.",
+    description="Annotate sample data MAGs or contigs with antimicrobial resistance "
+    "genes with AMRFinderPlus. Check https://github.com/ncbi/amr/wiki for "
+    "documentation.",
+    citations=[citations["feldgarden2021amrfinderplus"]],
+)
+
+
 plugin.register_semantic_type_to_format(
     AMRFinderPlusDatabase,
     artifact_format=AMRFinderPlusDatabaseDirFmt,
 )
 plugin.register_semantic_type_to_format(
-    SampleData[AMRFinderPlusAnnotations],
+    GenomeData[AMRFinderPlusAnnotations],
     artifact_format=AMRFinderPlusAnnotationsDirFmt,
 )
 

diff --git a/q2_amrfinderplus/tests/test_annotate.py b/q2_amrfinderplus/tests/test_annotate.py
@@ -0,0 +1,47 @@
+from unittest.mock import patch
+
+from q2_types.genome_data import GenesDirectoryFormat, ProteinsDirectoryFormat
+from qiime2.plugin.testing import TestPluginBase
+
+from q2_amrfinderplus.annotate import annotate
+from q2_amrfinderplus.types import (
+    AMRFinderPlusAnnotationsDirFmt,
+    AMRFinderPlusDatabaseDirFmt,
+)
+
+
+class TestAnnotate(TestPluginBase):
+    package = "q2_amrfinderplus.tests"
+
+    @patch("q2_amrfinderplus.annotate._validate_inputs")
+    @patch(
+        "q2_amrfinderplus.annotate._create_sample_dict",
+        return_value={"sample1": {"id1": "file_path"}},
+    )
+    @patch("q2_amrfinderplus.annotate._create_sample_dirs")
+    @patch(
+        "q2_amrfinderplus.annotate._get_file_paths",
+        return_value=("dna_path", "protein_path", "gff_path"),
+    )
+    @patch("q2_amrfinderplus.annotate._run_amrfinderplus_analyse")
+    @patch("q2_amrfinderplus.annotate._create_empty_files")
+    def test_annotate(
+        self,
+        mock_create_empty_files,
+        mock_run_amrfinderplus,
+        mock_get_file_paths,
+        mock_create_sample_dirs,
+        mock_create_sample_dict,
+        mock_validate_inputs,
+    ):
+        # Create mock for the AMRFinderPlusDatabaseDirFmt input
+        amrfinderplus_db = AMRFinderPlusDatabaseDirFmt()
+
+        # Call the function with mostly default inputs
+        result = annotate(amrfinderplus_db)
+
+        # Ensure the output is the correct types
+        self.assertIsInstance(result[0], AMRFinderPlusAnnotationsDirFmt)
+        self.assertIsInstance(result[1], AMRFinderPlusAnnotationsDirFmt)
+        self.assertIsInstance(result[2], GenesDirectoryFormat)
+        self.assertIsInstance(result[3], ProteinsDirectoryFormat)