Skip to content

Commit

Permalink
merge 3
Browse files Browse the repository at this point in the history
  • Loading branch information
VinzentRisch committed Sep 26, 2024
2 parents 1026eec + ee66471 commit 4f76e2e
Show file tree
Hide file tree
Showing 7 changed files with 1,153 additions and 11 deletions.
129 changes: 129 additions & 0 deletions q2_amrfinderplus/annotate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from typing import Union

from q2_types.feature_data_mag import MAGSequencesDirFmt
from q2_types.genome_data import (
GenesDirectoryFormat,
LociDirectoryFormat,
ProteinsDirectoryFormat,
)
from q2_types.per_sample_sequences import ContigSequencesDirFmt, MultiMAGSequencesDirFmt

from q2_amrfinderplus.types import (
AMRFinderPlusAnnotationsDirFmt,
AMRFinderPlusDatabaseDirFmt,
)
from q2_amrfinderplus.utils import (
_create_empty_files,
_create_sample_dict,
_create_sample_dirs,
_get_file_paths,
_run_amrfinderplus_analyse,
_validate_inputs,
)


def annotate(
amrfinderplus_db: AMRFinderPlusDatabaseDirFmt,
sequences: Union[
MultiMAGSequencesDirFmt, ContigSequencesDirFmt, MAGSequencesDirFmt
] = None,
proteins: ProteinsDirectoryFormat = None,
loci: LociDirectoryFormat = None,
organism: str = None,
plus: bool = False,
report_all_equal: bool = False,
ident_min: float = None,
curated_ident: bool = False,
coverage_min: float = 0.5,
translation_table: str = "11",
annotation_format: str = "prodigal",
report_common: bool = False,
threads: int = None,
) -> (
AMRFinderPlusAnnotationsDirFmt,
AMRFinderPlusAnnotationsDirFmt,
GenesDirectoryFormat,
ProteinsDirectoryFormat,
):
# Validate input and parameter combinations
_validate_inputs(
sequences,
loci,
proteins,
ident_min,
curated_ident,
report_common,
plus,
organism,
)

# Set up common parameters for _run_amrfinderplus_analyse
common_params = locals().copy()
del common_params["sequences"]
del common_params["proteins"]
del common_params["loci"]

# Innit output formats
amr_annotations = AMRFinderPlusAnnotationsDirFmt()
amr_all_mutations = AMRFinderPlusAnnotationsDirFmt()
amr_genes = GenesDirectoryFormat()
amr_proteins = ProteinsDirectoryFormat()

# Create sample_dict to iterate over input files
sample_dict = _create_sample_dict(proteins, sequences)

# Iterate over sample_dict
for sample_id, files_dict in sample_dict.items():
# Create sample directories in output directories
_create_sample_dirs(
sequences,
proteins,
organism,
amr_annotations,
amr_genes,
amr_proteins,
amr_all_mutations,
sample_id,
)

for id, file_fp in files_dict.items():
# Construct and validate file input paths for amrfinderplus
dna_path, protein_path, gff_path = _get_file_paths(
sequences,
proteins,
loci,
id,
file_fp,
sample_id,
)

# Define paths for output files
amr_annotations_path = (
amr_annotations.path / sample_id / f"{id}_amr_annotations.tsv"
)
amr_genes_path = amr_genes.path / sample_id / f"{id}_amr_genes.fasta"
amr_proteins_path = (
amr_proteins.path / sample_id / f"{id}_amr_proteins.fasta"
)
amr_all_mutations_path = (
amr_all_mutations.path / sample_id / f"{id}_amr_all_mutations.tsv"
)

# Run amrfinderplus
_run_amrfinderplus_analyse(
dna_path=dna_path,
protein_path=protein_path,
gff_path=gff_path,
amr_annotations_path=amr_annotations_path,
amr_genes_path=amr_genes_path,
amr_proteins_path=amr_proteins_path,
amr_all_mutations_path=amr_all_mutations_path,
**common_params,
)

# Create empty files for empty output artifacts if needed
_create_empty_files(
sequences, proteins, organism, amr_genes, amr_proteins, amr_all_mutations
)

return amr_annotations, amr_all_mutations, amr_genes, amr_proteins
187 changes: 186 additions & 1 deletion q2_amrfinderplus/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,16 @@
# ----------------------------------------------------------------------------
import importlib

from q2_types.feature_data import FeatureData
from q2_types.feature_data_mag import MAG
from q2_types.genome_data import Genes, GenomeData, Loci, Proteins
from q2_types.per_sample_sequences import Contigs, MAGs
from q2_types.sample_data import SampleData
from qiime2.core.type import Bool, Choices, Float, Int, Range, Str
from qiime2.plugin import Citations, Plugin

from q2_amrfinderplus import __version__
from q2_amrfinderplus.annotate import annotate
from q2_amrfinderplus.database import fetch_amrfinderplus_db
from q2_amrfinderplus.types._format import (
AMRFinderPlusAnnotationFormat,
Expand Down Expand Up @@ -50,12 +56,191 @@
citations=[citations["feldgarden2021amrfinderplus"]],
)

organisms = [
"Acinetobacter_baumannii",
"Burkholderia_cepacia",
"Burkholderia_pseudomallei",
"Campylobacter",
"Citrobacter_freundii",
"Clostridioides_difficile",
"Enterobacter_asburiae",
"Enterobacter_cloacae",
"Enterococcus_faecalis",
"Enterococcus_faecium",
"Escherichia",
"Klebsiella_oxytoca",
"Klebsiella_pneumoniae",
"Neisseria_gonorrhoeae",
"Neisseria_meningitidis",
"Pseudomonas_aeruginosa",
"Salmonella",
"Serratia_marcescens",
"Staphylococcus_aureus",
"Staphylococcus_pseudintermedius",
"Streptococcus_agalactiae",
"Streptococcus_pneumoniae",
"Streptococcus_pyogenes",
"Vibrio_cholerae",
"Vibrio_parahaemolyticus",
"Vibrio_vulnificus",
"Acinetobacter",
"Burkholderia_cepacia_complex",
"Escherichia_coli_Shigella",
"Klebsiella",
"Serratia",
]


translation_tables = [
"1",
"2",
"3",
"4",
"5",
"6",
"9",
"10",
"11",
"12",
"13",
"14",
"15",
"16",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30",
"31",
"33",
]


amrfinderplus_parameters = {
"organism": Str % Choices(organisms),
"plus": Bool,
"report_all_equal": Bool,
"ident_min": Float % Range(0, 1, inclusive_start=True, inclusive_end=True),
"curated_ident": Bool,
"coverage_min": Float % Range(0, 1, inclusive_start=True, inclusive_end=True),
"translation_table": Str % Choices(translation_tables),
"annotation_format": Str
% Choices(
"bakta",
"genbank",
"microscope",
"patric",
"pgap",
"prodigal",
"prokka",
"pseudomonasdb",
"rast",
"standard",
),
"report_common": Bool,
"threads": Int % Range(0, None, inclusive_start=False),
}

amrfinderplus_parameter_descriptions = {
"organism": "Taxon used for screening known resistance causing point mutations "
"and blacklisting of common, non-informative genes. Pathogen Detection "
"taxgroup names can also be used.",
"plus": "Provide results from 'Plus' genes such as virulence factors, "
"stress-response genes, etc.",
"report_all_equal": "Report all equally scoring BLAST and HMM matches. This "
"will report multiple lines for a single element if there "
"are multiple reference proteins that have the same score. "
"On those lines the fields Accession of closest sequence "
"and Name of closest sequence will be different showing "
"each of the database proteins that are equally close to "
"the query sequence.",
"ident_min": "Minimum identity for a blast-based hit (Methods BLAST or "
"PARTIAL). Setting this value to something other than -1 "
"will override curated similarity cutoffs. We only recommend "
"using this option if you have a specific reason.",
"curated_ident": "Use the curated threshold for a blast-based hit, if it "
"exists and 0.9 otherwise. This will overwrite the value specified with the "
"'ident_min' parameter.",
"coverage_min": "Minimum proportion of reference gene covered for a "
"BLAST-based hit (Methods BLAST or PARTIAL).",
"translation_table": "Translation table used for BLASTX.",
"report_common": "Report proteins common to a taxonomy group.",
"threads": "The number of threads to use for processing. AMRFinderPlus "
"defaults to 4 on hosts with >= 4 cores. Setting this number higher"
" than the number of cores on the running host may cause blastp to "
"fail. Using more than 4 threads may speed up searches.",
}

amrfinderplus_output_descriptions = {
"amr_annotations": "Annotated AMR genes and mutations.",
"amr_all_mutations": "Report of genotypes at all locations screened for point "
"mutations. These files allow you to distinguish between called "
"point mutations that were the sensitive variant and the point "
"mutations that could not be called because the sequence was not "
"found. This file will contain all detected variants from the "
"reference sequence, so it could be used as an initial screen for "
"novel variants. Note 'Gene symbols' for mutations not in the "
"database (identifiable by [UNKNOWN] in the Sequence name field) "
"have offsets that are relative to the start of the sequence "
"indicated in the field 'Accession of closest sequence' while "
"'Gene symbols' from known point-mutation sites have gene symbols "
"that match the Pathogen Detection Reference Gene Catalog "
"standardized nomenclature for point mutations.",
"amr_genes": "Sequences that were identified by AMRFinderPlus as AMR genes. "
"This will include the entire region that aligns to the references for "
"point mutations.",
"amr_proteins": "Protein Sequences that were identified by AMRFinderPlus as "
"AMR genes. This will include the entire region that aligns to the references "
"for point mutations.",
}


amrfinderplus_input_descriptions = {
"sequences": "MAGs or contigs to be annotated with AMRFinderPlus.",
"proteins": "Protein sequences to be annotated with AMRFinderPlus.",
"loci": "GFF files to give sequence coordinates for proteins input. Required "
"for combined searches of protein and DNA sequences.",
"amrfinderplus_db": "AMRFinderPlus Database.",
}


plugin.methods.register_function(
function=annotate,
inputs={
"sequences": SampleData[MAGs | Contigs] | FeatureData[MAG],
"proteins": GenomeData[Proteins],
"loci": GenomeData[Loci],
"amrfinderplus_db": AMRFinderPlusDatabase,
},
parameters=amrfinderplus_parameters,
outputs=[
("amr_annotations", GenomeData[AMRFinderPlusAnnotations]),
("amr_all_mutations", GenomeData[AMRFinderPlusAnnotations]),
("amr_genes", GenomeData[Genes]),
("amr_proteins", GenomeData[Proteins]),
],
input_descriptions=amrfinderplus_input_descriptions,
parameter_descriptions=amrfinderplus_parameter_descriptions,
output_descriptions=amrfinderplus_output_descriptions,
name="Annotate MAGs or contigs with AMRFinderPlus.",
description="Annotate sample data MAGs or contigs with antimicrobial resistance "
"genes with AMRFinderPlus. Check https://github.com/ncbi/amr/wiki for "
"documentation.",
citations=[citations["feldgarden2021amrfinderplus"]],
)


plugin.register_semantic_type_to_format(
AMRFinderPlusDatabase,
artifact_format=AMRFinderPlusDatabaseDirFmt,
)
plugin.register_semantic_type_to_format(
SampleData[AMRFinderPlusAnnotations],
GenomeData[AMRFinderPlusAnnotations],
artifact_format=AMRFinderPlusAnnotationsDirFmt,
)

Expand Down
47 changes: 47 additions & 0 deletions q2_amrfinderplus/tests/test_annotate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from unittest.mock import patch

from q2_types.genome_data import GenesDirectoryFormat, ProteinsDirectoryFormat
from qiime2.plugin.testing import TestPluginBase

from q2_amrfinderplus.annotate import annotate
from q2_amrfinderplus.types import (
AMRFinderPlusAnnotationsDirFmt,
AMRFinderPlusDatabaseDirFmt,
)


class TestAnnotate(TestPluginBase):
package = "q2_amrfinderplus.tests"

@patch("q2_amrfinderplus.annotate._validate_inputs")
@patch(
"q2_amrfinderplus.annotate._create_sample_dict",
return_value={"sample1": {"id1": "file_path"}},
)
@patch("q2_amrfinderplus.annotate._create_sample_dirs")
@patch(
"q2_amrfinderplus.annotate._get_file_paths",
return_value=("dna_path", "protein_path", "gff_path"),
)
@patch("q2_amrfinderplus.annotate._run_amrfinderplus_analyse")
@patch("q2_amrfinderplus.annotate._create_empty_files")
def test_annotate(
self,
mock_create_empty_files,
mock_run_amrfinderplus,
mock_get_file_paths,
mock_create_sample_dirs,
mock_create_sample_dict,
mock_validate_inputs,
):
# Create mock for the AMRFinderPlusDatabaseDirFmt input
amrfinderplus_db = AMRFinderPlusDatabaseDirFmt()

# Call the function with mostly default inputs
result = annotate(amrfinderplus_db)

# Ensure the output is the correct types
self.assertIsInstance(result[0], AMRFinderPlusAnnotationsDirFmt)
self.assertIsInstance(result[1], AMRFinderPlusAnnotationsDirFmt)
self.assertIsInstance(result[2], GenesDirectoryFormat)
self.assertIsInstance(result[3], ProteinsDirectoryFormat)
Loading

0 comments on commit 4f76e2e

Please sign in to comment.