diff --git a/q2_amrfinderplus/annotate.py b/q2_amrfinderplus/annotate.py new file mode 100644 index 0000000..09eb401 --- /dev/null +++ b/q2_amrfinderplus/annotate.py @@ -0,0 +1,129 @@ +from typing import Union + +from q2_types.feature_data_mag import MAGSequencesDirFmt +from q2_types.genome_data import ( + GenesDirectoryFormat, + LociDirectoryFormat, + ProteinsDirectoryFormat, +) +from q2_types.per_sample_sequences import ContigSequencesDirFmt, MultiMAGSequencesDirFmt + +from q2_amrfinderplus.types import ( + AMRFinderPlusAnnotationsDirFmt, + AMRFinderPlusDatabaseDirFmt, +) +from q2_amrfinderplus.utils import ( + _create_empty_files, + _create_sample_dict, + _create_sample_dirs, + _get_file_paths, + _run_amrfinderplus_analyse, + _validate_inputs, +) + + +def annotate( + amrfinderplus_db: AMRFinderPlusDatabaseDirFmt, + sequences: Union[ + MultiMAGSequencesDirFmt, ContigSequencesDirFmt, MAGSequencesDirFmt + ] = None, + proteins: ProteinsDirectoryFormat = None, + loci: LociDirectoryFormat = None, + organism: str = None, + plus: bool = False, + report_all_equal: bool = False, + ident_min: float = None, + curated_ident: bool = False, + coverage_min: float = 0.5, + translation_table: str = "11", + annotation_format: str = "prodigal", + report_common: bool = False, + threads: int = None, +) -> ( + AMRFinderPlusAnnotationsDirFmt, + AMRFinderPlusAnnotationsDirFmt, + GenesDirectoryFormat, + ProteinsDirectoryFormat, +): + # Validate input and parameter combinations + _validate_inputs( + sequences, + loci, + proteins, + ident_min, + curated_ident, + report_common, + plus, + organism, + ) + + # Set up common parameters for _run_amrfinderplus_analyse + common_params = locals().copy() + del common_params["sequences"] + del common_params["proteins"] + del common_params["loci"] + + # Innit output formats + amr_annotations = AMRFinderPlusAnnotationsDirFmt() + amr_all_mutations = AMRFinderPlusAnnotationsDirFmt() + amr_genes = GenesDirectoryFormat() + amr_proteins = ProteinsDirectoryFormat() + + # Create sample_dict to iterate over input files + sample_dict = _create_sample_dict(proteins, sequences) + + # Iterate over sample_dict + for sample_id, files_dict in sample_dict.items(): + # Create sample directories in output directories + _create_sample_dirs( + sequences, + proteins, + organism, + amr_annotations, + amr_genes, + amr_proteins, + amr_all_mutations, + sample_id, + ) + + for id, file_fp in files_dict.items(): + # Construct and validate file input paths for amrfinderplus + dna_path, protein_path, gff_path = _get_file_paths( + sequences, + proteins, + loci, + id, + file_fp, + sample_id, + ) + + # Define paths for output files + amr_annotations_path = ( + amr_annotations.path / sample_id / f"{id}_amr_annotations.tsv" + ) + amr_genes_path = amr_genes.path / sample_id / f"{id}_amr_genes.fasta" + amr_proteins_path = ( + amr_proteins.path / sample_id / f"{id}_amr_proteins.fasta" + ) + amr_all_mutations_path = ( + amr_all_mutations.path / sample_id / f"{id}_amr_all_mutations.tsv" + ) + + # Run amrfinderplus + _run_amrfinderplus_analyse( + dna_path=dna_path, + protein_path=protein_path, + gff_path=gff_path, + amr_annotations_path=amr_annotations_path, + amr_genes_path=amr_genes_path, + amr_proteins_path=amr_proteins_path, + amr_all_mutations_path=amr_all_mutations_path, + **common_params, + ) + + # Create empty files for empty output artifacts if needed + _create_empty_files( + sequences, proteins, organism, amr_genes, amr_proteins, amr_all_mutations + ) + + return amr_annotations, amr_all_mutations, amr_genes, amr_proteins diff --git a/q2_amrfinderplus/plugin_setup.py b/q2_amrfinderplus/plugin_setup.py index cf7d1f6..a14aa9c 100644 --- a/q2_amrfinderplus/plugin_setup.py +++ b/q2_amrfinderplus/plugin_setup.py @@ -7,10 +7,16 @@ # ---------------------------------------------------------------------------- import importlib +from q2_types.feature_data import FeatureData +from q2_types.feature_data_mag import MAG +from q2_types.genome_data import Genes, GenomeData, Loci, Proteins +from q2_types.per_sample_sequences import Contigs, MAGs from q2_types.sample_data import SampleData +from qiime2.core.type import Bool, Choices, Float, Int, Range, Str from qiime2.plugin import Citations, Plugin from q2_amrfinderplus import __version__ +from q2_amrfinderplus.annotate import annotate from q2_amrfinderplus.database import fetch_amrfinderplus_db from q2_amrfinderplus.types._format import ( AMRFinderPlusAnnotationFormat, @@ -50,12 +56,191 @@ citations=[citations["feldgarden2021amrfinderplus"]], ) +organisms = [ + "Acinetobacter_baumannii", + "Burkholderia_cepacia", + "Burkholderia_pseudomallei", + "Campylobacter", + "Citrobacter_freundii", + "Clostridioides_difficile", + "Enterobacter_asburiae", + "Enterobacter_cloacae", + "Enterococcus_faecalis", + "Enterococcus_faecium", + "Escherichia", + "Klebsiella_oxytoca", + "Klebsiella_pneumoniae", + "Neisseria_gonorrhoeae", + "Neisseria_meningitidis", + "Pseudomonas_aeruginosa", + "Salmonella", + "Serratia_marcescens", + "Staphylococcus_aureus", + "Staphylococcus_pseudintermedius", + "Streptococcus_agalactiae", + "Streptococcus_pneumoniae", + "Streptococcus_pyogenes", + "Vibrio_cholerae", + "Vibrio_parahaemolyticus", + "Vibrio_vulnificus", + "Acinetobacter", + "Burkholderia_cepacia_complex", + "Escherichia_coli_Shigella", + "Klebsiella", + "Serratia", +] + + +translation_tables = [ + "1", + "2", + "3", + "4", + "5", + "6", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "33", +] + + +amrfinderplus_parameters = { + "organism": Str % Choices(organisms), + "plus": Bool, + "report_all_equal": Bool, + "ident_min": Float % Range(0, 1, inclusive_start=True, inclusive_end=True), + "curated_ident": Bool, + "coverage_min": Float % Range(0, 1, inclusive_start=True, inclusive_end=True), + "translation_table": Str % Choices(translation_tables), + "annotation_format": Str + % Choices( + "bakta", + "genbank", + "microscope", + "patric", + "pgap", + "prodigal", + "prokka", + "pseudomonasdb", + "rast", + "standard", + ), + "report_common": Bool, + "threads": Int % Range(0, None, inclusive_start=False), +} + +amrfinderplus_parameter_descriptions = { + "organism": "Taxon used for screening known resistance causing point mutations " + "and blacklisting of common, non-informative genes. Pathogen Detection " + "taxgroup names can also be used.", + "plus": "Provide results from 'Plus' genes such as virulence factors, " + "stress-response genes, etc.", + "report_all_equal": "Report all equally scoring BLAST and HMM matches. This " + "will report multiple lines for a single element if there " + "are multiple reference proteins that have the same score. " + "On those lines the fields Accession of closest sequence " + "and Name of closest sequence will be different showing " + "each of the database proteins that are equally close to " + "the query sequence.", + "ident_min": "Minimum identity for a blast-based hit (Methods BLAST or " + "PARTIAL). Setting this value to something other than -1 " + "will override curated similarity cutoffs. We only recommend " + "using this option if you have a specific reason.", + "curated_ident": "Use the curated threshold for a blast-based hit, if it " + "exists and 0.9 otherwise. This will overwrite the value specified with the " + "'ident_min' parameter.", + "coverage_min": "Minimum proportion of reference gene covered for a " + "BLAST-based hit (Methods BLAST or PARTIAL).", + "translation_table": "Translation table used for BLASTX.", + "report_common": "Report proteins common to a taxonomy group.", + "threads": "The number of threads to use for processing. AMRFinderPlus " + "defaults to 4 on hosts with >= 4 cores. Setting this number higher" + " than the number of cores on the running host may cause blastp to " + "fail. Using more than 4 threads may speed up searches.", +} + +amrfinderplus_output_descriptions = { + "amr_annotations": "Annotated AMR genes and mutations.", + "amr_all_mutations": "Report of genotypes at all locations screened for point " + "mutations. These files allow you to distinguish between called " + "point mutations that were the sensitive variant and the point " + "mutations that could not be called because the sequence was not " + "found. This file will contain all detected variants from the " + "reference sequence, so it could be used as an initial screen for " + "novel variants. Note 'Gene symbols' for mutations not in the " + "database (identifiable by [UNKNOWN] in the Sequence name field) " + "have offsets that are relative to the start of the sequence " + "indicated in the field 'Accession of closest sequence' while " + "'Gene symbols' from known point-mutation sites have gene symbols " + "that match the Pathogen Detection Reference Gene Catalog " + "standardized nomenclature for point mutations.", + "amr_genes": "Sequences that were identified by AMRFinderPlus as AMR genes. " + "This will include the entire region that aligns to the references for " + "point mutations.", + "amr_proteins": "Protein Sequences that were identified by AMRFinderPlus as " + "AMR genes. This will include the entire region that aligns to the references " + "for point mutations.", +} + + +amrfinderplus_input_descriptions = { + "sequences": "MAGs or contigs to be annotated with AMRFinderPlus.", + "proteins": "Protein sequences to be annotated with AMRFinderPlus.", + "loci": "GFF files to give sequence coordinates for proteins input. Required " + "for combined searches of protein and DNA sequences.", + "amrfinderplus_db": "AMRFinderPlus Database.", +} + + +plugin.methods.register_function( + function=annotate, + inputs={ + "sequences": SampleData[MAGs | Contigs] | FeatureData[MAG], + "proteins": GenomeData[Proteins], + "loci": GenomeData[Loci], + "amrfinderplus_db": AMRFinderPlusDatabase, + }, + parameters=amrfinderplus_parameters, + outputs=[ + ("amr_annotations", GenomeData[AMRFinderPlusAnnotations]), + ("amr_all_mutations", GenomeData[AMRFinderPlusAnnotations]), + ("amr_genes", GenomeData[Genes]), + ("amr_proteins", GenomeData[Proteins]), + ], + input_descriptions=amrfinderplus_input_descriptions, + parameter_descriptions=amrfinderplus_parameter_descriptions, + output_descriptions=amrfinderplus_output_descriptions, + name="Annotate MAGs or contigs with AMRFinderPlus.", + description="Annotate sample data MAGs or contigs with antimicrobial resistance " + "genes with AMRFinderPlus. Check https://github.com/ncbi/amr/wiki for " + "documentation.", + citations=[citations["feldgarden2021amrfinderplus"]], +) + + plugin.register_semantic_type_to_format( AMRFinderPlusDatabase, artifact_format=AMRFinderPlusDatabaseDirFmt, ) plugin.register_semantic_type_to_format( - SampleData[AMRFinderPlusAnnotations], + GenomeData[AMRFinderPlusAnnotations], artifact_format=AMRFinderPlusAnnotationsDirFmt, ) diff --git a/q2_amrfinderplus/tests/test_annotate.py b/q2_amrfinderplus/tests/test_annotate.py new file mode 100644 index 0000000..ec92ecf --- /dev/null +++ b/q2_amrfinderplus/tests/test_annotate.py @@ -0,0 +1,47 @@ +from unittest.mock import patch + +from q2_types.genome_data import GenesDirectoryFormat, ProteinsDirectoryFormat +from qiime2.plugin.testing import TestPluginBase + +from q2_amrfinderplus.annotate import annotate +from q2_amrfinderplus.types import ( + AMRFinderPlusAnnotationsDirFmt, + AMRFinderPlusDatabaseDirFmt, +) + + +class TestAnnotate(TestPluginBase): + package = "q2_amrfinderplus.tests" + + @patch("q2_amrfinderplus.annotate._validate_inputs") + @patch( + "q2_amrfinderplus.annotate._create_sample_dict", + return_value={"sample1": {"id1": "file_path"}}, + ) + @patch("q2_amrfinderplus.annotate._create_sample_dirs") + @patch( + "q2_amrfinderplus.annotate._get_file_paths", + return_value=("dna_path", "protein_path", "gff_path"), + ) + @patch("q2_amrfinderplus.annotate._run_amrfinderplus_analyse") + @patch("q2_amrfinderplus.annotate._create_empty_files") + def test_annotate( + self, + mock_create_empty_files, + mock_run_amrfinderplus, + mock_get_file_paths, + mock_create_sample_dirs, + mock_create_sample_dict, + mock_validate_inputs, + ): + # Create mock for the AMRFinderPlusDatabaseDirFmt input + amrfinderplus_db = AMRFinderPlusDatabaseDirFmt() + + # Call the function with mostly default inputs + result = annotate(amrfinderplus_db) + + # Ensure the output is the correct types + self.assertIsInstance(result[0], AMRFinderPlusAnnotationsDirFmt) + self.assertIsInstance(result[1], AMRFinderPlusAnnotationsDirFmt) + self.assertIsInstance(result[2], GenesDirectoryFormat) + self.assertIsInstance(result[3], ProteinsDirectoryFormat) diff --git a/q2_amrfinderplus/tests/test_utils.py b/q2_amrfinderplus/tests/test_utils.py index c2d0c20..cfbe144 100644 --- a/q2_amrfinderplus/tests/test_utils.py +++ b/q2_amrfinderplus/tests/test_utils.py @@ -1,8 +1,25 @@ -from unittest.mock import call, patch +import os +import subprocess +from io import StringIO +from pathlib import Path +from unittest.mock import MagicMock, call, mock_open, patch +from q2_types.feature_data_mag import MAGSequencesDirFmt +from q2_types.genome_data import ProteinsDirectoryFormat +from q2_types.per_sample_sequences import ContigSequencesDirFmt, MultiMAGSequencesDirFmt from qiime2.plugin.testing import TestPluginBase -from q2_amrfinderplus.utils import EXTERNAL_CMD_WARNING, run_command +from q2_amrfinderplus.utils import ( + EXTERNAL_CMD_WARNING, + _create_empty_files, + _create_sample_dict, + _create_sample_dirs, + _get_file_paths, + _run_amrfinderplus_analyse, + _validate_inputs, + colorify, + run_command, +) class TestRunCommand(TestPluginBase): @@ -45,3 +62,499 @@ def test_run_command_non_verbose(self, mock_print, mock_subprocess_run): # Ensure no print statements were made mock_print.assert_not_called() + + +class TestRunAMRFinderPlusAnalyse(TestPluginBase): + package = "q2_amrfinderplus.tests" + + @patch("q2_amrfinderplus.utils.run_command") + def test_run_amrfinderplus_analyse(self, mock_run_command): + _run_amrfinderplus_analyse( + amrfinderplus_db="amrfinderplus_db", + dna_path="dna_sequences", + protein_path="protein_sequences", + gff_path="gff", + organism="Acinetobacter", + plus=True, + report_all_equal=True, + ident_min=1, + curated_ident=False, + coverage_min=1, + translation_table="11", + annotation_format="prodigal", + report_common=True, + threads=4, + amr_annotations_path="amr_annotations_path", + amr_genes_path="amr_genes_path", + amr_proteins_path="amr_proteins_path", + amr_all_mutations_path="amr_all_mutations_path", + ) + mock_run_command.assert_called_once_with( + cmd=[ + "amrfinder", + "--database", + "amrfinderplus_db", + "-o", + "amr_annotations_path", + "--print_node", + "-n", + "dna_sequences", + "--nucleotide_output", + "amr_genes_path", + "-p", + "protein_sequences", + "--protein_output", + "amr_proteins_path", + "-g", + "gff", + "--threads", + "4", + "--organism", + "Acinetobacter", + "--mutation_all", + "amr_all_mutations_path", + "--plus", + "--report_all_equal", + "--ident_min", + "1", + "--coverage_min", + "1", + "--translation_table", + "11", + "--annotation_format", + "prodigal", + "--report_common", + "--gpipe_org", + ], + ) + + @patch("q2_amrfinderplus.utils.run_command") + def test_run_amrfinderplus_analyse_minimal(self, mock_run_command): + _run_amrfinderplus_analyse( + amrfinderplus_db="amrfinderplus_db", + dna_path=None, + protein_path=None, + gff_path=None, + organism=None, + plus=False, + report_all_equal=False, + ident_min=None, + curated_ident=True, + coverage_min=None, + translation_table=None, + annotation_format=None, + report_common=False, + threads=None, + amr_annotations_path="amr_annotations_path", + ) + mock_run_command.assert_called_once_with( + cmd=[ + "amrfinder", + "--database", + "amrfinderplus_db", + "-o", + "amr_annotations_path", + "--print_node", + "--ident_min", + "-1", + ], + ) + + @patch("q2_amrfinderplus.utils.run_command") + def test_run_amrfinderplus_analyse_exception_message(self, mock_run_command): + # Simulate subprocess.CalledProcessError + mock_run_command.side_effect = subprocess.CalledProcessError( + returncode=1, cmd="amrfinder" + ) + + # Call the function and assert the exception message + with self.assertRaises(Exception) as context: + _run_amrfinderplus_analyse( + amrfinderplus_db="mock_db", + dna_path=None, + protein_path=None, + gff_path=None, + organism=None, + plus=False, + report_all_equal=False, + ident_min=None, + curated_ident=False, + coverage_min=0.5, + translation_table="11", + annotation_format="prodigal", + report_common=False, + threads=None, + amr_annotations_path="mock_annotations_path", + ) + + # Assert the correct exception message is raised + self.assertIn( + "An error was encountered while running AMRFinderPlus", + str(context.exception), + ) + self.assertIn("(return code 1)", str(context.exception)) + + +class TestValidateInputs(TestPluginBase): + package = "q2_amrfinderplus.tests" + + # Test when --i-loci is given without --i-proteins + def test_loci_without_proteins(self): + with self.assertRaisesRegex( + ValueError, "can only be given in combination " 'with "--i-proteins"' + ): + _validate_inputs( + sequences=True, + loci=True, + proteins=False, + ident_min=None, + curated_ident=None, + report_common=None, + plus=None, + organism=None, + ) + + # Test when --i-mags and --i-proteins are given without --i-loci + def test_mags_and_proteins_without_loci(self): + with self.assertRaisesRegex( + ValueError, "can only be given in combination " 'with "--i-loci"' + ): + _validate_inputs( + sequences=True, + loci=False, + proteins=True, + ident_min=None, + curated_ident=None, + report_common=None, + plus=None, + organism=None, + ) + + # Test when neither --i-mags nor --i-proteins is provided + def test_missing_mags_and_proteins(self): + with self.assertRaisesRegex( + ValueError, '"--i-sequences" or "--i-proteins" input has to be provided' + ): + _validate_inputs( + sequences=False, + loci=False, + proteins=False, + ident_min=None, + curated_ident=None, + report_common=None, + plus=None, + organism=None, + ) + + # Test when both --p-ident-min and --p-curated-ident are given + def test_ident_min_and_curated_ident(self): + with self.assertRaisesRegex( + ValueError, + '"--p-ident-min" and ' + '"--p-curated-ident" cannot be used ' + "simultaneously", + ): + _validate_inputs( + sequences=True, + loci=None, + proteins=None, + ident_min=True, + curated_ident=True, + report_common=None, + plus=None, + organism=None, + ) + + # Test when --p-report-common is given but --p-plus or --p-organism is missing + def test_report_common_without_plus_or_organism(self): + with self.assertRaisesRegex( + ValueError, '"--p-report-common" requires ' '"--p-plus" and "--p-organism"' + ): + _validate_inputs( + sequences=True, + loci=None, + proteins=None, + ident_min=None, + curated_ident=None, + report_common=True, + plus=False, + organism=None, + ) + + +class TestGetFilePaths(TestPluginBase): + package = "q2_amrfinderplus.tests" + + @patch("os.path.exists") + def test_mags_with_proteins_and_loci(self, mock_exists): + # Mock the os.path.exists to simulate files existing + mock_exists.side_effect = [True, True] # First for protein, second for GFF + + # Call the function with mags, proteins, and loci + dna_path, protein_path, gff_path = _get_file_paths( + sequences=MagicMock(), + proteins=MagicMock(path=Path("proteins")), + loci=MagicMock(path=Path("loci")), + id="id", + sample_id="sample1", + file_fp="dna_file.fasta", + ) + + # Assertions + self.assertEqual(dna_path, "dna_file.fasta") + self.assertEqual(str(protein_path), "proteins/sample1/id.fasta") + self.assertEqual(str(gff_path), "loci/sample1/id.gff") + + def test_mags_without_proteins_and_loci(self): + # Call the function with mags, proteins, and loci + dna_path, protein_path, gff_path = _get_file_paths( + sequences=MagicMock(), + proteins=None, + loci=None, + id="sample123", + file_fp="dna_file.fasta", + ) + + # Assertions + self.assertEqual(dna_path, "dna_file.fasta") + self.assertEqual(protein_path, None) + self.assertEqual(gff_path, None) + + @patch("os.path.exists") + def test_mags_with_missing_protein(self, mock_exists): + # Mock os.path.exists to simulate the missing protein file + mock_exists.side_effect = [False] # Protein file does not exist + + # Call the function with mags and proteins, but no loci + with self.assertRaises(ValueError) as context: + _get_file_paths( + sequences=MagicMock(), + proteins=MagicMock(), + loci=None, + id="sample123", + sample_id="sample1", + file_fp="dna_file.fasta", + ) + + # Check that the exception message contains the correct text + self.assertIn( + "Proteins file for ID 'sample123' is missing", str(context.exception) + ) + + @patch("os.path.exists") + def test_loci_with_missing_gff(self, mock_exists): + # Mock os.path.exists to simulate the protein file exists but GFF file is + # missing + mock_exists.side_effect = [False] # Protein exists, GFF is missing + + # Call the function with proteins and loci, but no mags + with self.assertRaises(ValueError) as context: + _get_file_paths( + sequences=None, + proteins=None, + loci=MagicMock(path=Path("/mock/loci/path")), + id="sample123", + sample_id="sample1", + file_fp="protein_file.fasta", + ) + + # Check that the exception message contains the correct text + self.assertIn("GFF file for ID 'sample123' is missing", str(context.exception)) + + +class TestCreateSampleDict(TestPluginBase): + package = "q2_amrfinderplus.tests" + + @patch.object( + MultiMAGSequencesDirFmt, "sample_dict", return_value={"sample1": "some_value"} + ) + def test_create_sample_dict_sequences_multimags(self, mock_sample_dict): + # Mock the sequences input as MultiMAGSequencesDirFmt + sequences = MultiMAGSequencesDirFmt() + + # Call the function + result = _create_sample_dict(proteins=None, sequences=sequences) + + # Check that sample_dict is called correctly + mock_sample_dict.assert_called_once() + + # Ensure the result is the mocked return value of sample_dict + self.assertEqual(result, {"sample1": "some_value"}) + + @patch.object( + ContigSequencesDirFmt, "sample_dict", return_value={"contig_file": "file_path"} + ) + def test_create_sample_dict_sequences_contigs(self, mock_sample_dict): + # Mock the sequences input as ContigSequencesDirFmt + sequences = ContigSequencesDirFmt() + + # Call the function + result = _create_sample_dict(proteins=None, sequences=sequences) + + # Check that sample_dict is called correctly + mock_sample_dict.assert_called_once() + + # Ensure the result has a fake sample key with the file_dict + self.assertEqual(result, {"": {"contig_file": "file_path"}}) + + @patch.object( + MAGSequencesDirFmt, "feature_dict", return_value={"feature_file": "file_path"} + ) + def test_create_sample_dict_sequences_mag(self, mock_feature_dict): + # Mock the sequences input as MAGSequencesDirFmt + sequences = MAGSequencesDirFmt() + + # Call the function + result = _create_sample_dict(proteins=None, sequences=sequences) + + # Check that feature_dict is called correctly + mock_feature_dict.assert_called_once() + + # Ensure the result has a fake sample key with the feature_dict + self.assertEqual(result, {"": {"feature_file": "file_path"}}) + + def test_create_sample_dict_proteins_sample_data(self): + proteins = ProteinsDirectoryFormat() + + os.mkdir(proteins.path / "directory") + with open(proteins.path / "directory" / "file.fasta", "w"): + pass + + result = _create_sample_dict(proteins=proteins, sequences=None) + + self.assertEqual( + result, + {"directory": {"file": str(proteins.path / "directory" / "file.fasta")}}, + ) + + def test_create_sample_dict_proteins_feature_data(self): + proteins = ProteinsDirectoryFormat() + + with open(proteins.path / "file.fasta", "w"): + pass + + result = _create_sample_dict(proteins=proteins, sequences=None) + + self.assertEqual(result, {"": {"file": str(proteins.path / "file.fasta")}}) + + +class TestCreateEmptyFiles(TestPluginBase): + package = "q2_amrfinderplus.tests" + + @patch("builtins.open", new_callable=mock_open) + @patch("sys.stdout", new_callable=StringIO) + def test_create_empty_files_all_false(self, mock_stdout, mock_open_file): + amr_genes = MagicMock(path=Path("path/amr_genes")) + amr_proteins = MagicMock(path=Path("path/amr_proteins")) + amr_all_mutations = MagicMock(path=Path("path/amr_all_mutations")) + + _create_empty_files( + sequences=False, + proteins=False, + organism=False, + amr_genes=amr_genes, + amr_proteins=amr_proteins, + amr_all_mutations=amr_all_mutations, + ) + + # Assertions for file creation + mock_open_file.assert_any_call(Path("path/amr_genes/empty.fasta"), "w") + mock_open_file.assert_any_call(Path("path/amr_proteins/empty.fasta"), "w") + mock_open_file.assert_any_call( + Path("path/amr_all_mutations/empty_amr_all_mutations.tsv"), "w" + ) + self.assertEqual(mock_open_file.call_count, 3) + + # Capture printed output + printed_output = mock_stdout.getvalue() + + # Assertions for print statements by checking keywords + self.assertIn("amr_genes", printed_output) + self.assertIn("amr_proteins", printed_output) + self.assertIn("amr_all_mutations", printed_output) + + @patch("builtins.open", new_callable=mock_open) + def test_create_empty_files_all_true(self, mock_open_file): + amr_genes = MagicMock(path=Path("path/amr_genes")) + amr_proteins = MagicMock(path=Path("path/amr_proteins")) + amr_all_mutations = MagicMock(path=Path("path/amr_all_mutations")) + + _create_empty_files( + sequences=True, + proteins=True, + organism=True, + amr_genes=amr_genes, + amr_proteins=amr_proteins, + amr_all_mutations=amr_all_mutations, + ) + + # Assertions + mock_open_file.assert_not_called() + + +class TestCreateSampleDirs(TestPluginBase): + package = "q2_amrfinderplus.tests" + + @patch("os.makedirs") + def test_create_sample_dirs_all_exist(self, mock_makedirs): + amr_annotations = MagicMock(path=Path("/fake/path/amr_annotations")) + amr_genes = MagicMock(path=Path("/fake/path/amr_genes")) + amr_proteins = MagicMock(path=Path("/fake/path/amr_proteins")) + amr_all_mutations = MagicMock(path=Path("/fake/path/amr_all_mutations")) + + _create_sample_dirs( + sequences=True, + proteins=True, + organism=True, + amr_annotations=amr_annotations, + amr_genes=amr_genes, + amr_proteins=amr_proteins, + amr_all_mutations=amr_all_mutations, + sample_id="sample1", + ) + + # Assertions + mock_makedirs.assert_any_call( + Path("/fake/path/amr_annotations/sample1"), exist_ok=True + ) + mock_makedirs.assert_any_call( + Path("/fake/path/amr_genes/sample1"), exist_ok=True + ) + mock_makedirs.assert_any_call( + Path("/fake/path/amr_proteins/sample1"), exist_ok=True + ) + mock_makedirs.assert_any_call( + Path("/fake/path/amr_all_mutations/sample1"), exist_ok=True + ) + self.assertEqual(mock_makedirs.call_count, 4) + + @patch("os.makedirs") + def test_create_sample_dirs_nothing(self, mock_makedirs): + amr_annotations = MagicMock(path=Path("/fake/path/amr_annotations")) + + _create_sample_dirs( + sequences=False, + proteins=False, + organism=False, + amr_annotations=amr_annotations, + amr_genes=None, + amr_proteins=None, + amr_all_mutations=None, + sample_id="sample1", + ) + + # Assertions + mock_makedirs.assert_any_call( + Path("/fake/path/amr_annotations/sample1"), exist_ok=True + ) + self.assertEqual(mock_makedirs.call_count, 1) + + +class TestColorify(TestPluginBase): + package = "q2_amrfinderplus.tests" + + def test_colorify(self): + # Test if colorify wraps the string with the correct ANSI codes for yellow + result = colorify("Hello") + expected = "\033[1;33mHello\033[0m" + self.assertEqual(result, expected) diff --git a/q2_amrfinderplus/types/_type.py b/q2_amrfinderplus/types/_type.py index 5cc2f5e..e36796c 100644 --- a/q2_amrfinderplus/types/_type.py +++ b/q2_amrfinderplus/types/_type.py @@ -5,12 +5,11 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- -from q2_types.feature_data import FeatureData -from q2_types.sample_data import SampleData +from q2_types.genome_data import GenomeData from qiime2.core.type import SemanticType AMRFinderPlusDatabase = SemanticType("AMRFinderPlusDatabase") AMRFinderPlusAnnotations = SemanticType( "AMRFinderPlusAnnotations", - variant_of=[SampleData.field["type"], FeatureData.field["type"]], + variant_of=GenomeData.field["type"], ) diff --git a/q2_amrfinderplus/types/tests/test_types_formats_transformers.py b/q2_amrfinderplus/types/tests/test_types_formats_transformers.py index 901ec63..5120733 100644 --- a/q2_amrfinderplus/types/tests/test_types_formats_transformers.py +++ b/q2_amrfinderplus/types/tests/test_types_formats_transformers.py @@ -45,8 +45,8 @@ def test_amrfinderplus_annotation_format_validate_positive(self): def test_amrfinderplus_annotation_format_validate_positive_coordinates(self): filepath = self.get_data_path( - "annotation/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d" - "_amr_annotations.tsv" + "annotation/coordinates/" + "e026af61-d911-4de3-a957-7e8bf837f30d_amr_annotations.tsv" ) format = AMRFinderPlusAnnotationFormat(filepath, mode="r") format.validate() @@ -101,9 +101,7 @@ def test_amrfinderplus_annotation_format_validation_error(self): self.assertEqual(str(context.exception), expected_message) def test_amrfinderplus_annotations_dir_fmt_feature(self): - dirpath = self.get_data_path( - "annotation/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d" - ) + dirpath = self.get_data_path("annotation/coordinates") annotations = AMRFinderPlusAnnotationsDirFmt(dirpath, mode="r") assert isinstance(annotations, AMRFinderPlusAnnotationsDirFmt) diff --git a/q2_amrfinderplus/utils.py b/q2_amrfinderplus/utils.py index 1bad407..c8c0ec9 100644 --- a/q2_amrfinderplus/utils.py +++ b/q2_amrfinderplus/utils.py @@ -1,5 +1,10 @@ +import os import subprocess +from q2_types.feature_data_mag import MAGSequencesDirFmt +from q2_types.genome_data import ProteinsDirectoryFormat +from q2_types.per_sample_sequences import ContigSequencesDirFmt, MultiMAGSequencesDirFmt + EXTERNAL_CMD_WARNING = ( "Running external command line application(s). " "This may print messages to stdout and/or stderr.\n" @@ -15,3 +20,269 @@ def run_command(cmd, cwd=None, verbose=True): print("\nCommand:", end=" ") print(" ".join(cmd), end="\n\n") subprocess.run(cmd, check=True, cwd=cwd) + + +def _validate_inputs( + sequences, loci, proteins, ident_min, curated_ident, report_common, plus, organism +): + # Ensure that at least sequences or proteins is provided + if not sequences and not proteins: + raise ValueError('"--i-sequences" or "--i-proteins" input has to be provided.') + + # Check if loci is provided with sequences but without proteins + # (invalid combination) + if sequences and loci and not proteins: + raise ValueError( + '"--i-loci" input can only be given in combination with "--i-proteins" ' + "input." + ) + + # Check if sequences and proteins are provided together but without loci + # (invalid combination) + if sequences and not loci and proteins: + raise ValueError( + '"--i-sequences" and "--i-proteins" inputs together can only ' + 'be given in combination with "--i-loci" input.' + ) + + # Validate that ident_min and curated_ident are not used together + if ident_min and curated_ident: + raise ValueError( + '"--p-ident-min" and "--p-curated-ident" cannot be used simultaneously.' + ) + + # Check that report_common is only used with plus and organism + if report_common and (not plus or not organism): + raise ValueError('"--p-report-common" requires "--p-plus" and "--p-organism".') + + +def _run_amrfinderplus_analyse( + amrfinderplus_db, + dna_path, + protein_path, + gff_path, + organism, + plus, + report_all_equal, + ident_min, + curated_ident, + coverage_min, + translation_table, + annotation_format, + report_common, + threads, + amr_annotations_path, + amr_genes_path=None, + amr_proteins_path=None, + amr_all_mutations_path=None, +): + cmd = [ + "amrfinder", + "--database", + str(amrfinderplus_db), + "-o", + str(amr_annotations_path), + "--print_node", + ] + # Creates nucleotide fasta output if DNA sequences are given as input + if dna_path: + cmd.extend( + [ + "-n", + str(dna_path), + "--nucleotide_output", + str(amr_genes_path), + ] + ) + # Creates protein fasta output if protein sequences are given as input + if protein_path: + cmd.extend( + [ + "-p", + str(protein_path), + "--protein_output", + str(amr_proteins_path), + ] + ) + if gff_path: + cmd.extend(["-g", str(gff_path)]) + if threads: + cmd.extend(["--threads", str(threads)]) + # Creates all mutations output if an organism is specified + if organism: + cmd.extend( + [ + "--organism", + organism, + "--mutation_all", + str(amr_all_mutations_path), + ] + ) + if plus: + cmd.append("--plus") + if report_all_equal: + cmd.append("--report_all_equal") + if ident_min: + cmd.extend(["--ident_min", str(ident_min)]) + if curated_ident: + cmd.extend(["--ident_min", "-1"]) + if coverage_min: + cmd.extend(["--coverage_min", str(coverage_min)]) + if translation_table: + cmd.extend(["--translation_table", str(translation_table)]) + if annotation_format: + cmd.extend(["--annotation_format", str(annotation_format)]) + if report_common: + cmd.append("--report_common") + if organism in [ + "Acinetobacter", + "Burkholderia_cepacia_complex", + "Escherichia_coli_Shigella", + "Klebsiella", + "Serratia", + ]: + cmd.append("--gpipe_org") + + try: + run_command(cmd=cmd) + except subprocess.CalledProcessError as e: + raise Exception( + "An error was encountered while running AMRFinderPlus, " + f"(return code {e.returncode}), please inspect " + "stdout and stderr to learn more." + ) + + +def _create_empty_files( + sequences, proteins, organism, amr_genes, amr_proteins, amr_all_mutations +): + # Creates empty files in output artifacts amr_genes, amr_proteins and + # amr_all_mutations because artifacts can not be empty + if not sequences: + with open(amr_genes.path / "empty.fasta", "w"): + pass + print( + colorify( + '"amr_genes" output is empty because no "--i-sequences" input ' + "was given." + ) + ) + + if not proteins: + with open(amr_proteins.path / "empty.fasta", "w"): + pass + print( + colorify( + '"amr_proteins" output is empty because no "--i-proteins" input ' + "was given." + ) + ) + + if not organism: + with open(amr_all_mutations.path / "empty_amr_all_mutations.tsv", "w"): + pass + print( + colorify( + '"amr_all_mutations" output is empty because no "--p-organism" ' + "parameter was given." + ) + ) + + +def _create_sample_dirs( + sequences, + proteins, + organism, + amr_annotations, + amr_genes, + amr_proteins, + amr_all_mutations, + sample_id, +): + os.makedirs(amr_annotations.path / sample_id, exist_ok=True) + if sequences: + os.makedirs(amr_genes.path / sample_id, exist_ok=True) + if proteins: + os.makedirs(amr_proteins.path / sample_id, exist_ok=True) + if organism: + os.makedirs(amr_all_mutations.path / sample_id, exist_ok=True) + + +def _create_sample_dict(proteins, sequences): + if sequences: + # For SampleData[MAGs] + if isinstance(sequences, MultiMAGSequencesDirFmt): + sample_dict = sequences.sample_dict() + + # For SampleData[Contigs] + elif isinstance(sequences, ContigSequencesDirFmt): + file_dict = sequences.sample_dict() + # Create fake sample for sample_dict + sample_dict = {"": file_dict} + + # For FeatureData[MAG] + elif isinstance(sequences, MAGSequencesDirFmt): + file_dict = sequences.feature_dict() + # Create fake sample for sample_dict + sample_dict = {"": file_dict} + + else: + proteins.pathspec = r".+\.(fa|faa|fasta)$" + + # Monkey patch the sample_dict instance method of MultiMAGSequencesDirFmt to + # ProteinsDirectoryFormat if it has a sample data dir structure + if any(item.is_dir() for item in proteins.path.iterdir()): + proteins.sample_dict = MultiMAGSequencesDirFmt.sample_dict.__get__( + proteins, ProteinsDirectoryFormat + ) + sample_dict = proteins.sample_dict() + # Monkey patch the feature_dict instance method of MAGSequencesDirFmt to + # ProteinsDirectoryFormat if it has a feature data dir structure + else: + proteins.feature_dict = MAGSequencesDirFmt.feature_dict.__get__( + proteins, ProteinsDirectoryFormat + ) + file_dict = proteins.feature_dict() + # create sample_dict with fake sample + sample_dict = {"": file_dict} + + return sample_dict + + +def _get_file_paths(sequences, proteins, loci, id, file_fp, sample_id=""): + # If mags is provided + if sequences: + dna_path = file_fp + + # If proteins are provided, construct the expected protein file path. + if proteins: + protein_path = proteins.path / sample_id / f"{id}.fasta" + + # Raise an error if the expected protein file does not exist. + if not os.path.exists(protein_path): + raise ValueError( + f"Proteins file for ID '{id}' is missing in proteins input." + ) + else: + protein_path = None + + # If only proteins are provided (without mags), determine dna and protein file path. + else: + dna_path = None + protein_path = file_fp + + # If loci are provided, construct the expected GFF file path. + if loci: + gff_path = loci.path / sample_id / f"{id}.gff" + + # Raise an error if the expected GFF file does not exist. + if not os.path.exists(gff_path): + raise ValueError(f"GFF file for ID '{id}' is missing in loci input.") + else: + gff_path = None + + return dna_path, protein_path, gff_path + + +def colorify(string: str): + return "%s%s%s" % ("\033[1;33m", string, "\033[0m")