diff --git a/q2_amrfinderplus/annotate.py b/q2_amrfinderplus/annotate.py index 09eb401..a61d9ec 100644 --- a/q2_amrfinderplus/annotate.py +++ b/q2_amrfinderplus/annotate.py @@ -1,3 +1,4 @@ +import os from typing import Union from q2_types.feature_data_mag import MAGSequencesDirFmt @@ -58,10 +59,9 @@ def annotate( ) # Set up common parameters for _run_amrfinderplus_analyse - common_params = locals().copy() - del common_params["sequences"] - del common_params["proteins"] - del common_params["loci"] + common_params = { + k: v for k, v in locals().items() if k not in ("sequences", "proteins", "loci") + } # Innit output formats amr_annotations = AMRFinderPlusAnnotationsDirFmt() @@ -86,27 +86,30 @@ def annotate( sample_id, ) - for id, file_fp in files_dict.items(): + for _id, file_fp in files_dict.items(): # Construct and validate file input paths for amrfinderplus dna_path, protein_path, gff_path = _get_file_paths( sequences, proteins, loci, - id, + _id, file_fp, sample_id, ) # Define paths for output files - amr_annotations_path = ( - amr_annotations.path / sample_id / f"{id}_amr_annotations.tsv" + amr_annotations_path = os.path.join( + str(amr_annotations), sample_id, f"{_id}_amr_annotations.tsv" ) - amr_genes_path = amr_genes.path / sample_id / f"{id}_amr_genes.fasta" - amr_proteins_path = ( - amr_proteins.path / sample_id / f"{id}_amr_proteins.fasta" + amr_genes_path = os.path.join( + str(amr_genes), sample_id, f"{_id}_amr_genes.fasta" ) - amr_all_mutations_path = ( - amr_all_mutations.path / sample_id / f"{id}_amr_all_mutations.tsv" + + amr_proteins_path = os.path.join( + str(amr_proteins), sample_id, f"{_id}_amr_proteins.fasta" + ) + amr_all_mutations_path = os.path.join( + str(amr_all_mutations), sample_id, f"{_id}_amr_all_mutations.tsv" ) # Run amrfinderplus diff --git a/q2_amrfinderplus/plugin_setup.py b/q2_amrfinderplus/plugin_setup.py index a14aa9c..5cc8e2c 100644 --- a/q2_amrfinderplus/plugin_setup.py +++ b/q2_amrfinderplus/plugin_setup.py @@ -34,9 +34,10 @@ version=__version__, website="https://github.com/bokulich-lab/q2-amrfinderplus", package="q2_amrfinderplus", - description="A plugin to find acquired antimicrobial resistance genes and point " - "mutations in protein and/or assembled nucleotide sequences with " - "NCBI-AMRFinderPlus.", + description=( + "A plugin to find acquired antimicrobial resistance genes and point mutations " + "in protein and/or assembled nucleotide sequences with NCBI-AMRFinderPlus." + ), short_description="AMR annotation.", citations=[], ) @@ -58,7 +59,9 @@ organisms = [ "Acinetobacter_baumannii", + "Acinetobacter", "Burkholderia_cepacia", + "Burkholderia_cepacia_complex", "Burkholderia_pseudomallei", "Campylobacter", "Citrobacter_freundii", @@ -68,12 +71,15 @@ "Enterococcus_faecalis", "Enterococcus_faecium", "Escherichia", + "Escherichia_coli_Shigella", + "Klebsiella", "Klebsiella_oxytoca", "Klebsiella_pneumoniae", "Neisseria_gonorrhoeae", "Neisseria_meningitidis", "Pseudomonas_aeruginosa", "Salmonella", + "Serratia", "Serratia_marcescens", "Staphylococcus_aureus", "Staphylococcus_pseudintermedius", @@ -83,11 +89,6 @@ "Vibrio_cholerae", "Vibrio_parahaemolyticus", "Vibrio_vulnificus", - "Acinetobacter", - "Burkholderia_cepacia_complex", - "Escherichia_coli_Shigella", - "Klebsiella", - "Serratia", ] @@ -147,64 +148,83 @@ } amrfinderplus_parameter_descriptions = { - "organism": "Taxon used for screening known resistance causing point mutations " - "and blacklisting of common, non-informative genes. Pathogen Detection " - "taxgroup names can also be used.", - "plus": "Provide results from 'Plus' genes such as virulence factors, " - "stress-response genes, etc.", - "report_all_equal": "Report all equally scoring BLAST and HMM matches. This " - "will report multiple lines for a single element if there " - "are multiple reference proteins that have the same score. " - "On those lines the fields Accession of closest sequence " - "and Name of closest sequence will be different showing " - "each of the database proteins that are equally close to " - "the query sequence.", - "ident_min": "Minimum identity for a blast-based hit (Methods BLAST or " - "PARTIAL). Setting this value to something other than -1 " - "will override curated similarity cutoffs. We only recommend " - "using this option if you have a specific reason.", - "curated_ident": "Use the curated threshold for a blast-based hit, if it " - "exists and 0.9 otherwise. This will overwrite the value specified with the " - "'ident_min' parameter.", - "coverage_min": "Minimum proportion of reference gene covered for a " - "BLAST-based hit (Methods BLAST or PARTIAL).", + "organism": ( + "Taxon used for screening known resistance causing point mutations and " + "blacklisting of common, non-informative genes. Pathogen Detection taxgroup " + "names can also be used." + ), + "plus": ( + "Provide results from 'Plus' genes such as virulence factors, stress-response " + "genes, etc." + ), + "report_all_equal": ( + "Report all equally scoring BLAST and HMM matches. This will report multiple " + "lines for a single element if there are multiple reference proteins that have " + "the same score. On those lines the fields Accession of closest sequence and " + "Name of closest sequence will be different showing each of the database " + "proteins that are equally close to the query sequence." + ), + "ident_min": ( + "Minimum identity for a blast-based hit (Methods BLAST or PARTIAL). Setting " + "this value to something other than -1 will override curated similarity " + "cutoffs. We only recommend using this option if you have a specific reason." + ), + "curated_ident": ( + "Use the curated threshold for a blast-based hit, if it exists and 0.9 " + "otherwise." + ), + "coverage_min": ( + "Minimum proportion of reference gene covered for a BLAST-based hit (Methods " + "BLAST or PARTIAL)." + ), "translation_table": "Translation table used for BLASTX.", + "annotation_format": ( + "Specify the format of the GFF file in the loci input. 'standart' refers to " + "NCBI resources such as GenBank and RefSeq." + ), "report_common": "Report proteins common to a taxonomy group.", - "threads": "The number of threads to use for processing. AMRFinderPlus " - "defaults to 4 on hosts with >= 4 cores. Setting this number higher" - " than the number of cores on the running host may cause blastp to " - "fail. Using more than 4 threads may speed up searches.", + "threads": ( + "The number of threads to use for processing. AMRFinderPlus defaults to 4 on " + "hosts with >= 4 cores. Setting this number higher than the number of cores on " + "the running host may cause blastp to fail. Using more than 4 threads may " + "speed up searches." + ), } amrfinderplus_output_descriptions = { "amr_annotations": "Annotated AMR genes and mutations.", - "amr_all_mutations": "Report of genotypes at all locations screened for point " - "mutations. These files allow you to distinguish between called " - "point mutations that were the sensitive variant and the point " - "mutations that could not be called because the sequence was not " - "found. This file will contain all detected variants from the " - "reference sequence, so it could be used as an initial screen for " - "novel variants. Note 'Gene symbols' for mutations not in the " - "database (identifiable by [UNKNOWN] in the Sequence name field) " - "have offsets that are relative to the start of the sequence " - "indicated in the field 'Accession of closest sequence' while " - "'Gene symbols' from known point-mutation sites have gene symbols " - "that match the Pathogen Detection Reference Gene Catalog " - "standardized nomenclature for point mutations.", - "amr_genes": "Sequences that were identified by AMRFinderPlus as AMR genes. " - "This will include the entire region that aligns to the references for " - "point mutations.", - "amr_proteins": "Protein Sequences that were identified by AMRFinderPlus as " - "AMR genes. This will include the entire region that aligns to the references " - "for point mutations.", + "amr_all_mutations": ( + "Report of genotypes at all locations screened for point mutations. These " + "files allow you to distinguish between called point mutations that were the " + "sensitive variant and the point mutations that could not be called because " + "the sequence was not found. This file will contain all detected variants from " + "the reference sequence, so it could be used as an initial screen for novel " + "variants. Note 'Gene symbols' for mutations not in the database (identifiable " + "by [UNKNOWN] in the Sequence name field) have offsets that are relative to " + "the start of the sequence indicated in the field 'Accession of closest " + "sequence' while 'Gene symbols' from known point-mutation sites have gene " + "symbols that match the Pathogen Detection Reference Gene Catalog standardized " + "nomenclature for point mutations." + ), + "amr_genes": ( + "Sequences that were identified by AMRFinderPlus as AMR genes. This will " + "include the entire region that aligns to the references for point mutations." + ), + "amr_proteins": ( + "Protein Sequences that were identified by AMRFinderPlus as AMR genes. This " + "will include the entire region that aligns to the references for point " + "mutations" + ), } amrfinderplus_input_descriptions = { "sequences": "MAGs or contigs to be annotated with AMRFinderPlus.", "proteins": "Protein sequences to be annotated with AMRFinderPlus.", - "loci": "GFF files to give sequence coordinates for proteins input. Required " - "for combined searches of protein and DNA sequences.", + "loci": ( + "GFF files to give sequence coordinates for proteins input. Required for " + "combined searches of protein and DNA sequences." + ), "amrfinderplus_db": "AMRFinderPlus Database.", } @@ -228,9 +248,10 @@ parameter_descriptions=amrfinderplus_parameter_descriptions, output_descriptions=amrfinderplus_output_descriptions, name="Annotate MAGs or contigs with AMRFinderPlus.", - description="Annotate sample data MAGs or contigs with antimicrobial resistance " - "genes with AMRFinderPlus. Check https://github.com/ncbi/amr/wiki for " - "documentation.", + description=( + "Annotate MAGs or contigs with antimicrobial resistance genes with " + "AMRFinderPlus. Check https://github.com/ncbi/amr/wiki for documentation." + ), citations=[citations["feldgarden2021amrfinderplus"]], ) diff --git a/q2_amrfinderplus/tests/data/contigs/sample1_contigs.fasta b/q2_amrfinderplus/tests/data/contigs/sample1_contigs.fasta new file mode 100644 index 0000000..e69de29 diff --git a/q2_amrfinderplus/tests/data/feature_data_mag/30ef72ed-84fd-4348-a418-9d68a9b88729.fasta b/q2_amrfinderplus/tests/data/feature_data_mag/30ef72ed-84fd-4348-a418-9d68a9b88729.fasta new file mode 100644 index 0000000..e69de29 diff --git a/q2_amrfinderplus/tests/data/loci_per_sample/sample1/genome1.gff b/q2_amrfinderplus/tests/data/loci_per_sample/sample1/genome1.gff new file mode 100644 index 0000000..e69de29 diff --git a/q2_amrfinderplus/tests/data/proteins/sample1.fasta b/q2_amrfinderplus/tests/data/proteins/sample1.fasta new file mode 100644 index 0000000..e69de29 diff --git a/q2_amrfinderplus/tests/data/proteins_per_sample/sample1/genome1.fasta b/q2_amrfinderplus/tests/data/proteins_per_sample/sample1/genome1.fasta new file mode 100644 index 0000000..e69de29 diff --git a/q2_amrfinderplus/tests/data/sample_data_mags/sample1/mag.fasta b/q2_amrfinderplus/tests/data/sample_data_mags/sample1/mag.fasta new file mode 100644 index 0000000..e69de29 diff --git a/q2_amrfinderplus/tests/test_utils.py b/q2_amrfinderplus/tests/test_utils.py index cfbe144..1e4d5f5 100644 --- a/q2_amrfinderplus/tests/test_utils.py +++ b/q2_amrfinderplus/tests/test_utils.py @@ -1,14 +1,17 @@ import os import subprocess -from io import StringIO -from pathlib import Path -from unittest.mock import MagicMock, call, mock_open, patch +from unittest.mock import call, patch from q2_types.feature_data_mag import MAGSequencesDirFmt -from q2_types.genome_data import ProteinsDirectoryFormat +from q2_types.genome_data import ( + GenesDirectoryFormat, + LociDirectoryFormat, + ProteinsDirectoryFormat, +) from q2_types.per_sample_sequences import ContigSequencesDirFmt, MultiMAGSequencesDirFmt from qiime2.plugin.testing import TestPluginBase +from q2_amrfinderplus.types import AMRFinderPlusAnnotationsDirFmt from q2_amrfinderplus.utils import ( EXTERNAL_CMD_WARNING, _create_empty_files, @@ -36,7 +39,7 @@ def test_run_command_verbose(self, mock_print, mock_subprocess_run): run_command(cmd, cwd=cwd, verbose=True) # Check if subprocess.run was called with the correct arguments - mock_subprocess_run.assert_called_once_with(cmd, check=True, cwd=cwd) + mock_subprocess_run.assert_called_once_with(cmd, check=True, cwd=cwd, stderr=-1) # Check if the correct print statements were called mock_print.assert_has_calls( @@ -58,7 +61,7 @@ def test_run_command_non_verbose(self, mock_print, mock_subprocess_run): run_command(cmd, cwd=cwd, verbose=False) # Check if subprocess.run was called with the correct arguments - mock_subprocess_run.assert_called_once_with(cmd, check=True, cwd=cwd) + mock_subprocess_run.assert_called_once_with(cmd, check=True, cwd=cwd, stderr=-1) # Ensure no print statements were made mock_print.assert_not_called() @@ -164,11 +167,15 @@ def test_run_amrfinderplus_analyse_minimal(self, mock_run_command): def test_run_amrfinderplus_analyse_exception_message(self, mock_run_command): # Simulate subprocess.CalledProcessError mock_run_command.side_effect = subprocess.CalledProcessError( - returncode=1, cmd="amrfinder" + returncode=1, + cmd="amrfinder", + stderr=b"Mock stderr message", ) # Call the function and assert the exception message - with self.assertRaises(Exception) as context: + with self.assertRaisesRegex( + Exception, "An error was encountered while running AMRFinderPlus" + ): _run_amrfinderplus_analyse( amrfinderplus_db="mock_db", dna_path=None, @@ -187,12 +194,34 @@ def test_run_amrfinderplus_analyse_exception_message(self, mock_run_command): amr_annotations_path="mock_annotations_path", ) - # Assert the correct exception message is raised - self.assertIn( - "An error was encountered while running AMRFinderPlus", - str(context.exception), + @patch("q2_amrfinderplus.utils.run_command") + def test_run_amrfinderplus_analyse_exception_gff_error(self, mock_run_command): + # Simulate subprocess.CalledProcessError + mock_run_command.side_effect = subprocess.CalledProcessError( + returncode=1, + cmd="amrfinder", + stderr=b"gff_check.cpp", ) - self.assertIn("(return code 1)", str(context.exception)) + + # Call the function and assert the exception message + with self.assertRaisesRegex(Exception, "GFF file error:"): + _run_amrfinderplus_analyse( + amrfinderplus_db="mock_db", + dna_path=None, + protein_path=None, + gff_path=None, + organism=None, + plus=False, + report_all_equal=False, + ident_min=None, + curated_ident=False, + coverage_min=0.5, + translation_table="11", + annotation_format="prodigal", + report_common=False, + threads=None, + amr_annotations_path="mock_annotations_path", + ) class TestValidateInputs(TestPluginBase): @@ -217,7 +246,7 @@ def test_loci_without_proteins(self): # Test when --i-mags and --i-proteins are given without --i-loci def test_mags_and_proteins_without_loci(self): with self.assertRaisesRegex( - ValueError, "can only be given in combination " 'with "--i-loci"' + ValueError, 'can only be given in combination with "--i-loci"' ): _validate_inputs( sequences=True, @@ -250,9 +279,7 @@ def test_missing_mags_and_proteins(self): def test_ident_min_and_curated_ident(self): with self.assertRaisesRegex( ValueError, - '"--p-ident-min" and ' - '"--p-curated-ident" cannot be used ' - "simultaneously", + '"--p-ident-min" and "--p-curated-ident" cannot be used simultaneously', ): _validate_inputs( sequences=True, @@ -268,7 +295,7 @@ def test_ident_min_and_curated_ident(self): # Test when --p-report-common is given but --p-plus or --p-organism is missing def test_report_common_without_plus_or_organism(self): with self.assertRaisesRegex( - ValueError, '"--p-report-common" requires ' '"--p-plus" and "--p-organism"' + ValueError, '"--p-report-common" requires "--p-plus" and "--p-organism"' ): _validate_inputs( sequences=True, @@ -285,168 +312,139 @@ def test_report_common_without_plus_or_organism(self): class TestGetFilePaths(TestPluginBase): package = "q2_amrfinderplus.tests" - @patch("os.path.exists") - def test_mags_with_proteins_and_loci(self, mock_exists): - # Mock the os.path.exists to simulate files existing - mock_exists.side_effect = [True, True] # First for protein, second for GFF + def test_mags_with_proteins_and_loci(self): + sequences = MAGSequencesDirFmt() + proteins = ProteinsDirectoryFormat( + self.get_data_path("proteins_per_sample"), "r" + ) + loci = LociDirectoryFormat(self.get_data_path("loci_per_sample"), "r") # Call the function with mags, proteins, and loci dna_path, protein_path, gff_path = _get_file_paths( - sequences=MagicMock(), - proteins=MagicMock(path=Path("proteins")), - loci=MagicMock(path=Path("loci")), - id="id", + sequences=sequences, + proteins=proteins, + loci=loci, + _id="genome1", sample_id="sample1", file_fp="dna_file.fasta", ) - # Assertions self.assertEqual(dna_path, "dna_file.fasta") - self.assertEqual(str(protein_path), "proteins/sample1/id.fasta") - self.assertEqual(str(gff_path), "loci/sample1/id.gff") + self.assertEqual(protein_path, f"{str(proteins)}/sample1/genome1.fasta") + self.assertEqual(str(gff_path), f"{str(loci)}/sample1/genome1.gff") def test_mags_without_proteins_and_loci(self): - # Call the function with mags, proteins, and loci + # Call the function with mags dna_path, protein_path, gff_path = _get_file_paths( - sequences=MagicMock(), + sequences=MAGSequencesDirFmt(), proteins=None, loci=None, - id="sample123", + _id="sample123", file_fp="dna_file.fasta", ) - # Assertions self.assertEqual(dna_path, "dna_file.fasta") self.assertEqual(protein_path, None) self.assertEqual(gff_path, None) - @patch("os.path.exists") - def test_mags_with_missing_protein(self, mock_exists): - # Mock os.path.exists to simulate the missing protein file - mock_exists.side_effect = [False] # Protein file does not exist - + def test_mags_with_missing_protein(self): # Call the function with mags and proteins, but no loci - with self.assertRaises(ValueError) as context: + with self.assertRaisesRegex( + ValueError, "Proteins file for ID 'sample123' is missing" + ): _get_file_paths( - sequences=MagicMock(), - proteins=MagicMock(), + sequences=MAGSequencesDirFmt(), + proteins=ProteinsDirectoryFormat(), loci=None, - id="sample123", + _id="sample123", sample_id="sample1", file_fp="dna_file.fasta", ) - # Check that the exception message contains the correct text - self.assertIn( - "Proteins file for ID 'sample123' is missing", str(context.exception) - ) - - @patch("os.path.exists") - def test_loci_with_missing_gff(self, mock_exists): - # Mock os.path.exists to simulate the protein file exists but GFF file is - # missing - mock_exists.side_effect = [False] # Protein exists, GFF is missing - + def test_loci_with_missing_gff(self): # Call the function with proteins and loci, but no mags - with self.assertRaises(ValueError) as context: + with self.assertRaisesRegex( + ValueError, "GFF file for ID 'sample123' is missing" + ): _get_file_paths( sequences=None, proteins=None, - loci=MagicMock(path=Path("/mock/loci/path")), - id="sample123", + loci=LociDirectoryFormat(), + _id="sample123", sample_id="sample1", file_fp="protein_file.fasta", ) - # Check that the exception message contains the correct text - self.assertIn("GFF file for ID 'sample123' is missing", str(context.exception)) - class TestCreateSampleDict(TestPluginBase): package = "q2_amrfinderplus.tests" - @patch.object( - MultiMAGSequencesDirFmt, "sample_dict", return_value={"sample1": "some_value"} - ) - def test_create_sample_dict_sequences_multimags(self, mock_sample_dict): - # Mock the sequences input as MultiMAGSequencesDirFmt - sequences = MultiMAGSequencesDirFmt() + def test_create_sample_dict_sequences_multimags(self): + dirpath = self.get_data_path("sample_data_mags") + sequences = MultiMAGSequencesDirFmt(dirpath, "r") - # Call the function result = _create_sample_dict(proteins=None, sequences=sequences) - # Check that sample_dict is called correctly - mock_sample_dict.assert_called_once() - - # Ensure the result is the mocked return value of sample_dict - self.assertEqual(result, {"sample1": "some_value"}) + self.assertEqual( + result, {"sample1": {"mag": os.path.join(dirpath, "sample1", "mag.fasta")}} + ) - @patch.object( - ContigSequencesDirFmt, "sample_dict", return_value={"contig_file": "file_path"} - ) - def test_create_sample_dict_sequences_contigs(self, mock_sample_dict): - # Mock the sequences input as ContigSequencesDirFmt - sequences = ContigSequencesDirFmt() + def test_create_sample_dict_sequences_contigs(self): + dirpath = self.get_data_path("contigs") + sequences = ContigSequencesDirFmt(dirpath, "r") - # Call the function result = _create_sample_dict(proteins=None, sequences=sequences) - # Check that sample_dict is called correctly - mock_sample_dict.assert_called_once() - - # Ensure the result has a fake sample key with the file_dict - self.assertEqual(result, {"": {"contig_file": "file_path"}}) + self.assertEqual( + result, {"": {"sample1": os.path.join(dirpath, "sample1_contigs.fasta")}} + ) - @patch.object( - MAGSequencesDirFmt, "feature_dict", return_value={"feature_file": "file_path"} - ) - def test_create_sample_dict_sequences_mag(self, mock_feature_dict): - # Mock the sequences input as MAGSequencesDirFmt - sequences = MAGSequencesDirFmt() + def test_create_sample_dict_sequences_mag(self): + dirpath = self.get_data_path("feature_data_mag") + sequences = MAGSequencesDirFmt(dirpath, "r") - # Call the function result = _create_sample_dict(proteins=None, sequences=sequences) - # Check that feature_dict is called correctly - mock_feature_dict.assert_called_once() - - # Ensure the result has a fake sample key with the feature_dict - self.assertEqual(result, {"": {"feature_file": "file_path"}}) - - def test_create_sample_dict_proteins_sample_data(self): - proteins = ProteinsDirectoryFormat() + self.assertEqual( + result, + { + "": { + "30ef72ed-84fd-4348-a418-9d68a9b88729": os.path.join( + dirpath, "30ef72ed-84fd-4348-a418-9d68a9b88729.fasta" + ) + } + }, + ) - os.mkdir(proteins.path / "directory") - with open(proteins.path / "directory" / "file.fasta", "w"): - pass + def test_create_sample_dict_proteins(self): + dirpath = self.get_data_path("proteins") + proteins = ProteinsDirectoryFormat(dirpath, "r") result = _create_sample_dict(proteins=proteins, sequences=None) self.assertEqual( - result, - {"directory": {"file": str(proteins.path / "directory" / "file.fasta")}}, + result, {"": {"sample1": os.path.join(dirpath, "sample1.fasta")}} ) - def test_create_sample_dict_proteins_feature_data(self): - proteins = ProteinsDirectoryFormat() - - with open(proteins.path / "file.fasta", "w"): - pass + def test_create_sample_dict_proteins_per_sample(self): + dirpath = self.get_data_path("proteins_per_sample") + proteins = ProteinsDirectoryFormat(dirpath, "r") result = _create_sample_dict(proteins=proteins, sequences=None) - self.assertEqual(result, {"": {"file": str(proteins.path / "file.fasta")}}) + self.assertEqual( + result, + {"sample1": {"genome1": os.path.join(dirpath, "sample1", "genome1.fasta")}}, + ) class TestCreateEmptyFiles(TestPluginBase): package = "q2_amrfinderplus.tests" - @patch("builtins.open", new_callable=mock_open) - @patch("sys.stdout", new_callable=StringIO) - def test_create_empty_files_all_false(self, mock_stdout, mock_open_file): - amr_genes = MagicMock(path=Path("path/amr_genes")) - amr_proteins = MagicMock(path=Path("path/amr_proteins")) - amr_all_mutations = MagicMock(path=Path("path/amr_all_mutations")) + def test_create_empty_files_all_false(self): + amr_genes = GenesDirectoryFormat() + amr_proteins = ProteinsDirectoryFormat() + amr_all_mutations = AMRFinderPlusAnnotationsDirFmt() _create_empty_files( sequences=False, @@ -456,28 +454,19 @@ def test_create_empty_files_all_false(self, mock_stdout, mock_open_file): amr_proteins=amr_proteins, amr_all_mutations=amr_all_mutations, ) - - # Assertions for file creation - mock_open_file.assert_any_call(Path("path/amr_genes/empty.fasta"), "w") - mock_open_file.assert_any_call(Path("path/amr_proteins/empty.fasta"), "w") - mock_open_file.assert_any_call( - Path("path/amr_all_mutations/empty_amr_all_mutations.tsv"), "w" + # Assert that all empty files were created + self.assertTrue(os.path.exists(os.path.join(str(amr_genes), "empty.fasta"))) + self.assertTrue(os.path.exists(os.path.join(str(amr_proteins), "empty.fasta"))) + self.assertTrue( + os.path.exists( + os.path.join(str(amr_all_mutations), "empty_amr_all_mutations.tsv") + ) ) - self.assertEqual(mock_open_file.call_count, 3) - - # Capture printed output - printed_output = mock_stdout.getvalue() - - # Assertions for print statements by checking keywords - self.assertIn("amr_genes", printed_output) - self.assertIn("amr_proteins", printed_output) - self.assertIn("amr_all_mutations", printed_output) - @patch("builtins.open", new_callable=mock_open) - def test_create_empty_files_all_true(self, mock_open_file): - amr_genes = MagicMock(path=Path("path/amr_genes")) - amr_proteins = MagicMock(path=Path("path/amr_proteins")) - amr_all_mutations = MagicMock(path=Path("path/amr_all_mutations")) + def test_create_empty_files_all_true(self): + amr_genes = GenesDirectoryFormat() + amr_proteins = ProteinsDirectoryFormat() + amr_all_mutations = AMRFinderPlusAnnotationsDirFmt() _create_empty_files( sequences=True, @@ -487,20 +476,24 @@ def test_create_empty_files_all_true(self, mock_open_file): amr_proteins=amr_proteins, amr_all_mutations=amr_all_mutations, ) - - # Assertions - mock_open_file.assert_not_called() + # Assert that no empty files were created + self.assertFalse(os.path.exists(os.path.join(str(amr_genes), "empty.fasta"))) + self.assertFalse(os.path.exists(os.path.join(str(amr_proteins), "empty.fasta"))) + self.assertFalse( + os.path.exists( + os.path.join(str(amr_all_mutations), "empty_amr_all_mutations.tsv") + ) + ) class TestCreateSampleDirs(TestPluginBase): package = "q2_amrfinderplus.tests" - @patch("os.makedirs") - def test_create_sample_dirs_all_exist(self, mock_makedirs): - amr_annotations = MagicMock(path=Path("/fake/path/amr_annotations")) - amr_genes = MagicMock(path=Path("/fake/path/amr_genes")) - amr_proteins = MagicMock(path=Path("/fake/path/amr_proteins")) - amr_all_mutations = MagicMock(path=Path("/fake/path/amr_all_mutations")) + def test_create_sample_dirs_all_exist(self): + amr_annotations = AMRFinderPlusAnnotationsDirFmt() + amr_genes = GenesDirectoryFormat() + amr_proteins = ProteinsDirectoryFormat() + amr_all_mutations = AMRFinderPlusAnnotationsDirFmt() _create_sample_dirs( sequences=True, @@ -513,24 +506,13 @@ def test_create_sample_dirs_all_exist(self, mock_makedirs): sample_id="sample1", ) - # Assertions - mock_makedirs.assert_any_call( - Path("/fake/path/amr_annotations/sample1"), exist_ok=True - ) - mock_makedirs.assert_any_call( - Path("/fake/path/amr_genes/sample1"), exist_ok=True - ) - mock_makedirs.assert_any_call( - Path("/fake/path/amr_proteins/sample1"), exist_ok=True - ) - mock_makedirs.assert_any_call( - Path("/fake/path/amr_all_mutations/sample1"), exist_ok=True - ) - self.assertEqual(mock_makedirs.call_count, 4) + self.assertTrue(os.path.exists(os.path.join(str(amr_annotations), "sample1"))) + self.assertTrue(os.path.exists(os.path.join(str(amr_genes), "sample1"))) + self.assertTrue(os.path.exists(os.path.join(str(amr_proteins), "sample1"))) + self.assertTrue(os.path.exists(os.path.join(str(amr_all_mutations), "sample1"))) - @patch("os.makedirs") - def test_create_sample_dirs_nothing(self, mock_makedirs): - amr_annotations = MagicMock(path=Path("/fake/path/amr_annotations")) + def test_create_sample_dirs_nothing(self): + amr_annotations = AMRFinderPlusAnnotationsDirFmt() _create_sample_dirs( sequences=False, @@ -543,11 +525,7 @@ def test_create_sample_dirs_nothing(self, mock_makedirs): sample_id="sample1", ) - # Assertions - mock_makedirs.assert_any_call( - Path("/fake/path/amr_annotations/sample1"), exist_ok=True - ) - self.assertEqual(mock_makedirs.call_count, 1) + self.assertTrue(os.path.exists(os.path.join(str(amr_annotations), "sample1"))) class TestColorify(TestPluginBase): diff --git a/q2_amrfinderplus/utils.py b/q2_amrfinderplus/utils.py index c8c0ec9..1114f87 100644 --- a/q2_amrfinderplus/utils.py +++ b/q2_amrfinderplus/utils.py @@ -2,7 +2,6 @@ import subprocess from q2_types.feature_data_mag import MAGSequencesDirFmt -from q2_types.genome_data import ProteinsDirectoryFormat from q2_types.per_sample_sequences import ContigSequencesDirFmt, MultiMAGSequencesDirFmt EXTERNAL_CMD_WARNING = ( @@ -19,7 +18,7 @@ def run_command(cmd, cwd=None, verbose=True): print(EXTERNAL_CMD_WARNING) print("\nCommand:", end=" ") print(" ".join(cmd), end="\n\n") - subprocess.run(cmd, check=True, cwd=cwd) + subprocess.run(cmd, check=True, cwd=cwd, stderr=subprocess.PIPE) def _validate_inputs( @@ -81,7 +80,7 @@ def _run_amrfinderplus_analyse( "--database", str(amrfinderplus_db), "-o", - str(amr_annotations_path), + amr_annotations_path, "--print_node", ] # Creates nucleotide fasta output if DNA sequences are given as input @@ -89,9 +88,9 @@ def _run_amrfinderplus_analyse( cmd.extend( [ "-n", - str(dna_path), + dna_path, "--nucleotide_output", - str(amr_genes_path), + amr_genes_path, ] ) # Creates protein fasta output if protein sequences are given as input @@ -99,13 +98,13 @@ def _run_amrfinderplus_analyse( cmd.extend( [ "-p", - str(protein_path), + protein_path, "--protein_output", - str(amr_proteins_path), + amr_proteins_path, ] ) if gff_path: - cmd.extend(["-g", str(gff_path)]) + cmd.extend(["-g", gff_path]) if threads: cmd.extend(["--threads", str(threads)]) # Creates all mutations output if an organism is specified @@ -115,7 +114,7 @@ def _run_amrfinderplus_analyse( "--organism", organism, "--mutation_all", - str(amr_all_mutations_path), + amr_all_mutations_path, ] ) if plus: @@ -146,11 +145,22 @@ def _run_amrfinderplus_analyse( try: run_command(cmd=cmd) except subprocess.CalledProcessError as e: - raise Exception( - "An error was encountered while running AMRFinderPlus, " - f"(return code {e.returncode}), please inspect " - "stdout and stderr to learn more." - ) + print(e.stderr.decode("utf-8")) + if "gff_check.cpp" in e.stderr.decode("utf-8"): + raise Exception( + "GFF file error: Either there is data missing in one GFF file or an " + "incorrect GFF input format was specified with the parameter " + "'--p-annotation-format'. Please check https://github.com/ncbi/amr/wiki" + "/Running-AMRFinderPlus#input-file-formats for documentation and " + "choose the correct GFF input format. Please inspect stdout and stderr " + "to learn more." + ) + else: + raise Exception( + "An error was encountered while running AMRFinderPlus, " + f"(return code {e.returncode}), please inspect stdout and stderr to " + "learn more." + ) def _create_empty_files( @@ -159,7 +169,7 @@ def _create_empty_files( # Creates empty files in output artifacts amr_genes, amr_proteins and # amr_all_mutations because artifacts can not be empty if not sequences: - with open(amr_genes.path / "empty.fasta", "w"): + with open(os.path.join(str(amr_genes), "empty.fasta"), "w"): pass print( colorify( @@ -169,7 +179,7 @@ def _create_empty_files( ) if not proteins: - with open(amr_proteins.path / "empty.fasta", "w"): + with open(os.path.join(str(amr_proteins), "empty.fasta"), "w"): pass print( colorify( @@ -179,7 +189,9 @@ def _create_empty_files( ) if not organism: - with open(amr_all_mutations.path / "empty_amr_all_mutations.tsv", "w"): + with open( + os.path.join(str(amr_all_mutations), "empty_amr_all_mutations.tsv"), "w" + ): pass print( colorify( @@ -199,13 +211,13 @@ def _create_sample_dirs( amr_all_mutations, sample_id, ): - os.makedirs(amr_annotations.path / sample_id, exist_ok=True) + os.makedirs(os.path.join(str(amr_annotations), sample_id), exist_ok=True) if sequences: - os.makedirs(amr_genes.path / sample_id, exist_ok=True) + os.makedirs(os.path.join(str(amr_genes), sample_id), exist_ok=True) if proteins: - os.makedirs(amr_proteins.path / sample_id, exist_ok=True) + os.makedirs(os.path.join(str(amr_proteins), sample_id), exist_ok=True) if organism: - os.makedirs(amr_all_mutations.path / sample_id, exist_ok=True) + os.makedirs(os.path.join(str(amr_all_mutations), sample_id), exist_ok=True) def _create_sample_dict(proteins, sequences): @@ -227,41 +239,32 @@ def _create_sample_dict(proteins, sequences): sample_dict = {"": file_dict} else: - proteins.pathspec = r".+\.(fa|faa|fasta)$" - - # Monkey patch the sample_dict instance method of MultiMAGSequencesDirFmt to - # ProteinsDirectoryFormat if it has a sample data dir structure + # For GenomeData[Proteins] with per sample directory structure if any(item.is_dir() for item in proteins.path.iterdir()): - proteins.sample_dict = MultiMAGSequencesDirFmt.sample_dict.__get__( - proteins, ProteinsDirectoryFormat - ) - sample_dict = proteins.sample_dict() - # Monkey patch the feature_dict instance method of MAGSequencesDirFmt to - # ProteinsDirectoryFormat if it has a feature data dir structure + sample_dict = proteins.genome_dict() + + # For GenomeData[Proteins] with no per sample directory structure else: - proteins.feature_dict = MAGSequencesDirFmt.feature_dict.__get__( - proteins, ProteinsDirectoryFormat - ) - file_dict = proteins.feature_dict() - # create sample_dict with fake sample + file_dict = proteins.genome_dict() + # Create sample_dict with fake sample sample_dict = {"": file_dict} return sample_dict -def _get_file_paths(sequences, proteins, loci, id, file_fp, sample_id=""): +def _get_file_paths(sequences, proteins, loci, _id, file_fp, sample_id=""): # If mags is provided if sequences: dna_path = file_fp # If proteins are provided, construct the expected protein file path. if proteins: - protein_path = proteins.path / sample_id / f"{id}.fasta" + protein_path = os.path.join(str(proteins), sample_id, f"{_id}.fasta") # Raise an error if the expected protein file does not exist. if not os.path.exists(protein_path): raise ValueError( - f"Proteins file for ID '{id}' is missing in proteins input." + f"Proteins file for ID '{_id}' is missing in proteins input." ) else: protein_path = None @@ -273,11 +276,11 @@ def _get_file_paths(sequences, proteins, loci, id, file_fp, sample_id=""): # If loci are provided, construct the expected GFF file path. if loci: - gff_path = loci.path / sample_id / f"{id}.gff" + gff_path = os.path.join(str(loci), sample_id, f"{_id}.gff") # Raise an error if the expected GFF file does not exist. if not os.path.exists(gff_path): - raise ValueError(f"GFF file for ID '{id}' is missing in loci input.") + raise ValueError(f"GFF file for ID '{_id}' is missing in loci input.") else: gff_path = None diff --git a/setup.py b/setup.py index 58c9b2e..3ca0003 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,7 @@ package_data={ "q2_amrfinderplus": ["citations.bib"], "q2_amrfinderplus.types.tests": ["data/*" "data/*/*" "data/*/*/*"], + "q2_amrfinderplus.tests": ["data/*" "data/*/*" "data/*/*/*"], }, zip_safe=False, )