From d149b14f732d816362d097c2a292438998e53571 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Fri, 13 Sep 2024 15:31:25 +0200 Subject: [PATCH 1/8] sample data mags with tests Created from https://github.com/caporaso-lab/cookiecutter-qiime2-plugin. See https://develop.qiime2.org to learn more. From 9e2f584b205a642c5ae3b67bed35aa0b22555b78 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Mon, 23 Sep 2024 13:57:09 +0200 Subject: [PATCH 2/8] added error when proteins are not ordered in samples --- q2_amrfinderplus/sample_data.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/q2_amrfinderplus/sample_data.py b/q2_amrfinderplus/sample_data.py index eb22ab9..84f70a9 100644 --- a/q2_amrfinderplus/sample_data.py +++ b/q2_amrfinderplus/sample_data.py @@ -56,9 +56,9 @@ def annotate_sample_data_amrfinderplus( amr_proteins = ProteinsDirectoryFormat() frequency_list = [] - # Create iterator for samples with sample_dict + # Create sample dict with sample_dict if mags: - sample_iterator = mags.sample_dict().items() + sample_dict = mags.sample_dict() else: # Monkey patch the sample_dict instance method of MultiMAGSequencesDirFmt to # ProteinsDirectoryFormat because it should have the same per sample structure @@ -66,10 +66,18 @@ def annotate_sample_data_amrfinderplus( proteins.sample_dict = MultiMAGSequencesDirFmt.sample_dict.__get__( proteins, ProteinsDirectoryFormat ) - sample_iterator = proteins.sample_dict().items() + sample_dict = proteins.sample_dict() + + # ProteinsDirectoryFormat has no predefined dir structure and can be just files + # without sample directories + if len(sample_dict) == 0: + raise ValueError( + "The files in the GenomeData[Proteins] input should be " + "organised in per-sample directories." + ) # Iterate over paths of MAGs - for sample_id, files_dict in sample_iterator: + for sample_id, files_dict in sample_dict.items(): # Create sample directories in output directories os.mkdir(f"{amr_annotations}/{sample_id}") if mags: @@ -117,6 +125,7 @@ def annotate_sample_data_amrfinderplus( feature_table = create_count_table(df_list=frequency_list) + # Create empty files if needed if not mags: with open(os.path.join(str(amr_genes), "empty.fasta"), "w"): pass From d937e745f4a08a38ff783981b56ddc8adfc461db Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Tue, 24 Sep 2024 16:50:50 +0200 Subject: [PATCH 3/8] one action combined with tests --- q2_amrfinderplus/annotate.py | 129 +++++++ q2_amrfinderplus/plugin_setup.py | 21 +- q2_amrfinderplus/sample_data.py | 181 --------- q2_amrfinderplus/tests/test_annotate.py | 47 +++ q2_amrfinderplus/tests/test_database.py | 10 +- q2_amrfinderplus/tests/test_sample_data.py | 232 ------------ q2_amrfinderplus/tests/test_utils.py | 343 +++++++++++++++++- q2_amrfinderplus/types/_type.py | 5 +- .../tests/test_types_formats_transformers.py | 9 +- q2_amrfinderplus/utils.py | 167 +++++++-- 10 files changed, 665 insertions(+), 479 deletions(-) create mode 100644 q2_amrfinderplus/annotate.py delete mode 100644 q2_amrfinderplus/sample_data.py create mode 100644 q2_amrfinderplus/tests/test_annotate.py delete mode 100644 q2_amrfinderplus/tests/test_sample_data.py diff --git a/q2_amrfinderplus/annotate.py b/q2_amrfinderplus/annotate.py new file mode 100644 index 0000000..e5e8fa5 --- /dev/null +++ b/q2_amrfinderplus/annotate.py @@ -0,0 +1,129 @@ +from typing import Union + +from q2_types.feature_data_mag import MAGSequencesDirFmt +from q2_types.genome_data import ( + GenesDirectoryFormat, + LociDirectoryFormat, + ProteinsDirectoryFormat, +) +from q2_types.per_sample_sequences import ContigSequencesDirFmt, MultiMAGSequencesDirFmt + +from q2_amrfinderplus.types import ( + AMRFinderPlusAnnotationsDirFmt, + AMRFinderPlusDatabaseDirFmt, +) +from q2_amrfinderplus.utils import ( + _create_empty_files, + _create_sample_dict, + _create_sample_dirs, + _get_file_paths, + _run_amrfinderplus_analyse, + _validate_inputs, +) + + +def annotate( + amrfinderplus_db: AMRFinderPlusDatabaseDirFmt, + sequences: Union[ + MultiMAGSequencesDirFmt, ContigSequencesDirFmt, MAGSequencesDirFmt + ] = None, + proteins: ProteinsDirectoryFormat = None, + loci: LociDirectoryFormat = None, + organism: str = None, + plus: bool = False, + report_all_equal: bool = False, + ident_min: float = None, + curated_ident: bool = False, + coverage_min: float = 0.5, + translation_table: str = "11", + annotation_format: str = "prodigal", + report_common: bool = False, + threads: int = None, +) -> ( + AMRFinderPlusAnnotationsDirFmt, + AMRFinderPlusAnnotationsDirFmt, + GenesDirectoryFormat, + ProteinsDirectoryFormat, +): + # Validate input and parameter combinations + _validate_inputs( + sequences, + loci, + proteins, + ident_min, + curated_ident, + report_common, + plus, + organism, + ) + + # Set up common parameters for _run_amrfinderplus_analyse + common_params = locals().copy() + del common_params["sequences"] + del common_params["proteins"] + del common_params["loci"] + + # Innit output formats + amr_annotations = AMRFinderPlusAnnotationsDirFmt() + amr_all_mutations = AMRFinderPlusAnnotationsDirFmt() + amr_genes = GenesDirectoryFormat() + amr_proteins = ProteinsDirectoryFormat() + + # Create sample_dict to iterate over input files + sample_dict = _create_sample_dict(proteins, sequences) + + # Iterate over sample_dict + for sample_id, files_dict in sample_dict.items(): + # Create sample directories in output directories + _create_sample_dirs( + sequences, + proteins, + organism, + amr_annotations, + amr_genes, + amr_proteins, + amr_all_mutations, + sample_id, + ) + + for id, file_fp in files_dict.items(): + # Construct and validate file input paths for amrfinderplus + dna_path, protein_path, gff_path = _get_file_paths( + sequences, + proteins, + loci, + id, + file_fp, + sample_id, + ) + + # Define paths for output files + amr_annotations_path = ( + amr_annotations.path / sample_id / f"{id}_amr_annotations.tsv" + ) + amr_genes_path = (amr_genes.path / sample_id / f"{id}_amr_genes.fasta",) + amr_proteins_path = ( + amr_proteins.path / sample_id / f"{id}_amr_proteins.fasta" + ) + amr_all_mutations_path = ( + amr_all_mutations.path / sample_id / f"{id}_amr_all_mutations.tsv" + ) + + # Run amrfinderplus + _run_amrfinderplus_analyse( + dna_path=dna_path, + protein_path=protein_path, + gff_path=gff_path, + amr_annotations_path=amr_annotations_path, + amr_genes_path=amr_genes_path, + amr_proteins_path=amr_proteins_path, + amr_all_mutations_path=amr_all_mutations_path, + **common_params, + ) + + # Create empty files for empty output artifacts if needed + _create_empty_files( + sequences, proteins, organism, amr_genes, amr_proteins, amr_all_mutations + ) + + return amr_annotations, amr_all_mutations, amr_genes, amr_proteins diff --git a/q2_amrfinderplus/plugin_setup.py b/q2_amrfinderplus/plugin_setup.py index 37496f6..bb235a2 100644 --- a/q2_amrfinderplus/plugin_setup.py +++ b/q2_amrfinderplus/plugin_setup.py @@ -5,16 +5,17 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- -from q2_types.feature_table import FeatureTable, PresenceAbsence +from q2_types.feature_data import FeatureData +from q2_types.feature_data_mag import MAG from q2_types.genome_data import Genes, GenomeData, Loci, Proteins -from q2_types.per_sample_sequences import MAGs +from q2_types.per_sample_sequences import Contigs, MAGs from q2_types.sample_data import SampleData from qiime2.core.type import Bool, Choices, Float, Int, Range, Str from qiime2.plugin import Citations, Plugin from q2_amrfinderplus import __version__ +from q2_amrfinderplus.annotate import annotate from q2_amrfinderplus.database import fetch_amrfinderplus_db -from q2_amrfinderplus.sample_data import annotate_sample_data_amrfinderplus from q2_amrfinderplus.types._format import ( AMRFinderPlusAnnotationFormat, AMRFinderPlusAnnotationsDirFmt, @@ -194,12 +195,11 @@ "amr_proteins": "Protein Sequences that were identified by AMRFinderPlus as " "AMR genes. This will include the entire region that aligns to the references " "for point mutations.", - "feature_table": "Presence/Absence table of ARGs in all samples.", } amrfinderplus_input_descriptions = { - "mags": "MAGs to be annotated with AMRFinderPlus.", + "sequences": "MAGs or contigs to be annotated with AMRFinderPlus.", "proteins": "Protein sequences to be annotated with AMRFinderPlus.", "loci": "GFF files to give sequence coordinates for proteins input. Required " "for combined searches of protein and DNA sequences.", @@ -208,20 +208,19 @@ plugin.methods.register_function( - function=annotate_sample_data_amrfinderplus, + function=annotate, inputs={ - "mags": SampleData[MAGs], + "sequences": SampleData[MAGs | Contigs] | FeatureData[MAG], "proteins": GenomeData[Proteins], "loci": GenomeData[Loci], "amrfinderplus_db": AMRFinderPlusDatabase, }, parameters=amrfinderplus_parameters, outputs=[ - ("amr_annotations", SampleData[AMRFinderPlusAnnotations]), - ("amr_all_mutations", SampleData[AMRFinderPlusAnnotations]), + ("amr_annotations", GenomeData[AMRFinderPlusAnnotations]), + ("amr_all_mutations", GenomeData[AMRFinderPlusAnnotations]), ("amr_genes", GenomeData[Genes]), ("amr_proteins", GenomeData[Proteins]), - ("feature_table", FeatureTable[PresenceAbsence]), ], input_descriptions=amrfinderplus_input_descriptions, parameter_descriptions=amrfinderplus_parameter_descriptions, @@ -239,7 +238,7 @@ artifact_format=AMRFinderPlusDatabaseDirFmt, ) plugin.register_semantic_type_to_format( - SampleData[AMRFinderPlusAnnotations], + GenomeData[AMRFinderPlusAnnotations], artifact_format=AMRFinderPlusAnnotationsDirFmt, ) diff --git a/q2_amrfinderplus/sample_data.py b/q2_amrfinderplus/sample_data.py deleted file mode 100644 index 84f70a9..0000000 --- a/q2_amrfinderplus/sample_data.py +++ /dev/null @@ -1,181 +0,0 @@ -import os -from functools import reduce - -import pandas as pd -from q2_types.genome_data import ( - GenesDirectoryFormat, - LociDirectoryFormat, - ProteinsDirectoryFormat, -) -from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt - -from q2_amrfinderplus.types import ( - AMRFinderPlusAnnotationsDirFmt, - AMRFinderPlusDatabaseDirFmt, -) -from q2_amrfinderplus.utils import _validate_inputs, run_amrfinderplus_analyse - - -def annotate_sample_data_amrfinderplus( - amrfinderplus_db: AMRFinderPlusDatabaseDirFmt, - mags: MultiMAGSequencesDirFmt = None, - proteins: ProteinsDirectoryFormat = None, - loci: LociDirectoryFormat = None, - organism: str = None, - plus: bool = False, - report_all_equal: bool = False, - ident_min: float = None, - curated_ident: bool = False, - coverage_min: float = 0.5, - translation_table: str = "11", - annotation_format: str = "prodigal", - report_common: bool = False, - threads: int = None, -) -> ( - AMRFinderPlusAnnotationsDirFmt, - AMRFinderPlusAnnotationsDirFmt, - GenesDirectoryFormat, - ProteinsDirectoryFormat, - pd.DataFrame, -): - # Validate input and parameter combinations - _validate_inputs( - mags, loci, proteins, ident_min, curated_ident, report_common, plus, organism - ) - - # Set up common parameters for run_amrfinderplus_analyse - common_params = locals().copy() - del common_params["mags"] - del common_params["proteins"] - del common_params["loci"] - - # Innit output formats - amr_annotations = AMRFinderPlusAnnotationsDirFmt() - amr_all_mutations = AMRFinderPlusAnnotationsDirFmt() - amr_genes = GenesDirectoryFormat() - amr_proteins = ProteinsDirectoryFormat() - frequency_list = [] - - # Create sample dict with sample_dict - if mags: - sample_dict = mags.sample_dict() - else: - # Monkey patch the sample_dict instance method of MultiMAGSequencesDirFmt to - # ProteinsDirectoryFormat because it should have the same per sample structure - proteins.pathspec = r".+\.(fa|faa|fasta)$" - proteins.sample_dict = MultiMAGSequencesDirFmt.sample_dict.__get__( - proteins, ProteinsDirectoryFormat - ) - sample_dict = proteins.sample_dict() - - # ProteinsDirectoryFormat has no predefined dir structure and can be just files - # without sample directories - if len(sample_dict) == 0: - raise ValueError( - "The files in the GenomeData[Proteins] input should be " - "organised in per-sample directories." - ) - - # Iterate over paths of MAGs - for sample_id, files_dict in sample_dict.items(): - # Create sample directories in output directories - os.mkdir(f"{amr_annotations}/{sample_id}") - if mags: - os.mkdir(f"{amr_genes}/{sample_id}") - if proteins: - os.mkdir(f"{amr_proteins}/{sample_id}") - if organism: - os.mkdir(f"{amr_all_mutations}/{sample_id}") - - for mag_id, file_fp in files_dict.items(): - # Run amrfinderplus - run_amrfinderplus_analyse( - # dna_sequences path is the mag full path if mags are specified and - # None if no mags are specifies - dna_sequences=file_fp if mags else None, - # protein_sequences path is constructed if mags and proteins are - # specified, the mag full path is used when only proteins is specified. - # If only mags are specified and not proteins, the path is None. - protein_sequences=proteins.path / sample_id / f"{mag_id}.fasta" - if mags and proteins - else file_fp - if not mags - else None, - gff=loci.path / sample_id / f"{mag_id}.gff" if loci else None, - amr_annotations_path=amr_annotations.path - / sample_id - / f"{mag_id}_amr_annotations.tsv", - amr_genes_path=amr_genes.path / sample_id / f"{mag_id}_amr_genes.fasta", - amr_proteins_path=amr_proteins.path - / sample_id - / f"{mag_id}_amr_proteins.fasta", - amr_all_mutations_path=amr_all_mutations.path - / sample_id - / f"{mag_id}_amr_all_mutations.tsv", - **common_params, - ) - - # Create frequency dataframe and append it to list - frequency_df = read_in_txt( - path=amr_annotations.path / sample_id / f"{mag_id}_amr_annotations.tsv", - sample_mag_id=f"{sample_id}/{mag_id}", - column_name="Gene symbol", - ) - frequency_list.append(frequency_df) - - feature_table = create_count_table(df_list=frequency_list) - - # Create empty files if needed - if not mags: - with open(os.path.join(str(amr_genes), "empty.fasta"), "w"): - pass - - if not proteins: - with open(os.path.join(str(amr_proteins), "empty.fasta"), "w"): - pass - - if not organism: - with open( - os.path.join(str(amr_all_mutations), "empty_amr_all_mutations.tsv"), "w" - ): - pass - - return amr_annotations, amr_all_mutations, amr_genes, amr_proteins, feature_table - - -def read_in_txt(path: str, sample_mag_id: str, column_name: str): - # Read in txt file to pd.Dataframe - df = pd.read_csv(path, sep="\t") - - # Process the df, create count table - df = df[column_name].value_counts().reset_index() - df.columns = [column_name, sample_mag_id] - - df = df.astype(str) - return df - - -def create_count_table(df_list: list) -> pd.DataFrame: - # Remove all empty lists from df_list - df_list = [df for df in df_list if not df.empty] - - # Raise ValueError if df_list is empty. This happens when no ARGs were detected - if not df_list: - raise ValueError( - "No AMR genes could be identified and no output can be created." - ) - - # Merge all dfs contained in df_list - df = reduce( - lambda left, right: pd.merge(left, right, on=left.columns[0], how="outer"), - df_list, - ) - - # Process the df to meet all requirements for a FeatureTable - df = df.transpose() - df = df.fillna(0) - df.columns = df.iloc[0] - df = df.drop(df.index[0]) - df.columns.name = None - df.index.name = "sample_id" - return df diff --git a/q2_amrfinderplus/tests/test_annotate.py b/q2_amrfinderplus/tests/test_annotate.py new file mode 100644 index 0000000..ec92ecf --- /dev/null +++ b/q2_amrfinderplus/tests/test_annotate.py @@ -0,0 +1,47 @@ +from unittest.mock import patch + +from q2_types.genome_data import GenesDirectoryFormat, ProteinsDirectoryFormat +from qiime2.plugin.testing import TestPluginBase + +from q2_amrfinderplus.annotate import annotate +from q2_amrfinderplus.types import ( + AMRFinderPlusAnnotationsDirFmt, + AMRFinderPlusDatabaseDirFmt, +) + + +class TestAnnotate(TestPluginBase): + package = "q2_amrfinderplus.tests" + + @patch("q2_amrfinderplus.annotate._validate_inputs") + @patch( + "q2_amrfinderplus.annotate._create_sample_dict", + return_value={"sample1": {"id1": "file_path"}}, + ) + @patch("q2_amrfinderplus.annotate._create_sample_dirs") + @patch( + "q2_amrfinderplus.annotate._get_file_paths", + return_value=("dna_path", "protein_path", "gff_path"), + ) + @patch("q2_amrfinderplus.annotate._run_amrfinderplus_analyse") + @patch("q2_amrfinderplus.annotate._create_empty_files") + def test_annotate( + self, + mock_create_empty_files, + mock_run_amrfinderplus, + mock_get_file_paths, + mock_create_sample_dirs, + mock_create_sample_dict, + mock_validate_inputs, + ): + # Create mock for the AMRFinderPlusDatabaseDirFmt input + amrfinderplus_db = AMRFinderPlusDatabaseDirFmt() + + # Call the function with mostly default inputs + result = annotate(amrfinderplus_db) + + # Ensure the output is the correct types + self.assertIsInstance(result[0], AMRFinderPlusAnnotationsDirFmt) + self.assertIsInstance(result[1], AMRFinderPlusAnnotationsDirFmt) + self.assertIsInstance(result[2], GenesDirectoryFormat) + self.assertIsInstance(result[3], ProteinsDirectoryFormat) diff --git a/q2_amrfinderplus/tests/test_database.py b/q2_amrfinderplus/tests/test_database.py index 8c926c2..a87dd51 100644 --- a/q2_amrfinderplus/tests/test_database.py +++ b/q2_amrfinderplus/tests/test_database.py @@ -12,14 +12,14 @@ class TestFetchAMRFinderPlusDB(TestPluginBase): - package = "q2_amr.amrfinderplus.tests" + package = "q2_amrfinderplus.tests" - @patch("q2_amr.amrfinderplus.database.run_amrfinder_fetch") - @patch("q2_amr.amrfinderplus.database._copy_all") + @patch("q2_amrfinderplus.database.run_amrfinder_fetch") + @patch("q2_amrfinderplus.database._copy_all") def test_fetch_amrfinderplus_db(self, mock_run_amrfinder_u, mock__copy_all): fetch_amrfinderplus_db() - @patch("q2_amr.amrfinderplus.database.run_command") + @patch("q2_amrfinderplus.database.run_command") def test_run_amrfinder_u(self, mock_run_command): run_amrfinder_fetch() mock_run_command.assert_called_once_with( @@ -27,7 +27,7 @@ def test_run_amrfinder_u(self, mock_run_command): verbose=True, ) - @patch("q2_amr.amrfinderplus.database.run_command") + @patch("q2_amrfinderplus.database.run_command") def test_run_amrfinder_u_error(self, mock_run_command): expected_message = ( "An error was encountered while running AMRFinderPlus, " diff --git a/q2_amrfinderplus/tests/test_sample_data.py b/q2_amrfinderplus/tests/test_sample_data.py deleted file mode 100644 index bf2ca93..0000000 --- a/q2_amrfinderplus/tests/test_sample_data.py +++ /dev/null @@ -1,232 +0,0 @@ -import os -from unittest.mock import MagicMock, mock_open, patch - -import pandas as pd -from q2_types.genome_data import LociDirectoryFormat, ProteinsDirectoryFormat -from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt -from qiime2.plugin.testing import TestPluginBase - -from q2_amrfinderplus.sample_data import ( - annotate_sample_data_amrfinderplus, - create_count_table, - read_in_txt, -) -from q2_amrfinderplus.types import AMRFinderPlusDatabaseDirFmt - - -def mock_run_amrfinderplus_n( - amrfinderplus_db, - dna_sequences, - protein_sequences, - gff, - organism, - plus, - report_all_equal, - ident_min, - curated_ident, - coverage_min, - translation_table, - annotation_format, - report_common, - threads, - amr_annotations_path, - amr_genes_path, - amr_proteins_path, - amr_all_mutations_path, -): - with open(amr_annotations_path, "w"): - pass - if organism: - with open(amr_all_mutations_path, "w"): - pass - if dna_sequences: - with open(amr_genes_path, "w"): - pass - if protein_sequences: - with open(amr_proteins_path, "w"): - pass - - -class TestAnnotateSampleDataAMRFinderPlus(TestPluginBase): - package = "q2_amrfinderplus.tests" - - def test_annotate_sample_data_amrfinderplus_mags(self): - mags = MultiMAGSequencesDirFmt() - os.mkdir(mags.path / "sample_1") - with open(mags.path / "sample_1" / "mag_1.fasta", "w"): - pass - self._helper(mags=mags, organism="Escherichia") - - def test_annotate_sample_data_amrfinderplus_mags_proteins_loci(self): - mags = MultiMAGSequencesDirFmt() - proteins = ProteinsDirectoryFormat() - loci = LociDirectoryFormat() - os.mkdir(mags.path / "sample_1") - with open(mags.path / "sample_1" / "mag_1.fasta", "w"): - pass - os.mkdir(proteins.path / "sample_1") - with open(proteins.path / "sample_1" / "mag_1.fasta", "w"): - pass - os.mkdir(loci.path / "sample_1") - with open(loci.path / "sample_1" / "mag_1.gff", "w"): - pass - - self._helper(mags=mags, proteins=proteins, loci=loci) - - def test_annotate_sample_data_amrfinderplus_proteins(self): - proteins = ProteinsDirectoryFormat() - os.mkdir(proteins.path / "sample_1") - with open(proteins.path / "sample_1" / "mag_1.fasta", "w"): - pass - self._helper(proteins=proteins) - - def _helper(self, mags=None, organism=None, proteins=None, loci=None): - amrfinderplus_db = AMRFinderPlusDatabaseDirFmt() - mock_create_count_table = MagicMock() - mock_read_in_txt = MagicMock() - with patch( - "q2_amrfinderplus.sample_data.run_amrfinderplus_analyse", - side_effect=mock_run_amrfinderplus_n, - ), patch("q2_amrfinderplus.sample_data.read_in_txt", mock_read_in_txt), patch( - "q2_amrfinderplus.sample_data.create_count_table", - mock_create_count_table, - ): - ( - amr_annotations, - amr_all_mutations, - amr_genes, - amr_proteins, - feature_table, - ) = annotate_sample_data_amrfinderplus( - mags=mags, - proteins=proteins, - loci=loci, - amrfinderplus_db=amrfinderplus_db, - organism=organism, - ) - self.assertTrue( - os.path.exists( - amr_annotations.path / "sample_1" / "mag_1_amr_annotations.tsv" - ) - ) - - if mags: - self.assertTrue( - os.path.exists( - amr_genes.path / "sample_1" / "mag_1_amr_genes.fasta" - ) - ) - else: - self.assertTrue(os.path.exists(amr_genes.path / "empty.fasta")) - if organism: - self.assertTrue( - os.path.exists( - amr_all_mutations.path - / "sample_1" - / "mag_1_amr_all_mutations.tsv" - ) - ) - else: - self.assertTrue( - os.path.exists( - amr_all_mutations.path / "empty_amr_all_mutations.tsv" - ) - ) - if proteins: - self.assertTrue( - os.path.exists( - amr_proteins.path / "sample_1" / "mag_1_amr_proteins.fasta" - ) - ) - else: - self.assertTrue(os.path.exists(amr_proteins.path / "empty.fasta")) - - -class TestReadInTxt(TestPluginBase): - package = "q2_amrfinderplus.tests" - - @patch( - "builtins.open", - new_callable=mock_open, - read_data="col1\tcol2\nA\t1\nB\t2\nA\t3", - ) - @patch("pandas.read_csv") - def test_read_in_txt(self, mock_read_csv, mock_file): - # Mock data that would be read by pd.read_csv - mock_df = pd.DataFrame({"col1": ["A", "B", "A"]}) - - # Mock the behavior of pd.read_csv - mock_read_csv.return_value = mock_df - - # Test parameters - path = "dummy_path.txt" - sample_mag_id = "Sample123" - column_name = "col1" - - # Call the function under test - result = read_in_txt(path, sample_mag_id, column_name) - - # Expected DataFrame after processing - expected_df = pd.DataFrame( - { - column_name: ["A", "B"], - sample_mag_id: ["2", "1"], # value_counts result for A is 2, for B is 1 - } - ) - - # Convert to string to match the function's output format - expected_df = expected_df.astype(str) - - # Check that the file was read using pandas - mock_read_csv.assert_called_once_with(path, sep="\t") - - # Assert that the returned DataFrame matches the expected output - pd.testing.assert_frame_equal(result, expected_df) - - -class TestCreateCountTable(TestPluginBase): - package = "q2_amrfinderplus.tests" - - @classmethod - def setUpClass(cls): - cls.gene_count_df = [ - pd.DataFrame( - { - "ARO Term": ["mdtF", "mgrA", "OprN", "mepA"], - "sample1": ["1", "1", "1", "1"], - } - ), - pd.DataFrame( - { - "ARO Term": ["mdtE", "mgrA", "OprN", "mepA"], - "sample2": ["1", "1", "1", "1"], - } - ), - ] - - cls.frequency_table = pd.DataFrame( - { - "sample_id": ["sample1", "sample2"], - "OprN": ["1", "1"], - "mdtE": ["0", "1"], - "mdtF": ["1", "0"], - "mepA": ["1", "1"], - "mgrA": ["1", "1"], - } - ) - cls.frequency_table.set_index("sample_id", inplace=True) - - def test_create_count_table(self): - # Create observed count table with create_count_table function - obs = create_count_table(self.gene_count_df) - obs = obs.astype(str) - - # Define expected count table - exp = self.frequency_table - - # Compare expected and observed count table - pd.testing.assert_frame_equal(exp, obs) - - def test_create_count_table_value_error(self): - # Assert if ValueError is called when empy list is passed - self.assertRaises(ValueError, create_count_table, []) diff --git a/q2_amrfinderplus/tests/test_utils.py b/q2_amrfinderplus/tests/test_utils.py index f0c84d0..4c58eb3 100644 --- a/q2_amrfinderplus/tests/test_utils.py +++ b/q2_amrfinderplus/tests/test_utils.py @@ -1,8 +1,21 @@ -from unittest.mock import patch +import os +import subprocess +from pathlib import Path +from unittest.mock import MagicMock, mock_open, patch +from q2_types.feature_data_mag import MAGSequencesDirFmt +from q2_types.genome_data import ProteinsDirectoryFormat +from q2_types.per_sample_sequences import ContigSequencesDirFmt, MultiMAGSequencesDirFmt from qiime2.plugin.testing import TestPluginBase -from q2_amrfinderplus.utils import _validate_inputs, run_amrfinderplus_analyse +from q2_amrfinderplus.utils import ( + _create_empty_files, + _create_sample_dict, + _create_sample_dirs, + _get_file_paths, + _run_amrfinderplus_analyse, + _validate_inputs, +) class TestRunAmrfinderplusAnalyse(TestPluginBase): @@ -10,12 +23,12 @@ class TestRunAmrfinderplusAnalyse(TestPluginBase): @patch("q2_amrfinderplus.utils.run_command") def test_run_amrfinderplus_analyse(self, mock_run_command): - run_amrfinderplus_analyse( + _run_amrfinderplus_analyse( amrfinderplus_db="amrfinderplus_db", - dna_sequences="dna_sequences", - protein_sequences="protein_sequences", - gff="gff", - organism="Escherichia", + dna_path="dna_sequences", + protein_path="protein_sequences", + gff_path="gff", + organism="Acinetobacter", plus=True, report_all_equal=True, ident_min=1, @@ -51,7 +64,7 @@ def test_run_amrfinderplus_analyse(self, mock_run_command): "--threads", "4", "--organism", - "Escherichia", + "Acinetobacter", "--mutation_all", "amr_all_mutations_path", "--plus", @@ -65,16 +78,17 @@ def test_run_amrfinderplus_analyse(self, mock_run_command): "--annotation_format", "prodigal", "--report_common", + "--gpipe_org", ], ) @patch("q2_amrfinderplus.utils.run_command") def test_run_amrfinderplus_analyse_minimal(self, mock_run_command): - run_amrfinderplus_analyse( + _run_amrfinderplus_analyse( amrfinderplus_db="amrfinderplus_db", - dna_sequences=None, - protein_sequences=None, - gff=None, + dna_path=None, + protein_path=None, + gff_path=None, organism=None, plus=False, report_all_equal=False, @@ -100,6 +114,40 @@ def test_run_amrfinderplus_analyse_minimal(self, mock_run_command): ], ) + @patch("q2_amrfinderplus.utils.run_command") + def test_run_amrfinderplus_analyse_exception_message(self, mock_run_command): + # Simulate subprocess.CalledProcessError + mock_run_command.side_effect = subprocess.CalledProcessError( + returncode=1, cmd="amrfinder" + ) + + # Call the function and assert the exception message + with self.assertRaises(Exception) as context: + _run_amrfinderplus_analyse( + amrfinderplus_db="mock_db", + dna_path=None, + protein_path=None, + gff_path=None, + organism=None, + plus=False, + report_all_equal=False, + ident_min=None, + curated_ident=False, + coverage_min=0.5, + translation_table="11", + annotation_format="prodigal", + report_common=False, + threads=None, + amr_annotations_path="mock_annotations_path", + ) + + # Assert the correct exception message is raised + self.assertIn( + "An error was encountered while running AMRFinderPlus", + str(context.exception), + ) + self.assertIn("(return code 1)", str(context.exception)) + class TestValidateInputs(TestPluginBase): package = "q2_amrfinderplus.tests" @@ -110,7 +158,7 @@ def test_loci_without_proteins(self): ValueError, "can only be given in combination " 'with "--i-proteins"' ): _validate_inputs( - mags=True, + sequences=True, loci=True, proteins=False, ident_min=None, @@ -126,7 +174,7 @@ def test_mags_and_proteins_without_loci(self): ValueError, "can only be given in combination " 'with "--i-loci"' ): _validate_inputs( - mags=True, + sequences=True, loci=False, proteins=True, ident_min=None, @@ -139,10 +187,10 @@ def test_mags_and_proteins_without_loci(self): # Test when neither --i-mags nor --i-proteins is provided def test_missing_mags_and_proteins(self): with self.assertRaisesRegex( - ValueError, '"--i-mags" or "--i-proteins" input ' "has to be provided" + ValueError, '"--i-sequences" or "--i-proteins" input has to be provided' ): _validate_inputs( - mags=False, + sequences=False, loci=False, proteins=False, ident_min=None, @@ -161,7 +209,7 @@ def test_ident_min_and_curated_ident(self): "simultaneously", ): _validate_inputs( - mags=True, + sequences=True, loci=None, proteins=None, ident_min=True, @@ -177,7 +225,7 @@ def test_report_common_without_plus_or_organism(self): ValueError, '"--p-report-common" requires ' '"--p-plus" and "--p-organism"' ): _validate_inputs( - mags=True, + sequences=True, loci=None, proteins=None, ident_min=None, @@ -186,3 +234,262 @@ def test_report_common_without_plus_or_organism(self): plus=False, organism=None, ) + + +class TestGetFilePaths(TestPluginBase): + package = "q2_amrfinderplus.tests" + + @patch("os.path.exists") + def test_mags_with_proteins_and_loci(self, mock_exists): + # Mock the os.path.exists to simulate files existing + mock_exists.side_effect = [True, True] # First for protein, second for GFF + + # Call the function with mags, proteins, and loci + dna_path, protein_path, gff_path = _get_file_paths( + sequences=MagicMock(), + proteins=MagicMock(path=Path("proteins")), + loci=MagicMock(path=Path("loci")), + id="id", + sample_id="sample1", + file_fp="dna_file.fasta", + ) + + # Assertions + self.assertEqual(dna_path, "dna_file.fasta") + self.assertEqual(str(protein_path), "proteins/sample1/id.fasta") + self.assertEqual(str(gff_path), "loci/sample1/id.gff") + + def test_mags_without_proteins_and_loci(self): + # Call the function with mags, proteins, and loci + dna_path, protein_path, gff_path = _get_file_paths( + sequences=MagicMock(), + proteins=None, + loci=None, + id="sample123", + file_fp="dna_file.fasta", + ) + + # Assertions + self.assertEqual(dna_path, "dna_file.fasta") + self.assertEqual(protein_path, None) + self.assertEqual(gff_path, None) + + @patch("os.path.exists") + def test_mags_with_missing_protein(self, mock_exists): + # Mock os.path.exists to simulate the missing protein file + mock_exists.side_effect = [False] # Protein file does not exist + + # Call the function with mags and proteins, but no loci + with self.assertRaises(ValueError) as context: + _get_file_paths( + sequences=MagicMock(), + proteins=MagicMock(), + loci=None, + id="sample123", + sample_id="sample1", + file_fp="dna_file.fasta", + ) + + # Check that the exception message contains the correct text + self.assertIn( + "Proteins file for ID 'sample123' is missing", str(context.exception) + ) + + @patch("os.path.exists") + def test_loci_with_missing_gff(self, mock_exists): + # Mock os.path.exists to simulate the protein file exists but GFF file is + # missing + mock_exists.side_effect = [False] # Protein exists, GFF is missing + + # Call the function with proteins and loci, but no mags + with self.assertRaises(ValueError) as context: + _get_file_paths( + sequences=None, + proteins=None, + loci=MagicMock(path=Path("/mock/loci/path")), + id="sample123", + sample_id="sample1", + file_fp="protein_file.fasta", + ) + + # Check that the exception message contains the correct text + self.assertIn("GFF file for ID 'sample123' is missing", str(context.exception)) + + +class TestCreateSampleDict(TestPluginBase): + package = "q2_amrfinderplus.tests" + + @patch.object( + MultiMAGSequencesDirFmt, "sample_dict", return_value={"sample1": "some_value"} + ) + def test_create_sample_dict_sequences_multimags(self, mock_sample_dict): + # Mock the sequences input as MultiMAGSequencesDirFmt + sequences = MultiMAGSequencesDirFmt() + + # Call the function + result = _create_sample_dict(proteins=None, sequences=sequences) + + # Check that sample_dict is called correctly + mock_sample_dict.assert_called_once() + + # Ensure the result is the mocked return value of sample_dict + self.assertEqual(result, {"sample1": "some_value"}) + + @patch.object( + ContigSequencesDirFmt, "sample_dict", return_value={"contig_file": "file_path"} + ) + def test_create_sample_dict_sequences_contigs(self, mock_sample_dict): + # Mock the sequences input as ContigSequencesDirFmt + sequences = ContigSequencesDirFmt() + + # Call the function + result = _create_sample_dict(proteins=None, sequences=sequences) + + # Check that sample_dict is called correctly + mock_sample_dict.assert_called_once() + + # Ensure the result has a fake sample key with the file_dict + self.assertEqual(result, {"": {"contig_file": "file_path"}}) + + @patch.object( + MAGSequencesDirFmt, "feature_dict", return_value={"feature_file": "file_path"} + ) + def test_create_sample_dict_sequences_mag(self, mock_feature_dict): + # Mock the sequences input as MAGSequencesDirFmt + sequences = MAGSequencesDirFmt() + + # Call the function + result = _create_sample_dict(proteins=None, sequences=sequences) + + # Check that feature_dict is called correctly + mock_feature_dict.assert_called_once() + + # Ensure the result has a fake sample key with the feature_dict + self.assertEqual(result, {"": {"feature_file": "file_path"}}) + + def test_create_sample_dict_proteins_sample_data(self): + proteins = ProteinsDirectoryFormat() + + os.mkdir(proteins.path / "directory") + with open(proteins.path / "directory" / "file.fasta", "w"): + pass + + result = _create_sample_dict(proteins=proteins, sequences=None) + + self.assertEqual( + result, + {"directory": {"file": str(proteins.path / "directory" / "file.fasta")}}, + ) + + def test_create_sample_dict_proteins_feature_data(self): + proteins = ProteinsDirectoryFormat() + + with open(proteins.path / "file.fasta", "w"): + pass + + result = _create_sample_dict(proteins=proteins, sequences=None) + + self.assertEqual(result, {"": {"file": str(proteins.path / "file.fasta")}}) + + +class TestCreateEmptyFiles(TestPluginBase): + package = "q2_amrfinderplus.tests" + + @patch("builtins.open", new_callable=mock_open) + def test_create_empty_files_all_false(self, mock_open_file): + amr_genes = MagicMock(path=Path("path/amr_genes")) + amr_proteins = MagicMock(path=Path("path/amr_proteins")) + amr_all_mutations = MagicMock(path=Path("path/amr_all_mutations")) + + _create_empty_files( + sequences=False, + proteins=False, + organism=False, + amr_genes=amr_genes, + amr_proteins=amr_proteins, + amr_all_mutations=amr_all_mutations, + ) + + # Assertions + mock_open_file.assert_any_call(Path("path/amr_genes/empty.fasta"), "w") + mock_open_file.assert_any_call(Path("path/amr_proteins/empty.fasta"), "w") + mock_open_file.assert_any_call( + Path("path/amr_all_mutations/empty_amr_all_mutations.tsv"), "w" + ) + self.assertEqual(mock_open_file.call_count, 3) + + @patch("builtins.open", new_callable=mock_open) + def test_create_empty_files_all_true(self, mock_open_file): + amr_genes = MagicMock(path=Path("path/amr_genes")) + amr_proteins = MagicMock(path=Path("path/amr_proteins")) + amr_all_mutations = MagicMock(path=Path("path/amr_all_mutations")) + + _create_empty_files( + sequences=True, + proteins=True, + organism=True, + amr_genes=amr_genes, + amr_proteins=amr_proteins, + amr_all_mutations=amr_all_mutations, + ) + + # Assertions + mock_open_file.assert_not_called() + + +class TestCreateSampleDirs(TestPluginBase): + package = "q2_amrfinderplus.tests" + + @patch("os.makedirs") + def test_create_sample_dirs_all_exist(self, mock_makedirs): + amr_annotations = MagicMock(path=Path("/fake/path/amr_annotations")) + amr_genes = MagicMock(path=Path("/fake/path/amr_genes")) + amr_proteins = MagicMock(path=Path("/fake/path/amr_proteins")) + amr_all_mutations = MagicMock(path=Path("/fake/path/amr_all_mutations")) + + _create_sample_dirs( + sequences=True, + proteins=True, + organism=True, + amr_annotations=amr_annotations, + amr_genes=amr_genes, + amr_proteins=amr_proteins, + amr_all_mutations=amr_all_mutations, + sample_id="sample1", + ) + + # Assertions + mock_makedirs.assert_any_call( + Path("/fake/path/amr_annotations/sample1"), exist_ok=True + ) + mock_makedirs.assert_any_call( + Path("/fake/path/amr_genes/sample1"), exist_ok=True + ) + mock_makedirs.assert_any_call( + Path("/fake/path/amr_proteins/sample1"), exist_ok=True + ) + mock_makedirs.assert_any_call( + Path("/fake/path/amr_all_mutations/sample1"), exist_ok=True + ) + self.assertEqual(mock_makedirs.call_count, 4) + + @patch("os.makedirs") + def test_create_sample_dirs_nothing(self, mock_makedirs): + amr_annotations = MagicMock(path=Path("/fake/path/amr_annotations")) + + _create_sample_dirs( + sequences=False, + proteins=False, + organism=False, + amr_annotations=amr_annotations, + amr_genes=None, + amr_proteins=None, + amr_all_mutations=None, + sample_id="sample1", + ) + + # Assertions + mock_makedirs.assert_any_call( + Path("/fake/path/amr_annotations/sample1"), exist_ok=True + ) + self.assertEqual(mock_makedirs.call_count, 1) diff --git a/q2_amrfinderplus/types/_type.py b/q2_amrfinderplus/types/_type.py index 5cc2f5e..e36796c 100644 --- a/q2_amrfinderplus/types/_type.py +++ b/q2_amrfinderplus/types/_type.py @@ -5,12 +5,11 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- -from q2_types.feature_data import FeatureData -from q2_types.sample_data import SampleData +from q2_types.genome_data import GenomeData from qiime2.core.type import SemanticType AMRFinderPlusDatabase = SemanticType("AMRFinderPlusDatabase") AMRFinderPlusAnnotations = SemanticType( "AMRFinderPlusAnnotations", - variant_of=[SampleData.field["type"], FeatureData.field["type"]], + variant_of=GenomeData.field["type"], ) diff --git a/q2_amrfinderplus/types/tests/test_types_formats_transformers.py b/q2_amrfinderplus/types/tests/test_types_formats_transformers.py index b128525..e23536a 100644 --- a/q2_amrfinderplus/types/tests/test_types_formats_transformers.py +++ b/q2_amrfinderplus/types/tests/test_types_formats_transformers.py @@ -8,17 +8,18 @@ import os import tempfile -from q2_amr.amrfinderplus.types._format import ( +from qiime2.core.exceptions import ValidationError +from qiime2.plugin.testing import TestPluginBase + +from q2_amrfinderplus.types._format import ( AMRFinderPlusAnnotationFormat, AMRFinderPlusAnnotationsDirFmt, AMRFinderPlusDatabaseDirFmt, ) -from qiime2.core.exceptions import ValidationError -from qiime2.plugin.testing import TestPluginBase class TestAMRFinderPlusTypesAndFormats(TestPluginBase): - package = "q2_amr.amrfinderplus.types.tests" + package = "q2_amrfinderplus.types.tests" def test_amrfinderplus_database_directory_format_validate_positive(self): format = AMRFinderPlusDatabaseDirFmt(self.get_data_path("database"), mode="r") diff --git a/q2_amrfinderplus/utils.py b/q2_amrfinderplus/utils.py index fe5cbea..f932c97 100644 --- a/q2_amrfinderplus/utils.py +++ b/q2_amrfinderplus/utils.py @@ -1,5 +1,10 @@ +import os import subprocess +from q2_types.feature_data_mag import MAGSequencesDirFmt +from q2_types.genome_data import ProteinsDirectoryFormat +from q2_types.per_sample_sequences import ContigSequencesDirFmt, MultiMAGSequencesDirFmt + EXTERNAL_CMD_WARNING = ( "Running external command line application(s). " "This may print messages to stdout and/or stderr.\n" @@ -18,31 +23,32 @@ def run_command(cmd, cwd=None, verbose=True): def _validate_inputs( - mags, loci, proteins, ident_min, curated_ident, report_common, plus, organism + sequences, loci, proteins, ident_min, curated_ident, report_common, plus, organism ): - # Check if loci is provided with mags but without proteins (invalid combination) - if mags and loci and not proteins: + # Ensure that at least sequences or proteins is provided + if not sequences and not proteins: + raise ValueError('"--i-sequences" or "--i-proteins" input has to be provided.') + + # Check if loci is provided with sequences but without proteins + # (invalid combination) + if sequences and loci and not proteins: raise ValueError( '"--i-loci" input can only be given in combination with "--i-proteins" ' "input." ) - # Check if mags and proteins are provided together but without loci + # Check if sequences and proteins are provided together but without loci # (invalid combination) - if mags and not loci and proteins: + if sequences and not loci and proteins: raise ValueError( - '"--i-mags" and "--i-proteins" inputs together can only ' + '"--i-sequences" and "--i-proteins" inputs together can only ' 'be given in combination with "--i-loci" input.' ) - # Ensure that at least mags or proteins is provided - if not mags and not proteins: - raise ValueError('"--i-mags" or "--i-proteins" input has to be provided.') - # Validate that ident_min and curated_ident are not used together if ident_min and curated_ident: raise ValueError( - '"--p-ident-min" and "--p-curated-ident" cannot be used ' "simultaneously." + '"--p-ident-min" and "--p-curated-ident" cannot be used simultaneously.' ) # Check that report_common is only used with plus and organism @@ -50,11 +56,11 @@ def _validate_inputs( raise ValueError('"--p-report-common" requires "--p-plus" and "--p-organism".') -def run_amrfinderplus_analyse( +def _run_amrfinderplus_analyse( amrfinderplus_db, - dna_sequences, - protein_sequences, - gff, + dna_path, + protein_path, + gff_path, organism, plus, report_all_equal, @@ -75,31 +81,31 @@ def run_amrfinderplus_analyse( "--database", str(amrfinderplus_db), "-o", - amr_annotations_path, + str(amr_annotations_path), "--print_node", ] # Creates nucleotide fasta output if DNA sequences are given as input - if dna_sequences: + if dna_path: cmd.extend( [ "-n", - dna_sequences, + str(dna_path), "--nucleotide_output", - amr_genes_path, + str(amr_genes_path), ] ) # Creates protein fasta output if protein sequences are given as input - if protein_sequences: + if protein_path: cmd.extend( [ "-p", - protein_sequences, + str(protein_path), "--protein_output", - amr_proteins_path, + str(amr_proteins_path), ] ) - if gff: - cmd.extend(["-g", gff]) + if gff_path: + cmd.extend(["-g", str(gff_path)]) if threads: cmd.extend(["--threads", str(threads)]) # Creates all mutations output if an organism is specified @@ -109,7 +115,7 @@ def run_amrfinderplus_analyse( "--organism", organism, "--mutation_all", - amr_all_mutations_path, + str(amr_all_mutations_path), ] ) if plus: @@ -145,3 +151,114 @@ def run_amrfinderplus_analyse( f"(return code {e.returncode}), please inspect " "stdout and stderr to learn more." ) + + +def _create_empty_files( + sequences, proteins, organism, amr_genes, amr_proteins, amr_all_mutations +): + if not sequences: + with open(amr_genes.path / "empty.fasta", "w"): + pass + + if not proteins: + with open(amr_proteins.path / "empty.fasta", "w"): + pass + + if not organism: + with open(amr_all_mutations.path / "empty_amr_all_mutations.tsv", "w"): + pass + + +def _create_sample_dirs( + sequences, + proteins, + organism, + amr_annotations, + amr_genes, + amr_proteins, + amr_all_mutations, + sample_id, +): + os.makedirs(amr_annotations.path / sample_id, exist_ok=True) + if sequences: + os.makedirs(amr_genes.path / sample_id, exist_ok=True) + if proteins: + os.makedirs(amr_proteins.path / sample_id, exist_ok=True) + if organism: + os.makedirs(amr_all_mutations.path / sample_id, exist_ok=True) + + +def _create_sample_dict(proteins, sequences): + if sequences: + # For SampleData[MAGs] + if isinstance(sequences, MultiMAGSequencesDirFmt): + sample_dict = sequences.sample_dict() + + # For SampleData[Contigs] + elif isinstance(sequences, ContigSequencesDirFmt): + file_dict = sequences.sample_dict() + # Create fake sample for sample_dict + sample_dict = {"": file_dict} + + # For FeatureData[MAG] + elif isinstance(sequences, MAGSequencesDirFmt): + file_dict = sequences.feature_dict() + # Create fake sample for sample_dict + sample_dict = {"": file_dict} + + else: + proteins.pathspec = r".+\.(fa|faa|fasta)$" + + # Monkey patch the sample_dict instance method of MultiMAGSequencesDirFmt to + # ProteinsDirectoryFormat if it has a sample data dir structure + if any(item.is_dir() for item in proteins.path.iterdir()): + proteins.sample_dict = MultiMAGSequencesDirFmt.sample_dict.__get__( + proteins, ProteinsDirectoryFormat + ) + sample_dict = proteins.sample_dict() + # Monkey patch the feature_dict instance method of MAGSequencesDirFmt to + # ProteinsDirectoryFormat if it has a feature data dir structure + else: + proteins.feature_dict = MAGSequencesDirFmt.feature_dict.__get__( + proteins, ProteinsDirectoryFormat + ) + file_dict = proteins.feature_dict() + # create sample_dict with fake sample + sample_dict = {"": file_dict} + + return sample_dict + + +def _get_file_paths(sequences, proteins, loci, id, file_fp, sample_id=""): + # If mags is provided + if sequences: + dna_path = file_fp + + # If proteins are provided, construct the expected protein file path. + if proteins: + protein_path = proteins.path / sample_id / f"{id}.fasta" + + # Raise an error if the expected protein file does not exist. + if not os.path.exists(protein_path): + raise ValueError( + f"Proteins file for ID '{id}' is missing in proteins input." + ) + else: + protein_path = None + + # If only proteins are provided (without mags), determine dna and protein file path. + else: + dna_path = None + protein_path = file_fp + + # If loci are provided, construct the expected GFF file path. + if loci: + gff_path = loci.path / sample_id / f"{id}.gff" + + # Raise an error if the expected GFF file does not exist. + if not os.path.exists(gff_path): + raise ValueError(f"GFF file for ID '{id}' is missing in loci input.") + else: + gff_path = None + + return dna_path, protein_path, gff_path From 179a4e48cefe24578e04fee5063d2ef0bee73b7c Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 25 Sep 2024 09:41:58 +0200 Subject: [PATCH 4/8] add test data to package data --- setup.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 95ceace..3e93898 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,14 @@ }, package_data={ "q2_amrfinderplus": ["citations.bib"], - "q2_amrfinderplus.tests": ["data/*"], + "q2_amrfinderplus.types.tests": [ + "data/*" + "data/annotation/*" + "data/database/*" + "data/annotation_wrong/*" + "data/annotation/coordinates/*" + "data/annotation/no_coordinates/*" + ], }, zip_safe=False, ) From 15356af783c67cf8b52c9ddc8f3ce126d713d111 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 25 Sep 2024 10:06:35 +0200 Subject: [PATCH 5/8] change in types tests --- .../types/tests/test_types_formats_transformers.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/q2_amrfinderplus/types/tests/test_types_formats_transformers.py b/q2_amrfinderplus/types/tests/test_types_formats_transformers.py index e23536a..81d1709 100644 --- a/q2_amrfinderplus/types/tests/test_types_formats_transformers.py +++ b/q2_amrfinderplus/types/tests/test_types_formats_transformers.py @@ -36,8 +36,8 @@ def test_amrfinderplus_annotation_format_validate_positive(self): def test_amrfinderplus_annotation_format_validate_positive_coordinates(self): filepath = self.get_data_path( - "annotation/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d" - "_amr_annotations.tsv" + "annotation/coordinates/" + "e026af61-d911-4de3-a957-7e8bf837f30d_amr_annotations.tsv" ) format = AMRFinderPlusAnnotationFormat(filepath, mode="r") format.validate() @@ -92,9 +92,7 @@ def test_amrfinderplus_annotation_format_validation_error(self): self.assertEqual(str(context.exception), expected_message) def test_amrfinderplus_annotations_dir_fmt_feature(self): - dirpath = self.get_data_path( - "annotation/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d" - ) + dirpath = self.get_data_path("annotation/coordinates") annotations = AMRFinderPlusAnnotationsDirFmt(dirpath, mode="r") assert isinstance(annotations, AMRFinderPlusAnnotationsDirFmt) From 0ad86afe2f466cb28ce5555b9c6b816823d2a4b3 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 25 Sep 2024 10:35:55 +0200 Subject: [PATCH 6/8] removed () and tuple representation in genes path bug --- q2_amrfinderplus/annotate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/q2_amrfinderplus/annotate.py b/q2_amrfinderplus/annotate.py index e5e8fa5..09eb401 100644 --- a/q2_amrfinderplus/annotate.py +++ b/q2_amrfinderplus/annotate.py @@ -101,7 +101,7 @@ def annotate( amr_annotations_path = ( amr_annotations.path / sample_id / f"{id}_amr_annotations.tsv" ) - amr_genes_path = (amr_genes.path / sample_id / f"{id}_amr_genes.fasta",) + amr_genes_path = amr_genes.path / sample_id / f"{id}_amr_genes.fasta" amr_proteins_path = ( amr_proteins.path / sample_id / f"{id}_amr_proteins.fasta" ) From 965d37a7f19eea98d7f9a56487ac1a475dd57d76 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 25 Sep 2024 12:05:21 +0200 Subject: [PATCH 7/8] adds colorify with test and warning message fro empty artifacts --- q2_amrfinderplus/tests/test_utils.py | 25 +++++++++++++++++++++++-- q2_amrfinderplus/utils.py | 24 ++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/q2_amrfinderplus/tests/test_utils.py b/q2_amrfinderplus/tests/test_utils.py index 2547537..cfbe144 100644 --- a/q2_amrfinderplus/tests/test_utils.py +++ b/q2_amrfinderplus/tests/test_utils.py @@ -1,5 +1,6 @@ import os import subprocess +from io import StringIO from pathlib import Path from unittest.mock import MagicMock, call, mock_open, patch @@ -16,6 +17,7 @@ _get_file_paths, _run_amrfinderplus_analyse, _validate_inputs, + colorify, run_command, ) @@ -440,7 +442,8 @@ class TestCreateEmptyFiles(TestPluginBase): package = "q2_amrfinderplus.tests" @patch("builtins.open", new_callable=mock_open) - def test_create_empty_files_all_false(self, mock_open_file): + @patch("sys.stdout", new_callable=StringIO) + def test_create_empty_files_all_false(self, mock_stdout, mock_open_file): amr_genes = MagicMock(path=Path("path/amr_genes")) amr_proteins = MagicMock(path=Path("path/amr_proteins")) amr_all_mutations = MagicMock(path=Path("path/amr_all_mutations")) @@ -454,7 +457,7 @@ def test_create_empty_files_all_false(self, mock_open_file): amr_all_mutations=amr_all_mutations, ) - # Assertions + # Assertions for file creation mock_open_file.assert_any_call(Path("path/amr_genes/empty.fasta"), "w") mock_open_file.assert_any_call(Path("path/amr_proteins/empty.fasta"), "w") mock_open_file.assert_any_call( @@ -462,6 +465,14 @@ def test_create_empty_files_all_false(self, mock_open_file): ) self.assertEqual(mock_open_file.call_count, 3) + # Capture printed output + printed_output = mock_stdout.getvalue() + + # Assertions for print statements by checking keywords + self.assertIn("amr_genes", printed_output) + self.assertIn("amr_proteins", printed_output) + self.assertIn("amr_all_mutations", printed_output) + @patch("builtins.open", new_callable=mock_open) def test_create_empty_files_all_true(self, mock_open_file): amr_genes = MagicMock(path=Path("path/amr_genes")) @@ -537,3 +548,13 @@ def test_create_sample_dirs_nothing(self, mock_makedirs): Path("/fake/path/amr_annotations/sample1"), exist_ok=True ) self.assertEqual(mock_makedirs.call_count, 1) + + +class TestColorify(TestPluginBase): + package = "q2_amrfinderplus.tests" + + def test_colorify(self): + # Test if colorify wraps the string with the correct ANSI codes for yellow + result = colorify("Hello") + expected = "\033[1;33mHello\033[0m" + self.assertEqual(result, expected) diff --git a/q2_amrfinderplus/utils.py b/q2_amrfinderplus/utils.py index f932c97..c8c0ec9 100644 --- a/q2_amrfinderplus/utils.py +++ b/q2_amrfinderplus/utils.py @@ -156,17 +156,37 @@ def _run_amrfinderplus_analyse( def _create_empty_files( sequences, proteins, organism, amr_genes, amr_proteins, amr_all_mutations ): + # Creates empty files in output artifacts amr_genes, amr_proteins and + # amr_all_mutations because artifacts can not be empty if not sequences: with open(amr_genes.path / "empty.fasta", "w"): pass + print( + colorify( + '"amr_genes" output is empty because no "--i-sequences" input ' + "was given." + ) + ) if not proteins: with open(amr_proteins.path / "empty.fasta", "w"): pass + print( + colorify( + '"amr_proteins" output is empty because no "--i-proteins" input ' + "was given." + ) + ) if not organism: with open(amr_all_mutations.path / "empty_amr_all_mutations.tsv", "w"): pass + print( + colorify( + '"amr_all_mutations" output is empty because no "--p-organism" ' + "parameter was given." + ) + ) def _create_sample_dirs( @@ -262,3 +282,7 @@ def _get_file_paths(sequences, proteins, loci, id, file_fp, sample_id=""): gff_path = None return dna_path, protein_path, gff_path + + +def colorify(string: str): + return "%s%s%s" % ("\033[1;33m", string, "\033[0m") From ee66471cc4bee2e37c681a3c687b6e96debab885 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 25 Sep 2024 13:06:22 +0200 Subject: [PATCH 8/8] typo --- q2_amrfinderplus/plugin_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/q2_amrfinderplus/plugin_setup.py b/q2_amrfinderplus/plugin_setup.py index bb235a2..a82b5f0 100644 --- a/q2_amrfinderplus/plugin_setup.py +++ b/q2_amrfinderplus/plugin_setup.py @@ -163,7 +163,7 @@ "using this option if you have a specific reason.", "curated_ident": "Use the curated threshold for a blast-based hit, if it " "exists and 0.9 otherwise. This will overwrite the value specified with the " - "'ident_min' parameter", + "'ident_min' parameter.", "coverage_min": "Minimum proportion of reference gene covered for a " "BLAST-based hit (Methods BLAST or PARTIAL).", "translation_table": "Translation table used for BLASTX.",