diff --git a/q2_amr/amrfinderplus/types/__init__.py b/q2_amr/amrfinderplus/types/__init__.py index 1e2e7e1..c84d73b 100644 --- a/q2_amr/amrfinderplus/types/__init__.py +++ b/q2_amr/amrfinderplus/types/__init__.py @@ -6,6 +6,9 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- from q2_amr.amrfinderplus.types._format import ( + AMRFinderPlusAnnotationDirFmt, + AMRFinderPlusAnnotationFormat, + AMRFinderPlusAnnotationsDirFmt, AMRFinderPlusDatabaseDirFmt, BinaryFormat, TextFormat, @@ -13,6 +16,9 @@ __all__ = [ "AMRFinderPlusDatabaseDirFmt", + "AMRFinderPlusAnnotationFormat", + "AMRFinderPlusAnnotationsDirFmt", + "AMRFinderPlusAnnotationDirFmt", "TextFormat", "BinaryFormat", ] diff --git a/q2_amr/amrfinderplus/types/_format.py b/q2_amr/amrfinderplus/types/_format.py index 25f1564..ba03052 100644 --- a/q2_amr/amrfinderplus/types/_format.py +++ b/q2_amr/amrfinderplus/types/_format.py @@ -5,7 +5,10 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +import pandas as pd from q2_types.feature_data import MixedCaseDNAFASTAFormat, ProteinFASTAFormat +from q2_types.per_sample_sequences._format import MultiDirValidationMixin +from qiime2.core.exceptions import ValidationError from qiime2.plugin import model @@ -57,3 +60,67 @@ def amr_dna_comp_path_maker(self, species, extension): @amr_dna_tab.set_path_maker def amr_dna_tab_path_maker(self, species): return "AMR_DNA-%s.tab" % species + + +class AMRFinderPlusAnnotationFormat(model.TextFileFormat): + def _validate(self): + header_coordinates = [ + "Protein identifier", + "Contig id", + "Start", + "Stop", + "Strand", + "Gene symbol", + "Sequence name", + "Scope", + "Element type", + "Element subtype", + "Class", + "Subclass", + "Method", + "Target length", + "Reference sequence length", + "% Coverage of reference sequence", + "% Identity to reference sequence", + "Alignment length", + "Accession of closest sequence", + "Name of closest sequence", + "HMM id", + "HMM description", + "Hierarchy node", + ] + header = header_coordinates[:1] + header_coordinates[5:] + try: + header_obs = pd.read_csv(str(self), sep="\t", nrows=0).columns.tolist() + if header != header_obs and header_coordinates != header_obs: + raise ValidationError( + "Header line does not match AMRFinderPlusAnnotationFormat. Must " + "consist of the following values: " + + ", ".join(header_coordinates) + + ".\n\nWhile Contig id, Start, Stop and Strand are optional." + + "\n\nFound instead: " + + ", ".join(header_obs) + ) + except pd.errors.EmptyDataError: + pass + + def _validate_(self, level): + self._validate() + + +class AMRFinderPlusAnnotationsDirFmt(MultiDirValidationMixin, model.DirectoryFormat): + annotation = model.FileCollection( + r".*amr_(annotations|mutations)\.tsv$", format=AMRFinderPlusAnnotationFormat + ) + + @annotation.set_path_maker + def annotation_path_maker(self, sample_id, mag_id): + prefix = f"{sample_id}/{mag_id}_" if mag_id else f"{sample_id}/" + return f"{prefix}amr_annotations.tsv" + + +AMRFinderPlusAnnotationDirFmt = model.SingleFileDirectoryFormat( + "AMRFinderPlusAnnotationDirFmt", + r"amr_(annotations|mutations)\.tsv$", + AMRFinderPlusAnnotationFormat, +) diff --git a/q2_amr/amrfinderplus/types/_type.py b/q2_amr/amrfinderplus/types/_type.py index 680bfcf..13d0e90 100644 --- a/q2_amr/amrfinderplus/types/_type.py +++ b/q2_amr/amrfinderplus/types/_type.py @@ -5,6 +5,14 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +from q2_types.feature_data import FeatureData +from q2_types.sample_data import SampleData from qiime2.core.type import SemanticType AMRFinderPlusDatabase = SemanticType("AMRFinderPlusDatabase") +AMRFinderPlusAnnotations = SemanticType( + "AMRFinderPlusAnnotations", variant_of=SampleData.field["type"] +) +AMRFinderPlusAnnotation = SemanticType( + "AMRFinderPlusAnnotation", variant_of=FeatureData.field["type"] +) diff --git a/q2_amr/amrfinderplus/types/tests/data/annotation/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d_amr_annotations.tsv b/q2_amr/amrfinderplus/types/tests/data/annotation/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d_amr_annotations.tsv new file mode 100644 index 0000000..20e52d1 --- /dev/null +++ b/q2_amr/amrfinderplus/types/tests/data/annotation/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d_amr_annotations.tsv @@ -0,0 +1,3 @@ +Protein identifier Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description Hierarchy node +aph3pp-Ib_partial_5p_neg aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIALP 225 267 81.27 100.00 217 WP_001082319.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase aph(3'')-Ib +blaOXA-436_partial blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF012161.0 class D beta-lactamase blaOXA-48_fam diff --git a/q2_amr/amrfinderplus/types/tests/data/annotation/no_coordinates/aa447c99-ecd9-4c4a-a53b-4df6999815dd_amr_annotations.tsv b/q2_amr/amrfinderplus/types/tests/data/annotation/no_coordinates/aa447c99-ecd9-4c4a-a53b-4df6999815dd_amr_annotations.tsv new file mode 100644 index 0000000..20e52d1 --- /dev/null +++ b/q2_amr/amrfinderplus/types/tests/data/annotation/no_coordinates/aa447c99-ecd9-4c4a-a53b-4df6999815dd_amr_annotations.tsv @@ -0,0 +1,3 @@ +Protein identifier Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description Hierarchy node +aph3pp-Ib_partial_5p_neg aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIALP 225 267 81.27 100.00 217 WP_001082319.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase aph(3'')-Ib +blaOXA-436_partial blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF012161.0 class D beta-lactamase blaOXA-48_fam diff --git a/q2_amr/amrfinderplus/types/tests/data/annotation_wrong/amr_annotation.tsv b/q2_amr/amrfinderplus/types/tests/data/annotation_wrong/amr_annotation.tsv new file mode 100644 index 0000000..1f1fa8b --- /dev/null +++ b/q2_amr/amrfinderplus/types/tests/data/annotation_wrong/amr_annotation.tsv @@ -0,0 +1 @@ +Incorrect Header 1 Incorrect Header 2 Incorrect Header 3 diff --git a/q2_amr/amrfinderplus/types/tests/test_types_formats_transformers.py b/q2_amr/amrfinderplus/types/tests/test_types_formats_transformers.py index f413052..2b2ea6f 100644 --- a/q2_amr/amrfinderplus/types/tests/test_types_formats_transformers.py +++ b/q2_amr/amrfinderplus/types/tests/test_types_formats_transformers.py @@ -5,14 +5,101 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +import os +import tempfile + +from qiime2.core.exceptions import ValidationError from qiime2.plugin.testing import TestPluginBase -from q2_amr.amrfinderplus.types._format import AMRFinderPlusDatabaseDirFmt +from q2_amr.amrfinderplus.types._format import ( + AMRFinderPlusAnnotationDirFmt, + AMRFinderPlusAnnotationFormat, + AMRFinderPlusAnnotationsDirFmt, + AMRFinderPlusDatabaseDirFmt, +) -class TestAMRFinderPlusDatabaseTypesAndFormats(TestPluginBase): +class TestAMRFinderPlusTypesAndFormats(TestPluginBase): package = "q2_amr.amrfinderplus.types.tests" def test_amrfinderplus_database_directory_format_validate_positive(self): format = AMRFinderPlusDatabaseDirFmt(self.get_data_path("database"), mode="r") format.validate() + + def test_amrfinderplus_annotation_format_validate_positive(self): + filepath = self.get_data_path( + "annotation/no_coordinates/" + "aa447c99-ecd9-4c4a-a53b-4df6999815dd_amr_annotations.tsv" + ) + + format = AMRFinderPlusAnnotationFormat(filepath, mode="r") + format.validate() + + def test_amrfinderplus_annotation_format_validate_positive_coordinates(self): + filepath = self.get_data_path( + "annotation/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d" + "_amr_annotations.tsv" + ) + format = AMRFinderPlusAnnotationFormat(filepath, mode="r") + format.validate() + + def test_amrfinderplus_annotation_format_validate_positive_empty(self): + with tempfile.TemporaryDirectory() as temp_dir: + temp_file_path = os.path.join(temp_dir, "amr_annotations.tsv") + with open(temp_file_path, "w"): + pass + format = AMRFinderPlusAnnotationFormat(temp_file_path, mode="r") + format.validate() + + def test_amrfinderplus_annotation_format_validation_error(self): + with self.assertRaises(ValidationError) as context: + path = self.get_data_path("annotation_wrong/amr_annotation.tsv") + format = AMRFinderPlusAnnotationFormat(path, mode="r") + format.validate() + + header_coordinates = [ + "Protein identifier", + "Contig id", + "Start", + "Stop", + "Strand", + "Gene symbol", + "Sequence name", + "Scope", + "Element type", + "Element subtype", + "Class", + "Subclass", + "Method", + "Target length", + "Reference sequence length", + "% Coverage of reference sequence", + "% Identity to reference sequence", + "Alignment length", + "Accession of closest sequence", + "Name of closest sequence", + "HMM id", + "HMM description", + ] + expected_message = ( + "Header line does not match AMRFinderPlusAnnotation format. Must " + "consist of the following values: " + + ", ".join(header_coordinates) + + ".\nWhile Contig id, Start, Stop and Strand are optional." + + "\n\nFound instead: " + + "Incorrect Header 1, Incorrect Header 2, Incorrect Header 3" + ) + + self.assertEqual(str(context.exception), expected_message) + + def test_amrfinderplus_annotation_directory_format(self): + dirpath = self.get_data_path( + "annotation/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d" + ) + annotations = AMRFinderPlusAnnotationDirFmt(dirpath, mode="r") + assert isinstance(annotations, AMRFinderPlusAnnotationDirFmt) + + def test_amrfinderplus_annotations_directory_format(self): + dirpath = self.get_data_path("annotation") + annotations = AMRFinderPlusAnnotationsDirFmt(dirpath, mode="r") + assert isinstance(annotations, AMRFinderPlusAnnotationsDirFmt) diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py index 828622b..bd21a68 100644 --- a/q2_amr/plugin_setup.py +++ b/q2_amr/plugin_setup.py @@ -7,6 +7,7 @@ # ---------------------------------------------------------------------------- import importlib +from q2_types.feature_data import FeatureData from q2_types.feature_table import FeatureTable, Frequency from q2_types.per_sample_sequences import ( MAGs, @@ -29,11 +30,18 @@ from q2_amr import __version__ from q2_amr.amrfinderplus.types._format import ( + AMRFinderPlusAnnotationDirFmt, + AMRFinderPlusAnnotationFormat, + AMRFinderPlusAnnotationsDirFmt, AMRFinderPlusDatabaseDirFmt, BinaryFormat, TextFormat, ) -from q2_amr.amrfinderplus.types._type import AMRFinderPlusDatabase +from q2_amr.amrfinderplus.types._type import ( + AMRFinderPlusAnnotation, + AMRFinderPlusAnnotations, + AMRFinderPlusDatabase, +) from q2_amr.card.database import fetch_card_db from q2_amr.card.heatmap import heatmap from q2_amr.card.kmer import ( @@ -1084,6 +1092,8 @@ CARDReadsAlleleKmerAnalysis, CARDMAGsKmerAnalysis, AMRFinderPlusDatabase, + AMRFinderPlusAnnotations, + AMRFinderPlusAnnotation, ) plugin.register_semantic_type_to_format( @@ -1118,6 +1128,15 @@ AMRFinderPlusDatabase, artifact_format=AMRFinderPlusDatabaseDirFmt, ) + +plugin.register_semantic_type_to_format( + SampleData[AMRFinderPlusAnnotations], + artifact_format=AMRFinderPlusAnnotationsDirFmt, +) +plugin.register_semantic_type_to_format( + FeatureData[AMRFinderPlusAnnotation], + artifact_format=AMRFinderPlusAnnotationDirFmt, +) plugin.register_formats( CARDKmerDatabaseDirectoryFormat, CARDKmerJSONFormat, @@ -1145,6 +1164,9 @@ AMRFinderPlusDatabaseDirFmt, TextFormat, BinaryFormat, + AMRFinderPlusAnnotationFormat, + AMRFinderPlusAnnotationsDirFmt, + AMRFinderPlusAnnotationDirFmt, ) importlib.import_module("q2_amr.card.types._transformer")