Skip to content

Commit

Permalink
added new annotation format
Browse files Browse the repository at this point in the history
  • Loading branch information
VinzentRisch committed Jul 3, 2024
1 parent c445800 commit 0670d7e
Show file tree
Hide file tree
Showing 8 changed files with 113 additions and 2 deletions.
2 changes: 2 additions & 0 deletions q2_amr/amrfinderplus/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
# ----------------------------------------------------------------------------
from q2_amr.amrfinderplus.types._format import (
AMRFinderPlusDatabaseDirectoryFormat,
ARMFinderPlusAnnotationFormat,
BinaryFormat,
TextFormat,
)

__all__ = [
"AMRFinderPlusDatabaseDirectoryFormat",
"ARMFinderPlusAnnotationFormat",
"TextFormat",
"BinaryFormat",
]
44 changes: 44 additions & 0 deletions q2_amr/amrfinderplus/types/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import pandas as pd
from q2_types.feature_data import MixedCaseDNAFASTAFormat, ProteinFASTAFormat
from qiime2.core.exceptions import ValidationError
from qiime2.plugin import model


Expand Down Expand Up @@ -345,3 +347,45 @@ class AMRFinderPlusDatabaseDirectoryFormat(model.DirectoryFormat):
"AMR_DNA-Streptococcus_pneumoniae.tab", format=TextFormat
)
AMR_DNA_Escherichia_tab = model.File("AMR_DNA-Escherichia.tab", format=TextFormat)


class ARMFinderPlusAnnotationFormat(model.TextFileFormat):
def _validate(self, n_records=None):
header_coordinates = [
"Protein identifier",
"Contig id",
"Start",
"Stop",
"Strand",
"Gene symbol",
"Sequence name",
"Scope",
"Element type",
"Element subtype",
"Class",
"Subclass",
"Method",
"Target length",
"Reference sequence length",
"% Coverage of reference sequence",
"% Identity to reference sequence",
"Alignment length",
"Accession of closest sequence",
"Name of closest sequence",
"HMM id",
"HMM description",
]
header = header_coordinates[:1] + header_coordinates[5:]
header_obs = pd.read_csv(str(self), sep="\t", nrows=0).columns.tolist()
if header != header_obs and header_coordinates != header_obs:
raise ValidationError(
"Header line does not match ARMFinderPlusAnnotation format. Must "
"consist of the following values: "
+ ", ".join(header_coordinates)
+ ".\nWhile Contig id, Start, Stop and Strand are optional."
+ ".\n\nFound instead: "
+ ", ".join(header_obs)
)

def _validate_(self, level):
self._validate()
1 change: 1 addition & 0 deletions q2_amr/amrfinderplus/types/_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
from qiime2.core.type import SemanticType

AMRFinderPlusDatabase = SemanticType("AMRFinderPlusDatabase")
ARMFinderPlusAnnotation = SemanticType("ARMFinderPlusAnnotation")
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Protein identifier Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description
aph3pp-Ib_partial_5p_neg aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIALP 225 267 81.27 100.00 217 WP_001082319.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase
blaOXA-436_partial blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF012161.0 class D beta-lactamase
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Protein identifier Contig id Start Stop Strand Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description
NA contig01 101 958 + blaTEM-156 class A beta-lactamase TEM-156 core AMR AMR BETA-LACTAM BETA-LACTAM ALLELEX 286 286 100.00 100.00 286 WP_061158039.1 class A beta-lactamase TEM-156 NA NA
NA contig02 1 1191 + blaPDC PDC family class C beta-lactamase core AMR AMR BETA-LACTAM CEPHALOSPORIN BLASTX 397 397 100.00 99.75 397 WP_061189306.1 class C beta-lactamase PDC-114 NA NA
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Incorrect Header 1 Incorrect Header 2 Incorrect Header 3
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,71 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
from qiime2.core.exceptions import ValidationError
from qiime2.plugin.testing import TestPluginBase

from q2_amr.amrfinderplus.types._format import AMRFinderPlusDatabaseDirectoryFormat
from q2_amr.amrfinderplus.types._format import (
AMRFinderPlusDatabaseDirectoryFormat,
ARMFinderPlusAnnotationFormat,
)


class TestAMRFinderPlusDatabaseTypesAndFormats(TestPluginBase):
class TestAMRFinderPlusTypesAndFormats(TestPluginBase):
package = "q2_amr.amrfinderplus.types.tests"

def test_amrfinderplus_database_directory_format_validate_positive(self):
format = AMRFinderPlusDatabaseDirectoryFormat(
self.get_data_path("database"), mode="r"
)
format.validate()

def test_amrfinderplus_annotation_format_validate_positive(self):
filepath = self.get_data_path("annotation/amr_annotation.tsv")
format = ARMFinderPlusAnnotationFormat(filepath, mode="r")
format.validate()

def test_amrfinderplus_annotation_format_validate_positive_coordinates(self):
filepath = self.get_data_path("annotation/amr_annotation_coordiantes.tsv")
format = ARMFinderPlusAnnotationFormat(filepath, mode="r")
format.validate()

def test_amrfinderplus_annotation_format_validation_error(self):
with self.assertRaises(ValidationError) as context:
path = self.get_data_path("annotation/amr_annotation_wrong.tsv")
format = ARMFinderPlusAnnotationFormat(path, mode="r")
format.validate()

header_coordinates = [
"Protein identifier",
"Contig id",
"Start",
"Stop",
"Strand",
"Gene symbol",
"Sequence name",
"Scope",
"Element type",
"Element subtype",
"Class",
"Subclass",
"Method",
"Target length",
"Reference sequence length",
"% Coverage of reference sequence",
"% Identity to reference sequence",
"Alignment length",
"Accession of closest sequence",
"Name of closest sequence",
"HMM id",
"HMM description",
]
expected_message = (
"Header line does not match ARMFinderPlusAnnotation format. Must "
"consist of the following values: "
+ ", ".join(header_coordinates)
+ ".\nWhile Contig id, Start, Stop and Strand are optional."
+ "\n\nFound instead: "
+ "Incorrect Header 1, Incorrect Header 2, Incorrect Header 3"
)

self.assertEqual(str(context.exception), expected_message)
2 changes: 2 additions & 0 deletions q2_amr/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from q2_amr import __version__
from q2_amr.amrfinderplus.types._format import (
AMRFinderPlusDatabaseDirectoryFormat,
ARMFinderPlusAnnotationFormat,
BinaryFormat,
TextFormat,
)
Expand Down Expand Up @@ -1145,6 +1146,7 @@
AMRFinderPlusDatabaseDirectoryFormat,
TextFormat,
BinaryFormat,
ARMFinderPlusAnnotationFormat,
)

importlib.import_module("q2_amr.card.types._transformer")

0 comments on commit 0670d7e

Please sign in to comment.