bokulich-lab · VinzentRisch · Oct 11, 2024 · Oct 11, 2024 · Oct 11, 2024 · Oct 11, 2024
diff --git a/q2_amrfinderplus/types/_format.py b/q2_amrfinderplus/types/_format.py
@@ -6,6 +6,7 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 import os
+from collections import defaultdict
 
 import pandas as pd
 from q2_types.feature_data import MixedCaseDNAFASTAFormat, ProteinFASTAFormat
@@ -115,6 +116,66 @@ class AMRFinderPlusAnnotationsDirFmt(model.DirectoryFormat):
         r".*amr_(annotations|all_mutations)\.tsv$", format=AMRFinderPlusAnnotationFormat
     )
 
+    def annotation_dict(self, relative=False):
+        """
+        For per sample directories it returns a mapping of sample id to
+        another dictionary where keys represent the file name and values
+        correspond to the filepath for each file.
+        For files, it returns a mapping of file name to filepath for each file.
+        The suffixes "_amr_annotations" and "_amr_all_mutations" are removed from
+        filenames.
+
+        Parameters
+        ---------
+        relative : bool
+            Whether to return filepaths relative to the directory's location.
+            Returns absolute filepaths by default.
+
+        Returns
+        -------
+        dict
+            Mapping of filename -> filepath as described above.
+            Or mapping of sample id -> dict {filename: filepath} as
+            described above.
+            Both levels of the dictionary are sorted alphabetically by key.
+        """
+        ids = defaultdict(dict)
+        for entry in self.path.iterdir():
+            if entry.is_dir():
+                outer_id = entry.name
+                for path in entry.iterdir():
+                    file_path, inner_id = _create_path(
+                        path=path, relative=relative, dir_format=self
+                    )
+
+                    ids[outer_id][inner_id] = str(file_path)
+                ids[outer_id] = dict(sorted(ids[outer_id].items()))
+            else:
+                file_path, inner_id = _create_path(
+                    path=entry, relative=relative, dir_format=self
+                )
+
+                ids[inner_id] = str(file_path)
+
+        return dict(sorted(ids.items()))
+
     @annotations.set_path_maker
     def annotations_path_maker(self, name, id, dir_name=""):
         return os.path.join(dir_name, f"{id}_amr_{name}.tsv")
+
+
+def _create_path(path, relative, dir_format):
+    file_name = path.stem
+
+    # Remove suffix from filename to create id
+    if file_name.endswith("_amr_annotations"):
+        _id = file_name[:-16]
+    else:
+        _id = file_name[:-18]
+
+    path_dict = (
+        path.absolute().relative_to(dir_format.path.absolute())
+        if relative
+        else path.absolute()
+    )
+    return str(path_dict), _id
diff --git a/...data/all_mutations/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d_amr_all_mutations.tsv b/...data/all_mutations/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d_amr_all_mutations.tsv
@@ -0,0 +1,3 @@
+Protein identifier	Gene symbol	Sequence name	Scope	Element type	Element subtype	Class	Subclass	Method	Target length	Reference sequence length	% Coverage of reference sequence	% Identity to reference sequence	Alignment length	Accession of closest sequence	Name of closest sequence	HMM id	HMM description	Hierarchy node
+aph3pp-Ib_partial_5p_neg	aph(3'')-Ib	aminoglycoside O-phosphotransferase APH(3'')-Ib	core	AMR	AMR	AMINOGLYCOSIDE	STREPTOMYCIN	PARTIALP	225	267	81.27	100.00	217	WP_001082319.1	aminoglycoside O-phosphotransferase APH(3'')-Ib	NF032896.1	APH(3'') family aminoglycoside O-phosphotransferase	aph(3'')-Ib
+blaOXA-436_partial	blaOXA	OXA-48 family class D beta-lactamase	core	AMR	AMR	BETA-LACTAM	BETA-LACTAM	PARTIALP	233	265	87.92	100.00	233	WP_058842180.1	OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436	NF012161.0	class D beta-lactamase	blaOXA-48_fam
diff --git a/...a/all_mutations/no_coordinates/aa447c99-ecd9-4c4a-a53b-4df6999815dd_amr_all_mutations.tsv b/...a/all_mutations/no_coordinates/aa447c99-ecd9-4c4a-a53b-4df6999815dd_amr_all_mutations.tsv
@@ -0,0 +1,3 @@
+Protein identifier	Gene symbol	Sequence name	Scope	Element type	Element subtype	Class	Subclass	Method	Target length	Reference sequence length	% Coverage of reference sequence	% Identity to reference sequence	Alignment length	Accession of closest sequence	Name of closest sequence	HMM id	HMM description	Hierarchy node
+aph3pp-Ib_partial_5p_neg	aph(3'')-Ib	aminoglycoside O-phosphotransferase APH(3'')-Ib	core	AMR	AMR	AMINOGLYCOSIDE	STREPTOMYCIN	PARTIALP	225	267	81.27	100.00	217	WP_001082319.1	aminoglycoside O-phosphotransferase APH(3'')-Ib	NF032896.1	APH(3'') family aminoglycoside O-phosphotransferase	aph(3'')-Ib
+blaOXA-436_partial	blaOXA	OXA-48 family class D beta-lactamase	core	AMR	AMR	BETA-LACTAM	BETA-LACTAM	PARTIALP	233	265	87.92	100.00	233	WP_058842180.1	OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436	NF012161.0	class D beta-lactamase	blaOXA-48_fam
diff --git a/q2_amrfinderplus/types/tests/test_types_formats_transformers.py b/q2_amrfinderplus/types/tests/test_types_formats_transformers.py
@@ -15,6 +15,7 @@
     AMRFinderPlusAnnotationFormat,
     AMRFinderPlusAnnotationsDirFmt,
     AMRFinderPlusDatabaseDirFmt,
+    _create_path,
 )
 
 
@@ -114,3 +115,69 @@ def test_amrfinderplus_annotations_dir_fmt_path_maker(self):
         fmt = AMRFinderPlusAnnotationsDirFmt()
         path = fmt.annotations_path_maker(name="annotations", id="id")
         self.assertEqual(str(path), os.path.join(str(fmt), "id_amr_annotations.tsv"))
+
+    def test__create_path_annotations_absolute(self):
+        dir_format = AMRFinderPlusAnnotationsDirFmt(
+            self.get_data_path("annotation/coordinates"), "r"
+        )
+        file_path = (
+            dir_format.path / "e026af61-d911-4de3-a957-7e8bf837f30d_amr_annotations.tsv"
+        )
+        path, _id = _create_path(path=file_path, relative=False, dir_format=dir_format)
+
+        self.assertEqual(path, str(file_path))
+        self.assertEqual(_id, "e026af61-d911-4de3-a957-7e8bf837f30d")
+
+    def test__create_path_all_mutations_relative(self):
+        dir_format = AMRFinderPlusAnnotationsDirFmt(
+            self.get_data_path("all_mutations/coordinates"), "r"
+        )
+        file_path = (
+            dir_format.path
+            / "e026af61-d911-4de3-a957-7e8bf837f30d_amr_all_mutations.tsv"
+        )
+        path, _id = _create_path(path=file_path, relative=True, dir_format=dir_format)
+
+        self.assertEqual(
+            path, "e026af61-d911-4de3-a957-7e8bf837f30d_amr_all_mutations.tsv"
+        )
+        self.assertEqual(_id, "e026af61-d911-4de3-a957-7e8bf837f30d")
+
+    def test_amrfinderplus_annotation_dirfmt_samples_annotation_dict(self):
+        annotations = AMRFinderPlusAnnotationsDirFmt(
+            self.get_data_path("annotation"), mode="r"
+        )
+
+        obs = annotations.annotation_dict()
+        exp = {
+            "coordinates": {
+                "e026af61-d911-4de3-a957-7e8bf837f30d": os.path.join(
+                    str(annotations),
+                    "coordinates/"
+                    "e026af61-d911-4de3-a957-7e8bf837f30d_amr_annotations.tsv",
+                ),
+            },
+            "no_coordinates": {
+                "aa447c99-ecd9-4c4a-a53b-4df6999815dd": os.path.join(
+                    str(annotations),
+                    "no_coordinates/"
+                    "aa447c99-ecd9-4c4a-a53b-4df6999815dd_amr_annotations.tsv",
+                ),
+            },
+        }
+        self.assertDictEqual(obs, exp)
+
+    def test_amrfinderplus_annotation_dirfmt_annotation_dict(self):
+        annotations = AMRFinderPlusAnnotationsDirFmt(
+            self.get_data_path("annotation/coordinates"), mode="r"
+        )
+
+        obs = annotations.annotation_dict()
+        exp = {
+            "e026af61-d911-4de3-a957-7e8bf837f30d": os.path.join(
+                str(annotations),
+                "e026af61-d911-4de3-a957-7e8bf837f30d_amr_annotations.tsv",
+            )
+        }
+
+        self.assertDictEqual(obs, exp)