diff --git a/q2_amrfinderplus/types/_format.py b/q2_amrfinderplus/types/_format.py index bcda9e8..c7352c4 100644 --- a/q2_amrfinderplus/types/_format.py +++ b/q2_amrfinderplus/types/_format.py @@ -6,6 +6,7 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import os +from collections import defaultdict import pandas as pd from q2_types.feature_data import MixedCaseDNAFASTAFormat, ProteinFASTAFormat @@ -115,6 +116,90 @@ class AMRFinderPlusAnnotationsDirFmt(model.DirectoryFormat): r".*amr_(annotations|all_mutations)\.tsv$", format=AMRFinderPlusAnnotationFormat ) + def annotation_dict(self, relative=False): + """ + For per sample directories it returns a mapping of sample id to + another dictionary where keys represent the file name and values + correspond to the filepath for each file. + For files, it returns a mapping of file name to filepath for each file. + The suffixes "_amr_annotations" and "_amr_all_mutations" are removed from + filenames. + + Parameters + --------- + relative : bool + Whether to return filepaths relative to the directory's location. + Returns absolute filepaths by default. + + Returns + ------- + dict + Mapping of filename -> filepath as described above. + Or mapping of sample id -> dict {filename: filepath} as + described above. + Both levels of the dictionary are sorted alphabetically by key. + """ + ids = defaultdict(dict) + for entry in self.path.iterdir(): + if entry.is_dir(): + outer_id = entry.name + for path in entry.iterdir(): + file_path, inner_id = _create_path( + path=path, relative=relative, dir_format=self + ) + + ids[outer_id][inner_id] = str(file_path) + ids[outer_id] = dict(sorted(ids[outer_id].items())) + else: + file_path, inner_id = _create_path( + path=entry, relative=relative, dir_format=self + ) + + ids[inner_id] = str(file_path) + + return dict(sorted(ids.items())) + @annotations.set_path_maker def annotations_path_maker(self, name, id, dir_name=""): return os.path.join(dir_name, f"{id}_amr_{name}.tsv") + + +def _create_path(path, relative, dir_format): + """ + This function processes the input file path to generate an absolute or relative + path string and the sample or MAG ID derived from the file name. The ID is + extracted by removing the suffix "_amr_annotations" or "_amr_all_mutations" from the + file name. The created path and ID are used to build the annotation_dict that maps + IDs to filepaths. + + Parameters: + --------- + path : Path + A Path object representing the file path to process. + relative : bool + A flag indicating whether the returned path should be relative + to the directory formats path or absolute. + dir_format : AMRFinderplusAnnotationDirFmt + An object of class "AMRFinderplusAnnotationDirFmt". + + Returns: + ------- + path_dict : str + The full relative or absolut path to the file. + _id : str + The sample or MAG ID derived from the file name. + """ + file_name = path.stem + + # Remove suffix from filename to create id + if file_name.endswith("_amr_annotations"): + _id = file_name[:-16] + else: + _id = file_name[:-18] + + path_dict = ( + path.absolute().relative_to(dir_format.path.absolute()) + if relative + else path.absolute() + ) + return str(path_dict), _id diff --git a/q2_amrfinderplus/types/tests/data/all_mutations/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d_amr_all_mutations.tsv b/q2_amrfinderplus/types/tests/data/all_mutations/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d_amr_all_mutations.tsv new file mode 100644 index 0000000..20e52d1 --- /dev/null +++ b/q2_amrfinderplus/types/tests/data/all_mutations/coordinates/e026af61-d911-4de3-a957-7e8bf837f30d_amr_all_mutations.tsv @@ -0,0 +1,3 @@ +Protein identifier Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description Hierarchy node +aph3pp-Ib_partial_5p_neg aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIALP 225 267 81.27 100.00 217 WP_001082319.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase aph(3'')-Ib +blaOXA-436_partial blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF012161.0 class D beta-lactamase blaOXA-48_fam diff --git a/q2_amrfinderplus/types/tests/data/all_mutations/no_coordinates/aa447c99-ecd9-4c4a-a53b-4df6999815dd_amr_all_mutations.tsv b/q2_amrfinderplus/types/tests/data/all_mutations/no_coordinates/aa447c99-ecd9-4c4a-a53b-4df6999815dd_amr_all_mutations.tsv new file mode 100644 index 0000000..20e52d1 --- /dev/null +++ b/q2_amrfinderplus/types/tests/data/all_mutations/no_coordinates/aa447c99-ecd9-4c4a-a53b-4df6999815dd_amr_all_mutations.tsv @@ -0,0 +1,3 @@ +Protein identifier Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description Hierarchy node +aph3pp-Ib_partial_5p_neg aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIALP 225 267 81.27 100.00 217 WP_001082319.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase aph(3'')-Ib +blaOXA-436_partial blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF012161.0 class D beta-lactamase blaOXA-48_fam diff --git a/q2_amrfinderplus/types/tests/test_types_formats_transformers.py b/q2_amrfinderplus/types/tests/test_types_formats_transformers.py index 81d1709..d1dc632 100644 --- a/q2_amrfinderplus/types/tests/test_types_formats_transformers.py +++ b/q2_amrfinderplus/types/tests/test_types_formats_transformers.py @@ -15,6 +15,7 @@ AMRFinderPlusAnnotationFormat, AMRFinderPlusAnnotationsDirFmt, AMRFinderPlusDatabaseDirFmt, + _create_path, ) @@ -114,3 +115,69 @@ def test_amrfinderplus_annotations_dir_fmt_path_maker(self): fmt = AMRFinderPlusAnnotationsDirFmt() path = fmt.annotations_path_maker(name="annotations", id="id") self.assertEqual(str(path), os.path.join(str(fmt), "id_amr_annotations.tsv")) + + def test__create_path_annotations_absolute(self): + dir_format = AMRFinderPlusAnnotationsDirFmt( + self.get_data_path("annotation/coordinates"), "r" + ) + file_path = ( + dir_format.path / "e026af61-d911-4de3-a957-7e8bf837f30d_amr_annotations.tsv" + ) + path, _id = _create_path(path=file_path, relative=False, dir_format=dir_format) + + self.assertEqual(path, str(file_path)) + self.assertEqual(_id, "e026af61-d911-4de3-a957-7e8bf837f30d") + + def test__create_path_all_mutations_relative(self): + dir_format = AMRFinderPlusAnnotationsDirFmt( + self.get_data_path("all_mutations/coordinates"), "r" + ) + file_path = ( + dir_format.path + / "e026af61-d911-4de3-a957-7e8bf837f30d_amr_all_mutations.tsv" + ) + path, _id = _create_path(path=file_path, relative=True, dir_format=dir_format) + + self.assertEqual( + path, "e026af61-d911-4de3-a957-7e8bf837f30d_amr_all_mutations.tsv" + ) + self.assertEqual(_id, "e026af61-d911-4de3-a957-7e8bf837f30d") + + def test_amrfinderplus_annotation_dirfmt_samples_annotation_dict(self): + annotations = AMRFinderPlusAnnotationsDirFmt( + self.get_data_path("annotation"), mode="r" + ) + + obs = annotations.annotation_dict() + exp = { + "coordinates": { + "e026af61-d911-4de3-a957-7e8bf837f30d": os.path.join( + str(annotations), + "coordinates/" + "e026af61-d911-4de3-a957-7e8bf837f30d_amr_annotations.tsv", + ), + }, + "no_coordinates": { + "aa447c99-ecd9-4c4a-a53b-4df6999815dd": os.path.join( + str(annotations), + "no_coordinates/" + "aa447c99-ecd9-4c4a-a53b-4df6999815dd_amr_annotations.tsv", + ), + }, + } + self.assertDictEqual(obs, exp) + + def test_amrfinderplus_annotation_dirfmt_annotation_dict(self): + annotations = AMRFinderPlusAnnotationsDirFmt( + self.get_data_path("annotation/coordinates"), mode="r" + ) + + obs = annotations.annotation_dict() + exp = { + "e026af61-d911-4de3-a957-7e8bf837f30d": os.path.join( + str(annotations), + "e026af61-d911-4de3-a957-7e8bf837f30d_amr_annotations.tsv", + ) + } + + self.assertDictEqual(obs, exp)