Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Added annotation_dict to AMRFinderPlusAnnotationsDirFmt #7

Merged
merged 4 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions q2_amrfinderplus/types/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import os
from collections import defaultdict

import pandas as pd
from q2_types.feature_data import MixedCaseDNAFASTAFormat, ProteinFASTAFormat
Expand Down Expand Up @@ -115,6 +116,66 @@ class AMRFinderPlusAnnotationsDirFmt(model.DirectoryFormat):
r".*amr_(annotations|all_mutations)\.tsv$", format=AMRFinderPlusAnnotationFormat
)

def annotation_dict(self, relative=False):
"""
For per sample directories it returns a mapping of sample id to
another dictionary where keys represent the file name and values
correspond to the filepath for each file.
For files, it returns a mapping of file name to filepath for each file.
The suffixes "_amr_annotations" and "_amr_all_mutations" are removed from
filenames.

Parameters
---------
relative : bool
Whether to return filepaths relative to the directory's location.
Returns absolute filepaths by default.

Returns
-------
dict
Mapping of filename -> filepath as described above.
Or mapping of sample id -> dict {filename: filepath} as
described above.
Both levels of the dictionary are sorted alphabetically by key.
"""
ids = defaultdict(dict)
for entry in self.path.iterdir():
if entry.is_dir():
outer_id = entry.name
for path in entry.iterdir():
file_path, inner_id = _create_path(
path=path, relative=relative, dir_format=self
)

ids[outer_id][inner_id] = str(file_path)
ids[outer_id] = dict(sorted(ids[outer_id].items()))
else:
file_path, inner_id = _create_path(
path=entry, relative=relative, dir_format=self
)

ids[inner_id] = str(file_path)

return dict(sorted(ids.items()))

@annotations.set_path_maker
def annotations_path_maker(self, name, id, dir_name=""):
return os.path.join(dir_name, f"{id}_amr_{name}.tsv")


def _create_path(path, relative, dir_format):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A docstring here would be nice 🙏

file_name = path.stem

# Remove suffix from filename to create id
if file_name.endswith("_amr_annotations"):
_id = file_name[:-16]
else:
_id = file_name[:-18]

path_dict = (
path.absolute().relative_to(dir_format.path.absolute())
if relative
else path.absolute()
)
return str(path_dict), _id
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Protein identifier Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description Hierarchy node
aph3pp-Ib_partial_5p_neg aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIALP 225 267 81.27 100.00 217 WP_001082319.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase aph(3'')-Ib
blaOXA-436_partial blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF012161.0 class D beta-lactamase blaOXA-48_fam
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Protein identifier Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description Hierarchy node
aph3pp-Ib_partial_5p_neg aph(3'')-Ib aminoglycoside O-phosphotransferase APH(3'')-Ib core AMR AMR AMINOGLYCOSIDE STREPTOMYCIN PARTIALP 225 267 81.27 100.00 217 WP_001082319.1 aminoglycoside O-phosphotransferase APH(3'')-Ib NF032896.1 APH(3'') family aminoglycoside O-phosphotransferase aph(3'')-Ib
blaOXA-436_partial blaOXA OXA-48 family class D beta-lactamase core AMR AMR BETA-LACTAM BETA-LACTAM PARTIALP 233 265 87.92 100.00 233 WP_058842180.1 OXA-48 family carbapenem-hydrolyzing class D beta-lactamase OXA-436 NF012161.0 class D beta-lactamase blaOXA-48_fam
67 changes: 67 additions & 0 deletions q2_amrfinderplus/types/tests/test_types_formats_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
AMRFinderPlusAnnotationFormat,
AMRFinderPlusAnnotationsDirFmt,
AMRFinderPlusDatabaseDirFmt,
_create_path,
)


Expand Down Expand Up @@ -114,3 +115,69 @@ def test_amrfinderplus_annotations_dir_fmt_path_maker(self):
fmt = AMRFinderPlusAnnotationsDirFmt()
path = fmt.annotations_path_maker(name="annotations", id="id")
self.assertEqual(str(path), os.path.join(str(fmt), "id_amr_annotations.tsv"))

def test__create_path_annotations_absolute(self):
dir_format = AMRFinderPlusAnnotationsDirFmt(
self.get_data_path("annotation/coordinates"), "r"
)
file_path = (
dir_format.path / "e026af61-d911-4de3-a957-7e8bf837f30d_amr_annotations.tsv"
)
path, _id = _create_path(path=file_path, relative=False, dir_format=dir_format)

self.assertEqual(path, str(file_path))
self.assertEqual(_id, "e026af61-d911-4de3-a957-7e8bf837f30d")

def test__create_path_all_mutations_relative(self):
dir_format = AMRFinderPlusAnnotationsDirFmt(
self.get_data_path("all_mutations/coordinates"), "r"
)
file_path = (
dir_format.path
/ "e026af61-d911-4de3-a957-7e8bf837f30d_amr_all_mutations.tsv"
)
path, _id = _create_path(path=file_path, relative=True, dir_format=dir_format)

self.assertEqual(
path, "e026af61-d911-4de3-a957-7e8bf837f30d_amr_all_mutations.tsv"
)
self.assertEqual(_id, "e026af61-d911-4de3-a957-7e8bf837f30d")

def test_amrfinderplus_annotation_dirfmt_samples_annotation_dict(self):
annotations = AMRFinderPlusAnnotationsDirFmt(
self.get_data_path("annotation"), mode="r"
)

obs = annotations.annotation_dict()
exp = {
"coordinates": {
"e026af61-d911-4de3-a957-7e8bf837f30d": os.path.join(
str(annotations),
"coordinates/"
"e026af61-d911-4de3-a957-7e8bf837f30d_amr_annotations.tsv",
),
},
"no_coordinates": {
"aa447c99-ecd9-4c4a-a53b-4df6999815dd": os.path.join(
str(annotations),
"no_coordinates/"
"aa447c99-ecd9-4c4a-a53b-4df6999815dd_amr_annotations.tsv",
),
},
}
self.assertDictEqual(obs, exp)

def test_amrfinderplus_annotation_dirfmt_annotation_dict(self):
annotations = AMRFinderPlusAnnotationsDirFmt(
self.get_data_path("annotation/coordinates"), mode="r"
)

obs = annotations.annotation_dict()
exp = {
"e026af61-d911-4de3-a957-7e8bf837f30d": os.path.join(
str(annotations),
"e026af61-d911-4de3-a957-7e8bf837f30d_amr_annotations.tsv",
)
}

self.assertDictEqual(obs, exp)
Loading