Skip to content

Commit

Permalink
added several types and formats for kmer analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
VinzentRisch committed Feb 13, 2024
1 parent db8b5ee commit ce33e1e
Show file tree
Hide file tree
Showing 11 changed files with 262 additions and 32 deletions.
34 changes: 28 additions & 6 deletions q2_amr/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,12 @@
CARDKmerTXTFormat,
CARDMAGsKmerAnalysisDirectoryFormat,
CARDMAGsKmerAnalysisFormat,
CARDReadsKmerAnalysisDirectoryFormat,
CARDReadsKmerAnalysisFormat,
CARDMAGsKmerAnalysisJSONFormat,
CARDReadsAlleleKmerAnalysisDirectoryFormat,
CARDReadsAlleleKmerAnalysisFormat,
CARDReadsGeneKmerAnalysisDirectoryFormat,
CARDReadsGeneKmerAnalysisFormat,
CARDReadsKmerAnalysisJSONFormat,
CARDWildcardIndexFormat,
GapDNAFASTAFormat,
)
Expand All @@ -51,6 +55,9 @@
CARDAnnotation,
CARDGeneAnnotation,
CARDKmerDatabase,
CARDMAGsKmerAnalysis,
CARDReadsAlleleKmerAnalysis,
CARDReadsGeneKmerAnalysis,
)

citations = Citations.load("citations.bib", package="q2_amr")
Expand Down Expand Up @@ -243,15 +250,23 @@
plugin.register_semantic_type_to_format(
SampleData[CARDGeneAnnotation], artifact_format=CARDGeneAnnotationDirectoryFormat
)

plugin.register_semantic_type_to_format(
SampleData[CARDReadsGeneKmerAnalysis],
artifact_format=CARDReadsGeneKmerAnalysisDirectoryFormat,
)
plugin.register_semantic_type_to_format(
SampleData[CARDReadsAlleleKmerAnalysis],
artifact_format=CARDReadsAlleleKmerAnalysisDirectoryFormat,
)
plugin.register_semantic_type_to_format(
SampleData[CARDMAGsKmerAnalysis],
artifact_format=CARDMAGsKmerAnalysisDirectoryFormat,
)
plugin.register_formats(
CARDKmerDatabaseDirectoryFormat,
CARDKmerJSONFormat,
CARDKmerTXTFormat,
CARDMAGsKmerAnalysisFormat,
CARDMAGsKmerAnalysisDirectoryFormat,
CARDReadsKmerAnalysisFormat,
CARDReadsKmerAnalysisDirectoryFormat,
GapDNAFASTAFormat,
CARDWildcardIndexFormat,
CARDAnnotationTXTFormat,
Expand All @@ -264,6 +279,13 @@
CARDAnnotationStatsFormat,
CARDAlleleAnnotationDirectoryFormat,
CARDGeneAnnotationDirectoryFormat,
CARDMAGsKmerAnalysisFormat,
CARDMAGsKmerAnalysisJSONFormat,
CARDReadsAlleleKmerAnalysisFormat,
CARDReadsGeneKmerAnalysisFormat,
CARDReadsKmerAnalysisJSONFormat,
CARDReadsGeneKmerAnalysisDirectoryFormat,
CARDReadsAlleleKmerAnalysisDirectoryFormat,
)

importlib.import_module("q2_amr.types._transformer")
16 changes: 12 additions & 4 deletions q2_amr/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,12 @@
CARDKmerTXTFormat,
CARDMAGsKmerAnalysisDirectoryFormat,
CARDMAGsKmerAnalysisFormat,
CARDReadsKmerAnalysisDirectoryFormat,
CARDReadsKmerAnalysisFormat,
CARDMAGsKmerAnalysisJSONFormat,
CARDReadsAlleleKmerAnalysisDirectoryFormat,
CARDReadsAlleleKmerAnalysisFormat,
CARDReadsGeneKmerAnalysisDirectoryFormat,
CARDReadsGeneKmerAnalysisFormat,
CARDReadsKmerAnalysisJSONFormat,
CARDWildcardIndexFormat,
GapDNAFASTAFormat,
)
Expand Down Expand Up @@ -57,7 +61,11 @@
"CARDWildcardIndexFormat",
"CARDKmerDatabase",
"CARDMAGsKmerAnalysisFormat",
"CARDMAGsKmerAnalysisJSONFormat",
"CARDMAGsKmerAnalysisDirectoryFormat",
"CARDReadsKmerAnalysisFormat",
"CARDReadsKmerAnalysisDirectoryFormat",
"CARDReadsAlleleKmerAnalysisFormat",
"CARDReadsGeneKmerAnalysisFormat",
"CARDReadsKmerAnalysisJSONFormat",
"CARDReadsGeneKmerAnalysisDirectoryFormat",
"CARDReadsAlleleKmerAnalysisDirectoryFormat",
]
161 changes: 148 additions & 13 deletions q2_amr/types/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,34 +454,119 @@ def _validate_(self, level):
self._validate()


class CARDMAGsKmerAnalysisJSONFormat(model.TextFileFormat):
def _validate(self, n_records=None):
keys_exp = [
"ORF",
"contig",
"HSP",
"ARO_model",
"type_hit",
"#_of_kmers_in_sequence",
"#_of_AMR_kmers",
"taxonomic_info",
"genomic_info",
]
with open(str(self)) as json_file:
dict = json.load(json_file)

keys_obs = list(next(iter(dict.values())).keys())

if keys_obs != keys_exp:
raise ValidationError(
"Keys do not match CARDMAGsKmerAnalysisJSONFormat format. Must consist "
"of the following values: "
+ ", ".join(keys_exp)
+ ".\n\nFound instead: "
+ ", ".join(keys_obs)
)

def _validate_(self, level):
self._validate()


class CARDMAGsKmerAnalysisDirectoryFormat(
MultiDirValidationMixin, model.DirectoryFormat
):
txt = model.FileCollection(
r"\d+mer_analysis_mags.txt$", format=CARDMAGsKmerAnalysisFormat
r".+\d+mer_analysis_rgi_summary\.txt$", format=CARDMAGsKmerAnalysisFormat
)
json = model.FileCollection(
r".+\d+mer_analysis\.json$", format=CARDMAGsKmerAnalysisJSONFormat
)

@txt.set_path_maker
def txt_path_maker(self, sample_id, bin_id):
pattern = r"\d+mer_analysis_mags.txt$"
pattern = r"\d+mer_analysis_rgi_summary\.txt$"
return f"{sample_id}/{bin_id}/{pattern}"

@json.set_path_maker
def json_path_maker(self, sample_id, bin_id):
pattern = r"\d+mer_analysis_mags\.json$"
return f"{sample_id}/{bin_id}/{pattern}"

class CARDReadsKmerAnalysisFormat(model.TextFileFormat):

class CARDReadsGeneKmerAnalysisFormat(model.TextFileFormat):
def _validate(self, n_records=None):
header_exp = [
"Reference Sequence / ARO term",
"ARO term",
"Mapped reads with kmer DB hits",
"CARD kmer Prediction",
"Subsequent fields",
"CARD*kmer Prediction",
"Single species (chromosome) reads",
"Single species (chromosome or plasmid) reads",
"Single species (plasmid) reads",
"Single species (no genomic info) reads",
"Single genus (chromosome) reads",
"Single genus (chromosome or plasmid) reads",
"Single genus (plasmid) reads",
"Single genus (no genomic info) reads",
"Promiscuous plasmid reads",
"Unknown taxonomy (chromosome) reads",
"Unknown taxonomy (chromosome or plasmid) reads",
"Unknown taxonomy (no genomic info) reads",
]

df = pd.read_csv(str(self), sep="\t")
header_obs = list(df.columns)
if not set(header_exp).issubset(set(header_obs)):
raise ValidationError(
"Header line does not match CARDReadsKmerAnalysisFormat. Must contain"
"the following values: "
"Header line does not match CARDReadsGeneKmerAnalysisFormat. Must "
"contain the following values: "
+ ", ".join(header_exp)
+ ".\n\nFound instead: "
+ ", ".join(header_obs)
)

def _validate_(self, level):
self._validate()


class CARDReadsAlleleKmerAnalysisFormat(model.TextFileFormat):
def _validate(self, n_records=None):
header_exp = [
"Reference Sequence",
"Mapped reads with kmer DB hits",
"CARD*kmer Prediction",
"Single species (chromosome) reads",
"Single species (chromosome or plasmid) reads",
"Single species (plasmid) reads",
"Single species (no genomic info) reads",
"Single genus (chromosome) reads",
"Single genus (chromosome or plasmid) reads",
"Single genus (plasmid) reads",
"Single genus (no genomic info) reads",
"Promiscuous plasmid reads",
"Unknown taxonomy (chromosome) reads",
"Unknown taxonomy (chromosome or plasmid) reads",
"Unknown taxonomy (no genomic info) reads",
]

df = pd.read_csv(str(self), sep="\t")
header_obs = list(df.columns)
if not set(header_exp).issubset(set(header_obs)):
raise ValidationError(
"Header line does not match CARDReadsAlleleKmerAnalysisFormat. Must "
"contain the following values: "
+ ", ".join(header_exp)
+ ".\n\nFound instead: "
+ ", ".join(header_obs)
Expand All @@ -491,14 +576,64 @@ def _validate_(self, level):
self._validate()


class CARDReadsKmerAnalysisDirectoryFormat(
class CARDReadsKmerAnalysisJSONFormat(model.TextFileFormat):
def _validate(self, n_records=None):
keys_exp = [
"reference",
"#_of_kmers_in_sequence",
"#_of_AMR_kmers",
"SAM_flag",
"MAPQ",
"taxonomic_info",
"genomic_info",
]
with open(str(self)) as json_file:
dict = json.load(json_file)

keys_obs = list(next(iter(dict.values())).keys())

if keys_obs != keys_exp:
raise ValidationError(
"Keys do not match CARDReadsKmerAnalysisJSONFormat format. Must consist"
" of the following values: "
+ ", ".join(keys_exp)
+ ".\n\nFound instead: "
+ ", ".join(keys_obs)
)

def _validate_(self, level):
self._validate()


class CARDReadsAlleleKmerAnalysisDirectoryFormat(
MultiDirValidationMixin, model.DirectoryFormat
):
txt = model.FileCollection(
r"\d+mer_analysis_analysis_reads.txt$", format=CARDReadsKmerAnalysisFormat
r".+\d+mer_analysis\.allele\.txt$", format=CARDReadsAlleleKmerAnalysisFormat
)
json = model.FileCollection(
r".+\d+mer_analysis\.json$", format=CARDReadsKmerAnalysisJSONFormat
)

@txt.set_path_maker
def txt_path_maker(self, sample_id, bin_id):
pattern = r"kmer_\d+mer_analysis_analysis_reads.txt$"
return f"{sample_id}/{bin_id}/{pattern}"
def txt_path_maker(self, sample_id):
pattern = r"\d+mer_analysis\.allele\.txt$"
return f"{sample_id}/{pattern}"

@json.set_path_maker
def json_path_maker(self, sample_id):
pattern = r"\d+mer_analysis\.json$"
return f"{sample_id}/{pattern}"


class CARDReadsGeneKmerAnalysisDirectoryFormat(
MultiDirValidationMixin, model.DirectoryFormat
):
txt = model.FileCollection(
r".+\d+mer_analysis\.gene\.txt$", format=CARDReadsGeneKmerAnalysisFormat
)

@txt.set_path_maker
def txt_path_maker(self, sample_id):
pattern = r"\d+mer_analysis\.gene\.txt$"
return f"{sample_id}/{pattern}"
11 changes: 9 additions & 2 deletions q2_amr/types/_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,15 @@

CARDDatabase = SemanticType("CARDDatabase")
CARDKmerDatabase = SemanticType("CARDKmerDatabase")
CARDMAGsKmerAnalysis = SemanticType("CARDMAGsKmerAnalysis")
CARDReadsKmerAnalysis = SemanticType("CARDReadsKmerAnalysis")
CARDMAGsKmerAnalysis = SemanticType(
"CARDMAGsKmerAnalysis", variant_of=SampleData.field["type"]
)
CARDReadsAlleleKmerAnalysis = SemanticType(
"CARDReadsAlleleKmerAnalysis", variant_of=SampleData.field["type"]
)
CARDReadsGeneKmerAnalysis = SemanticType(
"CARDReadsGeneKmerAnalysis", variant_of=SampleData.field["type"]
)
CARDAnnotation = SemanticType("CARDAnnotation", variant_of=SampleData.field["type"])
CARDAlleleAnnotation = SemanticType(
"CARDAlleleAnnotation", variant_of=SampleData.field["type"]
Expand Down
3 changes: 3 additions & 0 deletions q2_amr/types/tests/data/61mer_analysis.allele.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Reference Sequence Mapped reads with kmer DB hits CARD*kmer Prediction Single species (chromosome) reads Single species (chromosome or plasmid) reads Single species (plasmid) reads Single species (no genomic info) reads Single genus (chromosome) reads Single genus (chromosome or plasmid) reads Single genus (plasmid) reads Single genus (no genomic info) reads Promiscuous plasmid reads Unknown taxonomy (chromosome) reads Unknown taxonomy (chromosome or plasmid) reads Unknown taxonomy (no genomic info) reads
ARO:3003550|ID:45|Name:mdtP|NCBI:AP009048.1 2 0 1 1 0
ARO:3000796|ID:121|Name:mdtF|NCBI:U00096.1 2 0 2 0 0
3 changes: 3 additions & 0 deletions q2_amr/types/tests/data/61mer_analysis.gene.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ARO term Mapped reads with kmer DB hits CARD*kmer Prediction Single species (chromosome) reads Single species (chromosome or plasmid) reads Single species (plasmid) reads Single species (no genomic info) reads Single genus (chromosome) reads Single genus (chromosome or plasmid) reads Single genus (plasmid) reads Single genus (no genomic info) reads Promiscuous plasmid reads Unknown taxonomy (chromosome) reads Unknown taxonomy (chromosome or plasmid) reads Unknown taxonomy (no genomic info) reads
mdtP 2 0 1 1 0
mdtF 2 0 2 0 0
2 changes: 0 additions & 2 deletions q2_amr/types/tests/data/kmer_analysis_bwt_summary.txt

This file was deleted.

1 change: 1 addition & 0 deletions q2_amr/types/tests/data/mags_61mer_analysis.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"k141_1617_82 # 105913 # 109473 # 1 # ID=42_82;partial=00;start_type=TTG;rbs_motif=AGGA;rbs_spacer=5-10bp;gc_cont=0.630": {"ORF": "k141_1617_82 # 105913 # 109473 # 1 # ID=42_82;partial=00;start_type=TTG;rbs_motif=AGGA;rbs_spacer=5-10bp;gc_cont=0.630", "contig": "k141_1617_82", "HSP": "gnl|BL_ORD_ID|4713|hsp_num:0", "ARO_model": "Bifidobacterium adolescentis rpoB mutants conferring resistance to rifampicin", "type_hit": "Perfect", "#_of_kmers_in_sequence": 3501, "#_of_AMR_kmers": 2274, "taxonomic_info": {"species": {"Bifidobacterium longum": 75, "Bifidobacterium dentium": 67, "Parascardovia denticolens": 3, "Bifidobacterium animalis": 7, "Bifidobacterium bifidum": 9}, "genus": {}}, "genomic_info": {"chr + plasmid": 0, "plasmid": 0, "chr": 234}}}
1 change: 1 addition & 0 deletions q2_amr/types/tests/data/reads_61mer_analysis.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"NC_000913.3_208_0/1/1": {"reference": "ARO:3003550|ID:45|Name:mdtP|NCBI:AP009048.1", "#_of_kmers_in_sequence": 66, "#_of_AMR_kmers": 66, "SAM_flag": 83, "MAPQ": 193, "taxonomic_info": {"species": {}, "genus": {}}, "genomic_info": {"chr + plasmid": 18, "plasmid": 0, "chr": 48}}}
Loading

0 comments on commit ce33e1e

Please sign in to comment.