diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py index 0d11b5d..844149e 100644 --- a/q2_amr/plugin_setup.py +++ b/q2_amr/plugin_setup.py @@ -41,8 +41,12 @@ CARDKmerTXTFormat, CARDMAGsKmerAnalysisDirectoryFormat, CARDMAGsKmerAnalysisFormat, - CARDReadsKmerAnalysisDirectoryFormat, - CARDReadsKmerAnalysisFormat, + CARDMAGsKmerAnalysisJSONFormat, + CARDReadsAlleleKmerAnalysisDirectoryFormat, + CARDReadsAlleleKmerAnalysisFormat, + CARDReadsGeneKmerAnalysisDirectoryFormat, + CARDReadsGeneKmerAnalysisFormat, + CARDReadsKmerAnalysisJSONFormat, CARDWildcardIndexFormat, GapDNAFASTAFormat, ) @@ -51,6 +55,9 @@ CARDAnnotation, CARDGeneAnnotation, CARDKmerDatabase, + CARDMAGsKmerAnalysis, + CARDReadsAlleleKmerAnalysis, + CARDReadsGeneKmerAnalysis, ) citations = Citations.load("citations.bib", package="q2_amr") @@ -243,15 +250,23 @@ plugin.register_semantic_type_to_format( SampleData[CARDGeneAnnotation], artifact_format=CARDGeneAnnotationDirectoryFormat ) - +plugin.register_semantic_type_to_format( + SampleData[CARDReadsGeneKmerAnalysis], + artifact_format=CARDReadsGeneKmerAnalysisDirectoryFormat, +) +plugin.register_semantic_type_to_format( + SampleData[CARDReadsAlleleKmerAnalysis], + artifact_format=CARDReadsAlleleKmerAnalysisDirectoryFormat, +) +plugin.register_semantic_type_to_format( + SampleData[CARDMAGsKmerAnalysis], + artifact_format=CARDMAGsKmerAnalysisDirectoryFormat, +) plugin.register_formats( CARDKmerDatabaseDirectoryFormat, CARDKmerJSONFormat, CARDKmerTXTFormat, - CARDMAGsKmerAnalysisFormat, CARDMAGsKmerAnalysisDirectoryFormat, - CARDReadsKmerAnalysisFormat, - CARDReadsKmerAnalysisDirectoryFormat, GapDNAFASTAFormat, CARDWildcardIndexFormat, CARDAnnotationTXTFormat, @@ -264,6 +279,13 @@ CARDAnnotationStatsFormat, CARDAlleleAnnotationDirectoryFormat, CARDGeneAnnotationDirectoryFormat, + CARDMAGsKmerAnalysisFormat, + CARDMAGsKmerAnalysisJSONFormat, + CARDReadsAlleleKmerAnalysisFormat, + CARDReadsGeneKmerAnalysisFormat, + CARDReadsKmerAnalysisJSONFormat, + CARDReadsGeneKmerAnalysisDirectoryFormat, + CARDReadsAlleleKmerAnalysisDirectoryFormat, ) importlib.import_module("q2_amr.types._transformer") diff --git a/q2_amr/types/__init__.py b/q2_amr/types/__init__.py index 7a6aca6..2b015b1 100644 --- a/q2_amr/types/__init__.py +++ b/q2_amr/types/__init__.py @@ -22,8 +22,12 @@ CARDKmerTXTFormat, CARDMAGsKmerAnalysisDirectoryFormat, CARDMAGsKmerAnalysisFormat, - CARDReadsKmerAnalysisDirectoryFormat, - CARDReadsKmerAnalysisFormat, + CARDMAGsKmerAnalysisJSONFormat, + CARDReadsAlleleKmerAnalysisDirectoryFormat, + CARDReadsAlleleKmerAnalysisFormat, + CARDReadsGeneKmerAnalysisDirectoryFormat, + CARDReadsGeneKmerAnalysisFormat, + CARDReadsKmerAnalysisJSONFormat, CARDWildcardIndexFormat, GapDNAFASTAFormat, ) @@ -57,7 +61,11 @@ "CARDWildcardIndexFormat", "CARDKmerDatabase", "CARDMAGsKmerAnalysisFormat", + "CARDMAGsKmerAnalysisJSONFormat", "CARDMAGsKmerAnalysisDirectoryFormat", - "CARDReadsKmerAnalysisFormat", - "CARDReadsKmerAnalysisDirectoryFormat", + "CARDReadsAlleleKmerAnalysisFormat", + "CARDReadsGeneKmerAnalysisFormat", + "CARDReadsKmerAnalysisJSONFormat", + "CARDReadsGeneKmerAnalysisDirectoryFormat", + "CARDReadsAlleleKmerAnalysisDirectoryFormat", ] diff --git a/q2_amr/types/_format.py b/q2_amr/types/_format.py index 99a169c..6553096 100644 --- a/q2_amr/types/_format.py +++ b/q2_amr/types/_format.py @@ -454,34 +454,119 @@ def _validate_(self, level): self._validate() +class CARDMAGsKmerAnalysisJSONFormat(model.TextFileFormat): + def _validate(self, n_records=None): + keys_exp = [ + "ORF", + "contig", + "HSP", + "ARO_model", + "type_hit", + "#_of_kmers_in_sequence", + "#_of_AMR_kmers", + "taxonomic_info", + "genomic_info", + ] + with open(str(self)) as json_file: + dict = json.load(json_file) + + keys_obs = list(next(iter(dict.values())).keys()) + + if keys_obs != keys_exp: + raise ValidationError( + "Keys do not match CARDMAGsKmerAnalysisJSONFormat format. Must consist " + "of the following values: " + + ", ".join(keys_exp) + + ".\n\nFound instead: " + + ", ".join(keys_obs) + ) + + def _validate_(self, level): + self._validate() + + class CARDMAGsKmerAnalysisDirectoryFormat( MultiDirValidationMixin, model.DirectoryFormat ): txt = model.FileCollection( - r"\d+mer_analysis_mags.txt$", format=CARDMAGsKmerAnalysisFormat + r".+\d+mer_analysis_rgi_summary\.txt$", format=CARDMAGsKmerAnalysisFormat + ) + json = model.FileCollection( + r".+\d+mer_analysis\.json$", format=CARDMAGsKmerAnalysisJSONFormat ) @txt.set_path_maker def txt_path_maker(self, sample_id, bin_id): - pattern = r"\d+mer_analysis_mags.txt$" + pattern = r"\d+mer_analysis_rgi_summary\.txt$" return f"{sample_id}/{bin_id}/{pattern}" + @json.set_path_maker + def json_path_maker(self, sample_id, bin_id): + pattern = r"\d+mer_analysis_mags\.json$" + return f"{sample_id}/{bin_id}/{pattern}" -class CARDReadsKmerAnalysisFormat(model.TextFileFormat): + +class CARDReadsGeneKmerAnalysisFormat(model.TextFileFormat): def _validate(self, n_records=None): header_exp = [ - "Reference Sequence / ARO term", + "ARO term", "Mapped reads with kmer DB hits", - "CARD kmer Prediction", - "Subsequent fields", + "CARD*kmer Prediction", + "Single species (chromosome) reads", + "Single species (chromosome or plasmid) reads", + "Single species (plasmid) reads", + "Single species (no genomic info) reads", + "Single genus (chromosome) reads", + "Single genus (chromosome or plasmid) reads", + "Single genus (plasmid) reads", + "Single genus (no genomic info) reads", + "Promiscuous plasmid reads", + "Unknown taxonomy (chromosome) reads", + "Unknown taxonomy (chromosome or plasmid) reads", + "Unknown taxonomy (no genomic info) reads", ] df = pd.read_csv(str(self), sep="\t") header_obs = list(df.columns) if not set(header_exp).issubset(set(header_obs)): raise ValidationError( - "Header line does not match CARDReadsKmerAnalysisFormat. Must contain" - "the following values: " + "Header line does not match CARDReadsGeneKmerAnalysisFormat. Must " + "contain the following values: " + + ", ".join(header_exp) + + ".\n\nFound instead: " + + ", ".join(header_obs) + ) + + def _validate_(self, level): + self._validate() + + +class CARDReadsAlleleKmerAnalysisFormat(model.TextFileFormat): + def _validate(self, n_records=None): + header_exp = [ + "Reference Sequence", + "Mapped reads with kmer DB hits", + "CARD*kmer Prediction", + "Single species (chromosome) reads", + "Single species (chromosome or plasmid) reads", + "Single species (plasmid) reads", + "Single species (no genomic info) reads", + "Single genus (chromosome) reads", + "Single genus (chromosome or plasmid) reads", + "Single genus (plasmid) reads", + "Single genus (no genomic info) reads", + "Promiscuous plasmid reads", + "Unknown taxonomy (chromosome) reads", + "Unknown taxonomy (chromosome or plasmid) reads", + "Unknown taxonomy (no genomic info) reads", + ] + + df = pd.read_csv(str(self), sep="\t") + header_obs = list(df.columns) + if not set(header_exp).issubset(set(header_obs)): + raise ValidationError( + "Header line does not match CARDReadsAlleleKmerAnalysisFormat. Must " + "contain the following values: " + ", ".join(header_exp) + ".\n\nFound instead: " + ", ".join(header_obs) @@ -491,14 +576,64 @@ def _validate_(self, level): self._validate() -class CARDReadsKmerAnalysisDirectoryFormat( +class CARDReadsKmerAnalysisJSONFormat(model.TextFileFormat): + def _validate(self, n_records=None): + keys_exp = [ + "reference", + "#_of_kmers_in_sequence", + "#_of_AMR_kmers", + "SAM_flag", + "MAPQ", + "taxonomic_info", + "genomic_info", + ] + with open(str(self)) as json_file: + dict = json.load(json_file) + + keys_obs = list(next(iter(dict.values())).keys()) + + if keys_obs != keys_exp: + raise ValidationError( + "Keys do not match CARDReadsKmerAnalysisJSONFormat format. Must consist" + " of the following values: " + + ", ".join(keys_exp) + + ".\n\nFound instead: " + + ", ".join(keys_obs) + ) + + def _validate_(self, level): + self._validate() + + +class CARDReadsAlleleKmerAnalysisDirectoryFormat( MultiDirValidationMixin, model.DirectoryFormat ): txt = model.FileCollection( - r"\d+mer_analysis_analysis_reads.txt$", format=CARDReadsKmerAnalysisFormat + r".+\d+mer_analysis\.allele\.txt$", format=CARDReadsAlleleKmerAnalysisFormat + ) + json = model.FileCollection( + r".+\d+mer_analysis\.json$", format=CARDReadsKmerAnalysisJSONFormat ) @txt.set_path_maker - def txt_path_maker(self, sample_id, bin_id): - pattern = r"kmer_\d+mer_analysis_analysis_reads.txt$" - return f"{sample_id}/{bin_id}/{pattern}" + def txt_path_maker(self, sample_id): + pattern = r"\d+mer_analysis\.allele\.txt$" + return f"{sample_id}/{pattern}" + + @json.set_path_maker + def json_path_maker(self, sample_id): + pattern = r"\d+mer_analysis\.json$" + return f"{sample_id}/{pattern}" + + +class CARDReadsGeneKmerAnalysisDirectoryFormat( + MultiDirValidationMixin, model.DirectoryFormat +): + txt = model.FileCollection( + r".+\d+mer_analysis\.gene\.txt$", format=CARDReadsGeneKmerAnalysisFormat + ) + + @txt.set_path_maker + def txt_path_maker(self, sample_id): + pattern = r"\d+mer_analysis\.gene\.txt$" + return f"{sample_id}/{pattern}" diff --git a/q2_amr/types/_type.py b/q2_amr/types/_type.py index 6b9fe7c..073f39c 100644 --- a/q2_amr/types/_type.py +++ b/q2_amr/types/_type.py @@ -10,8 +10,15 @@ CARDDatabase = SemanticType("CARDDatabase") CARDKmerDatabase = SemanticType("CARDKmerDatabase") -CARDMAGsKmerAnalysis = SemanticType("CARDMAGsKmerAnalysis") -CARDReadsKmerAnalysis = SemanticType("CARDReadsKmerAnalysis") +CARDMAGsKmerAnalysis = SemanticType( + "CARDMAGsKmerAnalysis", variant_of=SampleData.field["type"] +) +CARDReadsAlleleKmerAnalysis = SemanticType( + "CARDReadsAlleleKmerAnalysis", variant_of=SampleData.field["type"] +) +CARDReadsGeneKmerAnalysis = SemanticType( + "CARDReadsGeneKmerAnalysis", variant_of=SampleData.field["type"] +) CARDAnnotation = SemanticType("CARDAnnotation", variant_of=SampleData.field["type"]) CARDAlleleAnnotation = SemanticType( "CARDAlleleAnnotation", variant_of=SampleData.field["type"] diff --git a/q2_amr/types/tests/data/61mer_analysis.allele.txt b/q2_amr/types/tests/data/61mer_analysis.allele.txt new file mode 100644 index 0000000..7eee60c --- /dev/null +++ b/q2_amr/types/tests/data/61mer_analysis.allele.txt @@ -0,0 +1,3 @@ +Reference Sequence Mapped reads with kmer DB hits CARD*kmer Prediction Single species (chromosome) reads Single species (chromosome or plasmid) reads Single species (plasmid) reads Single species (no genomic info) reads Single genus (chromosome) reads Single genus (chromosome or plasmid) reads Single genus (plasmid) reads Single genus (no genomic info) reads Promiscuous plasmid reads Unknown taxonomy (chromosome) reads Unknown taxonomy (chromosome or plasmid) reads Unknown taxonomy (no genomic info) reads +ARO:3003550|ID:45|Name:mdtP|NCBI:AP009048.1 2 0 1 1 0 +ARO:3000796|ID:121|Name:mdtF|NCBI:U00096.1 2 0 2 0 0 diff --git a/q2_amr/types/tests/data/61mer_analysis.gene.txt b/q2_amr/types/tests/data/61mer_analysis.gene.txt new file mode 100644 index 0000000..e5912f1 --- /dev/null +++ b/q2_amr/types/tests/data/61mer_analysis.gene.txt @@ -0,0 +1,3 @@ +ARO term Mapped reads with kmer DB hits CARD*kmer Prediction Single species (chromosome) reads Single species (chromosome or plasmid) reads Single species (plasmid) reads Single species (no genomic info) reads Single genus (chromosome) reads Single genus (chromosome or plasmid) reads Single genus (plasmid) reads Single genus (no genomic info) reads Promiscuous plasmid reads Unknown taxonomy (chromosome) reads Unknown taxonomy (chromosome or plasmid) reads Unknown taxonomy (no genomic info) reads +mdtP 2 0 1 1 0 +mdtF 2 0 2 0 0 diff --git a/q2_amr/types/tests/data/kmer_analysis_rgi_summary.txt b/q2_amr/types/tests/data/61mer_analysis_rgi_summary.txt similarity index 100% rename from q2_amr/types/tests/data/kmer_analysis_rgi_summary.txt rename to q2_amr/types/tests/data/61mer_analysis_rgi_summary.txt diff --git a/q2_amr/types/tests/data/kmer_analysis_bwt_summary.txt b/q2_amr/types/tests/data/kmer_analysis_bwt_summary.txt deleted file mode 100644 index 68869a5..0000000 --- a/q2_amr/types/tests/data/kmer_analysis_bwt_summary.txt +++ /dev/null @@ -1,2 +0,0 @@ -Reference Sequence / ARO term Mapped reads with kmer DB hits CARD kmer Prediction Subsequent fields -"Yes" "Yes" "Yes" "Yes" diff --git a/q2_amr/types/tests/data/mags_61mer_analysis.json b/q2_amr/types/tests/data/mags_61mer_analysis.json new file mode 100644 index 0000000..6eaea2b --- /dev/null +++ b/q2_amr/types/tests/data/mags_61mer_analysis.json @@ -0,0 +1 @@ +{"k141_1617_82 # 105913 # 109473 # 1 # ID=42_82;partial=00;start_type=TTG;rbs_motif=AGGA;rbs_spacer=5-10bp;gc_cont=0.630": {"ORF": "k141_1617_82 # 105913 # 109473 # 1 # ID=42_82;partial=00;start_type=TTG;rbs_motif=AGGA;rbs_spacer=5-10bp;gc_cont=0.630", "contig": "k141_1617_82", "HSP": "gnl|BL_ORD_ID|4713|hsp_num:0", "ARO_model": "Bifidobacterium adolescentis rpoB mutants conferring resistance to rifampicin", "type_hit": "Perfect", "#_of_kmers_in_sequence": 3501, "#_of_AMR_kmers": 2274, "taxonomic_info": {"species": {"Bifidobacterium longum": 75, "Bifidobacterium dentium": 67, "Parascardovia denticolens": 3, "Bifidobacterium animalis": 7, "Bifidobacterium bifidum": 9}, "genus": {}}, "genomic_info": {"chr + plasmid": 0, "plasmid": 0, "chr": 234}}} diff --git a/q2_amr/types/tests/data/reads_61mer_analysis.json b/q2_amr/types/tests/data/reads_61mer_analysis.json new file mode 100644 index 0000000..25108c6 --- /dev/null +++ b/q2_amr/types/tests/data/reads_61mer_analysis.json @@ -0,0 +1 @@ +{"NC_000913.3_208_0/1/1": {"reference": "ARO:3003550|ID:45|Name:mdtP|NCBI:AP009048.1", "#_of_kmers_in_sequence": 66, "#_of_AMR_kmers": 66, "SAM_flag": 83, "MAPQ": 193, "taxonomic_info": {"species": {}, "genus": {}}, "genomic_info": {"chr + plasmid": 18, "plasmid": 0, "chr": 48}}} diff --git a/q2_amr/types/tests/test_types_formats_transformers.py b/q2_amr/types/tests/test_types_formats_transformers.py index 445309d..19f44d3 100644 --- a/q2_amr/types/tests/test_types_formats_transformers.py +++ b/q2_amr/types/tests/test_types_formats_transformers.py @@ -36,8 +36,14 @@ CARDKmerDatabaseDirectoryFormat, CARDKmerJSONFormat, CARDKmerTXTFormat, + CARDMAGsKmerAnalysisDirectoryFormat, CARDMAGsKmerAnalysisFormat, - CARDReadsKmerAnalysisFormat, + CARDMAGsKmerAnalysisJSONFormat, + CARDReadsAlleleKmerAnalysisDirectoryFormat, + CARDReadsAlleleKmerAnalysisFormat, + CARDReadsGeneKmerAnalysisDirectoryFormat, + CARDReadsGeneKmerAnalysisFormat, + CARDReadsKmerAnalysisJSONFormat, CARDWildcardIndexFormat, GapDNAFASTAFormat, ) @@ -334,11 +340,57 @@ def test_CARDAlleleAnnotationDirectoryFormat_to_qiime2_Metadata_transformer(self class TestKmerTypesAndFormats(AMRTypesTestPluginBase): def test_card_mags_kmer_analysis_validate_positive(self): - filepath = self.get_data_path("kmer_analysis_rgi_summary.txt") + filepath = self.get_data_path("61mer_analysis_rgi_summary.txt") format = CARDMAGsKmerAnalysisFormat(filepath, mode="r") format.validate() - def test_card_reads_kmer_analysis_validate_positive(self): - filepath = self.get_data_path("kmer_analysis_bwt_summary.txt") - format = CARDReadsKmerAnalysisFormat(filepath, mode="r") + def test_kmer_mags_analysis_json_format_validate_positive(self): + filepath = self.get_data_path("mags_61mer_analysis.json") + format = CARDMAGsKmerAnalysisJSONFormat(filepath, mode="r") + format.validate() + + def test_card_reads_allele_kmer_analysis_validate_positive(self): + filepath = self.get_data_path("61mer_analysis.allele.txt") + format = CARDReadsAlleleKmerAnalysisFormat(filepath, mode="r") + format.validate() + + def test_card_reads_gene_kmer_analysis_validate_positive(self): + filepath = self.get_data_path("61mer_analysis.gene.txt") + format = CARDReadsGeneKmerAnalysisFormat(filepath, mode="r") + format.validate() + + def test_kmer_reads_analysis_json_format_validate_positive(self): + filepath = self.get_data_path("reads_61mer_analysis.json") + format = CARDReadsKmerAnalysisJSONFormat(filepath, mode="r") + format.validate() + + def test_card_reads_gene_kmer_analysis_directory_format_validate_positive(self): + sample_dir = os.path.join(self.temp_dir.name, "sample1") + os.mkdir(sample_dir) + shutil.copy(self.get_data_path("61mer_analysis.gene.txt"), sample_dir) + format = CARDReadsGeneKmerAnalysisDirectoryFormat(self.temp_dir.name, mode="r") + format.validate() + + def test_card_reads_allele_kmer_analysis_directory_format_validate_positive(self): + sample_dir = os.path.join(self.temp_dir.name, "sample1") + os.mkdir(sample_dir) + shutil.copy(self.get_data_path("61mer_analysis.allele.txt"), sample_dir) + shutil.copy( + self.get_data_path("reads_61mer_analysis.json"), + os.path.join(sample_dir, "61mer_analysis.json"), + ) + format = CARDReadsAlleleKmerAnalysisDirectoryFormat( + self.temp_dir.name, mode="r" + ) + format.validate() + + def test_card_mags_kmer_analysis_directory_format_validate_positive(self): + sample_dir = os.path.join(self.temp_dir.name, "sample1") + os.mkdir(sample_dir) + shutil.copy(self.get_data_path("61mer_analysis_rgi_summary.txt"), sample_dir) + shutil.copy( + self.get_data_path("mags_61mer_analysis.json"), + os.path.join(sample_dir, "61mer_analysis.json"), + ) + format = CARDMAGsKmerAnalysisDirectoryFormat(self.temp_dir.name, mode="r") format.validate()