Skip to content

Commit

Permalink
merge 55_collate
Browse files Browse the repository at this point in the history
  • Loading branch information
VinzentRisch committed Apr 5, 2024
2 parents 4ca0445 + 6cb68a5 commit 83b9cd5
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 20 deletions.
35 changes: 23 additions & 12 deletions q2_amr/card/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,21 +51,25 @@ def collate_reads_gene_kmer_analyses(
def _collate(partition_list):
collated_partitions = type(partition_list[0])()
# For every partition
for annotation in partition_list:
for partition in partition_list:
# For every sample
for sample in annotation.path.iterdir():
# If formats are annotations or kmer analyses from MAGs
for sample in partition.path.iterdir():
# If artifacts are annotations or kmer analyses from MAGs
if isinstance(
partition_list[0],
(CARDAnnotationDirectoryFormat, CARDMAGsKmerAnalysisDirectoryFormat),
):
# For every MAG
for mag in sample.iterdir():
# Create directories in collate
os.makedirs(
collated_partitions.path / sample.name / mag.name,
exist_ok=True,
)
# Create directories in collate. If dir already exists raise error
try:
os.makedirs(collated_partitions.path / sample.name / mag.name)
except FileExistsError as e:
raise FileExistsError(
f"The directory already exists: {e.filename}. MAG IDs must"
f" be unique across all artifacts. Each artifact in the"
f" list must be unique and cannot be repeated."
)

# Copy every file in the MAG directory to the collated directory
for file in mag.iterdir():
Expand All @@ -77,12 +81,19 @@ def _collate(partition_list):
/ file.name,
)

# If annotations or kmer analyses are from reads
# If artifacts are annotations or kmer analyses are from reads
else:
# Create directories in collate object
os.makedirs(collated_partitions.path / sample.name, exist_ok=True)
# Create directories in collate. If dir already exists raise error
try:
os.makedirs(collated_partitions.path / sample.name)
except FileExistsError as e:
raise FileExistsError(
f"The directory already exists: {e.filename}. Sample IDs must"
f" be unique across all artifacts. Each artifact in the"
f" list must be unique and cannot be repeated."
)

# For every mag in the sample
# Copy every file in the sample directory to the collated directory
for file in sample.iterdir():
duplicate(file, collated_partitions.path / sample.name / file.name)

Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ORF_ID Contig Start Stop Orientation Cut_Off Pass_Bitscore Best_Hit_Bitscore Best_Hit_ARO Best_Identities ARO Model_type SNPs_in_Best_Hit_ARO Other_SNPs Drug Class Resistance Mechanism AMR Gene Family Predicted_DNA Predicted_Protein CARD_Protein_Sequence Percentage Length of Reference Sequence ID Model_ID Nudged Note
k141_1197_2 # 683 # 1345 # 1 # ID=49_2;partial=00;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.588 k141_1197_2 683 1345 + Strict 200 326.635 vanX gene in vanO cluster 75.25 3002954 protein homolog model n/a n/a glycopeptide antibiotic antibiotic target alteration vanX; glycopeptide resistance gene cluster ATGAAGGGTGACTTCGTTTTCGTTGATGAGTGTGTTCCAGGAGTCCGCTGGGATGCCAAATACGCCACATCGGACAACTTCACCGGCAAACCGGTGGAGGGATATCTGGCCAACCGGATTGTCGGGACCAGGGCTTTGTGCTCAGCGCTGGAAAGCGTGCGGCAACGGGCTGCATCCCGCGGTTTCGGGTTGCTGCTGTGGGACGGCTACCGCCCGCAGCGCGCCGTGGATTCGTTCCTGCACTGGGCGAAACAACCAGAGGACGGCGCAACTAAACGCCGCCACTATCCAAATATTTCCCGAGCGGAAATGTTCGAAAGAGGATACGTAGCCTCCAAGTCCGGCCACAGCCGGGGCAGCACCGTCGATTTGACCCTGTATGACCTGGTTACCGGTGACCTCGTTCCCATGGGCGGCGGCCACGACTTGATGGATGAAATTTCGCATCACGGAGCGCCCGGCATCACCCGGGCCGAGACCGGCAACCGCCACACGCTGCGTTCGCTCATGGAGGCCTGCGGTTTCAGTTCCTACGATTCTGAGTGGTGGCATTACACCCTGAAGAACGAACCCTATCCGGACACTTATTTCGATTTTCCCGTTACGGATCCGCTTCCATCAGACGCCGCAACGGCCAGGGACCTTGTCTTCCAGAATGCATAG MKGDFVFVDECVPGVRWDAKYATSDNFTGKPVEGYLANRIVGTRALCSALESVRQRAASRGFGLLLWDGYRPQRAVDSFLHWAKQPEDGATKRRHYPNISRAEMFERGYVASKSGHSRGSTVDLTLYDLVTGDLVPMGGGHDLMDEISHHGAPGITRAETGNRHTLRSLMEACGFSSYDSEWWHYTLKNEPYPDTYFDFPVTDPLPSDAATARDLVFQNA MNDDFVYVDDWVPGVRWDAKYATWDNFTGKPVDGYLANRIVGTRALCAALEQAREKAASLGFGLLLWDGYRPRRAVDSFLRWSEQPEDGQTKQRHYPNIDRPEMLEKGYVATQSGHSRGGAVDLTLYHLATGELAPMGGDHDLMDPISHHRARGIKPIESKNRELLRSIMEDCGFDRYDCEWWHYTLKREPYPDVYFDFPIT 108.91 gnl|BL_ORD_ID|1674|hsp_num:0 1699
k141_10683_1 # 1 # 453 # 1 # ID=423_1;partial=10;start_type=Edge;rbs_motif=None;rbs_spacer=None;gc_cont=0.658 k141_10683_1 1 453 + Strict 50 90.8929 vanY gene in vanM cluster 38.62 3002961 protein homolog model n/a n/a glycopeptide antibiotic antibiotic target alteration vanY; glycopeptide resistance gene cluster GAGGCTGCAGGGGCCTACCGGCAAATGGCCGCGGAAGCGGGCGCCGCCGGAGTTCCCATGTCCGCGGTGAGCGGCTTTCGGACCGGAGCAGAGCAGGACCAGCTGTACGTCTCCTACACGGAGAACTTTGGGCCGGAGGCAGCCGACGCCATTTCGGCCCGTCCCGGGTACAGCGAGCATCAGACGGGGCTGGCCATCGACATCGCCAACCCGGACGGAACCTGCGCCCTGGAATCCTGCTTCGCCGAAACCTTGGCGGGTTCGTGGGCGGCCGCCAATGCCCAGCACTACGGCTTCATCATCCGTTATCCGGCAGGAGCCGAGCACATCACCGGGTACGCCCATGAACCGTGGCATCTGCGGTACGTGGGGACGGAACATGCCCGGACAATGCACGACGCCGGCACCACCTTGGAAGAATATCTGGGACTTCCTGCCGCGCCGGGTTACTGA EAAGAYRQMAAEAGAAGVPMSAVSGFRTGAEQDQLYVSYTENFGPEAADAISARPGYSEHQTGLAIDIANPDGTCALESCFAETLAGSWAAANAQHYGFIIRYPAGAEHITGYAHEPWHLRYVGTEHARTMHDAGTTLEEYLGLPAAPGY MVFQGNLLLVNNEYPVLEESIKTDVVNLFKHDELTKGYELLNREIYLSEKVAREFSEMVDAAEKEGVRHFSINSGFRNFDEQNALYQEMGSDYALPAGYSEHNLGLALDIGSTQMEMSEAPEGKWLEDNAWEYGFILRYPMDKTAITGIQYEPWHFRYVGLPHSAIIEEKNFALEEYLDFLKEQKSISGTIHGENYEISYYPITEKTDIEMPANLHYEISGNNMDGVIVTVYR 64.38 gnl|BL_ORD_ID|1675|hsp_num:0 1713
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"k141_1617_82 # 105913 # 109473 # 1 # ID=42_82;partial=00;start_type=TTG;rbs_motif=AGGA;rbs_spacer=5-10bp;gc_cont=0.630": {"ORF": "k141_1617_82 # 105913 # 109473 # 1 # ID=42_82;partial=00;start_type=TTG;rbs_motif=AGGA;rbs_spacer=5-10bp;gc_cont=0.630", "contig": "k141_1617_82", "HSP": "gnl|BL_ORD_ID|4713|hsp_num:0", "ARO_model": "Bifidobacterium adolescentis rpoB mutants conferring resistance to rifampicin", "type_hit": "Perfect", "#_of_kmers_in_sequence": 3501, "#_of_AMR_kmers": 2274, "taxonomic_info": {"species": {"Bifidobacterium longum": 75, "Bifidobacterium dentium": 67, "Parascardovia denticolens": 3, "Bifidobacterium animalis": 7, "Bifidobacterium bifidum": 9}, "genus": {}}, "genomic_info": {"chr + plasmid": 0, "plasmid": 0, "chr": 234}}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ORF_ID Contig Cut_Off Best_Hit_ARO CARD*kmer Prediction Taxonomic kmers Genomic kmers
"NC_000962.3_273 # 314309 # 314854 # -1 # ID=1_273;partial=00;start_type=GTG;rbs_motif=AGG;rbs_spacer=4bp;gc_cont=0.679" NC_000962.3_273 Perfect AAC(2')-Ic Mycobacterium tuberculosis (chromosome) "Mycobacterium tuberculosis: 486; " "chr + plasmid: 0; plasmid: 0; chr: 486; "
55 changes: 48 additions & 7 deletions q2_amr/card/tests/test_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def test_collate_mags_annotations(self):
# Test collate for mags annotations
self._test_collate(
data_dir="annotate_mags_output",
files_to_assert=["bin1/amr_annotation.json", "bin1/amr_annotation.txt"],
files_to_assert=["amr_annotation.json", "amr_annotation.txt"],
samples=["sample1/bin1", "sample2/bin2"],
format=CARDAnnotationDirectoryFormat,
function=collate_mags_annotations,
)
Expand All @@ -42,6 +43,7 @@ def test_collate_reads_allele_annotations(self):
"overall_mapping_stats.txt",
"sorted.length_100.bam",
],
samples=["sample1", "sample2"],
format=CARDAlleleAnnotationDirectoryFormat,
function=collate_reads_allele_annotations,
)
Expand All @@ -51,6 +53,7 @@ def test_collate_reads_gene_annotations(self):
self._test_collate(
data_dir="annotate_reads_gene_output",
files_to_assert=["gene_mapping_data.txt"],
samples=["sample1", "sample2"],
format=CARDGeneAnnotationDirectoryFormat,
function=collate_reads_gene_annotations,
)
Expand All @@ -59,10 +62,8 @@ def test_collate_mags_kmer_analysis(self):
# Test collate for MAGs k-mer analysis
self._test_collate(
data_dir="kmer_analysis_mags",
files_to_assert=[
"bin1/61mer_analysis.json",
"bin1/61mer_analysis_rgi_summary.txt",
],
files_to_assert=["61mer_analysis.json", "61mer_analysis_rgi_summary.txt"],
samples=["sample1/bin1", "sample2/bin2"],
format=CARDMAGsKmerAnalysisDirectoryFormat,
function=collate_mags_kmer_analyses,
)
Expand All @@ -72,6 +73,7 @@ def test_collate_reads_allele_kmer_analysis(self):
self._test_collate(
data_dir="kmer_analysis_reads_allele",
files_to_assert=["61mer_analysis.json", "61mer_analysis.allele.txt"],
samples=["sample1", "sample2"],
format=CARDReadsAlleleKmerAnalysisDirectoryFormat,
function=collate_reads_allele_kmer_analyses,
)
Expand All @@ -81,11 +83,12 @@ def test_collate_reads_gene_kmer_analysis(self):
self._test_collate(
data_dir="kmer_analysis_reads_gene",
files_to_assert=["61mer_analysis.json", "61mer_analysis.gene.txt"],
samples=["sample1", "sample2"],
format=CARDReadsGeneKmerAnalysisDirectoryFormat,
function=collate_reads_gene_kmer_analyses,
)

def _test_collate(self, data_dir, files_to_assert, format, function):
def _test_collate(self, data_dir, files_to_assert, samples, format, function):
# Set up the list with annotations objects to collate
artifact_1 = self.setup_annotations(
dir_name=f"partitioned/{data_dir}_1", format=format
Expand All @@ -103,12 +106,50 @@ def _test_collate(self, data_dir, files_to_assert, format, function):
self.assertTrue(isinstance(collate, format))

# Assert if all the files have been moved to the collated object
for sample in ["sample1", "sample2"]:
for sample in samples:
for file in files_to_assert:
self.assertTrue(
os.path.exists(os.path.join(collate.path, sample, file))
)

def test_mags_file_exists_error(self):
# Set up the list with duplicated artifacts
artifact = self.setup_annotations(
dir_name="partitioned/kmer_analysis_reads_allele_1",
format=CARDReadsAlleleKmerAnalysisDirectoryFormat,
)

artifacts = [artifact, artifact]

pattern = (
r"The directory already exists: .*/sample1. Sample IDs must be "
r"unique across all artifacts. Each artifact in the list must be "
r"unique and cannot be repeated."
)

# Check if error is raised
with self.assertRaisesRegex(FileExistsError, pattern):
collate_reads_allele_kmer_analyses(artifacts)

def test_reads_file_exists_error(self):
# Set up the list with duplicated artifacts
artifact = self.setup_annotations(
dir_name="partitioned/annotate_mags_output_1",
format=CARDAnnotationDirectoryFormat,
)

artifacts = [artifact, artifact]

pattern = (
r"The directory already exists: .*/bin1. MAG IDs must be "
r"unique across all artifacts. Each artifact in the list must be "
r"unique and cannot be repeated."
)

# Check if error is raised
with self.assertRaisesRegex(FileExistsError, pattern):
collate_reads_allele_kmer_analyses(artifacts)

def setup_annotations(self, dir_name, format):
# Setup of the directory with dummy files and the needed directory format
annotations = format()
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@
],
"q2_amr.card.tests": [
"data/*",
"data/*/*/*",
"data/*/*/*/*",
"data/*/*/*/*/*",
],
},
zip_safe=False,
Expand Down

0 comments on commit 83b9cd5

Please sign in to comment.