diff --git a/minos/scripts/metric_oddities.py b/minos/scripts/metric_oddities.py index 7e4029c..47ec421 100644 --- a/minos/scripts/metric_oddities.py +++ b/minos/scripts/metric_oddities.py @@ -1,3 +1,4 @@ +import sys import csv from collections import Counter @@ -6,18 +7,33 @@ class MetricOddityParser: - def __init__(self, metric_file, oddities, gene_filter=None): - self.table = Counter({oddity: 0 for oddity in oddities}) - self.metric_file = metric_file - self.gene_filter = gene_filter - def run(self): - for row in csv.DictReader(open(self.metric_file), delimiter="\t"): - if gene_filter is None or row["tid"] in gene_filter: - counts = [oddity for oddity in self.table if eval(oddity.format(**row))] - self.table.update(counts) - - for oddity, count in self.table.items(): - print(oddity.replace("{", "").replace("}", ""), count, sep="\t") - + def parse_metrics(self, metric_file, transcript_filter=None): + for row in csv.DictReader(open(metric_file), delimiter="\t"): + if transcript_filter is None or row["tid"] in transcript_filter: + counts = [oddity for oddity in self.oddities if eval(oddity.format(**row))] + source = row["original_source"] + self.data.setdefault( + source, + Counter({oddity: 0 for oddity in self.oddities}) + ).update(counts) + def __init__(self, metric_file, oddities, transcript_filter=None): + self.data = dict() + self.oddities = oddities + self.parse_metrics(metric_file, transcript_filter=transcript_filter) + + def write_table(self, collapse=True, stream=sys.stdout): + # collapse: True for loci, False for subloci and monoloci + header = list(self.data.keys()) if not collapse else ["counts"] + print("metric", *header, sep="\t", file=stream, flush=True) + for oddity in self.oddities: + total = 0 + row = list() + for source, counts in self.data.items(): + total += counts[oddity] + if not collapse: + row.append(counts[oddity]) + if collapse: + row.append(total) + print(oddity.replace("{", "").replace("}", ""), *row, sep="\t", file=stream, flush=True) diff --git a/minos/zzz/minos_run.smk b/minos/zzz/minos_run.smk index 653d733..816ec6b 100644 --- a/minos/zzz/minos_run.smk +++ b/minos/zzz/minos_run.smk @@ -167,6 +167,9 @@ rule all: os.path.join(config["outdir"], "mikado.subloci.gff3"), os.path.join(config["outdir"], "mikado.monoloci.gff3"), os.path.join(config["outdir"], "mikado.loci.gff3"), + os.path.join(RESULTS_DIR, "mikado.subloci.metrics_oddities.tsv"), + os.path.join(RESULTS_DIR, "mikado.monoloci.metrics_oddities.tsv"), + os.path.join(RESULTS_DIR, "mikado.loci.metrics_oddities.tsv"), [ os.path.join(config["outdir"], POST_PICK_PREFIX + suffix) for suffix in {".gff", ".proteins.fasta", ".table.txt", ".cds.fasta", ".cds.fasta.lengths", ".cdna.fasta"} @@ -876,21 +879,28 @@ rule minos_generate_final_table: rule minos_collate_metric_oddities: input: - loci = os.path.join(config["outdir"], "mikado.loci.gff3"), - subloci = os.path.join(config["outdir"], "mikado.subloci.gff3"), - monoloci = os.path.join(config["outdir"], "mikado.monoloci.gff3"), + loci = os.path.join(config["outdir"], "mikado.loci.metrics.tsv"), + subloci = os.path.join(config["outdir"], "mikado.subloci.metrics.tsv"), + monoloci = os.path.join(config["outdir"], "mikado.monoloci.metrics.tsv"), + old_new_rel = rules.minos_create_release_gffs.output[2], final_table = rules.minos_generate_final_table.output.final_table output: - rules.minos_final_sanity_check.output[0] + ".metric_oddities.tsv" + os.path.join(config["outdir"], "results", "mikado.loci.metrics_oddities.tsv"), + os.path.join(config["outdir"], "results", "mikado.subloci.metrics_oddities.tsv"), + os.path.join(config["outdir"], "results", "mikado.monoloci.metrics_oddities.tsv") run: + import csv from minos.scripts.metric_oddities import MetricOddityParser - tx2gene = {row[1]: row[0] for row in csv.reader(open(input.final_table), delimiter="\t") if not row[0].startswith("#")} - release_genes = set(tx2gene.values()) + #tx2gene = {row[1]: row[0] for row in csv.reader(open(input.final_table), delimiter="\t") if not row[0].startswith("#")} + #release_genes = set(tx2gene.values()) + release_transcripts = {row[0] for row in csv.reader(open(input.final_table), delimiter="\t") if not row[0].startswith("#")} + transcript_filter = {row[3] for row in csv.reader(open(input.old_new_rel), delimiter="\t") if not row[0].startswith("#") and row[1] in release_transcripts} with open(output[0], "w") as loci_oddities_out: - MetricOddityParser(input.loci, config["report_metric_oddities"], release_genes).run(stream=loci_oddities_out) - - - + MetricOddityParser(input[0], config["report_metric_oddities"], transcript_filter=transcript_filter).write_table(collapse=True, stream=loci_oddities_out) + with open(output[1], "w") as subloci_oddities_out: + MetricOddityParser(input[1], config["report_metric_oddities"]).write_table(collapse=False, stream=subloci_oddities_out) + with open(output[2], "w") as monoloci_oddities_out: + MetricOddityParser(input[2], config["report_metric_oddities"]).write_table(collapse=False, stream=monoloci_oddities_out) rule split_proteins_prepare: input: