From bbaad270c70b84af943dc4369bf84bb0882b8377 Mon Sep 17 00:00:00 2001 From: "Christian Schudoma (cschu)" Date: Tue, 9 Jun 2020 14:07:04 +0100 Subject: [PATCH] Adding in reporting on metric oddities (#24) --- etc/minos_config.yaml | 19 +++++++++++++++++++ minos/scripts/metric_oddities.py | 23 +++++++++++++++++++++++ minos/zzz/minos_run.smk | 31 ++++++++++++++++++++++++++++--- setup.py | 2 +- 4 files changed, 71 insertions(+), 4 deletions(-) create mode 100644 minos/scripts/metric_oddities.py diff --git a/etc/minos_config.yaml b/etc/minos_config.yaml index 037276a..17559b8 100644 --- a/etc/minos_config.yaml +++ b/etc/minos_config.yaml @@ -65,3 +65,22 @@ collapse_metrics_thresholds: #predicted_gene: "hom_acov_score lt 0.3 & cpc_score lt 0.25" #hi_confidence: "classification eq 1 | hom_acov_score ge 0.8 | (hom_acov_score ge 0.6 & transcript_score ge 0.4)" #discard: "protein_score eq 0 & transcript_score eq 0 & hom_acov_score eq 0 & expression_score lt 0.3" + +report_metric_oddities: + - "{five_utr_length} >= 10000" + - "{five_utr_num} >= 5" + - "{three_utr_length} >= 10000" + - "{three_utr_num} >= 4" + - "not {is_complete}" + - "not {has_start_codon}" + - "not {has_stop_codon}" + - "{max_exon_length} >= 10000" + - "{max_intron_length} >= 500000" + - "{min_exon_length} <= 5" + - "{min_intron_length} <= 5" + - "{selected_cds_fraction} <= 0.3" + - "{canonical_intron_proportion} != 1" + - "{non_verified_introns_num} >= 1" + - "not {only_non_canonical_splicing}" + - "{proportion_verified_introns} <= 0.5" + - "{suspicious_splicing}" diff --git a/minos/scripts/metric_oddities.py b/minos/scripts/metric_oddities.py new file mode 100644 index 0000000..7e4029c --- /dev/null +++ b/minos/scripts/metric_oddities.py @@ -0,0 +1,23 @@ +import csv +from collections import Counter + +# for testing +METRIC_ODDITIES = ['{five_utr_length} >= 10000', '{five_utr_num} >= 5', '{three_utr_length} >= 10000', '{three_utr_num} >= 4', 'not {is_complete}', 'not {has_start_codon}', 'not {has_stop_codon}', '{max_exon_length} >= 10000', '{max_intron_length} >= 500000', '{min_exon_length} <= 5', '{min_intron_length} <= 5', '{selected_cds_fraction} <= 0.3', '{canonical_intron_proportion} != 1', '{non_verified_introns_num} >= 1', 'not {only_non_canonical_splicing}', '{proportion_verified_introns} <= 0.5', '{suspicious_splicing}'] + + +class MetricOddityParser: + def __init__(self, metric_file, oddities, gene_filter=None): + self.table = Counter({oddity: 0 for oddity in oddities}) + self.metric_file = metric_file + self.gene_filter = gene_filter + def run(self): + for row in csv.DictReader(open(self.metric_file), delimiter="\t"): + if gene_filter is None or row["tid"] in gene_filter: + counts = [oddity for oddity in self.table if eval(oddity.format(**row))] + self.table.update(counts) + + for oddity, count in self.table.items(): + print(oddity.replace("{", "").replace("}", ""), count, sep="\t") + + + diff --git a/minos/zzz/minos_run.smk b/minos/zzz/minos_run.smk index bdbcaaa..653d733 100644 --- a/minos/zzz/minos_run.smk +++ b/minos/zzz/minos_run.smk @@ -155,7 +155,8 @@ localrules: busco_copy_results, busco_concat_protein_metrics, busco_summary, - minos_create_release_metrics + minos_create_release_metrics, + minos_collate_metric_oddities rule all: @@ -164,6 +165,7 @@ rule all: os.path.join(EXTERNAL_METRICS_DIR, "metrics_info.txt"), os.path.join(config["outdir"], "MIKADO_SERIALISE_DONE"), os.path.join(config["outdir"], "mikado.subloci.gff3"), + os.path.join(config["outdir"], "mikado.monoloci.gff3"), os.path.join(config["outdir"], "mikado.loci.gff3"), [ os.path.join(config["outdir"], POST_PICK_PREFIX + suffix) @@ -586,7 +588,8 @@ rule minos_mikado_pick: db = rules.minos_mikado_serialise.output[1] output: loci = os.path.join(config["outdir"], "mikado.loci.gff3"), - subloci = os.path.join(config["outdir"], "mikado.subloci.gff3") + subloci = os.path.join(config["outdir"], "mikado.subloci.gff3"), + monoloci = os.path.join(config["outdir"], "mikado.monoloci.gff3") params: program_call = config["program_calls"]["mikado"].format(container=config["mikado-container"], program="pick"), program_params = config["params"]["mikado"]["pick"], @@ -596,7 +599,11 @@ rule minos_mikado_pick: resources: mem_mb = lambda wildcards, attempt: HPC_CONFIG.get_memory("minos_mikado_pick") * attempt shell: - "{params.program_call} {params.program_params} -od {params.outdir} --procs {threads} --json-conf {input.config} --subloci-out $(basename {output.subloci}) -db {input.db} {input.gtf}" + "{params.program_call} {params.program_params}" + \ + " -od {params.outdir} --procs {threads} --json-conf {input.config}" + \ + " --subloci-out $(basename {output.subloci})" + \ + " --monoloci-out $(basename {output.monoloci})" + \ + " -db {input.db} {input.gtf}" rule minos_parse_mikado_pick: input: @@ -867,6 +874,24 @@ rule minos_generate_final_table: from minos.scripts.generate_final_table import generate_final_table generate_final_table(input.seq_table, input.bt_conf_table, input.stats_table, output.final_table, output.summary) +rule minos_collate_metric_oddities: + input: + loci = os.path.join(config["outdir"], "mikado.loci.gff3"), + subloci = os.path.join(config["outdir"], "mikado.subloci.gff3"), + monoloci = os.path.join(config["outdir"], "mikado.monoloci.gff3"), + final_table = rules.minos_generate_final_table.output.final_table + output: + rules.minos_final_sanity_check.output[0] + ".metric_oddities.tsv" + run: + from minos.scripts.metric_oddities import MetricOddityParser + tx2gene = {row[1]: row[0] for row in csv.reader(open(input.final_table), delimiter="\t") if not row[0].startswith("#")} + release_genes = set(tx2gene.values()) + with open(output[0], "w") as loci_oddities_out: + MetricOddityParser(input.loci, config["report_metric_oddities"], release_genes).run(stream=loci_oddities_out) + + + + rule split_proteins_prepare: input: rules.minos_gffread_extract_sequences.output[0] diff --git a/setup.py b/setup.py index 12ce44c..18d27ac 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ description = long_description = description.read() name="minos" -version = "1.5" +version = "1.6" if sys.version_info.major != 3: raise EnvironmentError("""minos is a python module that requires python3, and is not compatible with python2. Also, it is now 2020 and support for 2.x has ceased.""")