From 6893626d3cb18debc49ea3fe25bfeb3e93467b1e Mon Sep 17 00:00:00 2001
From: Dan Fornika <dfornika@gmail.com>
Date: Fri, 6 Sep 2024 16:43:17 -0700
Subject: [PATCH] Add json version of qualimap genome results parser

---
 bin/qualimap_bamqc_genome_results_to_json.py | 340 +++++++++++++++++++
 modules/alignment_variants.nf                |   9 +-
 2 files changed, 348 insertions(+), 1 deletion(-)
 create mode 100755 bin/qualimap_bamqc_genome_results_to_json.py

diff --git a/bin/qualimap_bamqc_genome_results_to_json.py b/bin/qualimap_bamqc_genome_results_to_json.py
new file mode 100755
index 0000000..0d45464
--- /dev/null
+++ b/bin/qualimap_bamqc_genome_results_to_json.py
@@ -0,0 +1,340 @@
+#!/usr/bin/env python
+
+import argparse
+import json
+import os
+import sys
+
+
+def get_section_lines(report_path, section_name):
+    """
+    """
+    section_lines = []
+
+    with open(report_path, 'r') as f:
+        # skip lines until we reach the input section
+        # indicated by the line starting with '>>>>>>> Input'
+        for line in f:
+            if line.startswith(f'>>>>>>> {section_name}'):
+                break
+        for line in f:
+            if line.startswith('>>>>>>>'):
+                break
+            if line.strip() != '':
+                section_lines.append(line.strip())
+        
+    return section_lines
+
+
+def parse_input_section(report_path):
+    """
+    """
+    section_lines = get_section_lines(report_path, 'Input')
+    input_section = {}
+    for line in section_lines:
+        key, value = line.split('=')
+        key = key.strip().replace(' ', '_').lower()
+        value = value.strip()
+        input_section[key] = value
+
+    return input_section
+
+
+def parse_reference_section(report_path):
+    """
+    """
+    section_lines = get_section_lines(report_path, 'Reference')
+    reference_section = {}
+    keys_with_units = [
+        'number_of_bases',
+    ]
+    int_keys = [
+        'number_of_bases',
+        'number_of_contigs',
+    ]
+    for line in section_lines:
+        key, value = line.split('=')
+        key = key.strip().replace(' ', '_').lower()
+        value = value.strip()
+        if key in keys_with_units:
+            value, unit = value.split()
+        if key in int_keys:
+            value = value.replace(',', '')
+            value = int(value)
+        reference_section[key] = value
+
+    return reference_section
+
+
+def parse_int_with_percentage(value):
+    """
+    Convert a string like "20,096,173 (99.89%)" to a tuple (20096173, 99.89)
+    """
+    value, percentage = value.split()
+    value = value.replace(',', '')
+    value = int(value)
+    percentage = percentage.strip('()').replace('%', '')
+    percentage = float(percentage)
+
+    return value, percentage
+
+def parse_globals_section(report_path):
+    """
+    """
+    section_lines = get_section_lines(report_path, 'Globals')
+    globals_section = {}
+    int_keys = [
+        'number_of_windows',
+        'number_of_reads',
+        'number_of_secondary_alignments',
+        'number_of_mapped_paired_reads_first_in_pair',
+        'number_of_mapped_paired_reads_second_in_pair',
+        'number_of_mapped_paired_reads_both_in_pair',
+        'number_of_mapped_paired_reads_singletons',
+        'number_of_overlapping_read_pairs',
+        'number_of_duplicated_reads_flagged',
+    ]
+    int_with_unit_keys = [
+        'number_of_mapped_bases',
+        'number_of_sequenced_bases',
+        'number_of_aligned_bases',
+    ]
+    int_with_percentage_keys = [
+        'number_of_mapped_reads',
+        'number_of_supplementary_alignments',
+    ]
+    for line in section_lines:
+        key, value = line.split('=')
+        key = key.strip().replace(' ', '_').replace('(', '').replace(')', '').lower()
+        value = value.strip()
+        if key in int_keys:
+            value = value.replace(',', '')
+            value = int(value)
+        elif key in int_with_percentage_keys:
+            value, percentage = parse_int_with_percentage(value)
+            percentage_key = key.replace('number_of_', 'percent_')
+            globals_section[percentage_key] = percentage
+        elif key in int_with_unit_keys:
+            value, unit = value.split()
+            value = value.replace(',', '')
+            value = int(value)
+        globals_section[key] = value
+
+    return globals_section
+
+
+def parse_insert_size_section(report_path):
+    """
+    """
+    insert_size_section = {}
+    section_lines = get_section_lines(report_path, 'Insert size')
+    float_keys = [
+        'mean_insert_size',
+        'std_insert_size',   
+    ]
+    int_keys = [
+        'median_insert_size',
+    ]
+    for line in section_lines:
+        key, value = line.split('=')
+        key = key.strip().replace(' ', '_').lower()
+        value = value.strip()
+        if key in float_keys:
+            value = value.replace(',', '')
+            value = float(value)
+        elif key in int_keys:
+            value = value.replace(',', '')
+            value = int(value)
+        insert_size_section[key] = value
+
+    return insert_size_section
+
+
+def parse_actg_content_section(report_path):
+    """
+    """
+    actg_content_section = {}
+    section_lines = get_section_lines(report_path, 'ACTG content')
+    int_with_units_and_percentage_keys = [
+        'number_of_a_bases',
+        'number_of_c_bases',
+        'number_of_g_bases',
+        'number_of_t_bases',
+        'number_of_n_bases',
+    ]
+    percentage_keys = [
+        'gc_percentage',
+    ]
+    for line in section_lines:
+        key, value = line.split('=')
+        key = key.strip().replace(' ', '_').replace("'", "_base").lower()
+        value = value.strip()
+        if key in int_with_units_and_percentage_keys:
+            value_split = value.split()
+            value_without_units = ' '.join([value_split[0], value_split[2]])
+            value, percentage = parse_int_with_percentage(value_without_units)
+            percentage_key = key.replace('number_of_', 'percent_')
+            actg_content_section[percentage_key] = percentage
+        elif key in percentage_keys:
+            value = value.replace('%', '')
+            value = float(value)
+        actg_content_section[key] = value
+
+    return actg_content_section
+
+
+def parse_mismatches_and_indels_section(report_path):
+    """
+    """
+    mismatches_and_indels_section = {}
+    section_lines = get_section_lines(report_path, 'Mismatches and indels')
+    int_keys = [
+        'number_of_mismatches',
+        'number_of_insertions',
+        'number_of_deletions',
+    ]
+    float_keys = [
+        'general_error_rate',
+    ]
+    percentage_keys = [
+        'mapped_reads_with_insertion_percentage',
+        'mapped_reads_with_deletion_percentage',
+        'homopolymer_indels',
+    ]
+    for line in section_lines:
+        key, value = line.split('=')
+        key = key.strip().replace(' ', '_').lower()
+        value = value.strip()
+        if key in int_keys:
+            value = value.replace(',', '')
+            value = int(value)
+        elif key in float_keys:
+            value = float(value)
+        elif key in percentage_keys:
+            value = value.replace('%', '')
+            value = float(value)
+            if key == 'homopolymer_indels':
+                key = 'homopolymer_indels_percentage'
+        mismatches_and_indels_section[key] = value
+
+    return mismatches_and_indels_section
+
+def parse_percent_coverage_line(line):
+    """
+    Convert lines that look like:
+
+    There is a 100% of reference with a coverageData >= 1X
+
+    ...into a tuple (1, 100)
+    """
+    line_split = line.split()
+    coverage = line_split[-1]
+    percentage = line_split[3]
+    coverage = int(coverage.strip('X'))
+    percentage = float(percentage.strip('%'))
+
+    return coverage, percentage
+
+def parse_coverage_section(report_path):
+    """
+    """
+    coverage_section = {
+        'coverage_profile': [],
+    }
+    section_lines = get_section_lines(report_path, 'Coverage')
+    float_x_keys = [
+        'mean_coverage',
+        'std_coverage',
+        'paired-end_adapted_mean_coverage',
+    ]
+    for line in section_lines:
+        if line.startswith('There is a'):
+            coverage, value = parse_percent_coverage_line(line)
+            coverage_profile_record = {
+                'depth_threshold': coverage,
+                'percent_reference_covered': value,
+            }
+            coverage_section['coverage_profile'].append(coverage_profile_record)
+        else:
+            key, value = line.split('=')
+            key = key.strip().replace(' ', '_').replace('coverageData', 'coverage').lower()
+            value = value.strip()
+            if key in float_x_keys:
+                value = value.replace('X', '')
+                value = float(value)
+        coverage_section[key] = value
+
+    return coverage_section
+
+
+def parse_coverage_per_contig_section(report_path):
+    """
+    """
+    coverage_per_contig_section = []
+    section_lines = get_section_lines(report_path, 'Coverage per contig')
+    section_headers = [
+        'name',
+        'length',
+        'mapped_bases',
+        'mean_coverage',
+        'standard_deviation',
+    ]
+    int_keys = [
+        'length',
+        'mapped_bases',
+    ]
+    float_keys = [
+        'mean_coverage',
+        'standard_deviation',
+    ]
+    for line in section_lines:
+        line_split = line.split('\t')
+        contig_data = {}
+        for i, header in enumerate(section_headers):
+            value = line_split[i]
+            if header in int_keys:
+                value = int(value)
+            elif header in float_keys:
+                value = float(value)
+            contig_data[header] = value
+        coverage_per_contig_section.append(contig_data)
+
+    return coverage_per_contig_section
+
+
+def main(args):
+
+    parsed_genome_results = {}
+    input_section = parse_input_section(args.input)
+    parsed_genome_results['input'] = input_section
+
+    reference_section = parse_reference_section(args.input)
+    parsed_genome_results['reference'] = reference_section
+
+    globals_section = parse_globals_section(args.input)
+    parsed_genome_results['globals'] = globals_section
+
+    insert_size_section = parse_insert_size_section(args.input)
+    parsed_genome_results['insert_size'] = insert_size_section
+
+    actg_content_section = parse_actg_content_section(args.input)
+    parsed_genome_results['actg_content'] = actg_content_section
+
+    mismatches_and_indels_section = parse_mismatches_and_indels_section(args.input)
+    parsed_genome_results['mismatches_and_indels'] = mismatches_and_indels_section
+
+    coverage_section = parse_coverage_section(args.input)
+    parsed_genome_results['coverage'] = coverage_section
+
+    coverage_per_contig_section = parse_coverage_per_contig_section(args.input)
+    parsed_genome_results['coverage_per_contig'] = coverage_per_contig_section
+
+    print(json.dumps(parsed_genome_results, indent=4))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Description of your script')
+    parser.add_argument('--input', type=str, required=True, help='Path to the input file')
+    args = parser.parse_args()
+    main(args)
+    
diff --git a/modules/alignment_variants.nf b/modules/alignment_variants.nf
index 43b5f30..30b02ea 100644
--- a/modules/alignment_variants.nf
+++ b/modules/alignment_variants.nf
@@ -142,7 +142,7 @@ process qualimap_bamqc {
     tag { sample_id + ' / ' + short_long }
 
     publishDir  "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_${short_long}_qualimap_alignment_qc.csv"
-    publishDir  "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_${short_long}_qualimap_genome_results.txt"
+    publishDir  "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_${short_long}_qualimap_genome_results.{txt,json}"
     publishDir  "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_${short_long}_qualimap_report.pdf"
 
     input:
@@ -152,6 +152,7 @@ process qualimap_bamqc {
     tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_alignment_qc.csv"), emit: alignment_qc
     tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_report.pdf"), emit: report, optional: true
     tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_genome_results.txt"), emit: genome_results, optional: true
+    tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_genome_results.json"), emit: genome_results_json, optional: true
     tuple val(sample_id), path("${sample_id}_${short_long}_qualimap_bamqc_provenance.yml"), emit: provenance
     
     script:
@@ -192,7 +193,13 @@ process qualimap_bamqc {
 	--read-type ${short_long} \
 	--failed \
 	> ${sample_id}_${short_long}_qualimap_alignment_qc.csv
+
+    echo '{}' > ${sample_id}_${short_long}_qualimap_genome_results.json
     else
+	qualimap_bamqc_genome_results_to_json.py \
+	--input ${sample_id}_${short_long}_bamqc/genome_results.txt \
+	> ${sample_id}_${short_long}_qualimap_genome_results.json
+	
 	qualimap_bamqc_genome_results_to_csv.py \
 	-s ${sample_id} \
 	--read-type ${short_long} \