From a0b2ba38096c8e9d95ed01f5d8669a654ad6495d Mon Sep 17 00:00:00 2001
From: Susana Posada-Cespedes <susana.posada@bsse.ethz.ch>
Date: Fri, 5 Jun 2020 17:22:28 +0200
Subject: [PATCH] Functionality to detect alignment bias

---
 envs/{test_snv.yaml => testbench.yaml} |   1 +
 rules/benchmark.smk                    |  11 +-
 rules/testbench.smk                    |  45 +++++++++
 scripts/alignmentBias.py               | 135 +++++++++++++++++++++++++
 vpipeBench.snake                       |   1 +
 5 files changed, 192 insertions(+), 1 deletion(-)
 rename envs/{test_snv.yaml => testbench.yaml} (93%)
 create mode 100755 scripts/alignmentBias.py

diff --git a/envs/test_snv.yaml b/envs/testbench.yaml
similarity index 93%
rename from envs/test_snv.yaml
rename to envs/testbench.yaml
index f35a5e075..f4bf09984 100644
--- a/envs/test_snv.yaml
+++ b/envs/testbench.yaml
@@ -8,4 +8,5 @@ dependencies:
   - sh
   - biopython
   - scipy
+  - pysam
   - mafft=7.305
diff --git a/rules/benchmark.smk b/rules/benchmark.smk
index 0e3911577..0ff2aaee6 100644
--- a/rules/benchmark.smk
+++ b/rules/benchmark.smk
@@ -40,6 +40,8 @@ class VpipeBenchConfig(VpipeConfig):
                 'simBench': __RECORD__(
                     value=f"{VPIPE_BASEDIR}/scripts/simBench.py", type=str),
                 'art': __RECORD__(value="art_illumina", type=str),
+                'alignmentBias': __RECORD__(
+                    value=f"{VPIPE_BASEDIR}/scripts/alignmentBias.py", type=str),
                 'testBench': __RECORD__(
                     value=f"{VPIPE_BASEDIR}/scripts/testBench.py", type=str),
                 'alignmentIntervals' : __RECORD__(
@@ -81,10 +83,17 @@ class VpipeBenchConfig(VpipeConfig):
                 'mem': __RECORD__(value=5000, type=int),
                 'time': __RECORD__(value=90, type=int),
             }),
+            ('alignment_bias', {
+                'mem': __RECORD__(value=2000, type=int),
+                'time': __RECORD__(value=60, type=int),
+                'conda': __RECORD__(
+                    value=f'{VPIPE_BASEDIR}/envs/testbench.yaml', type=str),
+            }),
             ('test_snv', {
                 'mem': __RECORD__(value=2000, type=int),
                 'time': __RECORD__(value=60, type=int),
-                'conda': __RECORD__(value='', type=str),
+                'conda': __RECORD__(
+                    value=f'{VPIPE_BASEDIR}/envs/testbench.yaml', type=str),
 
                 're_msa': __RECORD__(value=False, type=bool),
             }),
diff --git a/rules/testbench.smk b/rules/testbench.smk
index 803b56d48..38c16a1db 100644
--- a/rules/testbench.smk
+++ b/rules/testbench.smk
@@ -20,6 +20,51 @@ def snvfiles(wildcards):
         sys.exit(1)
     return snv_files
 
+rule alignment_bias:
+    input:
+        REF = reference_file,
+        BAM = "{sample_dir}/{sample_name}/{date}/alignments/REF_aln.bam",
+        R1gz = "{sample_dir}/{sample_name}/{date}/preprocessed_data/R1.fastq.gz",
+        HAPLOTYPE_SEQS = "{sample_dir}/{sample_name}/{date}/references/haplotypes/haplotypes.fasta",
+    output:
+        "{sample_dir}/{sample_name}/{date}/alignments/alignment_bias.tsv"
+    params:
+        scratch = '2000',
+        mem = config.alignment_bias['mem'],
+        time = config.alignment_bias['time'],
+        PAIRED = '-p' if config.input['paired'] else '',
+        ID = lambda wildcards: f'{wildcards.sample_name}-{wildcards.date}',
+        ALIGNMENT_BIAS = config.applications['alignmentBias'],
+    log:
+        outfile = "{sample_dir}/{sample_name}/{date}/alignments/alignment_bias.out.log",
+        errfile = "{sample_dir}/{sample_name}/{date}/alignments/alignment_bias.out.log"
+    conda:
+        config.alignment_bias['conda']
+    threads:
+        1
+    shell:
+        """
+        {params.ALIGNMENT_BIAS} -r {input.REF} -b {input.BAM} -f <(zcat {input.R1gz}) --hap {input.HAPLOTYPE_SEQS} {params.PAIRED} -N {params.ID} -o {output}
+        """
+
+rule aggregate_alignment_bias:
+    input:
+        expand("{dataset}/alignments/alignment_bias.tsv", dataset=datasets)
+    output:
+        "stats/alignment_bias.tsv"
+    params:
+        scratch = '1250',
+        mem = config.aggregate['mem'],
+        time = config.aggregate['time']
+    log:
+        outfile = "stats/alignment_bias.out.log",
+        errfile = "stats/alignment_bias.out.log"
+    shell:
+        """
+        awk FNR!=1 {input} > {output}
+        sed -i 1i"SampleID\tHaplotypeID\tDivergence\tPercent-aligned\tPercent-bases-aligne\n" {output}
+        """
+
 
 rule aggregate_beforeSB:
     input:
diff --git a/scripts/alignmentBias.py b/scripts/alignmentBias.py
new file mode 100755
index 000000000..2ae5141ef
--- /dev/null
+++ b/scripts/alignmentBias.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+
+import argparse
+import pysam
+from Bio import SeqIO, pairwise2
+
+__author__ = "Susana Posada-Cespedes"
+__license__ = "Apache2.0"
+__maintainer__ = "Ivan Topolsky"
+__email__ = "v-pipe@bsse.ethz.ch"
+
+
+def parse_args():
+    """ Set up the parsing of command-line arguments """
+
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    requiredNamed = parser.add_argument_group('required named arguments')
+    requiredNamed.add_argument(
+        "-r", required=True, default=None, metavar='FASTA',
+        dest='reference_file', type=str, help="Referene sequence"
+    )
+    requiredNamed.add_argument(
+        "-b", required=True, default=None, metavar='BAM', dest='bamfile',
+        help="BAM file"
+    )
+    requiredNamed.add_argument(
+        "-f", required=True, default=None, metavar='FASTQ', dest='fastq_file',
+        help="FASTQ file containing reads after QC"
+    )
+    requiredNamed.add_argument(
+        "--hap", required=True, default=None, metavar='FASTA',
+        dest='haplotype_seqs', help="FASTA file containing haplotype sequences"
+    )
+    requiredNamed.add_argument(
+        "-N", required=False, default='sample', metavar='STR',
+        dest='sampleID', help="Patient/sample identifiers"
+    )
+    parser.add_argument(
+        "-p", required=False, default=False, action='store_true',
+        dest='paired', help="Indicate whether to simulate paired-end reads"
+    )
+    parser.add_argument(
+        "-o", required=False, default='alignment_bias.tsv', metavar='FILENAME',
+        dest='output_file', type=str,
+        help="Output file"
+    )
+
+    return parser.parse_args()
+
+
+def hamming_dist(s1, s2):
+    """Return the Hamming distance between sequences"""
+    assert len(s1) == len(s2), ("Hamming distance is undefined for sequences "
+                                "differing in their lengths")
+    return sum(el1 != el2 for el1, el2 in zip(s1, s2))
+
+
+def main():
+    args = parse_args()
+    reference = SeqIO.read(args.reference_file, "fasta")
+    reference_len = len(reference)
+    start = None
+    end = None
+
+    hap_cnt = {}
+    hap_coverage = {}
+    with pysam.AlignmentFile(args.bamfile, 'rb') as alnfile:
+        for read in alnfile.fetch(reference=reference.id, start=start, end=end):
+            query_name = read.query_name
+            hapID = query_name.split('-')[0]
+            if hapID in hap_cnt:
+                hap_cnt[hapID] += 1
+            else:
+                hap_cnt[hapID] = 1
+
+            # Get the percent of aligned bases per read
+            cigar_arr = read.get_cigar_stats()[0]
+            aux = (cigar_arr[0] + cigar_arr[1])
+            assert aux == read.query_alignment_length
+            percent_aligned = read.query_alignment_length / read.infer_read_length()
+            assert percent_aligned <= 1, f"{cigar_arr}"
+            if hapID in hap_coverage:
+                hap_coverage[hapID] += percent_aligned
+            else:
+                hap_coverage[hapID] = percent_aligned
+
+    for k, v in hap_cnt.items():
+        aux = hap_coverage[k] / v
+        assert aux <= 1, f"{aux}"
+        hap_coverage[k] = aux
+
+    # Parse FASTQ file to compute the expected number of reads per haplotype
+    hap_expected = {}
+    total = 0
+    for read in SeqIO.parse(args.fastq_file, "fastq"):
+        total += 1
+        hapID = read.id.split('-')[0]
+        if hapID in hap_expected:
+            hap_expected[hapID] += 1
+        else:
+            hap_expected[hapID] = 1
+
+    percent_aligned = {}
+    for k, v in hap_expected.items():
+        if k in hap_cnt:
+            if args.paired:
+                percent_aligned[k] = hap_cnt[k] / (v * 2)
+            else:
+                percent_aligned[k] = hap_cnt[k] / v
+        else:
+            percent_aligned[k] = 0
+            hap_coverage[k] = 0
+
+    # Compute divergence from the reference sequence
+    divergence = {}
+    affine_pen = pairwise2.affine_penalty(-1, -0.1, True)
+    with open(args.haplotype_seqs, 'r') as infile:
+        for record in SeqIO.parse(infile, 'fasta'):
+            alignment = pairwise2.align.globalmc(reference.seq, record.seq, 1, 0,
+                                                 affine_pen, affine_pen,
+                                                 one_alignment_only=True)
+            divergence[record.id] = hamming_dist(alignment[0][0],
+                                                 alignment[0][1]) / alignment[0][4]
+
+    with open(args.output_file, 'w') as outfile:
+        outfile.write("SampleID\tHaplotypeID\tDivergence\tPercent-aligned\t"
+                      "Percent-bases-aligned\n")
+        for k, v in percent_aligned.items():
+            outfile.write(f"{args.sampleID}\t{k}\t{divergence[k]}\t{v}\t"
+                          f"{hap_coverage[k]}\n")
+
+if __name__ == '__main__':
+    main()
diff --git a/vpipeBench.snake b/vpipeBench.snake
index b4fda9116..8450d099c 100644
--- a/vpipeBench.snake
+++ b/vpipeBench.snake
@@ -26,6 +26,7 @@ rule allbench:
     input:
         RESULTS,
         "variants/SNV_calling_performance.tsv",
+        "stats/alignment_bias.tsv",
         "stats/coverage_intervals.tsv",
         "stats/read_counts.tsv"