From a0b2ba38096c8e9d95ed01f5d8669a654ad6495d Mon Sep 17 00:00:00 2001 From: Susana Posada-Cespedes Date: Fri, 5 Jun 2020 17:22:28 +0200 Subject: [PATCH] Functionality to detect alignment bias --- envs/{test_snv.yaml => testbench.yaml} | 1 + rules/benchmark.smk | 11 +- rules/testbench.smk | 45 +++++++++ scripts/alignmentBias.py | 135 +++++++++++++++++++++++++ vpipeBench.snake | 1 + 5 files changed, 192 insertions(+), 1 deletion(-) rename envs/{test_snv.yaml => testbench.yaml} (93%) create mode 100755 scripts/alignmentBias.py diff --git a/envs/test_snv.yaml b/envs/testbench.yaml similarity index 93% rename from envs/test_snv.yaml rename to envs/testbench.yaml index f35a5e075..f4bf09984 100644 --- a/envs/test_snv.yaml +++ b/envs/testbench.yaml @@ -8,4 +8,5 @@ dependencies: - sh - biopython - scipy + - pysam - mafft=7.305 diff --git a/rules/benchmark.smk b/rules/benchmark.smk index 0e3911577..0ff2aaee6 100644 --- a/rules/benchmark.smk +++ b/rules/benchmark.smk @@ -40,6 +40,8 @@ class VpipeBenchConfig(VpipeConfig): 'simBench': __RECORD__( value=f"{VPIPE_BASEDIR}/scripts/simBench.py", type=str), 'art': __RECORD__(value="art_illumina", type=str), + 'alignmentBias': __RECORD__( + value=f"{VPIPE_BASEDIR}/scripts/alignmentBias.py", type=str), 'testBench': __RECORD__( value=f"{VPIPE_BASEDIR}/scripts/testBench.py", type=str), 'alignmentIntervals' : __RECORD__( @@ -81,10 +83,17 @@ class VpipeBenchConfig(VpipeConfig): 'mem': __RECORD__(value=5000, type=int), 'time': __RECORD__(value=90, type=int), }), + ('alignment_bias', { + 'mem': __RECORD__(value=2000, type=int), + 'time': __RECORD__(value=60, type=int), + 'conda': __RECORD__( + value=f'{VPIPE_BASEDIR}/envs/testbench.yaml', type=str), + }), ('test_snv', { 'mem': __RECORD__(value=2000, type=int), 'time': __RECORD__(value=60, type=int), - 'conda': __RECORD__(value='', type=str), + 'conda': __RECORD__( + value=f'{VPIPE_BASEDIR}/envs/testbench.yaml', type=str), 're_msa': __RECORD__(value=False, type=bool), }), diff --git a/rules/testbench.smk b/rules/testbench.smk index 803b56d48..38c16a1db 100644 --- a/rules/testbench.smk +++ b/rules/testbench.smk @@ -20,6 +20,51 @@ def snvfiles(wildcards): sys.exit(1) return snv_files +rule alignment_bias: + input: + REF = reference_file, + BAM = "{sample_dir}/{sample_name}/{date}/alignments/REF_aln.bam", + R1gz = "{sample_dir}/{sample_name}/{date}/preprocessed_data/R1.fastq.gz", + HAPLOTYPE_SEQS = "{sample_dir}/{sample_name}/{date}/references/haplotypes/haplotypes.fasta", + output: + "{sample_dir}/{sample_name}/{date}/alignments/alignment_bias.tsv" + params: + scratch = '2000', + mem = config.alignment_bias['mem'], + time = config.alignment_bias['time'], + PAIRED = '-p' if config.input['paired'] else '', + ID = lambda wildcards: f'{wildcards.sample_name}-{wildcards.date}', + ALIGNMENT_BIAS = config.applications['alignmentBias'], + log: + outfile = "{sample_dir}/{sample_name}/{date}/alignments/alignment_bias.out.log", + errfile = "{sample_dir}/{sample_name}/{date}/alignments/alignment_bias.out.log" + conda: + config.alignment_bias['conda'] + threads: + 1 + shell: + """ + {params.ALIGNMENT_BIAS} -r {input.REF} -b {input.BAM} -f <(zcat {input.R1gz}) --hap {input.HAPLOTYPE_SEQS} {params.PAIRED} -N {params.ID} -o {output} + """ + +rule aggregate_alignment_bias: + input: + expand("{dataset}/alignments/alignment_bias.tsv", dataset=datasets) + output: + "stats/alignment_bias.tsv" + params: + scratch = '1250', + mem = config.aggregate['mem'], + time = config.aggregate['time'] + log: + outfile = "stats/alignment_bias.out.log", + errfile = "stats/alignment_bias.out.log" + shell: + """ + awk FNR!=1 {input} > {output} + sed -i 1i"SampleID\tHaplotypeID\tDivergence\tPercent-aligned\tPercent-bases-aligne\n" {output} + """ + rule aggregate_beforeSB: input: diff --git a/scripts/alignmentBias.py b/scripts/alignmentBias.py new file mode 100755 index 000000000..2ae5141ef --- /dev/null +++ b/scripts/alignmentBias.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 + +import argparse +import pysam +from Bio import SeqIO, pairwise2 + +__author__ = "Susana Posada-Cespedes" +__license__ = "Apache2.0" +__maintainer__ = "Ivan Topolsky" +__email__ = "v-pipe@bsse.ethz.ch" + + +def parse_args(): + """ Set up the parsing of command-line arguments """ + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + requiredNamed = parser.add_argument_group('required named arguments') + requiredNamed.add_argument( + "-r", required=True, default=None, metavar='FASTA', + dest='reference_file', type=str, help="Referene sequence" + ) + requiredNamed.add_argument( + "-b", required=True, default=None, metavar='BAM', dest='bamfile', + help="BAM file" + ) + requiredNamed.add_argument( + "-f", required=True, default=None, metavar='FASTQ', dest='fastq_file', + help="FASTQ file containing reads after QC" + ) + requiredNamed.add_argument( + "--hap", required=True, default=None, metavar='FASTA', + dest='haplotype_seqs', help="FASTA file containing haplotype sequences" + ) + requiredNamed.add_argument( + "-N", required=False, default='sample', metavar='STR', + dest='sampleID', help="Patient/sample identifiers" + ) + parser.add_argument( + "-p", required=False, default=False, action='store_true', + dest='paired', help="Indicate whether to simulate paired-end reads" + ) + parser.add_argument( + "-o", required=False, default='alignment_bias.tsv', metavar='FILENAME', + dest='output_file', type=str, + help="Output file" + ) + + return parser.parse_args() + + +def hamming_dist(s1, s2): + """Return the Hamming distance between sequences""" + assert len(s1) == len(s2), ("Hamming distance is undefined for sequences " + "differing in their lengths") + return sum(el1 != el2 for el1, el2 in zip(s1, s2)) + + +def main(): + args = parse_args() + reference = SeqIO.read(args.reference_file, "fasta") + reference_len = len(reference) + start = None + end = None + + hap_cnt = {} + hap_coverage = {} + with pysam.AlignmentFile(args.bamfile, 'rb') as alnfile: + for read in alnfile.fetch(reference=reference.id, start=start, end=end): + query_name = read.query_name + hapID = query_name.split('-')[0] + if hapID in hap_cnt: + hap_cnt[hapID] += 1 + else: + hap_cnt[hapID] = 1 + + # Get the percent of aligned bases per read + cigar_arr = read.get_cigar_stats()[0] + aux = (cigar_arr[0] + cigar_arr[1]) + assert aux == read.query_alignment_length + percent_aligned = read.query_alignment_length / read.infer_read_length() + assert percent_aligned <= 1, f"{cigar_arr}" + if hapID in hap_coverage: + hap_coverage[hapID] += percent_aligned + else: + hap_coverage[hapID] = percent_aligned + + for k, v in hap_cnt.items(): + aux = hap_coverage[k] / v + assert aux <= 1, f"{aux}" + hap_coverage[k] = aux + + # Parse FASTQ file to compute the expected number of reads per haplotype + hap_expected = {} + total = 0 + for read in SeqIO.parse(args.fastq_file, "fastq"): + total += 1 + hapID = read.id.split('-')[0] + if hapID in hap_expected: + hap_expected[hapID] += 1 + else: + hap_expected[hapID] = 1 + + percent_aligned = {} + for k, v in hap_expected.items(): + if k in hap_cnt: + if args.paired: + percent_aligned[k] = hap_cnt[k] / (v * 2) + else: + percent_aligned[k] = hap_cnt[k] / v + else: + percent_aligned[k] = 0 + hap_coverage[k] = 0 + + # Compute divergence from the reference sequence + divergence = {} + affine_pen = pairwise2.affine_penalty(-1, -0.1, True) + with open(args.haplotype_seqs, 'r') as infile: + for record in SeqIO.parse(infile, 'fasta'): + alignment = pairwise2.align.globalmc(reference.seq, record.seq, 1, 0, + affine_pen, affine_pen, + one_alignment_only=True) + divergence[record.id] = hamming_dist(alignment[0][0], + alignment[0][1]) / alignment[0][4] + + with open(args.output_file, 'w') as outfile: + outfile.write("SampleID\tHaplotypeID\tDivergence\tPercent-aligned\t" + "Percent-bases-aligned\n") + for k, v in percent_aligned.items(): + outfile.write(f"{args.sampleID}\t{k}\t{divergence[k]}\t{v}\t" + f"{hap_coverage[k]}\n") + +if __name__ == '__main__': + main() diff --git a/vpipeBench.snake b/vpipeBench.snake index b4fda9116..8450d099c 100644 --- a/vpipeBench.snake +++ b/vpipeBench.snake @@ -26,6 +26,7 @@ rule allbench: input: RESULTS, "variants/SNV_calling_performance.tsv", + "stats/alignment_bias.tsv", "stats/coverage_intervals.tsv", "stats/read_counts.tsv"