From 597e528b632ceb431ddd23bbe10a8c6d750d1883 Mon Sep 17 00:00:00 2001 From: Archa Jain Date: Mon, 11 Sep 2017 16:30:53 -0700 Subject: [PATCH 1/3] Add different forward/reverse end shifts. --- bin/bam_cov.py | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/bin/bam_cov.py b/bin/bam_cov.py index 50d757f2..736f19cc 100755 --- a/bin/bam_cov.py +++ b/bin/bam_cov.py @@ -84,6 +84,8 @@ def main(): parser.add_option('-s', dest='smooth_sd', default=32, type='float', help='Gaussian standard deviation to smooth coverage estimates with [Default: %default]') parser.add_option('-t', dest='shift', default=0, type='int', help='Fragment shift [Default: %default]') parser.add_option('-u', dest='unsorted', default=False, action='store_true', help='Alignments are unsorted [Default: %default]') + parser.add_option('-v', dest='shift_forward_end', default=0, type='int', help='Fragment shift for forward end read [Default: %default]') + parser.add_option('-w', dest='shift_reverse_end', default=0, type='int', help='Fragment shift for reverse end read [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: @@ -110,7 +112,9 @@ def main(): options.duplicate_max *= 2 # initialize - genome_coverage = GenomeCoverage(chrom_lengths, smooth_sd=options.smooth_sd, duplicate_max=options.duplicate_max, multi_max=options.multi_max, shift=options.shift, fasta_file=options.fasta_file) + genome_coverage = GenomeCoverage(chrom_lengths, smooth_sd=options.smooth_sd, duplicate_max=options.duplicate_max, + multi_max=options.multi_max, shift=options.shift, shift_forward=options.shift_forward_end, + shift_reverse=options.shift_reverse_end, fasta_file=options.fasta_file) # estimate fragment shift if options.strand_corr_shift: @@ -366,7 +370,7 @@ class GenomeCoverage: shift (int): Alignment shift to maximize cross-strand coverage correlation ''' - def __init__(self, chrom_lengths, smooth_sd=32, adaptive_max=8, duplicate_max=2, multi_max=1, shift=0, fasta_file=None): + def __init__(self, chrom_lengths, smooth_sd=32, adaptive_max=8, duplicate_max=2, multi_max=1, shift=0, shift_forward=0, shift_reverse=0, fasta_file=None): self.chrom_lengths = chrom_lengths self.genome_length = sum(chrom_lengths.values()) self.unique_counts = np.zeros(self.genome_length, dtype='uint16') @@ -376,6 +380,8 @@ def __init__(self, chrom_lengths, smooth_sd=32, adaptive_max=8, duplicate_max=2, self.duplicate_max = duplicate_max self.multi_max = multi_max self.shift = shift + self.shift_forward = shift_forward + self.shift_reverse = shift_reverse self.adaptive_max = adaptive_max self.adaptive_cdf = .05 @@ -967,21 +973,26 @@ def read_bam(self, bam_file, genome_sorted=False): read_id = (align.query_name, align.is_read1) # determine shift - if self.shift == 0: - # don't shift anyone - align_shift = 0 + if self.shift_forward and self.shift_reverse: + align_shift_forward = self.shift_forward + align_shift_reverse = self.shift_reverse else: - if align.is_proper_pair: - # shift proper pairs according to mate - align_shift = abs(align.template_length) // 2 + if self.shift == 0: + # don't shift anyone + align_shift_forward = 0 else: - # shift others by estimated amount - align_shift = self.shift + if align.is_proper_pair: + # shift proper pairs according to mate + align_shift_forward = abs(align.template_length) // 2 + else: + # shift others by estimated amount + align_shift_forward = self.shift + align_shift_reverse = align_shift_forward # determine alignment position - chrom_pos = align.reference_start + align_shift + chrom_pos = align.reference_start + align_shift_forward if align.is_reverse: - chrom_pos = align.reference_end - align_shift + chrom_pos = align.reference_end - align_shift_reverse # determine genome index gi = self.genome_index(align.reference_id, chrom_pos) From 044dded62a13cf818bc785d17dcf115bf2d215c1 Mon Sep 17 00:00:00 2001 From: David Kelley Date: Sat, 16 Sep 2017 15:34:14 -0700 Subject: [PATCH 2/3] extra comments --- bin/bam_cov.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/bam_cov.py b/bin/bam_cov.py index 736f19cc..e9f75185 100755 --- a/bin/bam_cov.py +++ b/bin/bam_cov.py @@ -978,9 +978,10 @@ def read_bam(self, bam_file, genome_sorted=False): align_shift_reverse = self.shift_reverse else: if self.shift == 0: - # don't shift anyone + # don't shift align_shift_forward = 0 else: + # assuming we want the fragment center if align.is_proper_pair: # shift proper pairs according to mate align_shift_forward = abs(align.template_length) // 2 From 2436b9db926ce34d10980d4c7df10d3caba81510 Mon Sep 17 00:00:00 2001 From: David Kelley Date: Sat, 16 Sep 2017 15:51:37 -0700 Subject: [PATCH 3/3] clarifying end versus center shifts --- bin/bam_cov.py | 64 +++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/bin/bam_cov.py b/bin/bam_cov.py index e9f75185..cf9c8e17 100755 --- a/bin/bam_cov.py +++ b/bin/bam_cov.py @@ -73,16 +73,15 @@ def main(): usage = 'usage: %prog [options] ' parser = OptionParser(usage) parser.add_option('-a', dest='adaptive_max', default=8, type='int', help='Maximum coverage at a single position, adaptively-determined [Default: %default]') - parser.add_option('-c', dest='cut_bias_kmer', default=None, action='store_true', help='Normalize coverage for a cutting bias model for k-mers [Default: %default]') + parser.add_option('-b', dest='cut_bias_kmer', default=None, action='store_true', help='Normalize coverage for a cutting bias model for k-mers [Default: %default]') + parser.add_option('-c', dest='shift_center', default=False, action='store_true', help='Shift the event to the fragment center, learning the distribution for single end reads [Default: %default]') parser.add_option('-d', dest='duplicate_max', default=None, type='int', help='Maximum coverage at a single position for initial clipping [Default: %default]') parser.add_option('-f', dest='fasta_file', default='%s/assembly/hg19.fa'%os.environ['HG19'], help='FASTA to obtain sequence to control for GC% [Default: %default]') parser.add_option('-g', dest='gc', default=False, action='store_true', help='Control for local GC% [Default: %default]') parser.add_option('-m', dest='multi_em', default=0, type='int', help='Iterations of EM to distribute multi-mapping reads [Default: %default]') parser.add_option('--multi_max', dest='multi_max', default=1, type='int', help='Maximum coverage at a single position from multi-mapping reads [Default: %default]') parser.add_option('-o', dest='out_dir', default='bam_cov', help='Output directory [Default: %default]') - parser.add_option('-r', dest='strand_corr_shift', default=False, action='store_true', help='Learn the fragment shift by fitting a strand cross correlation model [Default: %default]') parser.add_option('-s', dest='smooth_sd', default=32, type='float', help='Gaussian standard deviation to smooth coverage estimates with [Default: %default]') - parser.add_option('-t', dest='shift', default=0, type='int', help='Fragment shift [Default: %default]') parser.add_option('-u', dest='unsorted', default=False, action='store_true', help='Alignments are unsorted [Default: %default]') parser.add_option('-v', dest='shift_forward_end', default=0, type='int', help='Fragment shift for forward end read [Default: %default]') parser.add_option('-w', dest='shift_reverse_end', default=0, type='int', help='Fragment shift for reverse end read [Default: %default]') @@ -112,12 +111,17 @@ def main(): options.duplicate_max *= 2 # initialize - genome_coverage = GenomeCoverage(chrom_lengths, smooth_sd=options.smooth_sd, duplicate_max=options.duplicate_max, - multi_max=options.multi_max, shift=options.shift, shift_forward=options.shift_forward_end, - shift_reverse=options.shift_reverse_end, fasta_file=options.fasta_file) + genome_coverage = GenomeCoverage(chrom_lengths, + smooth_sd=options.smooth_sd, + duplicate_max=options.duplicate_max, + multi_max=options.multi_max, + shift_center=options.shift_center, + shift_forward=options.shift_forward_end, + shift_reverse=options.shift_reverse_end, + fasta_file=options.fasta_file) # estimate fragment shift - if options.strand_corr_shift: + if options.shift_center: if sp == 'single': genome_coverage.learn_shift_single(bam_file, out_dir=options.out_dir) else: @@ -370,7 +374,7 @@ class GenomeCoverage: shift (int): Alignment shift to maximize cross-strand coverage correlation ''' - def __init__(self, chrom_lengths, smooth_sd=32, adaptive_max=8, duplicate_max=2, multi_max=1, shift=0, shift_forward=0, shift_reverse=0, fasta_file=None): + def __init__(self, chrom_lengths, smooth_sd=32, adaptive_max=8, duplicate_max=2, multi_max=1, shift_center=False, shift_forward=0, shift_reverse=0, fasta_file=None): self.chrom_lengths = chrom_lengths self.genome_length = sum(chrom_lengths.values()) self.unique_counts = np.zeros(self.genome_length, dtype='uint16') @@ -379,7 +383,8 @@ def __init__(self, chrom_lengths, smooth_sd=32, adaptive_max=8, duplicate_max=2, self.smooth_sd = smooth_sd self.duplicate_max = duplicate_max self.multi_max = multi_max - self.shift = shift + + self.shift_center = shift_center self.shift_forward = shift_forward self.shift_reverse = shift_reverse @@ -855,14 +860,15 @@ def learn_shift_pair(self, bam_file): template_lengths.append(abs(align.template_length)) # compute mean - self.shift = int(np.round(np.mean(template_lengths) / 2)) + self.shift_forward = int(np.round(np.mean(template_lengths) / 2)) print(' Done in %ds.' % (time.time()-t0)) - print('Shift: %d' % self.shift, flush=True) + print('Shift: %d' % self.shift_forward, flush=True) def learn_shift_single(self, bam_file, shift_min=50, shift_max=350, out_dir=None): - ''' Learn the optimal fragment shift that maximizes across strand correlation ''' + ''' Learn the optimal fragment shift that maximizes across strand correlation + (to be applied for single end discordant alignments.) ''' t0 = time.time() print('Learning shift from single-end sequences.', end='', flush=True) @@ -934,10 +940,10 @@ def learn_shift_single(self, bam_file, shift_min=50, shift_max=350, out_dir=None strand_corrs_smooth = gaussian_filter1d(strand_corrs, sigma=12, truncate=3) # find max - self.shift = (shift_min + np.argmax(strand_corrs_smooth)) // 2 + self.shift_forward = (shift_min + np.argmax(strand_corrs_smooth)) // 2 print(' Done in %ds.' % (time.time()-t0)) - print('Shift: %d' % self.shift, flush=True) + print('Shift: %d' % self.shift_forward, flush=True) if out_dir is not None: # plot training fit @@ -946,7 +952,7 @@ def learn_shift_single(self, bam_file, shift_min=50, shift_max=350, out_dir=None # print table shift_out = open('%s/shift_table.txt' % out_dir, 'w') for si in range(len(shifts)): - print(shifts[si], strand_corrs[si], strand_corrs_smooth[si], '*'*int(2*self.shift==shifts[si]), file=shift_out) + print(shifts[si], strand_corrs[si], strand_corrs_smooth[si], '*'*int(2*self.shift_forward==shifts[si]), file=shift_out) shift_out.close() @@ -972,24 +978,22 @@ def read_bam(self, bam_file, genome_sorted=False): if not align.is_unmapped: read_id = (align.query_name, align.is_read1) - # determine shift - if self.shift_forward and self.shift_reverse: - align_shift_forward = self.shift_forward - align_shift_reverse = self.shift_reverse - else: - if self.shift == 0: - # don't shift - align_shift_forward = 0 + if self.shift_center: + if align.is_proper_pair: + # shift proper pairs according to mate + align_shift_forward = abs(align.template_length) // 2 else: - # assuming we want the fragment center - if align.is_proper_pair: - # shift proper pairs according to mate - align_shift_forward = abs(align.template_length) // 2 - else: - # shift others by estimated amount - align_shift_forward = self.shift + # shift others by estimated amount + align_shift_forward = self.shift_forward + + # match reverse to forward align_shift_reverse = align_shift_forward + else: + # apply user-specific shifts + align_shift_forward = self.shift_forward + align_shift_reverse = self.shift_reverse + # determine alignment position chrom_pos = align.reference_start + align_shift_forward if align.is_reverse: