From 597e528b632ceb431ddd23bbe10a8c6d750d1883 Mon Sep 17 00:00:00 2001
From: Archa Jain <archa@sci-vm-021.calicolabs.local>
Date: Mon, 11 Sep 2017 16:30:53 -0700
Subject: [PATCH 1/3] Add different forward/reverse end shifts.

---
 bin/bam_cov.py | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/bin/bam_cov.py b/bin/bam_cov.py
index 50d757f2..736f19cc 100755
--- a/bin/bam_cov.py
+++ b/bin/bam_cov.py
@@ -84,6 +84,8 @@ def main():
     parser.add_option('-s', dest='smooth_sd', default=32, type='float', help='Gaussian standard deviation to smooth coverage estimates with [Default: %default]')
     parser.add_option('-t', dest='shift', default=0, type='int', help='Fragment shift [Default: %default]')
     parser.add_option('-u', dest='unsorted', default=False, action='store_true', help='Alignments are unsorted [Default: %default]')
+    parser.add_option('-v', dest='shift_forward_end', default=0, type='int', help='Fragment shift for forward end read [Default: %default]')
+    parser.add_option('-w', dest='shift_reverse_end', default=0, type='int', help='Fragment shift for reverse end read [Default: %default]')
     (options,args) = parser.parse_args()
 
     if len(args) != 2:
@@ -110,7 +112,9 @@ def main():
         options.duplicate_max *= 2
 
     # initialize
-    genome_coverage = GenomeCoverage(chrom_lengths, smooth_sd=options.smooth_sd, duplicate_max=options.duplicate_max, multi_max=options.multi_max, shift=options.shift, fasta_file=options.fasta_file)
+    genome_coverage = GenomeCoverage(chrom_lengths, smooth_sd=options.smooth_sd, duplicate_max=options.duplicate_max,
+        multi_max=options.multi_max, shift=options.shift, shift_forward=options.shift_forward_end,
+        shift_reverse=options.shift_reverse_end, fasta_file=options.fasta_file)
 
     # estimate fragment shift
     if options.strand_corr_shift:
@@ -366,7 +370,7 @@ class GenomeCoverage:
      shift (int): Alignment shift to maximize cross-strand coverage correlation
     '''
 
-    def __init__(self, chrom_lengths, smooth_sd=32, adaptive_max=8, duplicate_max=2, multi_max=1, shift=0, fasta_file=None):
+    def __init__(self, chrom_lengths, smooth_sd=32, adaptive_max=8, duplicate_max=2, multi_max=1, shift=0, shift_forward=0, shift_reverse=0, fasta_file=None):
         self.chrom_lengths = chrom_lengths
         self.genome_length = sum(chrom_lengths.values())
         self.unique_counts = np.zeros(self.genome_length, dtype='uint16')
@@ -376,6 +380,8 @@ def __init__(self, chrom_lengths, smooth_sd=32, adaptive_max=8, duplicate_max=2,
         self.duplicate_max = duplicate_max
         self.multi_max = multi_max
         self.shift = shift
+        self.shift_forward = shift_forward
+        self.shift_reverse = shift_reverse
 
         self.adaptive_max = adaptive_max
         self.adaptive_cdf = .05
@@ -967,21 +973,26 @@ def read_bam(self, bam_file, genome_sorted=False):
                 read_id = (align.query_name, align.is_read1)
 
                 # determine shift
-                if self.shift == 0:
-                    # don't shift anyone
-                    align_shift = 0
+                if self.shift_forward and self.shift_reverse:
+                    align_shift_forward = self.shift_forward
+                    align_shift_reverse = self.shift_reverse
                 else:
-                    if align.is_proper_pair:
-                        # shift proper pairs according to mate
-                        align_shift = abs(align.template_length) // 2
+                    if self.shift == 0:
+                        # don't shift anyone
+                        align_shift_forward = 0
                     else:
-                        # shift others by estimated amount
-                        align_shift = self.shift
+                        if align.is_proper_pair:
+                            # shift proper pairs according to mate
+                            align_shift_forward = abs(align.template_length) // 2
+                        else:
+                            # shift others by estimated amount
+                            align_shift_forward = self.shift
+                    align_shift_reverse = align_shift_forward
 
                 # determine alignment position
-                chrom_pos = align.reference_start + align_shift
+                chrom_pos = align.reference_start + align_shift_forward
                 if align.is_reverse:
-                    chrom_pos = align.reference_end - align_shift
+                    chrom_pos = align.reference_end - align_shift_reverse
 
                 # determine genome index
                 gi = self.genome_index(align.reference_id, chrom_pos)

From 044dded62a13cf818bc785d17dcf115bf2d215c1 Mon Sep 17 00:00:00 2001
From: David Kelley <dkelley@fas.harvard.edu>
Date: Sat, 16 Sep 2017 15:34:14 -0700
Subject: [PATCH 2/3] extra comments

---
 bin/bam_cov.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bin/bam_cov.py b/bin/bam_cov.py
index 736f19cc..e9f75185 100755
--- a/bin/bam_cov.py
+++ b/bin/bam_cov.py
@@ -978,9 +978,10 @@ def read_bam(self, bam_file, genome_sorted=False):
                     align_shift_reverse = self.shift_reverse
                 else:
                     if self.shift == 0:
-                        # don't shift anyone
+                        # don't shift
                         align_shift_forward = 0
                     else:
+                        # assuming we want the fragment center
                         if align.is_proper_pair:
                             # shift proper pairs according to mate
                             align_shift_forward = abs(align.template_length) // 2

From 2436b9db926ce34d10980d4c7df10d3caba81510 Mon Sep 17 00:00:00 2001
From: David Kelley <drk@calicolabs.com>
Date: Sat, 16 Sep 2017 15:51:37 -0700
Subject: [PATCH 3/3] clarifying end versus center shifts

---
 bin/bam_cov.py | 64 +++++++++++++++++++++++++++-----------------------
 1 file changed, 34 insertions(+), 30 deletions(-)

diff --git a/bin/bam_cov.py b/bin/bam_cov.py
index e9f75185..cf9c8e17 100755
--- a/bin/bam_cov.py
+++ b/bin/bam_cov.py
@@ -73,16 +73,15 @@ def main():
     usage = 'usage: %prog [options] <bam_file> <output_file>'
     parser = OptionParser(usage)
     parser.add_option('-a', dest='adaptive_max', default=8, type='int', help='Maximum coverage at a single position, adaptively-determined [Default: %default]')
-    parser.add_option('-c', dest='cut_bias_kmer', default=None, action='store_true', help='Normalize coverage for a cutting bias model for k-mers [Default: %default]')
+    parser.add_option('-b', dest='cut_bias_kmer', default=None, action='store_true', help='Normalize coverage for a cutting bias model for k-mers [Default: %default]')
+    parser.add_option('-c', dest='shift_center', default=False, action='store_true', help='Shift the event to the fragment center, learning the distribution for single end reads [Default: %default]')
     parser.add_option('-d', dest='duplicate_max', default=None, type='int', help='Maximum coverage at a single position for initial clipping [Default: %default]')
     parser.add_option('-f', dest='fasta_file', default='%s/assembly/hg19.fa'%os.environ['HG19'], help='FASTA to obtain sequence to control for GC% [Default: %default]')
     parser.add_option('-g', dest='gc', default=False, action='store_true', help='Control for local GC% [Default: %default]')
     parser.add_option('-m', dest='multi_em', default=0, type='int', help='Iterations of EM to distribute multi-mapping reads [Default: %default]')
     parser.add_option('--multi_max', dest='multi_max', default=1, type='int', help='Maximum coverage at a single position from multi-mapping reads [Default: %default]')
     parser.add_option('-o', dest='out_dir', default='bam_cov', help='Output directory [Default: %default]')
-    parser.add_option('-r', dest='strand_corr_shift', default=False, action='store_true', help='Learn the fragment shift by fitting a strand cross correlation model [Default: %default]')
     parser.add_option('-s', dest='smooth_sd', default=32, type='float', help='Gaussian standard deviation to smooth coverage estimates with [Default: %default]')
-    parser.add_option('-t', dest='shift', default=0, type='int', help='Fragment shift [Default: %default]')
     parser.add_option('-u', dest='unsorted', default=False, action='store_true', help='Alignments are unsorted [Default: %default]')
     parser.add_option('-v', dest='shift_forward_end', default=0, type='int', help='Fragment shift for forward end read [Default: %default]')
     parser.add_option('-w', dest='shift_reverse_end', default=0, type='int', help='Fragment shift for reverse end read [Default: %default]')
@@ -112,12 +111,17 @@ def main():
         options.duplicate_max *= 2
 
     # initialize
-    genome_coverage = GenomeCoverage(chrom_lengths, smooth_sd=options.smooth_sd, duplicate_max=options.duplicate_max,
-        multi_max=options.multi_max, shift=options.shift, shift_forward=options.shift_forward_end,
-        shift_reverse=options.shift_reverse_end, fasta_file=options.fasta_file)
+    genome_coverage = GenomeCoverage(chrom_lengths,
+        smooth_sd=options.smooth_sd,
+        duplicate_max=options.duplicate_max,
+        multi_max=options.multi_max,
+        shift_center=options.shift_center,
+        shift_forward=options.shift_forward_end,
+        shift_reverse=options.shift_reverse_end,
+        fasta_file=options.fasta_file)
 
     # estimate fragment shift
-    if options.strand_corr_shift:
+    if options.shift_center:
         if sp == 'single':
             genome_coverage.learn_shift_single(bam_file, out_dir=options.out_dir)
         else:
@@ -370,7 +374,7 @@ class GenomeCoverage:
      shift (int): Alignment shift to maximize cross-strand coverage correlation
     '''
 
-    def __init__(self, chrom_lengths, smooth_sd=32, adaptive_max=8, duplicate_max=2, multi_max=1, shift=0, shift_forward=0, shift_reverse=0, fasta_file=None):
+    def __init__(self, chrom_lengths, smooth_sd=32, adaptive_max=8, duplicate_max=2, multi_max=1, shift_center=False, shift_forward=0, shift_reverse=0, fasta_file=None):
         self.chrom_lengths = chrom_lengths
         self.genome_length = sum(chrom_lengths.values())
         self.unique_counts = np.zeros(self.genome_length, dtype='uint16')
@@ -379,7 +383,8 @@ def __init__(self, chrom_lengths, smooth_sd=32, adaptive_max=8, duplicate_max=2,
         self.smooth_sd = smooth_sd
         self.duplicate_max = duplicate_max
         self.multi_max = multi_max
-        self.shift = shift
+
+        self.shift_center = shift_center
         self.shift_forward = shift_forward
         self.shift_reverse = shift_reverse
 
@@ -855,14 +860,15 @@ def learn_shift_pair(self, bam_file):
                 template_lengths.append(abs(align.template_length))
 
         # compute mean
-        self.shift = int(np.round(np.mean(template_lengths) / 2))
+        self.shift_forward = int(np.round(np.mean(template_lengths) / 2))
 
         print(' Done in %ds.' % (time.time()-t0))
-        print('Shift: %d' % self.shift, flush=True)
+        print('Shift: %d' % self.shift_forward, flush=True)
 
 
     def learn_shift_single(self, bam_file, shift_min=50, shift_max=350, out_dir=None):
-        ''' Learn the optimal fragment shift that maximizes across strand correlation '''
+        ''' Learn the optimal fragment shift that maximizes across strand correlation
+             (to be applied for single end discordant alignments.) '''
 
         t0 = time.time()
         print('Learning shift from single-end sequences.', end='', flush=True)
@@ -934,10 +940,10 @@ def learn_shift_single(self, bam_file, shift_min=50, shift_max=350, out_dir=None
         strand_corrs_smooth = gaussian_filter1d(strand_corrs, sigma=12, truncate=3)
 
         # find max
-        self.shift = (shift_min + np.argmax(strand_corrs_smooth)) // 2
+        self.shift_forward = (shift_min + np.argmax(strand_corrs_smooth)) // 2
 
         print(' Done in %ds.' % (time.time()-t0))
-        print('Shift: %d' % self.shift, flush=True)
+        print('Shift: %d' % self.shift_forward, flush=True)
 
         if out_dir is not None:
             # plot training fit
@@ -946,7 +952,7 @@ def learn_shift_single(self, bam_file, shift_min=50, shift_max=350, out_dir=None
             # print table
             shift_out = open('%s/shift_table.txt' % out_dir, 'w')
             for si in range(len(shifts)):
-                print(shifts[si], strand_corrs[si], strand_corrs_smooth[si], '*'*int(2*self.shift==shifts[si]), file=shift_out)
+                print(shifts[si], strand_corrs[si], strand_corrs_smooth[si], '*'*int(2*self.shift_forward==shifts[si]), file=shift_out)
             shift_out.close()
 
 
@@ -972,24 +978,22 @@ def read_bam(self, bam_file, genome_sorted=False):
             if not align.is_unmapped:
                 read_id = (align.query_name, align.is_read1)
 
-                # determine shift
-                if self.shift_forward and self.shift_reverse:
-                    align_shift_forward = self.shift_forward
-                    align_shift_reverse = self.shift_reverse
-                else:
-                    if self.shift == 0:
-                        # don't shift
-                        align_shift_forward = 0
+                if self.shift_center:
+                    if align.is_proper_pair:
+                        # shift proper pairs according to mate
+                        align_shift_forward = abs(align.template_length) // 2
                     else:
-                        # assuming we want the fragment center
-                        if align.is_proper_pair:
-                            # shift proper pairs according to mate
-                            align_shift_forward = abs(align.template_length) // 2
-                        else:
-                            # shift others by estimated amount
-                            align_shift_forward = self.shift
+                        # shift others by estimated amount
+                        align_shift_forward = self.shift_forward
+
+                    # match reverse to forward
                     align_shift_reverse = align_shift_forward
 
+                else:
+                    # apply user-specific shifts
+                    align_shift_forward = self.shift_forward
+                    align_shift_reverse = self.shift_reverse
+
                 # determine alignment position
                 chrom_pos = align.reference_start + align_shift_forward
                 if align.is_reverse: