Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/calico/basenji
Browse files Browse the repository at this point in the history
ok
  • Loading branch information
davek44 committed Sep 19, 2017
2 parents 918492b + 159c684 commit 344e02a
Show file tree
Hide file tree
Showing 16 changed files with 353 additions and 556 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ To verify the install, launch python and run
---------------------------------------------------------------------------------------------------
### Documentation

At this stage, Basenji is something in between personal research code and accessible software for wide use. The primary challenge is uncertainty what the best role for this type of toolkit is going to be in functional genomics and statistical genetics. The computational requirements don't make it easy either. Thus, this package is under active development, and I encourage anyone to get in touch to relate your experience and request clarifications or additional features, documentation, or tutorials.
At this stage, Basenji is something in between personal research code and accessible software for wide use. The primary challenge is uncertainty in what the best role for this type of toolkit is going to be in functional genomics and statistical genetics. The computational requirements don't make it easy either. Thus, this package is under active development, and I encourage anyone to get in touch to relate your experience and request clarifications or additional features, documentation, or tutorials.

- [Preprocess](docs/preprocess.md)
- [bam_cov.py](docs/preprocess.md#bam_cov)
Expand Down Expand Up @@ -71,4 +71,4 @@ These are a work in progress, so forgive incompleteness for the moment. If there
- [Test gene expression predictions.](tutorials/genes.ipynb)
- Study
- [Execute an in silico saturated mutagenesis](tutorials/sat_mut.ipynb)
- [Compute SNP Activity Difference (SAD) and Expression Difference (SED) scores.](tutorials/sad.ipynb)
- [Compute SNP Activity Difference (SAD) and Expression Difference (SED) scores.](tutorials/sad.ipynb)
60 changes: 38 additions & 22 deletions bin/bam_cov.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,17 +73,18 @@ def main():
usage = 'usage: %prog [options] <bam_file> <output_file>'
parser = OptionParser(usage)
parser.add_option('-a', dest='adaptive_max', default=8, type='int', help='Maximum coverage at a single position, adaptively-determined [Default: %default]')
parser.add_option('-c', dest='cut_bias_kmer', default=None, action='store_true', help='Normalize coverage for a cutting bias model for k-mers [Default: %default]')
parser.add_option('-b', dest='cut_bias_kmer', default=None, action='store_true', help='Normalize coverage for a cutting bias model for k-mers [Default: %default]')
parser.add_option('-c', dest='shift_center', default=False, action='store_true', help='Shift the event to the fragment center, learning the distribution for single end reads [Default: %default]')
parser.add_option('-d', dest='duplicate_max', default=None, type='int', help='Maximum coverage at a single position for initial clipping [Default: %default]')
parser.add_option('-f', dest='fasta_file', default='%s/assembly/hg19.fa'%os.environ['HG19'], help='FASTA to obtain sequence to control for GC% [Default: %default]')
parser.add_option('-g', dest='gc', default=False, action='store_true', help='Control for local GC% [Default: %default]')
parser.add_option('-m', dest='multi_em', default=0, type='int', help='Iterations of EM to distribute multi-mapping reads [Default: %default]')
parser.add_option('--multi_max', dest='multi_max', default=1, type='int', help='Maximum coverage at a single position from multi-mapping reads [Default: %default]')
parser.add_option('-o', dest='out_dir', default='bam_cov', help='Output directory [Default: %default]')
parser.add_option('-r', dest='strand_corr_shift', default=False, action='store_true', help='Learn the fragment shift by fitting a strand cross correlation model [Default: %default]')
parser.add_option('-s', dest='smooth_sd', default=32, type='float', help='Gaussian standard deviation to smooth coverage estimates with [Default: %default]')
parser.add_option('-t', dest='shift', default=0, type='int', help='Fragment shift [Default: %default]')
parser.add_option('-u', dest='unsorted', default=False, action='store_true', help='Alignments are unsorted [Default: %default]')
parser.add_option('-v', dest='shift_forward_end', default=0, type='int', help='Fragment shift for forward end read [Default: %default]')
parser.add_option('-w', dest='shift_reverse_end', default=0, type='int', help='Fragment shift for reverse end read [Default: %default]')
(options,args) = parser.parse_args()

if len(args) != 2:
Expand All @@ -110,10 +111,17 @@ def main():
options.duplicate_max *= 2

# initialize
genome_coverage = GenomeCoverage(chrom_lengths, smooth_sd=options.smooth_sd, duplicate_max=options.duplicate_max, multi_max=options.multi_max, shift=options.shift, fasta_file=options.fasta_file)
genome_coverage = GenomeCoverage(chrom_lengths,
smooth_sd=options.smooth_sd,
duplicate_max=options.duplicate_max,
multi_max=options.multi_max,
shift_center=options.shift_center,
shift_forward=options.shift_forward_end,
shift_reverse=options.shift_reverse_end,
fasta_file=options.fasta_file)

# estimate fragment shift
if options.strand_corr_shift:
if options.shift_center:
if sp == 'single':
genome_coverage.learn_shift_single(bam_file, out_dir=options.out_dir)
else:
Expand Down Expand Up @@ -366,7 +374,7 @@ class GenomeCoverage:
shift (int): Alignment shift to maximize cross-strand coverage correlation
'''

def __init__(self, chrom_lengths, smooth_sd=32, adaptive_max=8, duplicate_max=2, multi_max=1, shift=0, fasta_file=None):
def __init__(self, chrom_lengths, smooth_sd=32, adaptive_max=8, duplicate_max=2, multi_max=1, shift_center=False, shift_forward=0, shift_reverse=0, fasta_file=None):
self.chrom_lengths = chrom_lengths
self.genome_length = sum(chrom_lengths.values())
self.unique_counts = np.zeros(self.genome_length, dtype='uint16')
Expand All @@ -375,7 +383,10 @@ def __init__(self, chrom_lengths, smooth_sd=32, adaptive_max=8, duplicate_max=2,
self.smooth_sd = smooth_sd
self.duplicate_max = duplicate_max
self.multi_max = multi_max
self.shift = shift

self.shift_center = shift_center
self.shift_forward = shift_forward
self.shift_reverse = shift_reverse

self.adaptive_max = adaptive_max
self.adaptive_cdf = .05
Expand Down Expand Up @@ -849,14 +860,15 @@ def learn_shift_pair(self, bam_file):
template_lengths.append(abs(align.template_length))

# compute mean
self.shift = int(np.round(np.mean(template_lengths) / 2))
self.shift_forward = int(np.round(np.mean(template_lengths) / 2))

print(' Done in %ds.' % (time.time()-t0))
print('Shift: %d' % self.shift, flush=True)
print('Shift: %d' % self.shift_forward, flush=True)


def learn_shift_single(self, bam_file, shift_min=50, shift_max=350, out_dir=None):
''' Learn the optimal fragment shift that maximizes across strand correlation '''
''' Learn the optimal fragment shift that maximizes across strand correlation
(to be applied for single end discordant alignments.) '''

t0 = time.time()
print('Learning shift from single-end sequences.', end='', flush=True)
Expand Down Expand Up @@ -928,10 +940,10 @@ def learn_shift_single(self, bam_file, shift_min=50, shift_max=350, out_dir=None
strand_corrs_smooth = gaussian_filter1d(strand_corrs, sigma=12, truncate=3)

# find max
self.shift = (shift_min + np.argmax(strand_corrs_smooth)) // 2
self.shift_forward = (shift_min + np.argmax(strand_corrs_smooth)) // 2

print(' Done in %ds.' % (time.time()-t0))
print('Shift: %d' % self.shift, flush=True)
print('Shift: %d' % self.shift_forward, flush=True)

if out_dir is not None:
# plot training fit
Expand All @@ -940,7 +952,7 @@ def learn_shift_single(self, bam_file, shift_min=50, shift_max=350, out_dir=None
# print table
shift_out = open('%s/shift_table.txt' % out_dir, 'w')
for si in range(len(shifts)):
print(shifts[si], strand_corrs[si], strand_corrs_smooth[si], '*'*int(2*self.shift==shifts[si]), file=shift_out)
print(shifts[si], strand_corrs[si], strand_corrs_smooth[si], '*'*int(2*self.shift_forward==shifts[si]), file=shift_out)
shift_out.close()


Expand All @@ -966,22 +978,26 @@ def read_bam(self, bam_file, genome_sorted=False):
if not align.is_unmapped:
read_id = (align.query_name, align.is_read1)

# determine shift
if self.shift == 0:
# don't shift anyone
align_shift = 0
else:
if self.shift_center:
if align.is_proper_pair:
# shift proper pairs according to mate
align_shift = abs(align.template_length) // 2
align_shift_forward = abs(align.template_length) // 2
else:
# shift others by estimated amount
align_shift = self.shift
align_shift_forward = self.shift_forward

# match reverse to forward
align_shift_reverse = align_shift_forward

else:
# apply user-specific shifts
align_shift_forward = self.shift_forward
align_shift_reverse = self.shift_reverse

# determine alignment position
chrom_pos = align.reference_start + align_shift
chrom_pos = align.reference_start + align_shift_forward
if align.is_reverse:
chrom_pos = align.reference_end - align_shift
chrom_pos = align.reference_end - align_shift_reverse

# determine genome index
gi = self.genome_index(align.reference_id, chrom_pos)
Expand Down
8 changes: 4 additions & 4 deletions bin/basenji_sad.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,10 @@ def main():
target_labels = []
for line in open(options.targets_file):
a = line.strip().split('\t')
target_ids.append(a[1])
target_ids.append(a[0])
target_labels.append(a[2])

header_cols = ('rsid', 'index', 'score', 'ref', 'alt', 'ref_pred', 'alt pred', 'sad', 'target_index', 'target_id', 'target_label')
header_cols = ('rsid', 'index', 'score', 'ref', 'alt', 'ref_upred', 'alt_upred', 'usad', 'usar', 'ref_xpred', 'alt_xpred', 'xsad', 'xsar', 'target_index', 'target_id', 'target_label')
if options.csv:
sad_out = open('%s/sad_table.csv' % options.out_dir, 'w')
print(','.join(header_cols), file=sad_out)
Expand Down Expand Up @@ -238,10 +238,10 @@ def main():
# profile the max difference position
max_li = 0
max_sad = alt_preds[max_li,ti] - ref_preds[max_li,ti]
max_sar = np.log2(alt_preds[max_li,ti]) - np.log2(ref_preds[max_li,ti])
max_sar = np.log2(alt_preds[max_li,ti]+1) - np.log2(ref_preds[max_li,ti]+1)
for li in range(ref_preds.shape[0]):
sad_li = alt_preds[li,ti] - ref_preds[li,ti]
sar_li = np.log2(alt_preds[li,ti]) - np.log2(ref_preds[li,ti])
sar_li = np.log2(alt_preds[li,ti]+1) - np.log2(ref_preds[li,ti]+1)
# if abs(sad_li) > abs(max_sad):
if abs(sar_li) > abs(max_sar):
max_li = li
Expand Down
3 changes: 3 additions & 0 deletions tutorials/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
.ipynb_checkpoints
data/gm12878_test
pim1_sat
rfx6_sad
39 changes: 39 additions & 0 deletions tutorials/data/gm12878_wigs.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
ENCSR000EJD_3_1 ../../../data/human/dnase/encode_fastq/covse/ENCSR000EJD_3_1.h5 DNASE:GM12878
ENCSR000EMT_2_1 ../../../data/human/dnase/encode_fastq/covse/ENCSR000EMT_2_1.h5 DNASE:GM12878
ENCSR000EMT_1_1 ../../../data/human/dnase/encode_fastq/covse/ENCSR000EMT_1_1.h5 DNASE:GM12878
ENCSR000EJD_1_1 ../../../data/human/dnase/encode_fastq/covse/ENCSR000EJD_1_1.h5 DNASE:GM12878
ENCSR000EJD_2_1 ../../../data/human/dnase/encode_fastq/covse/ENCSR000EJD_2_1.h5 DNASE:GM12878
ENCSR057BWO_2_1 ../../../data/human/histone/encode_fastq/covse/ENCSR057BWO_2_1.h5 HISTONE:H3K4me3 GM12878
ENCSR000AKE_1_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AKE_1_1.h5 HISTONE:H3K36me3 GM12878
ENCSR000AKF_2_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AKF_2_1.h5 HISTONE:H3K4me1 GM12878
ENCSR000AOV_2_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AOV_2_1.h5 HISTONE:H2AFZ GM12878
ENCSR000AKI_2_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AKI_2_1.h5 HISTONE:H4K20me1 GM12878
ENCSR000AKA_2_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AKA_2_1.h5 HISTONE:H3K4me3 GM12878
ENCSR000AOX_2_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AOX_2_1.h5 HISTONE:H3K9me3 GM12878
ENCSR000DRW_1_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000DRW_1_1.h5 HISTONE:H3K36me3 GM12878
ENCSR000AOW_1_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AOW_1_1.h5 HISTONE:H3K79me2 GM12878
ENCSR000AKD_1_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AKD_1_1.h5 HISTONE:H3K27me3 GM12878
ENCSR000AKE_2_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AKE_2_1.h5 HISTONE:H3K36me3 GM12878
ENCSR000DRW_2_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000DRW_2_1.h5 HISTONE:H3K36me3 GM12878
ENCSR000AKA_1_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AKA_1_1.h5 HISTONE:H3K4me3 GM12878
ENCSR000AKH_1_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AKH_1_1.h5 HISTONE:H3K9ac GM12878
ENCSR000AOX_1_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AOX_1_1.h5 HISTONE:H3K9me3 GM12878
ENCSR000AKG_1_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AKG_1_1.h5 HISTONE:H3K4me2 GM12878
ENCSR057BWO_1_1 ../../../data/human/histone/encode_fastq/covse/ENCSR057BWO_1_1.h5 HISTONE:H3K4me3 GM12878
ENCSR000DRY_2_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000DRY_2_1.h5 HISTONE:H3K4me3 GM12878
ENCSR000DRY_1_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000DRY_1_1.h5 HISTONE:H3K4me3 GM12878
ENCSR000DRX_2_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000DRX_2_1.h5 HISTONE:H3K27me3 GM12878
ENCSR000AKG_2_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AKG_2_1.h5 HISTONE:H3K4me2 GM12878
ENCSR000AKD_2_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AKD_2_1.h5 HISTONE:H3K27me3 GM12878
ENCSR000AKC_2_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AKC_2_1.h5 HISTONE:H3K27ac GM12878
ENCSR000AOX_3_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AOX_3_1.h5 HISTONE:H3K9me3 GM12878
ENCSR000DRX_1_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000DRX_1_1.h5 HISTONE:H3K27me3 GM12878
ENCSR000AKF_1_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AKF_1_1.h5 HISTONE:H3K4me1 GM12878
ENCSR000AOW_2_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AOW_2_1.h5 HISTONE:H3K79me2 GM12878
ENCSR000AKI_1_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AKI_1_1.h5 HISTONE:H4K20me1 GM12878
ENCSR000AKH_2_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AKH_2_1.h5 HISTONE:H3K9ac GM12878
ENCSR000AKC_1_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AKC_1_1.h5 HISTONE:H3K27ac GM12878
ENCSR000AOV_1_1 ../../../data/human/histone/encode_fastq/covse/ENCSR000AOV_1_1.h5 HISTONE:H2AFZ GM12878
CNhs12332 ../../../data/human/cage/fantom/coverage/CNhs12332.h5 CAGE:B lymphoblastoid cell line: GM12878 ENCODE, biol_rep2
CNhs12333 ../../../data/human/cage/fantom/coverage/CNhs12333.h5 CAGE:B lymphoblastoid cell line: GM12878 ENCODE, biol_rep3
CNhs12331 ../../../data/human/cage/fantom/coverage/CNhs12331.h5 CAGE:B lymphoblastoid cell line: GM12878 ENCODE, biol_rep1
1 change: 1 addition & 0 deletions tutorials/data/pim1.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
chr6 37006697 37268841
2 changes: 2 additions & 0 deletions tutorials/data/pim1.fa

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions tutorials/data/rs339331.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
##fileformat=VCFv4.0
chr6 117210052 rs339331 T C 100
Binary file added tutorials/models/gm12878.tf.data-00000-of-00001
Binary file not shown.
Binary file added tutorials/models/gm12878.tf.index
Binary file not shown.
Binary file added tutorials/models/gm12878.tf.meta
Binary file not shown.
54 changes: 0 additions & 54 deletions tutorials/models/params_large.txt

This file was deleted.

Loading

0 comments on commit 344e02a

Please sign in to comment.