Skip to content

Commit

Permalink
Version 1.4.6
Browse files Browse the repository at this point in the history
  • Loading branch information
marshall-lab committed Aug 2, 2020
1 parent 4ec63e9 commit 21fb45f
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 68 deletions.
9 changes: 9 additions & 0 deletions changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
[v1.4.6]
* New feature: paired-end reads are now handled natively at the fastq level. Use the --paired flag to enable, and your paired reads should be automagically detected and aligned
* Changed: BAM file format is now used natively without SAM intermediates at all levels (also fixes the recent samtools version handling bug)
* Changed: paired-end reads are selected based on the 0x2 bit on the SAM file format bitwise FLAG (reads aligned in proper pair)
* Fixed: paired-end read counts are enumerated correctly
* Removed: --keep_sam and --only_sam flags (as SAM format is no longer used)
* Added: --keep_original_bams flag to retain the non-extended reads BAM file when using single-end sequencing
* Changed: warning on missing chromosomal identities in GATC alignment made more friendly and less confusing

[v1.4.5]
* Added --ps_debug option to explicitly display pseudocounts calculation

Expand Down
181 changes: 113 additions & 68 deletions damidseq_pipeline
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/perl -w

# Copyright © 2013-17, Owen Marshall
# Copyright © 2013-20, Owen Marshall

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
Expand All @@ -25,8 +25,8 @@ use 5.010;

$|++;

my $version = "1.4.5";
print STDERR "\ndamidseq_pipeline v$version\nCopyright 2013-17, Owen Marshall\n\n";
my $version = "1.4.6";
print STDERR "\ndamidseq_pipeline v$version\nCopyright 2013-20, Owen Marshall\n\n";

# Global options
my %vars = (
Expand All @@ -41,6 +41,7 @@ my %vars = (
'bedtools_path' => '',
'gatc_frag_file' => '',
'kde_plot' => 0,
'keep_original_bams' => 0,
'bowtie2_genome_dir' => '',
'threads' => 7,
'full_data_files' => 0,
Expand All @@ -50,7 +51,7 @@ my %vars = (
'norm_override'=> 0,
'output_format'=>'bedgraph',
'method_subtract' => 0,
#'paired_ends' => 0,
'paired' => 0,
'pseudocounts' => 0,
'ps_factor' => 10,
'ps_debug' => 0,
Expand All @@ -67,8 +68,6 @@ my %vars = (
'dam' => '',
'out_name' => '',
'no_file_filters' => '',
'keep_sam' => 0,
'only_sam' => 0
);

my %vars_details = (
Expand All @@ -83,6 +82,7 @@ my %vars_details = (
'bedtools_path' => 'path to BEDTools executable (leave blank if in path)',
'gatc_frag_file' => 'GFF file containing all instances of the sequence GATC',
'kde_plot' => 'create an Rplot of the kernel density fit for normalisation (requires R)',
'keep_original_bams' => 'Keep unextended BAM files if using single-end reads',
'bowtie2_genome_dir' => 'Directory and basename for bowtie2 .bt2 indices',
'threads' => 'threads for bowtie2 to use',
'full_data_files' => 'Output full binned ratio files (not only GATC array)',
Expand All @@ -93,7 +93,7 @@ my %vars_details = (
'output_format' => 'Output tracks in this format [gff/bedgraph]',
'method_subtract' => 'Output values are (Dam_fusion - Dam) instead of log2(Dam_fusion/Dam) (not recommended)',
'pseudocounts' => 'Add this value of psuedocounts instead (default: optimal number of pseudocounts determined algorithmically)',
#'paired_ends' => 'Process paired-end reads (experimental, use with caution)',
'paired' => 'Process paired-end reads (experimental, use with caution)',
'ps_factor' => 'Value of c in c*(reads/bins) formula for calculating pseudocounts (default = 10)',
'ps_debug' => 'Print extra debugging info on pseudocount calculations in log file',
'min_norm_value' => 'Minimum log2 value to limit normalisation search at (default = -5)',
Expand All @@ -109,8 +109,6 @@ my %vars_details = (
'dam' => 'Specify file to use as Dam control',
'out_name' => 'Use this as the fusion-protein name when saving the final ratio',
'no_file_filters' => 'Do not trim filenames for output',
'keep_sam' => 'Do not delete .sam file',
'only_sam' => 'Do not generate bam file'
);

my @vars_unsaved = (
Expand Down Expand Up @@ -180,18 +178,18 @@ executable_check();
###

align_sequences();
load_gatc_frags() unless ($vars{'extension_method'} eq 'full' && $vars{'just_align'});
load_gatc_frags() unless (($vars{'extension_method'} eq 'full' || $vars{'paired'}) && $vars{'just_align'});

if ($vars{'extension_method'} eq 'full') {
extend_reads_full();
} else {
extend_reads_gatc();
}

calc_bins();

exit 0 if $vars{'just_align'};

calc_bins();

if ($vars{'norm_method'} eq 'kde') {
find_quants();
find_norm_factor();
Expand Down Expand Up @@ -267,6 +265,12 @@ sub executable_check {
printout("\nWarning: samtools executable found but unable to determine the version. We'll assume it's v1.2 or greater and hope for the best ...\n");
$samtools_version = 1.2;
}

# samtools confusing version numbers workaround
my ($big, $little) = $samtools_version =~ m/(\d+)\.(\d+)/;
if (defined($big) && defined($little)) {
$samtools_version = $big*1000 + $little;
}
}


Expand Down Expand Up @@ -357,25 +361,62 @@ EOT
}
} elsif (@in_files && !(-e $index_file)) {
foreach my $l (@in_files) {
my ($name,$dir,$ext) = fileparse($l, qr/\.[^.]*/);
($name) = $name =~ m/(.*?)(_|-ext|\.)/i unless $vars{'no_file_filters'};
my ($name_long,$dir,$ext) = fileparse($l, qr/\..*/);
my $name;
if ($vars{'no_file_filters'}) {
$name = $name_long;
} else {
($name) = $name_long =~ m/(.*?)(_|-ext|\.)/i;
}
$name ||= $name_long;

$vars{'bamfiles'} = 1 if $ext =~ /bam/;

if ($files{$name}) {
#same name already in use
my $i = 1;
while ($files{"$name.$i"}) {
$i++;
if ($vars{'paired'} && !$vars{'bamfiles'}) {
$vars{'extend_reads'} = 0;
# find paired end fastq files
my ($pmatch) = $name_long =~ m/(.*[R|_])[1|2](?=[_\d]*$)/;

unless ($pmatch) {
print "Error: $name_long not paired? File ignored ...\n";
next;
}
$name = "$name.$i";
}

print STDOUT "$name\t$l\n";
$files{$name}[0]=$l;

my $pmatch1 = $pmatch."1";
my $pmatch2 = $pmatch."2";

$name =~ s/[R|_][_\d]+$//;

if ($name_long =~ m/$pmatch1/) {
$files{$name}[0]=$l;
} elsif ($name_long =~ m/$pmatch2/) {
$files{$name}[1]=$l;
} else {
die("Error: paired-end reads set but cannot find paired-end flag: $name_long\nExiting.\n\n");
}
} else {
if ($files{$name}) {
#same name already in use
my $i = 1;
while ($files{"$name.$i"}) {
$i++;
}
$name = "$name.$i";
}
$files{$name}[0]=$l;
}
}

foreach my $name (sort(keys %files)) {
unless ($vars{'just_align'}) {
check_dam_name($name, $l);
check_dam_name($name, @{$files{$name}}[0]);
}
print "$name:\n";
foreach my $p ( @{$files{$name}} ) {
print "\t$p\n";
}
}

} elsif (-e $index_file) {
open (NORM, "<$index_file") || die "Unable to open index file for reading: $!\n";
my @norm = <NORM>;
Expand Down Expand Up @@ -442,7 +483,7 @@ EOT

sub check_dam_name {
my ($name, $file) = @_;
if ($vars{'dam'}) {
if ($vars{'dam'} && !($vars{'paired'})) {
$damname = $name if $vars{'dam'} eq $file;
} elsif ($name =~ m/^dam/i) {
die("\nERROR: more than one Dam sample detected. Please only use one Dam control per run -- specify the files to process on the command-line\n\n") if $damname;
Expand Down Expand Up @@ -565,11 +606,24 @@ sub align_sequences {
printout("\n*** Aligning files with bowtie2 ***\n");
foreach my $fn (keys %files) {
my $pair1 = $files{$fn}[0];
my $pair2 = $files{$fn}[1];

printout("Now working on $fn ...\n");

if ($vars{'bowtie'}) {
$bowtie_output{$fn} = `$vars{'bowtie2_path'}bowtie2 -p $vars{'threads'} -x $vars{'bowtie2_genome_dir'} -U $pair1 -S $fn.sam 2>&1`;
#$bowtie_output{$fn} = $vars{'paired'} ?
#`$vars{'bowtie2_path'}bowtie2 -p $vars{'threads'} -x $vars{'bowtie2_genome_dir'} -1 $pair1 -2 $pair2 -S $fn.sam 2>&1` :
#`$vars{'bowtie2_path'}bowtie2 -p $vars{'threads'} -x $vars{'bowtie2_genome_dir'} -U $pair1 -S $fn.sam 2>&1`;

#print "$vars{'bowtie2_path'}bowtie2 -p $vars{'threads'} -x $vars{'bowtie2_genome_dir'} -U $pair1 -S 2>tmp.out | $vars{'samtools_path'}samtools view -Shb - > $fn.bam\n";

$vars{'paired'} ?
`$vars{'bowtie2_path'}bowtie2 -p $vars{'threads'} -x $vars{'bowtie2_genome_dir'} -1 $pair1 -2 $pair2 2>tmp.out | $vars{'samtools_path'}samtools view -Shb - > $fn.bam` :
`$vars{'bowtie2_path'}bowtie2 -p $vars{'threads'} -x $vars{'bowtie2_genome_dir'} -U $pair1 2>tmp.out | $vars{'samtools_path'}samtools view -Shb - > $fn.bam`;

open(my $tmp, '<', "tmp.out") || die "Cannot open temporary bowtie output for reading: $!\n";
$bowtie_output{$fn} = join("",<$tmp>);
unlink("tmp.out");

unless ($bowtie_output{$fn}) {
printout("Error: no data returned from bowtie2. Exiting.\n\n");
Expand All @@ -588,10 +642,10 @@ sub extend_reads_full {
printout("*** Extending reads to $vars{'len'} bases (full extension) ***\n");
foreach my $fn (keys %files) {
printout("Reading input file: $fn ...\n");
open (IN, "<$fn.sam") || die "Unable to read $fn: $!\n";
open (IN, "$vars{'samtools_path'}samtools view -h $fn.bam |") || die "Unable to read $fn: $!\n";

printout(" Processing data ...\n");
open (OUT, ">$fn-ext$vars{'len'}.sam");
open (OUT, "| $vars{'samtools_path'}samtools view -Shb - > $fn-ext$vars{'len'}.bam");

my $c=0;
my $seqs;
Expand Down Expand Up @@ -636,9 +690,10 @@ sub extend_reads_full {
close OUT;

# the original SAM files are huge, so we nuke them
unlink("$fn.sam") unless $vars{'keep_sam'};
#unlink("$fn.sam") unless $vars{'keep_sam'};

printout(" Seqs extended (>q".$vars{'q'}.") = $seqs\n");
unlink("$fn.bam") unless $vars{'keep_original_bams'};
}
}
}
Expand All @@ -649,10 +704,10 @@ sub extend_reads_gatc {
printout("*** Extending reads up to $vars{'len'} bases ***\n");
foreach my $fn (keys %files) {
printout("Reading input file: $fn ...\n");
open (IN, "<$fn.sam") || die "Unable to read $fn: $!\n";
open (IN, "$vars{'samtools_path'}samtools view -h $fn.bam |") || die "Unable to read $fn: $!\n";

printout(" Processing data ...\n");
open (OUT, ">$fn-ext$vars{'len'}.sam");
open (OUT, "| $vars{'samtools_path'}samtools view -Shb - > $fn-ext$vars{'len'}.bam");

my $c=0;
my $seqs;
Expand Down Expand Up @@ -764,9 +819,10 @@ sub extend_reads_gatc {
}

# the original SAM files are huge, so we nuke them
unlink("$fn.sam") unless $vars{'keep_sam'};
#unlink("$fn.sam") unless $vars{'keep_sam'};

printout(" Seqs extended (>q".$vars{'q'}.") = $seqs\n\n");
unlink("$fn.bam") unless $vars{'keep_original_bams'};
}
}
}
Expand All @@ -780,47 +836,17 @@ sub calc_bins {

printout("\nNow working on $fn ...\n");

unless ($vars{'bamfiles'}) {
printout("Generating .bam file ...\n");
`$vars{'samtools_path'}samtools view -Sb $fn.sam > $fn.unsort.bam`;

unlink("$fn.sam") unless $vars{'keep_sam'};

# sort the bam file ...
printout(" sorting ...\n");
if ($samtools_version < 1.3) {
`$vars{'samtools_path'}samtools sort $fn.unsort.bam $fn`;
} else {
`$vars{'samtools_path'}samtools sort -T . -o $fn.bam $fn.unsort.bam`;
}

unlink("$fn.unsort.bam");
}

# Obtain readcounts via samtools ...
my $seqs = `$vars{'samtools_path'}samtools view -c $fn.bam`;
chomp($seqs);
if ($seqs == 0) {
printout("Error: no reads obtained from bamfile ... exiting.\n\n");
exit 1;
}

$counts{$n}=$seqs;
printout(" $seqs reads\n");

# short-circuit for just_align option
next if $vars{'just_align'};

if ($n eq $damname) {
$denom = $counts{$n};
}

printout(" Generating bins from $fn.bam ...\n");

my %cov;
my %gff;
my %bases;
my $lines;
my $reads = 0;
# We now manually calculate coverage, rather than using bedtools' -coverage option, for both memory considerations and ease of use.
# (bedtools' -coverage memory consumption is pretty crazy, getting up to 8Gb for ~25M reads ... this implementation uses about 1/10th of that memory.)
# Speed considerations are similar (this seems to be slightly faster ...)
Expand All @@ -846,7 +872,8 @@ sub calc_bins {
if ($pnext) {
# paired ends
$paired_flag = 1;
next unless $tlen && abs($tlen) < 1500;
next unless $flag & 2;
#next unless $tlen && abs($tlen) < 1500;
next unless $rnext eq '=';
($pstart, $pend) = sort($pos, $pos + $tlen);
} else {
Expand All @@ -859,8 +886,14 @@ sub calc_bins {
$pend = $pos+$readlen;
}

if ($paired_flag) {
$reads++ if $flag & 64;
} else {
$reads++;
}

if ($lines % 20000 == 0) {
my $pc = sprintf("%0.1f", ($lines*100 / $counts{$n}));
my $pc = sprintf("%0.1f", ($lines*100 / $seqs));
print STDERR " $pc% processed ...\r"
}

Expand All @@ -874,8 +907,20 @@ sub calc_bins {
}
close COV;

$counts{$n}=$reads;
printout(" $reads mapped fragments\n");

if ($reads == 0) {
printout("Error: no mapped reads obtained from bamfile ... exiting.\n\n");
exit 1;
}

if ($n eq $damname) {
$denom = $counts{$n};
}

if ($paired_flag) {
printout(" Warning: paired-end reads detected. Support for paired-end reads is experimental ...\n Please report any bugs encountered at owenjm.github.io/damidseq_pipeline\n\n");
printout(" Paired-end reads detected in bamfile\n");
}

# sanity check on coverage
Expand Down Expand Up @@ -992,7 +1037,7 @@ sub calc_bins {

if (@non_chrs) {
my $missing = join(', ', @non_chrs);
print STDERR " Warning: alignment contains chromosome identities not found in GATC file (see log file for details; this is normal if unmapped assemblies and the mitochondrial genome have been excluded from the GATC file.)";
print STDERR " Alignment contains chromosome identities not found in GATC file (normal if unmapped assemblies and the mitochondrial genome have been excluded from the GATC file; see log file for details)";
print LOG " Warning: alignment contains chromosome identities not found in GATC file:\n $missing\n";
}

Expand Down

0 comments on commit 21fb45f

Please sign in to comment.