diff --git a/README.md b/README.md index 6514f7c..dafe81d 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,24 @@ # Introduction -Processing DamID-seq data involves extending single-end reads, aligning the reads to the genome and determining the coverage, similar to processing regular ChIP-seq datasets. However, as DamID data is represented as a log2 ratio of (Dam-fusion/Dam), normalisation of the sample and Dam-only control is necessary and adding pseudocounts to mitigate the effect of background counts is highly recommended. +[damidseq_pipeline](https://github.com/owenjm/damidseq_pipeline/releases) is a single script that automatically handles sequence alignment, read extension, binned counts, normalisation, pseudocount addition and final ratio file generation. The script uses FASTQ or BAM files as input, and outputs the final log2 ratio files in bedGraph format. -[damidseq_pipeline](https://github.com/owenjm/damidseq_pipeline/tarball/master) is a single script that automatically handles sequence alignment, read extension, binned counts, normalisation, pseudocount addition and final ratio file generation. The script uses FASTQ or BAM files as input, and outputs the final log2 ratio files in bedGraph (or optionally GFF) format. - -The output ratio files can easily be converted to TDF for viewing in [IGV](http://www.broadinstitute.org/software/igv/) using igvtools. The files can be processed for peak calling using [find_peaks](http://github.com/owenjm/find_peaks) or, if using RNA pol II DamID, transcribed genes can be determined using [polii.gene.call](http://github.com/owenjm/polii.gene.call). +## Features +* Fully automated processing of NGS DamID-seq datasets, from FASTQ input to bedGraph output +* Handles both single- and paired-end datasets +* Can be used with either FASTQ or pre-aligned BAM input files +* Multiple methods of normalisation provided +* As of v1.5.3 and greater, can also handle and process ChIP-seq NGS data ## Citation If you find this software useful, please cite: -Marshall OJ and Brand AH. (2015) damidseq_pipeline: an automated pipeline for processing DamID sequencing datasets. *Bioinformatics.* Oct 15;31(20):3371-3. doi: 10.1093/bioinformatics/btv386. ([pubmed](http://www.ncbi.nlm.nih.gov/pubmed/26112292); [full text, open access](http://bioinformatics.oxfordjournals.org/content/early/2015/07/13/bioinformatics.btv386.long)) - -# Download +Marshall OJ and Brand AH. (2015) damidseq_pipeline: an automated pipeline for processing DamID sequencing datasets. *Bioinformatics.* 31(20): 3371--3. +([pubmed](http://www.ncbi.nlm.nih.gov/pubmed/26112292); [full text, open access](https://academic.oup.com/bioinformatics/article/31/20/3371/196153)) -**Important: if using a version of the pipeline less than v1.3, please update as older versions did not correctly account for reads on the minus strand during read extension.** The difference in the final output files is minimal (the Spearman's correlation between files generated with v1.2.6 or less or v1.3 is >0.95) but files generated by the new method should be more accurate. +# Download and installation -Download the latest version of the pipeline script and associated files: -* [As a tarball](https://github.com/owenjm/damidseq_pipeline/tarball/master) -* [As a zipfile](https://github.com/owenjm/damidseq_pipeline/zipball/master) +[Download the latest version](https://github.com/owenjm/damidseq_pipeline/releases) of the pipeline script and associated files. Prebuilt GATC fragment files used by the script are available for the following genomes: * [*Drosophila melanogaster* r5.57](https://github.com/owenjm/damidseq_pipeline/raw/gh-pages/pipeline_gatc_files/Dmel_r5.57.GATC.gff.gz) @@ -26,7 +26,7 @@ Prebuilt GATC fragment files used by the script are available for the following * [*Mus musculus* GRCm38](https://github.com/owenjm/damidseq_pipeline/raw/gh-pages/pipeline_gatc_files/MmGRCm38.GATC.gff.gz) or * [Human GRCh38](https://github.com/owenjm/damidseq_pipeline/raw/gh-pages/pipeline_gatc_files/HsGRCh38.GATC.gff.gz). -# Requirements +## Requirements * [Bowtie 2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) v2.1 or above (and appropriate genome indices -- see below) (not required if using pre-aligned BAM files) * [SAMtools](http://samtools.sourceforge.net) v0.1.9 or above @@ -34,7 +34,7 @@ Prebuilt GATC fragment files used by the script are available for the following * a *nix operating system (e.g. linux, Mac OSX) with Perl v5.10 or greater installed. We recommend using Ubuntu Linux in a virtual machine if using Windows. * Sequencing data in FASTQ or BAM format -# Installation +## Installation 1. Extract the pipeline script archive, make the damid_pipeline file executable and place it in your path 1. Install [Bowtie 2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) @@ -42,10 +42,9 @@ Prebuilt GATC fragment files used by the script are available for the following Alternatively, build the Bowtie 2 index files manually: 1. Download the latest FASTA genome primary_assembly (or toplevel) file from [Ensembl](http://ftp.ensembl.org/pub/current_fasta/) - e.g. [the current release for *Mus musculus*](http://ftp.ensembl.org/pub/current_fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz) + e.g. [the current release for *Mus musculus*](http://ftp.ensembl.org/pub/current_fasta/mus_musculus/dna/) - (alternatively, for *Drosophila*, download from the Flybase FTP site (ftp://ftp.flybase.net/releases/current/) - e.g. ftp://ftp.flybase.net/releases/FB2014_03/dmel_r5.57/fasta/dmel-all-chromosome-r5.57.fasta.gz ) + (alternatively, for *Drosophila*, download from [the Flybase FTP site](ftp://ftp.flybase.net/releases/current/) 1. Extract the .gz file 1. Run bowtie2-build in the directory containing the extracted .fasta file. For the examples above: @@ -65,69 +64,66 @@ Prebuilt GATC fragment files used by the script are available for the following perl gatc.track.maker.pl --name=dmel_r5.57 dmel-all-chromosome-r5.57.fasta -## Running the script the first time +## Initial setup -In order to run correctly, the script needs to know the locations of two paths, specified using the following command-line options: +In order to run correctly, damidseq_pipeline needs to know the locations of two paths, specified using the following command-line options (it will prompt you for these if you fail to specify them): 1. The directory and basename of the bowtie2 index files (obtained or built in step 3 above) - (specified with the --bowtie2_genome_dir option) + (specified with the `--bowtie2_genome_dir` option) e.g. in the example above, use --bowtie2_genome_dir=[path_to_.bt2_files]/dmel_r5.57 1. The GATC fragment .gff file (downloaded from the pre-built files listed in step 5, or built following the instructions above) - (specified with the --gatc_frag_file option) + (specified with the `--gatc_frag_file` option) In order to setup the pipeline to process the *D. melanogaster* genome, for example, the first-run command would be: damidseq_pipeline --gatc_frag_file=path/to/Dmel_r5.57.GATC.gff.gz --bowtie2_genome_dir=path/to/dmel_r5.57/dmel_r.5.57 -If these paths do not already exist and the script is run with these options and correct values, the paths will be saved for all future runs unless overridden on the command-line. - -(To clear all user-saved values, run the script with the --reset_defaults option.) +Once run once with these options and correct values, the paths will be saved for all future runs unless overridden on the command-line. **You do not need to specify these options each time** +{: .notice--info} -# Running the script +(To clear all user-saved values, including these values, run with the `--reset_defaults` option.) -Run the script in a directory with sequencing files in FASTQ or BAM format. The default behaviour is to process all files in FASTQ format, and if none are found, all files in BAM format. Alternatively, individual files may be specified on the command line if the user does not wish to process all available files present in the directory (for example, if the sequencing lane contained multiple replicates). +# Using damidseq_pipeline -The script will by default determine sample names from the file names, and expects a single filename to start with "Dam" in order to automatically assign the Dam-only control. +Run damidseq_pipeline in a directory containing sequencing files in FASTQ or BAM format. The default behaviour is to process all files in FASTQ format, and if none are found, all files in BAM format. -To see all available options, run the script with --help command-line option: - - damidseq_pipeline --help - -This will give you a list of adjustable parameters and their default and current values if applicable. We recommend keeping these at the default value in most cases; however, these can be modified on the command-line with --option=value (no spaces). - -To save modified values for all future runs, run the script with the parameter you wish to change together with the --save_defaults command-line option: - - damidseq_pipeline --save_defaults +Alternatively, individual files may be specified on the command line if the user does not wish to process all available files present in the directory (for example, if the sequencing lane contained multiple replicates). -If bowtie2 and samtools are not in your path, you can specify these on the command-line also. +## Sample names +Sample names are assigned from filenames. If a single filename being processed begins with "Dam", this will be assigned as the Dam-only control. -## Output and downstream data processing +If no sample filename, or multiple filenames, begin with "Dam", use `--dam=[filename]` to specify the Dam-only control sample manually. If a Dam-only control cannot be automatically determined, damidseq_pipeline will exit and prompt you to specify one. -The final output will be a single ratio file: Sample-vs-DAM.gatc.bedgraph. The .gatc.bedgraph file represents the data at GATC fragment resolution (based on the reference genome) and should be used for all subsequent analysis. +## Paired-end sequencing files +To process paired-end FASTQ files, use the `--paired` option and the pipeline will search for, and match, paired reads. -The [bedGraph format](http://genome.ucsc.edu/goldenpath/help/bedgraph.html) is used by default. The pipeline script can output the final ratio files in [GFF format](http://www.ensembl.org/info/website/upload/gff.html) instead if the --output_format=gff command-line switch is used. +BAM files generated from paired-end data are automatically detected and processed, without requiring this option. -### Visualising the DNA binding profiles +## Processing ChIP-seq data +As of v1.5.3, damidseq_pipeline can also handle ChIP-seq data via the `--chipseq` flag. This option will remove PCR duplicate reads, only process uniquely mapping reads, and output binned coverage tracks in RPM (reads per million mapped reads). -The bedgraph output file can be can be converted to .tdf format via igvtools, either using the graphical interface provided by [IGV](http://www.broadinstitute.org/software/igv/) or via the command-line. A legacy [gff2tdf.pl](http://github.com/owenjm/damid_pipeline/blob/master/gff2tdf.pl?raw=true) script is also provided for converting GFF files (the default output file format prior to v1.4) to TDF. +Warning -- do not use this option with DamID-seq data. DamID-seq is all about the PCR duplicates! +{: .notice--warning} -### Calling significant peaks from the data +## Other options +To see all available options, run the script with `--help` command-line option: -The [find_peaks](http://github.com/owenjm/find_peaks) software will process the output .gatc.bedgraph ratio file and call significant peaks present in the dataset. Please see the find_peaks page for more details. + damidseq_pipeline --help -### Calling transcribed genes from RNA pol II datasets +This will give you a list of adjustable parameters and their default and current values if applicable. We recommend keeping these at the default value in most cases; however, these can be modified on the command-line with `--option=value` (no spaces). -The [polii.gene.call](http://github.com/owenjm/polii.gene.call) Rscript will call transcribed genes (i.e. gene bodies with significantly enriched pol II occupancy) from the output .gatc.bedgraph file. Please see the polii.gene.call page for more details. +## Saving default option values +To save modified values for all future runs, run the script with the parameter you wish to change together with the `--save_defaults` command-line option: -### Other useful scripts and utilities + damidseq_pipeline --save_defaults -A collection of useful R and Perl scripts for comparing and analysing DamID-seq data is maintained at [http://github.com/owenjm/damid_misc]. +If bowtie2 and samtools are not in your path, you can specify these on the command-line also. ## Working with multiple genomes -If the user expects to process data from multiple genomes, separate genome specifications can be saved by using the --save_defaults=[name] along with the --bowtie2_genome_dir and --gatc_frag_file options (and any other custom options that the user wishes to set as default for this genome, e.g. the bin width). For e.g.: +If the user expects to process data from multiple genomes, separate genome specifications can be saved by using the `--save_defaults=[name]` along with the `--bowtie2_genome_dir` and `--gatc_frag_file` options (and any other custom options that the user wishes to set as default for this genome, e.g. the bin width). For e.g.: damidseq_pipeline --save_defaults=fly --gatc_frag_file=path/to/Dmel_r5.57.GATC.gff.gz --bowtie2_genome_dir=path/to/dmel_r5.57/dmel_r.5.57 damidseq_pipeline --save_defaults=mouse --bins=500 --gatc_frag_file=path/to/MmGRCm38.GATC.gff.gz --bowtie2_genome_dir=path/to/Mm_GRCm38/GRCm38 @@ -138,17 +134,28 @@ Once set up, different genome definitions can be quickly loaded using the --load All currently saved genome definitions can be listed using --load_defaults=list. -## Processing FASTQ files with adaptor codes +## Example dataset -The damidseq_pipeline will attempt to automatically determine the sample name from the filenames provided. +An example set of two small (3000 reads each) fastq.gz files and an index.txt file are provided in the zip archive (as "example.zip"), or you can [download these separately](http://github.com/owenjm/damid_pipeline/blob/master/example.zip?raw=true). Running the pipeline script on these files should successfully produce a polII-vs-Dam.gatc.bedgraph ratio file as output. -Some sequencing facilities, however, may provide only adaptor index IDs rather than sample names. If sequencing filenames are provided with index IDs rather than names, a file "index.txt" in the working directory can be provided with the format of [index] [sample_name], eg: +# Output and downstream data processing - A6 Dam - A12 polII +The final output will be a single ratio file: Sample-vs-Dam.gatc.bedgraph. The .gatc.bedgraph file represents the data at GATC fragment resolution (based on the reference genome) and should be used for all subsequent analysis. -where A6 is the sequencing adaptor index. The sample name cannot contain spaces and there has to be one (and only one) sample called "Dam"; the adaptor index must to be referenced in the fastq filename (e.g. for "A6", either "Index6" or "A006" are expected in the filename). Please see the [provided example](http://github.com/owenjm/damid_pipeline/blob/master/example.zip?raw=true) for an illustration of the index.txt file format and matching file names. +The [bedGraph format](http://genome.ucsc.edu/goldenpath/help/bedgraph.html) is used by default. The pipeline script can output the final ratio files in [GFF format](http://www.ensembl.org/info/website/upload/gff.html) instead if the `--output_format=gff` command-line switch is used. -## Example dataset +### Visualising the DNA binding profiles -An example set of two small (3000 reads each) fastq.gz files and an index.txt file are provided in the zip archive (as "example.zip"), or you can [download these separately](http://github.com/owenjm/damid_pipeline/blob/master/example.zip?raw=true). Running the pipeline script on these files should successfully produce a polII-vs-Dam.gatc.bedgraph ratio file as output. +The bedgraph output files can be can viewed directly in genome browsers such as [IGV](http://www.broadinstitute.org/software/igv/). For publication-quality figures, we recommend [pyGenomeTracks](https://pygenometracks.readthedocs.io/). + +### Calling significant peaks from the data + +The [find_peaks](http://github.com/owenjm/find_peaks) software will process the output .gatc.bedgraph ratio file and call significant peaks present in the dataset. Please see the find_peaks page for more details. + +### Calling transcribed genes from RNA pol II datasets + +The [polii.gene.call](http://github.com/owenjm/polii.gene.call) Rscript will call transcribed genes (i.e. gene bodies with significantly enriched pol II occupancy) from the output .gatc.bedgraph file. Please see the polii.gene.call page for more details. + +### Other useful scripts and utilities + +A collection of useful R and Perl scripts for comparing and analysing DamID-seq data [is maintained here](http://github.com/owenjm/damid_misc). diff --git a/changelog b/changelog index 1c59021..fa55b0e 100644 --- a/changelog +++ b/changelog @@ -1,3 +1,15 @@ + +[v1.5.3] +* New feature: --coverage flag outputs coverage bedGraph track of each sample (useful with Dam-only tracks for CATaDa analysis). --just_coverage exits after coverage tracks are written. +* New feature: Kernel density normalisation routines sped up ~10x via Inline::C. This is enabled by default, but will only work if the Inline::C module is installed; damidseq_pipeline will fall back on the Perl coded routine if this is not present. If you want to control explicitly, do so via the --kde_inline flag. +* New feature: ChIP-seq-specific processing routines added: --remdups removes PCR duplicates, --unique only maps unique reads. Enable all of these with --chipseq (also turns of GATC-level binning, GATC-based read extension and will only output coverage tracks not ratios) +* New feature: New normalisation method, "rawbins". This writes the normalisation data to TSV format and runs a user-provided script for custom normalisation. Your script should read the data file and output the normalisation factor. Advanced users only, use with caution. +* New feature: consider all occupancy as RPM (reads per million mapped reads), set as default. Control with --scores_as_rpm flag. +* Changed: bowtie2 command handling refactored more elegantly +* Added: ability to add custom flags to bowtie2 command with --bowtie2_add_flags +* Fixed: small changes to read mapping code for more accurate mapping +* Changed: filename filters are now turned off by default (use --no_file_filters=0 to turn back on) + [v1.4.6] * New feature: paired-end reads are now handled natively at the fastq level. Use the --paired flag to enable, and your paired reads should be automagically detected and aligned * Changed: BAM file format is now used natively without SAM intermediates at all levels (also fixes the recent samtools version handling bug) diff --git a/damidseq_pipeline b/damidseq_pipeline index 1ce9c53..1a0f34c 100755 --- a/damidseq_pipeline +++ b/damidseq_pipeline @@ -1,6 +1,6 @@ #!/usr/bin/perl -w -# Copyright © 2013-20, Owen Marshall +# Copyright © 2013-21, Owen Marshall # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -25,8 +25,8 @@ use 5.010; $|++; -my $version = "1.4.6"; -print STDERR "\ndamidseq_pipeline v$version\nCopyright 2013-20, Owen Marshall\n\n"; +my $version = "1.5.3"; +print STDERR "\ndamidseq_pipeline v$version\nCopyright 2013-22, Owen Marshall\n\n"; # Global options my %vars = ( @@ -43,8 +43,10 @@ my %vars = ( 'kde_plot' => 0, 'keep_original_bams' => 0, 'bowtie2_genome_dir' => '', + 'bowtie2_add_flags' => '', 'threads' => 7, 'full_data_files' => 0, + 'as_gatc' => 1, 'qscore1min' => 0.4, 'qscore1max' => 1.0, 'qscore2max' => 0.9, @@ -52,6 +54,7 @@ my %vars = ( 'output_format'=>'bedgraph', 'method_subtract' => 0, 'paired' => 0, + 'scores_as_rpm' => 1, 'pseudocounts' => 0, 'ps_factor' => 10, 'ps_debug' => 0, @@ -59,6 +62,8 @@ my %vars = ( 'max_norm_value' => 5, 'norm_steps' => 300, 'just_align' => 0, + 'just_coverage' => 0, + 'coverage' => 0, 'norm_method' => 'kde', 'save_defaults' => 0, 'load_defaults' => '', @@ -67,7 +72,14 @@ my %vars = ( #'filter' => 0, 'dam' => '', 'out_name' => '', - 'no_file_filters' => '', + 'no_file_filters' => 1, + 'write_raw' => 0, + 'rawbins_cmd' => '', + 'markdup' => 0, + 'remdups' => 0, + 'unique' => 0, + 'kde_inline' => 1, + 'chipseq' => 0, ); my %vars_details = ( @@ -84,8 +96,10 @@ my %vars_details = ( 'kde_plot' => 'create an Rplot of the kernel density fit for normalisation (requires R)', 'keep_original_bams' => 'Keep unextended BAM files if using single-end reads', 'bowtie2_genome_dir' => 'Directory and basename for bowtie2 .bt2 indices', + 'bowtie2_add_flags' => 'Additional flags to use for bowtie2 alignment', 'threads' => 'threads for bowtie2 to use', 'full_data_files' => 'Output full binned ratio files (not only GATC array)', + 'as_gatc' => 'Output files at GATC resolution', 'qscore1min' => 'min decile for normalising from Dam array', 'qscore1max' => 'max decile for normalising from Dam array', 'qscore2max' => 'max decile for normalising from fusion-protein array', @@ -93,14 +107,17 @@ my %vars_details = ( 'output_format' => 'Output tracks in this format [gff/bedgraph]', 'method_subtract' => 'Output values are (Dam_fusion - Dam) instead of log2(Dam_fusion/Dam) (not recommended)', 'pseudocounts' => 'Add this value of psuedocounts instead (default: optimal number of pseudocounts determined algorithmically)', - 'paired' => 'Process paired-end reads (experimental, use with caution)', + 'paired' => 'Process paired-end reads', + 'scores_as_rpm' => 'Calculate coverage scores (before normalisation) as reads per million (RPM)', 'ps_factor' => 'Value of c in c*(reads/bins) formula for calculating pseudocounts (default = 10)', 'ps_debug' => 'Print extra debugging info on pseudocount calculations in log file', 'min_norm_value' => 'Minimum log2 value to limit normalisation search at (default = -5)', 'max_norm_value' => 'Maximum log2 value to limit normalisation search at (default = +5)', 'norm_steps' => 'Number of points in normalisation routine (default = 300)', 'just_align' => 'Just align the FASTQ files, generate BAM files, and exit', - 'norm_method' => "Normalisation method to use; options are:\n\rkde: use kernel density estimation of log2 GATC fragment ratio (default)\n\rrpm: use readcounts per million reads (not recommended for most use cases)", + 'just_coverage' => 'Align, generate BAMs and calculate coverage; do generate ratio files', + 'coverage' => 'Save individual coverage bedgraphs', + 'norm_method' => "Normalisation method to use; options are:\n\rkde: use kernel density estimation of log2 GATC fragment ratio (default)\n\rrpm: use readcounts per million reads (not recommended for most use cases)\n\rrawbins: process raw counts with external normalisation command (set with --rawbins_cmd)", 'save_defaults' => "Save runtime parameters as default\n\r(provide a name to differentiate different genomes -- these can be loaded with 'load_defaults')", 'load_defaults' => "Load this saved set of defaults\n\r(use 'list' to list current saved options)", 'reset_defaults' => 'Delete user-defined parameters', @@ -109,12 +126,22 @@ my %vars_details = ( 'dam' => 'Specify file to use as Dam control', 'out_name' => 'Use this as the fusion-protein name when saving the final ratio', 'no_file_filters' => 'Do not trim filenames for output', + 'write_raw' => 'Write TSV file with raw counts for each Dam, sample pair', + 'rawbins_cmd' => "Command to process raw tsv counts for --norm_method=rawbins\n\rOutput of command should be the normalisation factor (use with caution)", + 'markdup' => "Mark PCR duplicate reads with samtools.\n\r*NB: for ChIP-seq alignment only; do NOT use with DamID data.", + 'remdups' => "Remove PCR duplicate reads (implies --markdup).\n\r*NB: for ChIP-seq alignment only; do NOT use with DamID data.", + 'unique' => "Only map uniquely aligning reads", + 'kde_inline' => 'Use Inline::C for KDE normalisation calculations', + 'chipseq' => "Process ChIP-seq data\n\r(Equivalent to --just_coverage --as_gatc=0 --remdups --unique --extension_method=full --full_data_files)", + ); my @vars_unsaved = ( 'save_defaults', 'reset_defaults', 'load_defaults', + 'just_align', + 'just_coverage', 'dam', 'out_name' ); @@ -152,6 +179,14 @@ my $frags; my $no_paths; my $windows_file; my $samtools_version; +my $kden_sr; +my $no_inline_C; + +# Inline_C sanity checks +$kden_sr = \&kden_calc_perl unless $vars{'kde_inline'}; +if ($vars{'kde_inline'} && $no_inline_C) { + print "** Inline::C not found; falling back on Perl KDE implementation ...\n"; +} # Read parameters if exist process_cli(0); @@ -166,6 +201,9 @@ save_defaults(); # Log file init_log_file(); +# Final sanity checks +final_sanity_check(); + # Read input files process_input_files(); @@ -178,7 +216,7 @@ executable_check(); ### align_sequences(); -load_gatc_frags() unless (($vars{'extension_method'} eq 'full' || $vars{'paired'}) && $vars{'just_align'}); +load_gatc_frags() unless (!$vars{'as_gatc'} || (($vars{'extension_method'} eq 'full' || $vars{'paired'}) && $vars{'just_align'})); if ($vars{'extension_method'} eq 'full') { extend_reads_full(); @@ -190,11 +228,25 @@ exit 0 if $vars{'just_align'}; calc_bins(); +if ($vars{'write_raw'}) { + generate_raw_files(); + exit 0; +} + +if ($vars{'just_coverage'} || $vars{'coverage'}) { + generate_cov_files(); + exit 0 if $vars{'just_coverage'}; +} + if ($vars{'norm_method'} eq 'kde') { find_quants(); find_norm_factor(); } +if ($vars{'norm_method'} eq 'rawbins') { + find_norm_factor_raw(); +} + normalize(); generate_ratio(); @@ -273,6 +325,30 @@ sub executable_check { } } +sub final_sanity_check { + if ($vars{'norm_method'} eq 'rawbins') { + unless ($vars{'rawbins_cmd'}) { + print STDERR "Error: rawbins normalisation method selected, but no custom command supplied.\n\nPlease use the --rawbins_cmd to specify the command to provide normalisation factors from raw data.\n\n"; + exit 1; + } + } + + if ($vars{'chipseq'}) { + $vars{'remdups'} = 1; + $vars{'markdup'} = 1; + $vars{'as_gatc'} = 0; + $vars{'just_coverage'} = 1; + } + + unless ($vars{'as_gatc'}) { + $vars{'extension_method'}="full"; + $vars{'full_data_files'}=1; + printout("\n** Warning: no GATC conversion requested; extension_method set to 'full'\n"); + } + + $vars{'markdup'} = 1 if $vars{'remdups'}; +} + sub process_cli { # CLI processing @@ -408,7 +484,7 @@ EOT } foreach my $name (sort(keys %files)) { - unless ($vars{'just_align'}) { + unless ($vars{'just_align'} || $vars{'just_coverage'}) { check_dam_name($name, @{$files{$name}}[0]); } print "$name:\n"; @@ -449,7 +525,7 @@ EOT printout("$index{$i}: Index $i\n"); $files{$index{$i}}[0]=$l; - unless ($vars{'just_align'}) { + unless ($vars{'just_align'} || $vars{'just_coverage'}) { if ($index{$i} =~ m/^dam/i) { die("\nERROR: more than one Dam sample detected. Please only use one Dam control per run -- specify the files to process on the command-line\n\n") if $damname; $damname = $index{$i} @@ -464,7 +540,7 @@ EOT } # Check that there's a Dam control sample ... - unless ($vars{'just_align'}) { + unless ($vars{'just_align'} || $vars{'just_coverage'}) { unless ($damname) { printout("\nError: no Dam control sample detected!\nPlease use include a filename starting with 'Dam', or specify the Dam control file to use with the --dam=[filename] switch\n"); exit 1; @@ -605,33 +681,36 @@ sub align_sequences { printout("\n*** Aligning files with bowtie2 ***\n"); foreach my $fn (keys %files) { + + my $samtools_pipe = "$vars{'samtools_path'}samtools view -Shb - > $fn.bam"; + if ($vars{'markdup'}) { + # additional processing to mark duplicate reads for ChIP-seq alignments + ## DO NOT USE for DamID read processing! + ## (as DamID uses PCR duplication to create signal!) + + $samtools_pipe = "$vars{'samtools_path'}samtools collate -\@$vars{'threads'} -O - | $vars{'samtools_path'}samtools fixmate -O bam,level=0 -m - - | $vars{'samtools_path'}samtools sort -O bam,level=0 -\@$vars{'threads'} - | $vars{'samtools_path'}samtools markdup -\@$vars{'threads'} -s - $fn.bam" + } + my $pair1 = $files{$fn}[0]; my $pair2 = $files{$fn}[1]; printout("Now working on $fn ...\n"); - if ($vars{'bowtie'}) { - #$bowtie_output{$fn} = $vars{'paired'} ? - #`$vars{'bowtie2_path'}bowtie2 -p $vars{'threads'} -x $vars{'bowtie2_genome_dir'} -1 $pair1 -2 $pair2 -S $fn.sam 2>&1` : - #`$vars{'bowtie2_path'}bowtie2 -p $vars{'threads'} -x $vars{'bowtie2_genome_dir'} -U $pair1 -S $fn.sam 2>&1`; - - #print "$vars{'bowtie2_path'}bowtie2 -p $vars{'threads'} -x $vars{'bowtie2_genome_dir'} -U $pair1 -S 2>tmp.out | $vars{'samtools_path'}samtools view -Shb - > $fn.bam\n"; - - $vars{'paired'} ? - `$vars{'bowtie2_path'}bowtie2 -p $vars{'threads'} -x $vars{'bowtie2_genome_dir'} -1 $pair1 -2 $pair2 2>tmp.out | $vars{'samtools_path'}samtools view -Shb - > $fn.bam` : - `$vars{'bowtie2_path'}bowtie2 -p $vars{'threads'} -x $vars{'bowtie2_genome_dir'} -U $pair1 2>tmp.out | $vars{'samtools_path'}samtools view -Shb - > $fn.bam`; - - open(my $tmp, '<', "tmp.out") || die "Cannot open temporary bowtie output for reading: $!\n"; - $bowtie_output{$fn} = join("",<$tmp>); - unlink("tmp.out"); - - unless ($bowtie_output{$fn}) { - printout("Error: no data returned from bowtie2. Exiting.\n\n"); - exit 1; - } - - printout("$bowtie_output{$fn}\n"); + $vars{'paired'} ? + `$vars{'bowtie2_path'}bowtie2 -p $vars{'threads'} -x $vars{'bowtie2_genome_dir'} $vars{'bowtie2_add_flags'} -1 $pair1 -2 $pair2 2>tmp.out | $samtools_pipe` : + `$vars{'bowtie2_path'}bowtie2 -p $vars{'threads'} -x $vars{'bowtie2_genome_dir'} $vars{'bowtie2_add_flags'} -U $pair1 2>tmp.out | $samtools_pipe`; + + open(my $tmp, '<', "tmp.out") || die "Cannot open temporary bowtie output for reading: $!\n"; + $bowtie_output{$fn} = join("",<$tmp>); + unlink("tmp.out"); + + unless ($bowtie_output{$fn}) { + printout("Error: no data returned from bowtie2. Exiting.\n\n"); + exit 1; } + + printout("$bowtie_output{$fn}\n"); + } } } @@ -650,12 +729,18 @@ sub extend_reads_full { my $c=0; my $seqs; while () { + if (/^\@/) { + print OUT; + next; + } + chomp; my ($qname, $flag, $chr, $pos, $mapq, $cigar, $rnext, $pnext, $tlen, $seq, $qual) = split('\s+'); $c++; unless ($c%100000) { - print " $c lines processed ...\r"; + my $lp = sprintf("%0.2f",$c / 1000000); + print " $lp"."M reads processed ...\r"; } unless ($seq) { @@ -668,13 +753,13 @@ sub extend_reads_full { $seqs++; my $readlen; - foreach my $cig ($cigar =~ m/(\d+)[M|I|S|=|X]/g) { + foreach my $cig ($cigar =~ m/(\d+)[M|D|N|=|X]/g) { $readlen += $cig; } $cigar="$vars{'len'}M"; - if ($flag == 16) { + if ($flag & 16) { # minus strand $pos -= $vars{'len'} - $readlen; $pos = max(1,$pos); @@ -726,7 +811,7 @@ sub extend_reads_gatc { $c++; unless ($c%100000) { my $lp = sprintf("%0.2f",$c / 1000000); - print " $lp"."M lines processed ...\r"; + print " $lp"."M reads processed ...\r"; } next unless $seq; @@ -741,7 +826,8 @@ sub extend_reads_gatc { $seqs++; my $readlen; - foreach my $cig ($cigar =~ m/(\d+)[M|I|S|=|X]/g) { + # used to use [M|I|S|=|X], but this is length of seq, not length of template + foreach my $cig ($cigar =~ m/(\d+)[M|D|N|=|X]/g) { $readlen += $cig; } @@ -750,7 +836,7 @@ sub extend_reads_gatc { # find next GATC after reads # It takes far too long to grep an entire chromosome's worth of GATC fragments for GATC sites internal to the extension length. The following is an inelegant but very effective solution for speeding the process up: essentially, GATC fragments are binned into a hash when read in, and the hash is used as a lookup table. There may be issues with long extension lengths, but it's fairly versatile. - if ($flag == 16) { + if ($flag & 16) { ### minus strand my $fiveprime_search = $pos - ($vars{'len'} - $readlen); @@ -818,9 +904,6 @@ sub extend_reads_gatc { print LOG " Warning: alignment contains chromosome identities not found in GATC file:\n $missing\n\n"; } - # the original SAM files are huge, so we nuke them - #unlink("$fn.sam") unless $vars{'keep_sam'}; - printout(" Seqs extended (>q".$vars{'q'}.") = $seqs\n\n"); unlink("$fn.bam") unless $vars{'keep_original_bams'}; } @@ -866,6 +949,15 @@ sub calc_bins { next; } + # Filter out all additional comments + next if $qname =~ m/^@/; + + # Remove PCR duplicates if requested + next if $vars{'remdups'} && ($flag & 1024); + + # Remove non-uniquely mapping reads if requested + next if $vars{'unique'} && $mapq <= 1; + next if $cigar eq "*"; my ($pstart, $pend); @@ -879,7 +971,7 @@ sub calc_bins { } else { # single ends my $readlen; - foreach my $cig ($cigar =~ m/(\d+)[M|I|S|=|X]/g) { + foreach my $cig ($cigar =~ m/(\d+)[M|D|N|=|X]/g) { $readlen += $cig; } $pstart = $pos; @@ -915,7 +1007,7 @@ sub calc_bins { exit 1; } - if ($n eq $damname) { + if ($damname && ($n eq $damname)) { $denom = $counts{$n}; } @@ -929,33 +1021,37 @@ sub calc_bins { exit 1; } - # sanity check on chromosome names - unless ( map {my $t = $_; grep {$_ eq $t} keys(%cov)} keys(%gatc) ) { - if ( map {s/chr//gi; my $t = $_; grep {s/chr//gi; $_ eq $t} keys(%cov)} keys(%gatc) ) { - printout(" Warning: chromosome name mismatch detected -- removing 'chr' from definitions and continuing ...\n"); - if (grep {m/chr/} keys %cov) { - my %new_cov; - @new_cov{ map {s/chr//; $_} keys %cov} = values %cov; - %cov = %new_cov; - - my %new_bases; - @new_bases{ map {s/chr//; $_} keys %bases} = values %bases; - %bases = %new_bases; - - } elsif (grep {m/chr/} keys %gatc) { - my %new_gatc; - @new_gatc{ map {s/chr//; $_} keys %gatc} = values %gatc; - %gatc = %new_gatc; + # sanity check on chromosome names if using GATC file + if ($vars{'as_gatc'}) { + unless ( map {my $t = $_; grep {$_ eq $t} keys(%cov)} keys(%gatc) ) { + if ( map {s/chr//gi; my $t = $_; grep {s/chr//gi; $_ eq $t} keys(%cov)} keys(%gatc) ) { + printout(" Warning: chromosome name mismatch detected -- removing 'chr' from definitions and continuing ...\n"); + if (grep {m/chr/} keys %cov) { + my %new_cov; + @new_cov{ map {s/chr//; $_} keys %cov} = values %cov; + %cov = %new_cov; + + my %new_bases; + @new_bases{ map {s/chr//; $_} keys %bases} = values %bases; + %bases = %new_bases; + + } elsif (grep {m/chr/} keys %gatc) { + my %new_gatc; + @new_gatc{ map {s/chr//; $_} keys %gatc} = values %gatc; + %gatc = %new_gatc; + } else { + printout(" Error: can't find a 'chr' key definition in either hash (please contact the author for support)\n"); + exit 1; + } } else { - printout(" Error: can't find a 'chr' key definition in either hash (please contact the author for support)\n"); - exit 1; + printout(" Error: chromsome definitions between bam and GATC file do not match!\n"); } - } else { - printout(" Error: chromsome definitions between bam and GATC file do not match!\n"); } } my $read_bins=0; + my $real_reads_mil = $reads/1e6; + foreach my $chr (keys %cov) { next unless $bases{$chr}; for (my $bin = 0; $bin < $bases{$chr}+$vars{'bins'}; $bin += $vars{'bins'}) { @@ -963,6 +1059,7 @@ sub calc_bins { $read_bins++; my $end = $bin+($vars{'bins'}-1); my $score = $cov{$chr}{$bin} || 0; + $score /= $real_reads_mil if $vars{'scores_as_rpm'}; push @{$gff{$chr}}, [$bin, $end, $score]; if ($vars{'full_data_files'}) { @@ -978,6 +1075,8 @@ sub calc_bins { my $index = 0; my @non_chrs; + next unless $vars{'as_gatc'}; + printout(" Converting to GATC resolution ... \n"); foreach my $chr (keys %gff) { unless ($gatc{$chr}) { @@ -1057,19 +1156,21 @@ sub normalize { $norm = $vars{'norm_override'}; } elsif ($vars{'norm_method'} eq 'rpm') { printout(" (using RPM normalisation ...)\n"); - $norm = $counts{$damname}/$counts{$n}; + $norm = $vars{"scores_as_rpm"} ? 1 : $counts{$damname}/$counts{$n}; printout(" ... normalising by ".sprintf("%0.2f",$norm)."\n"); } else { $norm = $norm_factors{$n}; printout(" ... normalising by ".sprintf("%0.2f",$norm)."\n"); } - foreach my $chr (keys %array) { - foreach my $bin (keys %{ $array{$chr}}) { - if (defined($array{$chr}{$bin}{$n})) { - $array{$chr}{$bin}{$n} *= $norm; - } else { - $array{$chr}{$bin}{$n} = 0; + if ($vars{'as_gatc'}) { + foreach my $chr (keys %array) { + foreach my $bin (keys %{ $array{$chr}}) { + if (defined($array{$chr}{$bin}{$n})) { + $array{$chr}{$bin}{$n} *= $norm; + } else { + $array{$chr}{$bin}{$n} = 0; + } } } } @@ -1102,11 +1203,118 @@ sub generate_ratio { printout("Now working on $n ...\n"); - write_file($n, 1); + write_file($n, 1) if $vars{'as_gatc'}; write_file($n, 0) if $vars{'full_data_files'}; } } +sub generate_raw_files { + printout("\n*** Writing raw files ***\n"); + foreach my $n (keys %files) { + next if $n eq $damname; + + printout("Now working on $n ...\n"); + + write_raw_bins($n, 1) if $vars{'as_gatc'}; + write_raw_bins($n, 0) if $vars{'full_data_files'}; + } +} + +sub generate_cov_files { + printout("\n*** Writing coverage files ***\n"); + foreach my $n (keys %files) { + printout("Now working on $n ...\n"); + my $write_gatc = $vars{'as_gatc'} ? 1 : 0; + write_cov($n, $write_gatc); + } +} + +sub calc_pseudocounts { + my ($n,$psc_min ) = @_; + my $ps = $vars{'ps_factor'}*$psc_min/$bins{$n}; + $ps /= $psc_min/1e6 if $vars{'scores_as_rpm'}; + my $pseudocounts = $vars{'pseudocounts'} ? $vars{'pseudocounts'} : $ps; + return($pseudocounts); +} + +sub write_raw_bins { + my ($n, $write_gatcs) = @_; + my %data; + + # find the lowest number of compared counts -- this will now use a different number of pseudocounts per sample + my $psc_min = ($counts{$n}, $denom)[$counts{$n} > $denom]; + my $pseudocounts = calc_pseudocounts($n,$psc_min); # pseudocounts value is related to total reads/number of bins + + my $dref = ($write_gatcs ? \%array : \%full_tracks); + + printout(" Reading Dam ...\n"); + foreach my $chr (keys %$dref) { + foreach my $start (keys %{ $dref -> {$chr}}) { + my $score = $dref -> {$chr} -> {$start} -> {$damname}; + my $end = $dref -> {$chr} -> {$start} -> {'end'}; + push @{ $data{$chr}{$start}}, $end; + push @{ $data{$chr}{$start}}, $score; + } + } + + printout(" Reading $n ...\n"); + foreach my $chr (keys %$dref) { + foreach my $start (keys %{ $dref -> {$chr}}) { + my $score = $dref -> {$chr} -> {$start} -> {$n}; + push @{ $data{$chr}{$start}}, $score; + } + } + + my $track_type = $write_gatcs ? 'gatc' : ''; + my $protname = $vars{'out_name'} ? $vars{'out_name'} : $n; + + my $fname = "$protname-vs-Dam.$track_type.raw"; + my $fout="$fname.tsv"; + + printout("\nWriting raw data to file ...\n"); + + open (OUT, '>', $fout) || die "Unable to open $fout for writing: $!\n"; + print OUT join("\t", split(/\s/,"chr start end Dam $protname") ), "\n"; # header + + foreach my $chr (sort keys %data) { + foreach my $start (sort {$a <=> $b} keys %{ $data{$chr}}) { + my ($end, $score1, $score2) = @{ $data{$chr}{$start}}; + print OUT join("\t", $chr, $start, $end, $score1 + $pseudocounts, $score2 + $pseudocounts), "\n"; + } + } + close OUT; +} + +sub write_cov { + my ($n, $write_gatcs) = @_; + + my %data; + + my $dref = ($write_gatcs ? \%array : \%full_tracks); + + my $track_type = $write_gatcs ? 'gatc.' : "$vars{'bins'}bp."; + my $protname = $vars{'out_name'} ? $vars{'out_name'} : $n; + my $fname = "$protname.$track_type"; + my $fout=$fname.( $vars{'output_format'} =~ m/^bed/ ? 'bedgraph' : 'gff' ); + + open (my $OUT, '>', $fout) || die "Unable to open $fout for writing: $!\n"; + print $OUT qq(track type=bedGraph name="$protname" description="$protname DamIDseq Coverage"\n) if $vars{'output_format'} =~ m/^bed/; + + open(my $FAILED, '>', "regions_without_scores.txt") || die "Unable to open failed_regions file for writing: $!\n"; + + foreach my $chr (sort keys %$dref) { + foreach my $start (sort {$a <=> $b} keys %{ $dref -> {$chr}}) { + my $score = $dref -> {$chr} -> {$start} -> {$n}; + my $end = $dref -> {$chr} -> {$start} -> {'end'}; + unless (defined($score)) { + print $FAILED "$chr:$start-$end\n"; + next; + } + print $OUT join("\t", $chr, $start, $end, $score), "\n"; + } + } +} + sub write_file { my ($n, $write_gatcs) = @_; @@ -1115,7 +1323,7 @@ sub write_file { # find the lowest number of compared counts -- this will now use a different number of pseudocounts per sample my $psc_min = ($counts{$n}, $denom)[$counts{$n} > $denom]; - my $pseudocounts = ($vars{'pseudocounts'} ? $vars{'pseudocounts'} : $vars{'ps_factor'}*$psc_min/$bins{$n}); # pseudocounts value is related to total reads/number of bins + my $pseudocounts = calc_pseudocounts($n,$psc_min); # pseudocounts value is related to total reads/number of bins my $dref = ($write_gatcs ? \%array : \%full_tracks); @@ -1140,18 +1348,22 @@ sub write_file { my $track_type = $write_gatcs ? 'gatc.' : ''; my $name_spacer = $vars{'method_subtract'} ? ".sub." : "."; my $protname = $vars{'out_name'} ? $vars{'out_name'} : $n; + my $normtype = "$vars{'norm_method'}-norm."; - my $fname = "$protname-vs-Dam$name_spacer$track_type"; + my $fname = "$protname-vs-Dam$name_spacer$normtype$track_type"; my $fout=$fname.( $vars{'output_format'} =~ m/^bed/ ? 'bedgraph' : 'gff' ); - my $print_psc = sprintf("%0.2f",$pseudocounts); - printout(" ... adding $print_psc pseudocounts to each sample\n"); + if ($vars{'scores_as_rpm'}) { + printout(sprintf(" ... adding %0.2fe-3 pseudocounts to each sample\n",$pseudocounts*1e3)); + } else { + printout(sprintf(" ... adding %0.2f pseudocounts to each sample\n",$pseudocounts)); + } if ($vars{'ps_debug'}) { if ($vars{'pseudocounts'}) { printout " (Pseudocounts set via --pseudocounts=$vars{'pseudocounts'} command-line option"; } else { - printout " ($print_psc pseudocounts = $vars{'ps_factor'} [ps_factor] * $psc_min [min_reads] / $bins{$n} [bins])"; + printout sprintf(" (%0.2f pseudocounts = %f [ps_factor] * %f [min_reads] / %f [bins])",$pseudocounts,$vars{'ps_factor'},$psc_min,$bins{$n}); } } @@ -1196,6 +1408,25 @@ sub generate_score { return $score; } +sub find_norm_factor_raw { + printout("*** Calculating normalisation factor (rawbins method) ***\n"); + + generate_raw_files(); + + foreach my $n (keys %files) { + next if $n eq $damname; + my $protname = $vars{'out_name'} ? $vars{'out_name'} : $n; + + printout("Now working on $n ...\n"); + + my $norm = `$vars{'rawbins_cmd'} $protname-vs-Dam.gatc.raw.tsv`; + + printout(" Norm factor = ".sprintf("%0.2f",$norm)."\n"); + $norm_factors{$n}=$norm; + } +} + + sub find_norm_factor { printout("*** Calculating normalisation factor ***\n"); @@ -1333,15 +1564,18 @@ sub stdev { sub kden { # gaussian kernel density estimator + # optionally and ideally uses Inline::C to speed up central calculations my @x = @_; my $s = 0; my $n = $#x; + my $sq2pi = sqrt(2*$pi); # KDE is measured over an equidistant grid: my $xmin = max((min(@x))[1],$vars{'min_norm_value'}); my $xmax = min((max(@x))[1],$vars{'max_norm_value'}); my $steps = $vars{'norm_steps'}; + my $ys = ($xmax-$xmin)/$steps; ## bandwidth via Silverman's estimator (eq 3.31 in Silverman 1986) my ($mean, $sd) = stdev(@x); @@ -1350,22 +1584,24 @@ sub kden { ## kernel density my @sample; foreach my $st (0 .. $steps) { - my $y = $xmin + $st*($xmax-$xmin)/$steps; - my $pc = sprintf("%0.0f",100*$st/$steps); - print STDERR " Fitting kernel density $pc% complete ...\r"; - - my $s = 0; - for my $i (0..$n) { - $s += exp(-1/2*(($x[$i]-$y)/$h)**2)/(sqrt(2*$pi)); - } - my $fh = $s/($n*$h); - + print STDERR sprintf(" Fitting kernel density: %0.1f%% complete ...\r", 100*$st/$steps) unless $st%5; + my $y = $xmin + $st*$ys; + my $fh = &$kden_sr($y, $h, $sq2pi, \@x); push @sample, [$y, $fh]; } - return (@sample); } +sub kden_calc_perl { + # Fallback code if Inline::C not present ... + my ($y, $h, $sq2pi, $xr) = @_; + my $s = 0; + my $n = $#$xr; + map { $s += exp(-1/2*((@$xr[$_]-$y)/$h)**2)/$sq2pi} (0..$n); + my $fh = $s/($n*$h); + return($fh); +} + sub iqr { my @x = @_; @x = sort {$a <=> $b} @x; @@ -1586,4 +1822,40 @@ sub uniq { return unless @_; my %seen; return grep { !$seen{$_}++ } @_; -} \ No newline at end of file +} + + +############################### +### Begin block for Inline_C +### + +BEGIN { + eval { + require Inline::C; + Inline->import('C' => q { + #include + double kden_calc (double y, double h, double sq2pi, SV* terms) { + int n = 0; + + /* Make sure we have an array ref with values */ + if ((!SvROK(terms)) + || (SvTYPE(SvRV(terms)) != SVt_PVAV) + || ((n = av_len((AV *)SvRV(terms))) < 0)) return 0; + + double result = 0; + + for (int i = 0; i <= n; i++) { + result += exp( (pow((( SvNV(* av_fetch((AV *)SvRV(terms), i, 0)) - y) / h),2) * -0.5) ) / sq2pi; + } + + return result / (n * h); + } + }); + $kden_sr = \&kden_calc; + 1; + } or do { + print STDERR "Inline::C not found; perl fallback used instead\n"; + $kden_sr = \&kden_calc_perl; + $no_inline_C = 1; + } +}