diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bc6537..ca9958f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ ## CARLISLE development version -- Bug fixes (#127, @epehrsson) +- Bug fixes: (#127, @epehrsson) - Removes single-sample group check for DESeq. - Increases memory for DESeq. - Ensures control replicate number is an integer. @@ -8,9 +8,12 @@ - Fixes `no_dedup` variable names in library normalization scripts. - Containerize rules that require R (`deseq`, `go_enrichment`, and `spikein_assessment`) to fix installation issues with common R library path. (#129, @kelly-sovacool) - The `Rlib_dir` and `Rpkg_config` config options have been removed as they are no longer needed. +- New visualizations: (#132, @epehrsson) + - New rules `cov_correlation`, `homer_enrich`, `combine_homer`, `count_peaks` + - Add peak caller to MACS2 peak xls filename - New parameters in the config file to make certain rules optional: (#133, @kelly-sovacool) - GO enrichment is controlled by `run_go_enrichment` (default: `false`) - - rose is controlled by `run_rose` (default: `false`) + - ROSE is controlled by `run_rose` (default: `false`) - Fig bug that added nonexistent directories to the singularity bind paths. (#135, @kelly-sovacool) ## CARLISLE v2.5.0 diff --git a/config/config.yaml b/config/config.yaml index dfff518..f2c5bee 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -164,4 +164,4 @@ adapters: "PIPELINE_HOME/resources/other/adapters.fa" ##################################################################################### containers: base: "docker://nciccbr/ccbr_ubuntu_base_20.04:v6" - carlisle_r: "docker://nciccbr/carlisle_r:v1" + carlisle_r: "docker://nciccbr/carlisle_r:v2" \ No newline at end of file diff --git a/docker/carlisle_r/Dockerfile b/docker/carlisle_r/Dockerfile index 972f18f..d1be7b4 100644 --- a/docker/carlisle_r/Dockerfile +++ b/docker/carlisle_r/Dockerfile @@ -9,13 +9,13 @@ ARG REPONAME="000000" ENV REPONAME=${REPONAME} # install conda packages -COPY packages.txt /data2/ -RUN mamba install \ - --no-channel-priority \ - -c bioconda -c conda-forge -c r \ - --file /data2/packages.txt -ENV PATH="/opt2/conda/bin:$PATH" +COPY environment.yml /data2/ +ENV CONDA_ENV=carlisle +RUN mamba env create -n ${CONDA_ENV} -f /data2/environment.yml && \ + echo "conda activate ${CONDA_ENV}" > ~/.bashrc +ENV PATH="/opt2/conda/envs/${CONDA_ENV}/bin:$PATH" ENV R_LIBS_USER=/opt2/conda/lib/R/library/ + # install ELBOW manually, fails with mamba RUN wget --no-check-certificate https://bioconductor.riken.jp/packages/3.4/bioc/src/contrib/ELBOW_1.10.0.tar.gz && \ R -e 'install.packages("ELBOW_1.10.0.tar.gz", repos = NULL, type="source", INSTALL_opts = "--no-lock")' diff --git a/docker/carlisle_r/environment.yml b/docker/carlisle_r/environment.yml new file mode 100644 index 0000000..cf26bbe --- /dev/null +++ b/docker/carlisle_r/environment.yml @@ -0,0 +1,38 @@ +channels: + - bioconda + - conda-forge + - r +dependencies: + - bioconductor-bsgenome.hsapiens.ncbi.t2t.chm13v2.0 + - bioconductor-chipenrich + - bioconductor-chipseeker + - bioconductor-ComplexHeatmap + - bioconductor-deseq2 + - bioconductor-edger + - bioconductor-enhancedvolcano + - bioconductor-genomicfeatures + - bioconductor-htsfilter + - bioconductor-org.Hs.eg.db + - bioconductor-org.Mm.eg.db + - bioconductor-rtracklayer + - bioconductor-txdb.hsapiens.ucsc.hg19.knowngene + - bioconductor-TxDb.Hsapiens.UCSC.hg38.knownGene + - bioconductor-TxDb.Mmusculus.UCSC.mm10.knownGene + - deeptools + - r-argparse + - r-circlize + - r-DT + - r-ggfortify + - r-ggvenn + - r-htmltools + - r-latticeextra + - r-openxlsx + - r-pander + - r-pdp + - r-plotly + - r-plyr + - r-rcolorbrewer + - r-reshape2 + - r-tidyverse + - r-xfun>=0.43 + - r-yaml diff --git a/docker/carlisle_r/meta.yml b/docker/carlisle_r/meta.yml index 9bfd934..f6131a5 100644 --- a/docker/carlisle_r/meta.yml +++ b/docker/carlisle_r/meta.yml @@ -1,4 +1,4 @@ dockerhub_namespace: nciccbr image_name: carlisle_r -version: v1 +version: v2 container: "$(dockerhub_namespace)/$(image_name):$(version)" diff --git a/docker/carlisle_r/packages.txt b/docker/carlisle_r/packages.txt deleted file mode 100644 index fdf0456..0000000 --- a/docker/carlisle_r/packages.txt +++ /dev/null @@ -1,28 +0,0 @@ -bioconductor-bsgenome.hsapiens.ncbi.t2t.chm13v2.0 -bioconductor-chipenrich -bioconductor-chipseeker -bioconductor-deseq2 -bioconductor-edger -bioconductor-enhancedvolcano -bioconductor-genomicfeatures -bioconductor-htsfilter -bioconductor-org.Hs.eg.db -bioconductor-org.Mm.eg.db -bioconductor-rtracklayer -bioconductor-txdb.hsapiens.ucsc.hg19.knowngene -bioconductor-TxDb.Hsapiens.UCSC.hg38.knownGene -bioconductor-TxDb.Mmusculus.UCSC.mm10.knownGene -r-argparse -r-DT -r-ggfortify -r-ggvenn -r-htmltools -r-latticeextra -r-pander -r-pdp -r-plotly -r-rcolorbrewer -r-reshape2 -r-tidyverse -r-xfun>=0.43 -r-yaml \ No newline at end of file diff --git a/resources/cluster_biowulf.yaml b/resources/cluster_biowulf.yaml index 746e83a..4d83b6e 100644 --- a/resources/cluster_biowulf.yaml +++ b/resources/cluster_biowulf.yaml @@ -78,3 +78,5 @@ DESeq: time: 00-02:00:00 threads: 2 ################################################################### +cov_correlation: + threads: 4 diff --git a/workflow/Snakefile b/workflow/Snakefile index aa7fd79..9654dd4 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -144,6 +144,26 @@ def get_motifs(wildcards): files.extend(anno_s) return files +def get_enrich(wildcards): + files=[] + if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE): + anno_m=expand(join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png"),peak_caller="macs2",qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_M), + files.extend(anno_m) + if ("gopeaks_narrow" in PEAKTYPE) or ("gopeaks_broad" in PEAKTYPE): + anno_g=expand(join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png"),peak_caller="gopeaks",qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_G), + files.extend(anno_g) + if ("seacr_stringent" in PEAKTYPE) or ("seacr_relaxed" in PEAKTYPE): + anno_s=expand(join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png"),peak_caller="seacr",qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_S), + files.extend(anno_s) + return files + +def get_combined(wildcards): + files = [] + if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE): + combined_m = expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.tsv"),qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_M,treatment_control_list=TREATMENT_LIST_M) + files.extend(combined_m) + return files + def get_rose(wildcards): files=[] if config['run_rose']: @@ -200,6 +220,12 @@ rule all: # ALIGN / deeptools_heatmap unpack(run_deeptools_heatmap), + # ALIGN / deeptools_corr + expand(join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonCorr.tab"),dupstatus=DUPSTATUS), + expand(join(RESULTSDIR,"deeptools","all.{dupstatus}.PCA.tab"),dupstatus=DUPSTATUS), + expand(join(RESULTSDIR,"deeptools","all.{dupstatus}.Pearson_heatmap.png"),dupstatus=DUPSTATUS), + expand(join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonPCA.png"),dupstatus=DUPSTATUS), + # ALIGN / create_library_norm_scales unpack(run_library_norm), @@ -213,6 +239,8 @@ rule all: unpack(run_macs2), unpack(run_seacr), unpack(run_gopeaks), + join(RESULTSDIR,"peaks","Peak counts.xlsx"), + join(RESULTSDIR,"peaks","all.peaks.txt"), # QC unpack(run_qc), @@ -222,6 +250,8 @@ rule all: # ANNOTATION / findMotif, rose, create_contrast_peakcaller_files, go_enrichment unpack(get_motifs), + unpack(get_enrich), + unpack(get_combined), unpack(get_rose), unpack(get_enrichment) @@ -292,4 +322,4 @@ onerror: expand(join(RESULTSDIR,"deeptools","clean", "{group}.{dupstatus}.metagene.mat.gz"),group=[a+b for a in TREATMENTS+["all_samples"] for b in ["", ".prot"]],dupstatus=DUPSTATUS), expand(join(RESULTSDIR,"deeptools","clean", "{group}.{dupstatus}.TSS.mat.gz"),group=[a+b for a in TREATMENTS+["all_samples"] for b in ["", ".prot"]],dupstatus=DUPSTATUS), expand(join(RESULTSDIR,"deeptools","clean", "{group}.{dupstatus}.geneinfo.bed"),group=[a+b for a in TREATMENTS+["all_samples"] for b in ["", ".prot"]],dupstatus=DUPSTATUS), -""" \ No newline at end of file +""" diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index 7a0b8c9..ca3e533 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -468,4 +468,48 @@ rule deeptools_heatmap: plotHeatmap -m {input.TSSmat} -out {output.TSSheat} --colorMap 'RdBu_r' --zMin auto --zMax auto --yAxisLabel 'average RPGC' --regionsLabel 'genes' --legendLocation 'none' plotProfile -m {input.metamat} -out {output.metaline} --plotHeight 15 --plotWidth 15 --perGroup --yAxisLabel 'average RPGC' --plotType 'se' --legendLocation upper-right plotProfile -m {input.TSSmat} -out {output.TSSline} --plotHeight 15 --plotWidth 15 --perGroup --yAxisLabel 'average RPGC' --plotType 'se' --legendLocation upper-left - """ \ No newline at end of file + """ + +rule cov_correlation: + """ + Create replicate correlation plots from filtered BAM files + """ + input: + bams=expand(join(RESULTSDIR,"bam","{replicate}.{{dupstatus}}.bam"),replicate=REPLICATES), + align_table=join(RESULTSDIR,"alignment_stats","alignment_stats.tsv") + output: + counts=join(RESULTSDIR,"deeptools","all.{dupstatus}.readCounts.npz"), + pearson_corr=join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonCorr.tab"), + pearson_plot=join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonCorr.png"), + pca=join(RESULTSDIR,"deeptools","all.{dupstatus}.PCA.tab"), + hc=join(RESULTSDIR,"deeptools","all.{dupstatus}.Pearson_heatmap.png"), + pca_format=join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonPCA.png") + params: + rscript=join(SCRIPTSDIR,"_plot_correlation.R"), + dupstatus="{dupstatus}" + container: config['containers']['carlisle_r'] + threads: getthreads("cov_correlation") + shell: + """ + # Calculate genome-wide coverage + multiBamSummary bins \ + --bamfiles {input.bams} \ + --smartLabels \ + -out {output.counts} \ + -p {threads} + + # Plot heatmap - Pearson + plotCorrelation \ + -in {output.counts} \ + --corMethod pearson --skipZeros \ + --whatToPlot heatmap --plotNumbers \ + -o {output.pearson_plot} \ + --removeOutliers \ + --outFileCorMatrix {output.pearson_corr} + + # Plot PCA + plotPCA -in {output.counts} --outFileNameData {output.pca} + + # Plot heatmap and PCA (formatted) + Rscript {params.rscript} {output.pearson_corr} {output.pca} {input.align_table} {params.dupstatus} {output.hc} {output.pca_format} + """ diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk index 389b18c..966e0b9 100644 --- a/workflow/rules/annotations.smk +++ b/workflow/rules/annotations.smk @@ -18,7 +18,7 @@ def get_peak_file(wildcards): bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"gopeaks","peak_output",wildcards.treatment_control_list + "." + wildcards.dupstatus + ".broad.peaks.bed") return bed -localrules: create_contrast_peakcaller_files +localrules: create_contrast_peakcaller_files, homer_enrich, combine_homer rule findMotif: """ Developed from code: https://github.com/CCRGeneticsBranch/khanlab_pipeline/blob/master/rules/pipeline.chipseq.smk @@ -51,7 +51,7 @@ rule findMotif: else annotatePeaks.pl {input.peak_file} {params.genome} -annStats {output.annotation_summary} > {output.annotation} fi - + # hs1 is not part of HOMER's config genome db. Must add it as a separate param if [[ {params.genome} == "hs1" ]]; then findMotifsGenome.pl {input.peak_file} {params.fa} {params.outDir} -size {params.motif_size} -p {threads} -preparsedDir {params.preparsedDir} @@ -60,12 +60,58 @@ rule findMotif: fi """ +def get_annotation_files(wildcards): + """ + treatment_control_list depends on the peak caller + """ + return expand(join(RESULTSDIR,"peaks", wildcards.qthresholds, wildcards.peak_caller, "annotation","homer", + "{treatment_control_list}" + "." + wildcards.dupstatus + "." + wildcards.peak_caller_type + ".annotation.summary"), + treatment_control_list = TREATMENT_LIST_M if wildcards.peak_caller.startswith('macs2') else TREATMENT_LIST_SG) + + +rule homer_enrich: + """ + Plot enrichment over genic features + """ + input: + annotation_summary=get_annotation_files + output: + enrich_png=join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png") + params: + annotation_dir=join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer"), + peak_mode="{peak_caller_type}", + dupstatus="{dupstatus}", + rscript=join(SCRIPTSDIR,"_plot_feature_enrichment.R") + container: config['containers']['carlisle_r'] + shell: + """ + Rscript {params.rscript} {params.annotation_dir} {params.peak_mode} {params.dupstatus} {output.enrich_png} + """ + +rule combine_homer: + """ + Add MACS2 q-value and FC to HOMER peak annotation + """ + input: + annotation=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation.txt"), + peaks_file=join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.{peak_caller_type}.peaks.xls") + output: + combined_tsv=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.tsv"), + combined_xlsx=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx") + container: config['containers']['carlisle_r'] + params: + rscript=join(SCRIPTSDIR,"_combine_macs2_homer.R") + shell: + """ + Rscript {params.rscript} {input.peaks_file} {input.annotation} {output.combined_tsv} {output.combined_xlsx} + """ + rule rose: """ - Developed from code: + Developed from code: https://github.com/CCRGeneticsBranch/khanlab_pipeline/blob/master/rules/pipeline.chipseq.smk - outdated verison of rose: https://github.com/younglab/ROSE - + outdated version of rose: https://github.com/younglab/ROSE + # SEACR bed file output format <1> <2> <3> <4> <5> <6> @@ -79,8 +125,8 @@ rule rose: https://github.com/macs3-project/MACS/blob/master/docs/callpeak.md <1> <2> <3> <4> <5> <6> <7> <8> <9> <10> <-log10pvalue> <-log10qvalue> - ## integer score for display: It's calculated as int(-10*log10pvalue) or int(-10*log10qvalue) depending on whether -p (pvalue) or -q (qvalue) is used as score cutoff. - ### Please note that currently this value might be out of the [0-1000] range defined in UCSC ENCODE narrowPeak format. You can let the value saturated at 1000 (i.e. p/q-value = 10^-100) + ## integer score for display: It's calculated as int(-10*log10pvalue) or int(-10*log10qvalue) depending on whether -p (pvalue) or -q (qvalue) is used as score cutoff. + ### Please note that currently this value might be out of the [0-1000] range defined in UCSC ENCODE narrowPeak format. You can let the value saturated at 1000 (i.e. p/q-value = 10^-100) ### by using the following 1-liner awk: awk -v OFS="\t" '{$5=$5>1000?1000:$5} {print}' NAME_peaks.narrowPeak ## broadPeak does not have 10th column ### Since in the broad peak calling mode, the peak summit won't be called, the values in the 5th, and 7-9th columns are the mean value across all positions in the peak region @@ -132,20 +178,20 @@ rule rose: """ # set tmp set -exo pipefail - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID" else dirname=$(basename $(mktemp)) TMPDIR="/dev/shm/$dirname" mkdir -p $TMPDIR fi - + # set ROSE specific paths PATHTO=/usr/local/apps/ROSE/1.3.1 PYTHONPATH=/usr/local/apps/ROSE/1.3.1/src/lib export PYTHONPATH export PATH=$PATH:$PATHTO/bin - + # pull treatment and control ids treatment=`echo {params.tc_file} | awk -F"_vs_" '{{print $1}}'` control=`echo {params.tc_file} | awk -F"_vs_" '{{print $2}}'` @@ -159,11 +205,11 @@ rule rose: samtools view -b ${{treat_bam}} {params.regions} > $TMPDIR/subset.bam samtools index $TMPDIR/subset.bam grep -v "NC_" {input.peak_file} > $TMPDIR/subset.bed - + # prep for ROSE # macs2 output is prepared for ROSE formatting # seacr and gopeaks must be edited to correct for formatting - + ## correct GOPEAKS ### original: ### output: <$sampleid_uniquenumber> <0> <.> @@ -174,7 +220,7 @@ rule rose: nl --number-format=rz --number-width=3 $TMPDIR/subset.bed | awk -v sample_id="${{treatment}}_" \'{{print sample_id$1"\\t0\\t."}}\' > $TMPDIR/col.txt paste -d "\t" $TMPDIR/save.bed $TMPDIR/col.txt > $TMPDIR/subset.bed fi - + ## correct SEACR ### original: ### output: <$sampleid_uniquenumber> <.> @@ -196,11 +242,11 @@ rule rose: if [[ ${{num_of_peaks}} -gt 5 ]]; then echo "## More than 5 usable peaks detected ${{num_of_peaks}} - Running rose" cd {params.workdir} - + # if macs2 control is off, there will be no macs2 control to annotate if [[ {params.control_flag} == "N" ]] & [[ {params.peak_caller_type} == "macs2_narrow" ]] || [[ {params.peak_caller_type} == "macs2_broad" ]]; then echo "#### No control was used" - rose_files="$TMPDIR/subset.bam" + rose_files="$TMPDIR/subset.bam" else echo "#### A control used ${{cntrl_bam}}" rose_files="$TMPDIR/subset.bam ${{cntrl_bam}}" @@ -213,7 +259,7 @@ rule rose: -r ${{rose_files}} \ -t {params.tss_distance} \ -s {params.stitch_distance} \ - -o {params.file_base} + -o {params.file_base} else ROSE_main.py \ -i {output.no_tss_bed} \ @@ -223,16 +269,16 @@ rule rose: -s {params.stitch_distance} \ -o {params.file_base} fi - + # rose to bed file echo "## Convert bed" # developed from https://github.com/CCRGeneticsBranch/khanlab_pipeline/blob/master/scripts/roseTable2Bed.sh grep -v "^[#|REGION]" {output.all} | awk -v OFS="\\t" -F"\\t" \'$NF==0 {{for(i=2; i<=NF; i++){{printf $i; printf (i $TMPDIR/regular bedtools sort -i $TMPDIR/regular > {output.regular} - + grep -v "^[#|REGION]" {output.all} | awk -v OFS="\\t" -F"\\t" \'$NF==1 {{for(i=2; i<=NF; i++){{printf $i; printf (i $TMPDIR/super bedtools sort -i $TMPDIR/super > {output.super} - + # cut rose output files, create summits echo "## Cut and summit" cut -f1-3 {output.regular} > {output.regular_great} @@ -303,7 +349,7 @@ if config["run_contrasts"]: sample_list=`awk '{{print $3}}' {input.contrast_file}` clean_sample_list=`echo $sample_list | sed "s/\s/xxx/g"` - # rum script + # rum script Rscript {params.rscript_wrapper} \\ --rmd {params.rmd} \\ --carlisle_functions {params.carlisle_functions} \\ diff --git a/workflow/rules/peakcalls.smk b/workflow/rules/peakcalls.smk index fb98690..173fd12 100644 --- a/workflow/rules/peakcalls.smk +++ b/workflow/rules/peakcalls.smk @@ -8,6 +8,32 @@ # 6)bed # /results/fragments/53_H3K4me3_1.dedup.fragments.bed +localrules: count_peaks + +def get_all_peak_files(wildcards): + files=[] + if "macs2_narrow" in PEAKTYPE: + n=expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_M,dupstatus=DUPSTATUS), + files.extend(n) + if "macs2_broad" in PEAKTYPE: + b=expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_M,dupstatus=DUPSTATUS), + files.extend(b) + if "seacr_stringent" in PEAKTYPE: + s=expand(join(RESULTSDIR,"peaks","{qthresholds}","seacr","peak_output","{treatment_control_list}.{dupstatus}.stringent.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_SG,dupstatus=DUPSTATUS), + files.extend(s) + if "seacr_relaxed" in PEAKTYPE: + r=expand(join(RESULTSDIR,"peaks","{qthresholds}","seacr","peak_output","{treatment_control_list}.{dupstatus}.relaxed.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_SG,dupstatus=DUPSTATUS), + files.extend(r) + if "gopeaks_narrow" in PEAKTYPE: + n=expand(join(RESULTSDIR,"peaks","{qthresholds}","gopeaks","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_SG,dupstatus=DUPSTATUS), + files.extend(n) + if "gopeaks_broad" in PEAKTYPE: + b=expand(join(RESULTSDIR,"peaks","{qthresholds}","gopeaks","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_SG,dupstatus=DUPSTATUS), + files.extend(b) + + files_list=list(itertools.chain.from_iterable(files)) + return files_list + rule macs2_narrow: ''' MACS2 can be run with and without a control. This featured is controlled in the config.yaml filen @@ -20,7 +46,7 @@ rule macs2_narrow: peak_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bed"), summit_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.summits.bed"), bg_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bigbed.gz"), - xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.xls"), + xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.macs2_narrow.peaks.xls"), params: frag_bed_path=join(RESULTSDIR,"fragments"), tc_file="{treatment_control_list}", @@ -109,7 +135,7 @@ rule macs2_broad: output: peak_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bed"), bg_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bigbed.gz"), - xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.xls"), + xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.macs2_broad.peaks.xls"), params: frag_bed_path=join(RESULTSDIR,"fragments"), tc_file="{treatment_control_list}", @@ -432,4 +458,20 @@ rule gopeaks_broad: cut -f1-3 {output.peak_file} | LC_ALL=C sort --buffer-size={params.memG} --parallel={threads} --temporary-directory=$TMPDIR -k1,1 -k2,2n | uniq > ${{TMPDIR}}/broad.bed bedToBigBed -type=bed3 ${{TMPDIR}}/broad.bed {input.genome_len} ${{TMPDIR}}/broad.bigbed bgzip -c ${{TMPDIR}}/broad.bigbed > {output.bg_file} - """ \ No newline at end of file + """ + +rule count_peaks: + input: + peaks=get_all_peak_files + output: + peak_count=join(RESULTSDIR,"peaks","all.peaks.txt"), + peak_table=join(RESULTSDIR,"peaks","Peak counts.xlsx"), + params: + outdir=join(RESULTSDIR,"peaks"), + rscript=join(SCRIPTSDIR,"_plot_peak_counts.R") + container: config['containers']['carlisle_r'] + shell: + """ + wc -l {input.peaks} > {output.peak_count} + Rscript {params.rscript} {output.peak_count} {params.outdir} + """ diff --git a/workflow/scripts/_combine_macs2_homer.R b/workflow/scripts/_combine_macs2_homer.R new file mode 100644 index 0000000..88b8d1b --- /dev/null +++ b/workflow/scripts/_combine_macs2_homer.R @@ -0,0 +1,28 @@ +# Add peak statistics to HOMER annotations file + +library(openxlsx) +library(readr) +library(dplyr) +library(tidyr) + +args <- commandArgs(trailingOnly = TRUE) + +peaks_file <- args[1] +homer_file <- args[2] +combined_tsv <- args[3] +combined_xlsx <- args[4] + +# Load peak file +# the file extension is '.xls' but it's actually just a tsv file +peaks <- read_tsv(peaks_file, comment = "#") %>% + select(c("name", "fold_enrichment", "-log10(qvalue)")) + +# Load HOMER annotations +homer <- read_tsv(homer_file) %>% + # rename first column to match peaks file + rename(name = 1) + +# Combine tables and write out +combined <- full_join(homer, peaks, by = "name") +write_tsv(combined, file = combined_tsv) +openxlsx::write.xlsx(combined, file = combined_xlsx) diff --git a/workflow/scripts/_plot_correlation.R b/workflow/scripts/_plot_correlation.R new file mode 100644 index 0000000..3727320 --- /dev/null +++ b/workflow/scripts/_plot_correlation.R @@ -0,0 +1,92 @@ +# Plot correlation heatmap and PCA + +library(ggplot2) +library(reshape2) +library(ComplexHeatmap) +library(RColorBrewer) +library(circlize) + +args = commandArgs(trailingOnly=TRUE) + +in.corr = args[1] +in.pca = args[2] +in.read_depth = args[3] +dupstatus = args[4] +out.corr = args[5] +out.pca = args[6] + +# Read deeptools output +print("Load heatmap") +heatmap = read.table(in.corr,check.names = FALSE) +colnames(heatmap) = gsub("[.]no_dedup","",gsub("[.]dedup","",colnames(heatmap))) +rownames(heatmap) = gsub("[.]no_dedup","",gsub("[.]dedup","",rownames(heatmap))) +print(heatmap) + +print("Load PCA") +pca = read.table(in.pca,header=TRUE,check.names = FALSE) +print(pca) + +# Create metadata table +print("Create metadata table") +metadata = data.frame(Replicate=colnames(heatmap)) +metadata$Sample = gsub("_[1-9]$","",metadata$Replicate) + +# Add read depth +read_depth = read.table(in.read_depth,sep='\t',header=TRUE) +metadata = merge(metadata,read_depth[,c("sample_name","nreads","no_dedup_nreads_genome","dedup_nreads_genome")], + by.x="Replicate",by.y="sample_name") +if(dupstatus == "dedup"){ + metadata$Depth = metadata$dedup_nreads_genome/1000000 +}else{ + metadata$Depth = metadata$no_dedup_nreads_genome/1000000 +} +metadata = metadata[match(rownames(heatmap),metadata$Replicate),] +print(metadata) + +# Create colors +sample_colors = setNames(colorRampPalette(brewer.pal(11, "Spectral"))(length(unique(metadata$Sample))), + unique(metadata$Sample)) +depth_colors = colorRamp2(breaks=c(0,30,60),colors=c("white","grey50","black")) + +colors = list(Sample=sample_colors, + Depth=depth_colors) + +# Heatmap + +## Plot heatmap +print("Print heatmap") +png(out.corr,width=600,height=600) +print(Heatmap(heatmap, + top_annotation = HeatmapAnnotation(df=metadata[,c("Sample","Depth")], + col=colors[c("Sample","Depth")]), + show_row_dend = FALSE, + show_row_names = FALSE, + col = colorRamp2(c(0,1), c("white", "blue")), + heatmap_legend_param = list( + title = "Pearson\ncorrelation" + ), + column_dend_height = unit(0.8,"inches"), + name="Pearson correlation, genome-wide coverage (10kb bins)")) +dev.off() + +# PCA +print("Print PCA") + +## Calculate variance from eigenvalue +eigenvalue = pca[,c("Component","Eigenvalue")] +eigenvalue$Variance = eigenvalue$Eigenvalue/sum(eigenvalue$Eigenvalue)*100 + +pca = melt(pca[,1:dim(pca)[2]-1],id.var="Component",variable.name="Replicate",value.name="Loading") +pca = dcast(pca,Replicate~Component,value.var="Loading") +pca$Replicate = gsub("[.]no_dedup","",gsub("[.]dedup","",pca$Replicate)) +pca = merge(pca,metadata,by="Replicate") + +## Plot PCA +png(out.pca) +print(ggplot(pca,aes(x=`1`,y=`2`,color=Sample)) + geom_point(size=3) + + xlab(paste("PC1 (",round(eigenvalue$Variance[1],1),"%)",sep="")) + + ylab(paste("PC2 (",round(eigenvalue$Variance[2],1),"%)",sep="")) + + scale_color_manual(values=sample_colors) + + ggtitle("PCA, genome-wide coverage (10kb bins)") + + theme_classic() + theme(legend.position="bottom")) +dev.off() diff --git a/workflow/scripts/_plot_feature_enrichment.R b/workflow/scripts/_plot_feature_enrichment.R new file mode 100644 index 0000000..343976f --- /dev/null +++ b/workflow/scripts/_plot_feature_enrichment.R @@ -0,0 +1,47 @@ +# Plot HOMER enrichment + +library(reshape2) +library(ggplot2) +library(plyr) +library(ComplexHeatmap) +library(circlize) + +theme_set(theme_bw()) + +args = commandArgs(trailingOnly=TRUE) + +peaks.dir = args[1] +peak_mode = args[2] +dupstatus = args[3] +fig = args[4] + +# Load HOMER annotation summary +homer.files = list.files(path=peaks.dir,pattern="annotation.summary", + recursive=TRUE,full.names = TRUE) +homer = lapply(homer.files,function(x) read.table(x,sep='\t',header=TRUE,check.names = FALSE,nrows=14)) +names(homer) = gsub("gopeaks/|macs2/|seacr/","", + gsub(".annotation.summary","", + gsub("/annotation/homer","", + gsub("^.*results/peaks/","",homer.files)))) +homer = lapply(homer,function(x) x[1:rownames(x[which(x$Annotation == "Annotation"),])-1,]) +homer = ldply(homer,.id="File") + +homer = cbind(homer,colsplit(homer$File,"/",c("Threshold","Sample"))) +homer = cbind(homer,colsplit(homer$Sample,"[.]",c("Comparison","Duplication","Caller"))) +homer = cbind(homer,colsplit(homer$Comparison,"_vs_",c("Replicate","Control"))) +homer = homer[which(homer$Duplication == dupstatus & homer$Caller == peak_mode),] +homer$`LogP enrichment (+values depleted)` = as.numeric(homer$`LogP enrichment (+values depleted)`) + +# Plot enrichment heatmap +homer = dcast(homer,Replicate~Annotation,value.var="LogP enrichment (+values depleted)") +rownames(homer) = homer$Replicate +homer = homer[,2:dim(homer)[2]] + +png(fig,width = 700, height = 400) +hm = Heatmap(homer, + name="LogP enrichment\n(+values depleted)", + col=colorRamp2(breaks=c(-70000,-50,0,50,70000), + colors=c("red","yellow","white","green","blue")), + width=unit(100, "mm")) +draw(hm,heatmap_legend_side = "left") +dev.off() diff --git a/workflow/scripts/_plot_peak_counts.R b/workflow/scripts/_plot_peak_counts.R new file mode 100644 index 0000000..eee5774 --- /dev/null +++ b/workflow/scripts/_plot_peak_counts.R @@ -0,0 +1,64 @@ +# Plot number of peaks called per sample by caller, duplication status, and threshold + +library(reshape2) +library(ggplot2) +library(plyr) +library(openxlsx) + +theme_set(theme_bw()) + +args = commandArgs(trailingOnly=TRUE) + +input = args[1] + +# Load number of peaks +print("Load peaks") +peaks = read.table(input,sep="") +colnames(peaks) = c("Peaks","File") +peaks = peaks[which(peaks$File != "total"),] +peaks$File = gsub(".peaks.bed$","",gsub("/peak_output","",gsub("^.*results/peaks/","",peaks$File))) +peaks = cbind(peaks,colsplit(peaks$File,"/",c("Threshold","Caller","Sample"))) +peaks = cbind(peaks,colsplit(peaks$Sample,"[.]",c("Comparison","Duplication","Mode"))) +peaks$Caller = paste(peaks$Caller,peaks$Mode,sep="_") +peaks = peaks[,c("Comparison","Caller","Duplication","Threshold","Peaks")] +peaks = cbind(peaks,colsplit(peaks$Comparison,"_vs_",c("Replicate","Control"))) +peaks$Sample = gsub("_[1-9]$","",peaks$Replicate) + +setwd(args[2]) + +# Write out table +print("Write table") +peak_output = dcast(peaks,Threshold+Duplication+Comparison~Caller,value.var="Peaks") +peak_output = peak_output[order(peak_output$Threshold,peak_output$Duplication,peak_output$Comparison),] +write.xlsx(peak_output,file="Peak counts.xlsx") + +# For each threshold, plot number of peaks +for (threshold in unique(peaks$Threshold)){ + print(threshold) +peaks_threshold = peaks[which(peaks$Threshold == threshold),] + +# Compare peaks with and without duplication +peaks_threshold_dup = dcast(peaks_threshold,Replicate+Caller~Duplication,value.var="Peaks") +cor=round(as.numeric(unlist(cor.test(peaks_threshold_dup$dedup,peaks_threshold_dup$no_dedup))["estimate.cor"]),2) +peaks.max=max(max(peaks_threshold_dup$dedup),max(peaks_threshold_dup$no_dedup)) + +png(paste("duplication_corr.",threshold,".png",sep=""),width = 350, height = 300) + print(ggplot(peaks_threshold_dup,aes(x=log10(no_dedup),y=log10(dedup))) + geom_point(aes(color=Caller)) + + geom_abline(intercept=0,slope=1,linetype="dashed") + + xlab("log10(Peaks), no deduplication") + ylab("log10(Peaks), deduplication") + + xlim(0,log10(peaks.max)+0.5) + ylim(0,log10(peaks.max)+0.5) + + ggtitle(paste("Peaks by read duplication, q-value threshold ",threshold,sep=""), + subtitle=paste("Pearson correlation: ",cor,sep=""))) +dev.off() + +# Plot summary across callers +width=(length(unique(peaks_threshold$Sample))*50)+100 +height=(length(unique(peaks_threshold$Caller))*100)+100 +png(paste("peaks_by_caller.",threshold,".png",sep=""),width = width, height = height) + print(ggplot(peaks_threshold[which(peaks_threshold$Duplication == "dedup"),],aes(x=Sample,y=Peaks)) + + geom_boxplot() + + facet_wrap(~Caller,nrow=length(unique(peaks_threshold$Caller)),scales="free_y") + + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) + + ggtitle(paste("Peaks,\nq-value threshold ",threshold,",\ndeduplicated reads",sep=""))) +dev.off() +}