diff --git a/workflow/Snakefile b/workflow/Snakefile index d03f307..9654dd4 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -160,7 +160,7 @@ def get_enrich(wildcards): def get_combined(wildcards): files = [] if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE): - combined_m = expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx"),qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_M,treatment_control_list=TREATMENT_LIST_M) + combined_m = expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.tsv"),qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_M,treatment_control_list=TREATMENT_LIST_M) files.extend(combined_m) return files diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk index 14549ee..6b9f040 100644 --- a/workflow/rules/annotations.smk +++ b/workflow/rules/annotations.smk @@ -51,7 +51,7 @@ rule findMotif: else annotatePeaks.pl {input.peak_file} {params.genome} -annStats {output.annotation_summary} > {output.annotation} fi - + # hs1 is not part of HOMER's config genome db. Must add it as a separate param if [[ {params.genome} == "hs1" ]]; then findMotifsGenome.pl {input.peak_file} {params.fa} {params.outDir} -size {params.motif_size} -p {threads} -preparsedDir {params.preparsedDir} @@ -67,13 +67,13 @@ def get_annotation_files(wildcards): return expand(join(RESULTSDIR,"peaks", wildcards.qthresholds, wildcards.peak_caller, "annotation","homer", "{treatment_control_list}" + "." + wildcards.dupstatus + "." + wildcards.peak_caller_type + ".annotation.summary"), treatment_control_list = TREATMENT_LIST_M if wildcards.peak_caller.startswith('macs2') else TREATMENT_LIST_SG) - + rule homer_enrich: """ Plot enrichment over genic features """ - input: + input: annotation_summary=get_annotation_files output: enrich_png=join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png") @@ -94,23 +94,24 @@ rule combine_homer: """ input: annotation=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation.txt"), - xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.{peak_caller_type}.peaks.xls") + peaks_file=join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.{peak_caller_type}.peaks.xls") output: - combined=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx") + combined_tsv=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.tsv"), + combined_xlsx=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx") container: config['containers']['carlisle_r'] params: rscript=join(SCRIPTSDIR,"_combine_macs2_homer.R") shell: """ - Rscript {params.rscript} {input.xls_file} {input.annotation} {output.combined} + Rscript {params.rscript} {input.peaks_file} {input.annotation} {output.combined} """ rule rose: """ - Developed from code: + Developed from code: https://github.com/CCRGeneticsBranch/khanlab_pipeline/blob/master/rules/pipeline.chipseq.smk - outdated verison of rose: https://github.com/younglab/ROSE - + outdated version of rose: https://github.com/younglab/ROSE + # SEACR bed file output format <1> <2> <3> <4> <5> <6> @@ -124,8 +125,8 @@ rule rose: https://github.com/macs3-project/MACS/blob/master/docs/callpeak.md <1> <2> <3> <4> <5> <6> <7> <8> <9> <10> <-log10pvalue> <-log10qvalue> - ## integer score for display: It's calculated as int(-10*log10pvalue) or int(-10*log10qvalue) depending on whether -p (pvalue) or -q (qvalue) is used as score cutoff. - ### Please note that currently this value might be out of the [0-1000] range defined in UCSC ENCODE narrowPeak format. You can let the value saturated at 1000 (i.e. p/q-value = 10^-100) + ## integer score for display: It's calculated as int(-10*log10pvalue) or int(-10*log10qvalue) depending on whether -p (pvalue) or -q (qvalue) is used as score cutoff. + ### Please note that currently this value might be out of the [0-1000] range defined in UCSC ENCODE narrowPeak format. You can let the value saturated at 1000 (i.e. p/q-value = 10^-100) ### by using the following 1-liner awk: awk -v OFS="\t" '{$5=$5>1000?1000:$5} {print}' NAME_peaks.narrowPeak ## broadPeak does not have 10th column ### Since in the broad peak calling mode, the peak summit won't be called, the values in the 5th, and 7-9th columns are the mean value across all positions in the peak region @@ -177,20 +178,20 @@ rule rose: """ # set tmp set -exo pipefail - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID" else dirname=$(basename $(mktemp)) TMPDIR="/dev/shm/$dirname" mkdir -p $TMPDIR fi - + # set ROSE specific paths PATHTO=/usr/local/apps/ROSE/1.3.1 PYTHONPATH=/usr/local/apps/ROSE/1.3.1/src/lib export PYTHONPATH export PATH=$PATH:$PATHTO/bin - + # pull treatment and control ids treatment=`echo {params.tc_file} | awk -F"_vs_" '{{print $1}}'` control=`echo {params.tc_file} | awk -F"_vs_" '{{print $2}}'` @@ -204,11 +205,11 @@ rule rose: samtools view -b ${{treat_bam}} {params.regions} > $TMPDIR/subset.bam samtools index $TMPDIR/subset.bam grep -v "NC_" {input.peak_file} > $TMPDIR/subset.bed - + # prep for ROSE # macs2 output is prepared for ROSE formatting # seacr and gopeaks must be edited to correct for formatting - + ## correct GOPEAKS ### original: ### output: <$sampleid_uniquenumber> <0> <.> @@ -219,7 +220,7 @@ rule rose: nl --number-format=rz --number-width=3 $TMPDIR/subset.bed | awk -v sample_id="${{treatment}}_" \'{{print sample_id$1"\\t0\\t."}}\' > $TMPDIR/col.txt paste -d "\t" $TMPDIR/save.bed $TMPDIR/col.txt > $TMPDIR/subset.bed fi - + ## correct SEACR ### original: ### output: <$sampleid_uniquenumber> <.> @@ -241,11 +242,11 @@ rule rose: if [[ ${{num_of_peaks}} -gt 5 ]]; then echo "## More than 5 usable peaks detected ${{num_of_peaks}} - Running rose" cd {params.workdir} - + # if macs2 control is off, there will be no macs2 control to annotate if [[ {params.control_flag} == "N" ]] & [[ {params.peak_caller_type} == "macs2_narrow" ]] || [[ {params.peak_caller_type} == "macs2_broad" ]]; then echo "#### No control was used" - rose_files="$TMPDIR/subset.bam" + rose_files="$TMPDIR/subset.bam" else echo "#### A control used ${{cntrl_bam}}" rose_files="$TMPDIR/subset.bam ${{cntrl_bam}}" @@ -258,7 +259,7 @@ rule rose: -r ${{rose_files}} \ -t {params.tss_distance} \ -s {params.stitch_distance} \ - -o {params.file_base} + -o {params.file_base} else ROSE_main.py \ -i {output.no_tss_bed} \ @@ -268,16 +269,16 @@ rule rose: -s {params.stitch_distance} \ -o {params.file_base} fi - + # rose to bed file echo "## Convert bed" # developed from https://github.com/CCRGeneticsBranch/khanlab_pipeline/blob/master/scripts/roseTable2Bed.sh grep -v "^[#|REGION]" {output.all} | awk -v OFS="\\t" -F"\\t" \'$NF==0 {{for(i=2; i<=NF; i++){{printf $i; printf (i $TMPDIR/regular bedtools sort -i $TMPDIR/regular > {output.regular} - + grep -v "^[#|REGION]" {output.all} | awk -v OFS="\\t" -F"\\t" \'$NF==1 {{for(i=2; i<=NF; i++){{printf $i; printf (i $TMPDIR/super bedtools sort -i $TMPDIR/super > {output.super} - + # cut rose output files, create summits echo "## Cut and summit" cut -f1-3 {output.regular} > {output.regular_great} @@ -348,7 +349,7 @@ if config["run_contrasts"]: sample_list=`awk '{{print $3}}' {input.contrast_file}` clean_sample_list=`echo $sample_list | sed "s/\s/xxx/g"` - # rum script + # rum script Rscript {params.rscript_wrapper} \\ --rmd {params.rmd} \\ --carlisle_functions {params.carlisle_functions} \\ diff --git a/workflow/scripts/_combine_macs2_homer.R b/workflow/scripts/_combine_macs2_homer.R index aaf1a76..c36804e 100644 --- a/workflow/scripts/_combine_macs2_homer.R +++ b/workflow/scripts/_combine_macs2_homer.R @@ -1,19 +1,27 @@ # Add peak statistics to HOMER annotations file -library(openxlsx) +library(readr) +library(dplyr) +library(tidyr) -args = commandArgs(trailingOnly=TRUE) +args <- commandArgs(trailingOnly = TRUE) -peaks.file = args[1] -homer.file = args[2] -combined.file = args[3] +peaks_file <- args[1] +homer_file <- args[2] +combined_tsv <- args[3] +combined_xlsx <- args[4] # Load peak file -peaks = read.table(peaks.file,skip = 23,header=TRUE,sep='\t',check.names = FALSE) +# the file extension is '.xls' but it's actually just a tsv file +peaks <- read_tsv(peaks_file, comment = "#") %>% + select(c("name", "fold_enrichment", "-log10(qvalue)")) # Load HOMER annotations -homer = read.table(homer.file,header=TRUE,sep='\t',check.names=FALSE,comment.char="",quote="") +homer <- read_tsv(homer_file) %>% + # rename first column to match peaks file + rename(name = 1) # Combine tables and write out -combined = merge(homer,peaks[,c("name","fold_enrichment","-log10(qvalue)")],by.x=1,by.y="name") -write.xlsx(combined,file=combined.file) +combined <- full_join(homer, peaks, by = "name") +write_tsv(combined, file = combined_tsv) +openxlsx::write.xlsx(combined, file = combined_xlsx)