diff --git a/workflow/Snakefile b/workflow/Snakefile index 1669890..011e479 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -44,6 +44,8 @@ def run_macs2(wildcards): files.extend(n) n2=expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bigbed.gz"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_M,dupstatus=DUPSTATUS), files.extend(n2) + n3=expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.summits.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_M,dupstatus=DUPSTATUS), + files.extend(n3) if "macs2_broad" in PEAKTYPE: b=expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_M,dupstatus=DUPSTATUS), files.extend(b) @@ -197,30 +199,30 @@ rule all: # ALIGN / deeptools_heatmap unpack(run_deeptools_heatmap), - # # ALIGN / create_library_norm_scales - # unpack(run_library_norm), + # ALIGN / create_library_norm_scales + unpack(run_library_norm), - # # ALIGN / alignstats - # expand(join(RESULTSDIR,"alignment_stats","{replicate}.alignment_stats.yaml"),replicate=REPLICATES), + # ALIGN / alignstats + expand(join(RESULTSDIR,"alignment_stats","{replicate}.alignment_stats.yaml"),replicate=REPLICATES), - # # ALIGN / gather_alignstats - # join(RESULTSDIR,"alignment_stats","alignment_stats.tsv"), + # ALIGN / gather_alignstats + join(RESULTSDIR,"alignment_stats","alignment_stats.tsv"), - # # PEAKCALLS / macs2_narrow, macs2_broad, seacr_stringent, seacr_relaxed, gopeaks_narrow, gopeaks_broad - # unpack(run_macs2), - # unpack(run_seacr), - # unpack(run_gopeaks), + # PEAKCALLS / macs2_narrow, macs2_broad, seacr_stringent, seacr_relaxed, gopeaks_narrow, gopeaks_broad + unpack(run_macs2), + unpack(run_seacr), + unpack(run_gopeaks), - # # QC - # unpack(run_qc), + # QC + unpack(run_qc), - # # DIFF / create_contrast_data_files - # unpack(run_contrasts), + # DIFF / create_contrast_data_files + unpack(run_contrasts), - # # ANNOTATION / findMotif, rose, create_contrast_peakcaller_files, go_enrichment - # unpack(get_motifs), - # unpack(get_rose), - # unpack(get_enrichment) + # ANNOTATION / findMotif, rose, create_contrast_peakcaller_files, go_enrichment + unpack(get_motifs), + unpack(get_rose), + unpack(get_enrichment) # create jobby tables ## command for jobby: bash scripts/_run_jobby_on_snakemake_log logs/snakemake.log _jobby.py | tee logs/snakemake.log.jobby | cut -f2,3,18 > logs/snakemake.log.jobby.short diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk index dcc4be2..fb6ddf3 100644 --- a/workflow/rules/annotations.smk +++ b/workflow/rules/annotations.smk @@ -1,7 +1,7 @@ def get_peak_file(wildcards): # MACS2 OPTIONS if wildcards.peak_caller_type == "macs2_narrow": - bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"macs2","peak_output", wildcards.treatment_control_list + "." + wildcards.dupstatus + ".narrow.peaks.bed") + bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"macs2","peak_output", wildcards.treatment_control_list + "." + wildcards.dupstatus + ".narrow.summits.bed") if wildcards.peak_caller_type == "macs2_broad": bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"macs2","peak_output",wildcards.treatment_control_list + "." + wildcards.dupstatus + ".broad.peaks.bed") diff --git a/workflow/rules/diff.smk b/workflow/rules/diff.smk index e9b6139..c9392a9 100644 --- a/workflow/rules/diff.smk +++ b/workflow/rules/diff.smk @@ -191,70 +191,89 @@ rule DESeq: mkdir -p ${{TMPDIR_AUC}}/knit_root_dir cd $TMPDIR_AUC - Rscript {params.rscript_diff} \\ - --rmd {params.rmd} \\ - --countsmatrix {input.cm_auc} \\ - --sampleinfo {input.si} \\ - --dupstatus {params.dupstatus} \\ - --condition1 ${{condition1}} \\ - --condition2 ${{condition2}} \\ - --fdr_cutoff {params.fdr_cutoff} \\ - --log2fc_cutoff {params.log2fc_cutoff} \\ - --results {output.results_auc} \\ - --report {output.html_auc} \\ - --elbowlimits {output.elbowlimits_auc} \\ - --spiked {params.spiked} \\ - --rawcountsprescaled \\ - --scalesfbymean \\ - --contrast_data {input.contrast_data} \\ - --tmpdir $TMPDIR_AUC \\ - --species {params.species} \\ - --gtf {params.gtf} + # if there is only one sample per group, DESEQ2 is not the tool to use + # check the sampleinfo file to determine how many samples are in each group. If any samplegroupp has a count of 1 + # create empty files with message that the file cannot be created due to conditions provided + search_list=`awk '{{print $2}}' {input.si} | grep -v "group" | sort | uniq -c | sort -nr | grep -o 1` + check="1" + _contains () {{ # Check if space-separated list $1 contains line $2 + echo "$1" | tr ' ' '\n' | grep -F -x -q "$2" + }} + if _contains "${{check}}" "${{search_list}}"; then + echo "There is only one sample per group, which is not allowed in DESEQ2" + touch {output.results_auc} + touch {output.html_auc} + touch {output.elbowlimits_auc} + touch {output.results_frag} + touch {output.html_frag} + touch {output.elbowlimits_frag} + touch {output.pdf} + else + Rscript {params.rscript_diff} \\ + --rmd {params.rmd} \\ + --countsmatrix {input.cm_auc} \\ + --sampleinfo {input.si} \\ + --dupstatus {params.dupstatus} \\ + --condition1 ${{condition1}} \\ + --condition2 ${{condition2}} \\ + --fdr_cutoff {params.fdr_cutoff} \\ + --log2fc_cutoff {params.log2fc_cutoff} \\ + --results {output.results_auc} \\ + --report {output.html_auc} \\ + --elbowlimits {output.elbowlimits_auc} \\ + --spiked {params.spiked} \\ + --rawcountsprescaled \\ + --scalesfbymean \\ + --contrast_data {input.contrast_data} \\ + --tmpdir $TMPDIR_AUC \\ + --species {params.species} \\ + --gtf {params.gtf} - # change elbow limits to provided log2fc if limit is set to .na.real - sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits_auc} - sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits_auc} + # change elbow limits to provided log2fc if limit is set to .na.real + sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits_auc} + sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits_auc} - ## Run FRAG method - mkdir -p ${{TMPDIR_FRAG}} - mkdir -p ${{TMPDIR_FRAG}}/intermediates_dir - mkdir -p ${{TMPDIR_FRAG}}/knit_root_dir - cd $TMPDIR_FRAG + ## Run FRAG method + mkdir -p ${{TMPDIR_FRAG}} + mkdir -p ${{TMPDIR_FRAG}}/intermediates_dir + mkdir -p ${{TMPDIR_FRAG}}/knit_root_dir + cd $TMPDIR_FRAG - # Do not use --rawcountsprescaled as these counts are not prescaled! - Rscript {params.rscript_diff} \\ - --rmd {params.rmd} \\ - --countsmatrix {input.cm_frag} \\ - --sampleinfo {input.si} \\ - --dupstatus {params.dupstatus} \\ - --condition1 ${{condition1}} \\ - --condition2 ${{condition2}} \\ - --fdr_cutoff {params.fdr_cutoff} \\ - --log2fc_cutoff {params.log2fc_cutoff} \\ - --results {output.results_frag} \\ - --report {output.html_frag} \\ - --elbowlimits {output.elbowlimits_frag} \\ - --spiked {params.spiked} \\ - --scalesfbymean \\ - --contrast_data {input.contrast_data} \\ - --tmpdir $TMPDIR_FRAG \\ - --species {params.species} \\ - --gtf {params.gtf} + # Do not use --rawcountsprescaled as these counts are not prescaled! + Rscript {params.rscript_diff} \\ + --rmd {params.rmd} \\ + --countsmatrix {input.cm_frag} \\ + --sampleinfo {input.si} \\ + --dupstatus {params.dupstatus} \\ + --condition1 ${{condition1}} \\ + --condition2 ${{condition2}} \\ + --fdr_cutoff {params.fdr_cutoff} \\ + --log2fc_cutoff {params.log2fc_cutoff} \\ + --results {output.results_frag} \\ + --report {output.html_frag} \\ + --elbowlimits {output.elbowlimits_frag} \\ + --spiked {params.spiked} \\ + --scalesfbymean \\ + --contrast_data {input.contrast_data} \\ + --tmpdir $TMPDIR_FRAG \\ + --species {params.species} \\ + --gtf {params.gtf} - # change elbow limits to provided log2fc if limit is set to .na.real - sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits_frag} - sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits_frag} + # change elbow limits to provided log2fc if limit is set to .na.real + sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits_frag} + sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits_frag} - ## Run venns - mkdir -p ${{TMPDIR_VENN}} - mkdir -p ${{TMPDIR_VENN}}/intermediates_dir - mkdir -p ${{TMPDIR_VENN}}/knit_root_dir - cd $TMPDIR_VENN - Rscript {params.rscript_venn} \\ - --aucresults {output.results_auc} \\ - --fragmentsresults {output.results_frag} \\ - --pdf {output.pdf} \\ - --title "${{condition1}}_vs_${{condition2}}__{params.dupstatus}__{params.peak_caller_type}" + ## Run venns + mkdir -p ${{TMPDIR_VENN}} + mkdir -p ${{TMPDIR_VENN}}/intermediates_dir + mkdir -p ${{TMPDIR_VENN}}/knit_root_dir + cd $TMPDIR_VENN + Rscript {params.rscript_venn} \\ + --aucresults {output.results_auc} \\ + --fragmentsresults {output.results_frag} \\ + --pdf {output.pdf} \\ + --title "${{condition1}}_vs_${{condition2}}__{params.dupstatus}__{params.peak_caller_type}" + fi """ rule diffbb: @@ -287,21 +306,33 @@ rule diffbb: mkdir -p $TMPDIR fi - python {params.script} \\ - --results {input.results} \\ - --fdr_cutoff {params.fdr} \\ - --log2FC_cutoff {params.lfc} \\ - --elbowyaml {input.elbowlimits} \\ - --bed {output.bed} + ## add check to ensure that DESEQ2 ran to completion + ## mainly used in tinytest scenarios, but also used if + ## Nsamples/group is 1 + check=`wc -c {input.results} | cut -f1 -d" "` + if [[ $check == 0 ]]; then + echo "There is only 1 sample per group - this is not allowed in DESEQ2 and leads to incomplete DIFFBB results" + touch {output.bed} + touch {output.bigbed} + touch {output.fbed} + touch {output.fbigbed} + else + python {params.script} \\ + --results {input.results} \\ + --fdr_cutoff {params.fdr} \\ + --log2FC_cutoff {params.lfc} \\ + --elbowyaml {input.elbowlimits} \\ + --bed {output.bed} - bedToBigBed -type=bed9 {output.bed} {input.genome_len} {output.bigbed} + bedToBigBed -type=bed9 {output.bed} {input.genome_len} {output.bigbed} - python {params.script} \\ - --results {input.fresults} \\ - --fdr_cutoff {params.fdr} \\ - --log2FC_cutoff {params.lfc} \\ - --elbowyaml {input.felbowlimits} \\ - --bed {output.fbed} + python {params.script} \\ + --results {input.fresults} \\ + --fdr_cutoff {params.fdr} \\ + --log2FC_cutoff {params.lfc} \\ + --elbowyaml {input.felbowlimits} \\ + --bed {output.fbed} - bedToBigBed -type=bed9 {output.fbed} {input.genome_len} {output.fbigbed} + bedToBigBed -type=bed9 {output.fbed} {input.genome_len} {output.fbigbed} + fi """ \ No newline at end of file diff --git a/workflow/rules/peakcalls.smk b/workflow/rules/peakcalls.smk index 271162a..fb98690 100644 --- a/workflow/rules/peakcalls.smk +++ b/workflow/rules/peakcalls.smk @@ -18,7 +18,9 @@ rule macs2_narrow: genome_len = join(BOWTIE2_INDEX,"genome.len"), output: peak_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bed"), + summit_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.summits.bed"), bg_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bigbed.gz"), + xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.xls"), params: frag_bed_path=join(RESULTSDIR,"fragments"), tc_file="{treatment_control_list}", @@ -86,6 +88,8 @@ rule macs2_narrow: # mv output and rename for consistency mv $TMPDIR/${{file_base}}_peaks.narrowPeak {output.peak_file} + mv $TMPDIR/${{file_base}}_summits.bed {output.summit_file} + mv $TMPDIR/${{file_base}}_peaks.xls {output.xls_file} # create bigbed files, zip cut -f1-3 {output.peak_file} | LC_ALL=C sort --buffer-size={params.memG} --parallel={threads} --temporary-directory=$TMPDIR -k1,1 -k2,2n | uniq > ${{TMPDIR}}/narrow.bed @@ -105,6 +109,7 @@ rule macs2_broad: output: peak_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bed"), bg_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bigbed.gz"), + xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.xls"), params: frag_bed_path=join(RESULTSDIR,"fragments"), tc_file="{treatment_control_list}", @@ -173,6 +178,7 @@ rule macs2_broad: # mv output and rename for consistency mv $TMPDIR/${{file_base}}_peaks.broadPeak {output.peak_file} + mv $TMPDIR/${{file_base}}_peaks.xls {output.xls_file} # create bigbed files cut -f1-3 {output.peak_file} | LC_ALL=C sort --buffer-size={params.memG} --parallel={threads} --temporary-directory=$TMPDIR -k1,1 -k2,2n | uniq > ${{TMPDIR}}/broad.bed