Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: use summits bed for homer input; save temporary files; fix deseq2 bug #108

Merged
merged 5 commits into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 20 additions & 18 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ def run_macs2(wildcards):
files.extend(n)
n2=expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bigbed.gz"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_M,dupstatus=DUPSTATUS),
files.extend(n2)
n3=expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.summits.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_M,dupstatus=DUPSTATUS),
files.extend(n3)
if "macs2_broad" in PEAKTYPE:
b=expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_M,dupstatus=DUPSTATUS),
files.extend(b)
Expand Down Expand Up @@ -197,30 +199,30 @@ rule all:
# ALIGN / deeptools_heatmap
unpack(run_deeptools_heatmap),

# # ALIGN / create_library_norm_scales
# unpack(run_library_norm),
# ALIGN / create_library_norm_scales
unpack(run_library_norm),

# # ALIGN / alignstats
# expand(join(RESULTSDIR,"alignment_stats","{replicate}.alignment_stats.yaml"),replicate=REPLICATES),
# ALIGN / alignstats
expand(join(RESULTSDIR,"alignment_stats","{replicate}.alignment_stats.yaml"),replicate=REPLICATES),

# # ALIGN / gather_alignstats
# join(RESULTSDIR,"alignment_stats","alignment_stats.tsv"),
# ALIGN / gather_alignstats
join(RESULTSDIR,"alignment_stats","alignment_stats.tsv"),

# # PEAKCALLS / macs2_narrow, macs2_broad, seacr_stringent, seacr_relaxed, gopeaks_narrow, gopeaks_broad
# unpack(run_macs2),
# unpack(run_seacr),
# unpack(run_gopeaks),
# PEAKCALLS / macs2_narrow, macs2_broad, seacr_stringent, seacr_relaxed, gopeaks_narrow, gopeaks_broad
unpack(run_macs2),
unpack(run_seacr),
unpack(run_gopeaks),

# # QC
# unpack(run_qc),
# QC
unpack(run_qc),

# # DIFF / create_contrast_data_files
# unpack(run_contrasts),
# DIFF / create_contrast_data_files
unpack(run_contrasts),

# # ANNOTATION / findMotif, rose, create_contrast_peakcaller_files, go_enrichment
# unpack(get_motifs),
# unpack(get_rose),
# unpack(get_enrichment)
# ANNOTATION / findMotif, rose, create_contrast_peakcaller_files, go_enrichment
unpack(get_motifs),
unpack(get_rose),
unpack(get_enrichment)

# create jobby tables
## command for jobby: bash scripts/_run_jobby_on_snakemake_log logs/snakemake.log _jobby.py | tee logs/snakemake.log.jobby | cut -f2,3,18 > logs/snakemake.log.jobby.short
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/annotations.smk
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
def get_peak_file(wildcards):
# MACS2 OPTIONS
if wildcards.peak_caller_type == "macs2_narrow":
bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"macs2","peak_output", wildcards.treatment_control_list + "." + wildcards.dupstatus + ".narrow.peaks.bed")
bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"macs2","peak_output", wildcards.treatment_control_list + "." + wildcards.dupstatus + ".narrow.summits.bed")
if wildcards.peak_caller_type == "macs2_broad":
bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"macs2","peak_output",wildcards.treatment_control_list + "." + wildcards.dupstatus + ".broad.peaks.bed")

Expand Down
177 changes: 104 additions & 73 deletions workflow/rules/diff.smk
Original file line number Diff line number Diff line change
Expand Up @@ -191,70 +191,89 @@ rule DESeq:
mkdir -p ${{TMPDIR_AUC}}/knit_root_dir
cd $TMPDIR_AUC

Rscript {params.rscript_diff} \\
--rmd {params.rmd} \\
--countsmatrix {input.cm_auc} \\
--sampleinfo {input.si} \\
--dupstatus {params.dupstatus} \\
--condition1 ${{condition1}} \\
--condition2 ${{condition2}} \\
--fdr_cutoff {params.fdr_cutoff} \\
--log2fc_cutoff {params.log2fc_cutoff} \\
--results {output.results_auc} \\
--report {output.html_auc} \\
--elbowlimits {output.elbowlimits_auc} \\
--spiked {params.spiked} \\
--rawcountsprescaled \\
--scalesfbymean \\
--contrast_data {input.contrast_data} \\
--tmpdir $TMPDIR_AUC \\
--species {params.species} \\
--gtf {params.gtf}
# if there is only one sample per group, DESEQ2 is not the tool to use
# check the sampleinfo file to determine how many samples are in each group. If any samplegroupp has a count of 1
# create empty files with message that the file cannot be created due to conditions provided
search_list=`awk '{{print $2}}' {input.si} | grep -v "group" | sort | uniq -c | sort -nr | grep -o 1`
check="1"
_contains () {{ # Check if space-separated list $1 contains line $2
echo "$1" | tr ' ' '\n' | grep -F -x -q "$2"
}}
if _contains "${{check}}" "${{search_list}}"; then
echo "There is only one sample per group, which is not allowed in DESEQ2"
touch {output.results_auc}
touch {output.html_auc}
touch {output.elbowlimits_auc}
touch {output.results_frag}
touch {output.html_frag}
touch {output.elbowlimits_frag}
touch {output.pdf}
else
Rscript {params.rscript_diff} \\
--rmd {params.rmd} \\
--countsmatrix {input.cm_auc} \\
--sampleinfo {input.si} \\
--dupstatus {params.dupstatus} \\
--condition1 ${{condition1}} \\
--condition2 ${{condition2}} \\
--fdr_cutoff {params.fdr_cutoff} \\
--log2fc_cutoff {params.log2fc_cutoff} \\
--results {output.results_auc} \\
--report {output.html_auc} \\
--elbowlimits {output.elbowlimits_auc} \\
--spiked {params.spiked} \\
--rawcountsprescaled \\
--scalesfbymean \\
--contrast_data {input.contrast_data} \\
--tmpdir $TMPDIR_AUC \\
--species {params.species} \\
--gtf {params.gtf}

# change elbow limits to provided log2fc if limit is set to .na.real
sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits_auc}
sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits_auc}
# change elbow limits to provided log2fc if limit is set to .na.real
sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits_auc}
sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits_auc}

## Run FRAG method
mkdir -p ${{TMPDIR_FRAG}}
mkdir -p ${{TMPDIR_FRAG}}/intermediates_dir
mkdir -p ${{TMPDIR_FRAG}}/knit_root_dir
cd $TMPDIR_FRAG
## Run FRAG method
mkdir -p ${{TMPDIR_FRAG}}
mkdir -p ${{TMPDIR_FRAG}}/intermediates_dir
mkdir -p ${{TMPDIR_FRAG}}/knit_root_dir
cd $TMPDIR_FRAG

# Do not use --rawcountsprescaled as these counts are not prescaled!
Rscript {params.rscript_diff} \\
--rmd {params.rmd} \\
--countsmatrix {input.cm_frag} \\
--sampleinfo {input.si} \\
--dupstatus {params.dupstatus} \\
--condition1 ${{condition1}} \\
--condition2 ${{condition2}} \\
--fdr_cutoff {params.fdr_cutoff} \\
--log2fc_cutoff {params.log2fc_cutoff} \\
--results {output.results_frag} \\
--report {output.html_frag} \\
--elbowlimits {output.elbowlimits_frag} \\
--spiked {params.spiked} \\
--scalesfbymean \\
--contrast_data {input.contrast_data} \\
--tmpdir $TMPDIR_FRAG \\
--species {params.species} \\
--gtf {params.gtf}
# Do not use --rawcountsprescaled as these counts are not prescaled!
Rscript {params.rscript_diff} \\
--rmd {params.rmd} \\
--countsmatrix {input.cm_frag} \\
--sampleinfo {input.si} \\
--dupstatus {params.dupstatus} \\
--condition1 ${{condition1}} \\
--condition2 ${{condition2}} \\
--fdr_cutoff {params.fdr_cutoff} \\
--log2fc_cutoff {params.log2fc_cutoff} \\
--results {output.results_frag} \\
--report {output.html_frag} \\
--elbowlimits {output.elbowlimits_frag} \\
--spiked {params.spiked} \\
--scalesfbymean \\
--contrast_data {input.contrast_data} \\
--tmpdir $TMPDIR_FRAG \\
--species {params.species} \\
--gtf {params.gtf}

# change elbow limits to provided log2fc if limit is set to .na.real
sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits_frag}
sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits_frag}
# change elbow limits to provided log2fc if limit is set to .na.real
sed -i "s/low_limit: .na.real/low_limit: -{params.log2fc_cutoff}/" {output.elbowlimits_frag}
sed -i "s/up_limit: .na.real/up_limit: {params.log2fc_cutoff}/g" {output.elbowlimits_frag}

## Run venns
mkdir -p ${{TMPDIR_VENN}}
mkdir -p ${{TMPDIR_VENN}}/intermediates_dir
mkdir -p ${{TMPDIR_VENN}}/knit_root_dir
cd $TMPDIR_VENN
Rscript {params.rscript_venn} \\
--aucresults {output.results_auc} \\
--fragmentsresults {output.results_frag} \\
--pdf {output.pdf} \\
--title "${{condition1}}_vs_${{condition2}}__{params.dupstatus}__{params.peak_caller_type}"
## Run venns
mkdir -p ${{TMPDIR_VENN}}
mkdir -p ${{TMPDIR_VENN}}/intermediates_dir
mkdir -p ${{TMPDIR_VENN}}/knit_root_dir
cd $TMPDIR_VENN
Rscript {params.rscript_venn} \\
--aucresults {output.results_auc} \\
--fragmentsresults {output.results_frag} \\
--pdf {output.pdf} \\
--title "${{condition1}}_vs_${{condition2}}__{params.dupstatus}__{params.peak_caller_type}"
fi
"""

rule diffbb:
Expand Down Expand Up @@ -287,21 +306,33 @@ rule diffbb:
mkdir -p $TMPDIR
fi

python {params.script} \\
--results {input.results} \\
--fdr_cutoff {params.fdr} \\
--log2FC_cutoff {params.lfc} \\
--elbowyaml {input.elbowlimits} \\
--bed {output.bed}
## add check to ensure that DESEQ2 ran to completion
## mainly used in tinytest scenarios, but also used if
## Nsamples/group is 1
check=`wc -c {input.results} | cut -f1 -d" "`
if [[ $check == 0 ]]; then
echo "There is only 1 sample per group - this is not allowed in DESEQ2 and leads to incomplete DIFFBB results"
touch {output.bed}
touch {output.bigbed}
touch {output.fbed}
touch {output.fbigbed}
else
python {params.script} \\
--results {input.results} \\
--fdr_cutoff {params.fdr} \\
--log2FC_cutoff {params.lfc} \\
--elbowyaml {input.elbowlimits} \\
--bed {output.bed}

bedToBigBed -type=bed9 {output.bed} {input.genome_len} {output.bigbed}
bedToBigBed -type=bed9 {output.bed} {input.genome_len} {output.bigbed}

python {params.script} \\
--results {input.fresults} \\
--fdr_cutoff {params.fdr} \\
--log2FC_cutoff {params.lfc} \\
--elbowyaml {input.felbowlimits} \\
--bed {output.fbed}
python {params.script} \\
--results {input.fresults} \\
--fdr_cutoff {params.fdr} \\
--log2FC_cutoff {params.lfc} \\
--elbowyaml {input.felbowlimits} \\
--bed {output.fbed}

bedToBigBed -type=bed9 {output.fbed} {input.genome_len} {output.fbigbed}
bedToBigBed -type=bed9 {output.fbed} {input.genome_len} {output.fbigbed}
fi
"""
6 changes: 6 additions & 0 deletions workflow/rules/peakcalls.smk
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ rule macs2_narrow:
genome_len = join(BOWTIE2_INDEX,"genome.len"),
output:
peak_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bed"),
summit_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.summits.bed"),
bg_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bigbed.gz"),
xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.xls"),
params:
frag_bed_path=join(RESULTSDIR,"fragments"),
tc_file="{treatment_control_list}",
Expand Down Expand Up @@ -86,6 +88,8 @@ rule macs2_narrow:

# mv output and rename for consistency
mv $TMPDIR/${{file_base}}_peaks.narrowPeak {output.peak_file}
mv $TMPDIR/${{file_base}}_summits.bed {output.summit_file}
mv $TMPDIR/${{file_base}}_peaks.xls {output.xls_file}

# create bigbed files, zip
cut -f1-3 {output.peak_file} | LC_ALL=C sort --buffer-size={params.memG} --parallel={threads} --temporary-directory=$TMPDIR -k1,1 -k2,2n | uniq > ${{TMPDIR}}/narrow.bed
Expand All @@ -105,6 +109,7 @@ rule macs2_broad:
output:
peak_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bed"),
bg_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bigbed.gz"),
xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.xls"),
params:
frag_bed_path=join(RESULTSDIR,"fragments"),
tc_file="{treatment_control_list}",
Expand Down Expand Up @@ -173,6 +178,7 @@ rule macs2_broad:

# mv output and rename for consistency
mv $TMPDIR/${{file_base}}_peaks.broadPeak {output.peak_file}
mv $TMPDIR/${{file_base}}_peaks.xls {output.xls_file}

# create bigbed files
cut -f1-3 {output.peak_file} | LC_ALL=C sort --buffer-size={params.memG} --parallel={threads} --temporary-directory=$TMPDIR -k1,1 -k2,2n | uniq > ${{TMPDIR}}/broad.bed
Expand Down